Spaces:
Running
Running
oceansweep
commited on
Commit
•
fa9a583
1
Parent(s):
086d7f1
Syncing latest changes, lets see what breaks
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- App_Function_Libraries/Article_Summarization_Lib.py +220 -291
- App_Function_Libraries/Audio_Files.py +691 -691
- App_Function_Libraries/Audio_Transcription_Lib.py +191 -191
- App_Function_Libraries/Chat.py +273 -0
- App_Function_Libraries/Chat_related_functions.py +41 -0
- App_Function_Libraries/ChromaDB_Library.py +225 -0
- App_Function_Libraries/Chunk_Lib.py +586 -582
- App_Function_Libraries/DB_Manager.py +472 -0
- App_Function_Libraries/Diarization_Lib.py +1 -6
- App_Function_Libraries/Gradio_Related.py +0 -0
- App_Function_Libraries/Gradio_UI/Audio_ingestion_tab.py +152 -0
- App_Function_Libraries/Gradio_UI/Chat_ui.py +1017 -0
- App_Function_Libraries/Gradio_UI/Explain_summarize_tab.py +193 -0
- App_Function_Libraries/Gradio_UI/Export_Functionality.py +314 -0
- App_Function_Libraries/Gradio_UI/Gradio_Shared.py +284 -0
- App_Function_Libraries/Gradio_UI/Import_Functionality.py +473 -0
- App_Function_Libraries/Gradio_UI/Introduction_tab.py +161 -0
- App_Function_Libraries/Gradio_UI/Keywords.py +65 -0
- App_Function_Libraries/Gradio_UI/Llamafile_tab.py +122 -0
- App_Function_Libraries/Gradio_UI/Media_edit.py +273 -0
- App_Function_Libraries/Gradio_UI/PDF_ingestion_tab.py +152 -0
- App_Function_Libraries/Gradio_UI/Podcast_tab.py +164 -0
- App_Function_Libraries/Gradio_UI/Re_summarize_tab.py +268 -0
- App_Function_Libraries/Gradio_UI/Search_Tab.py +487 -0
- App_Function_Libraries/Gradio_UI/Transcript_comparison.py +94 -0
- App_Function_Libraries/Gradio_UI/Trash.py +134 -0
- App_Function_Libraries/Gradio_UI/Utilities.py +118 -0
- App_Function_Libraries/Gradio_UI/Video_transcription_tab.py +691 -0
- App_Function_Libraries/Gradio_UI/Website_scraping_tab.py +113 -0
- App_Function_Libraries/Gradio_UI/Writing.py +700 -0
- App_Function_Libraries/Gradio_UI/__init__.py +0 -0
- App_Function_Libraries/Gradio_UI/__pycache__/Audio_ingestion_tab.cpython-312.pyc +0 -0
- App_Function_Libraries/Gradio_UI/__pycache__/Chat_ui.cpython-312.pyc +0 -0
- App_Function_Libraries/Gradio_UI/__pycache__/Explain_summarize_tab.cpython-312.pyc +0 -0
- App_Function_Libraries/Gradio_UI/__pycache__/Export_Functionality.cpython-312.pyc +0 -0
- App_Function_Libraries/Gradio_UI/__pycache__/Gradio_Shared.cpython-312.pyc +0 -0
- App_Function_Libraries/Gradio_UI/__pycache__/Import_Functionality.cpython-312.pyc +0 -0
- App_Function_Libraries/Gradio_UI/__pycache__/Introduction_tab.cpython-312.pyc +0 -0
- App_Function_Libraries/Gradio_UI/__pycache__/Keywords.cpython-312.pyc +0 -0
- App_Function_Libraries/Gradio_UI/__pycache__/Llamafile_tab.cpython-312.pyc +0 -0
- App_Function_Libraries/Gradio_UI/__pycache__/Media_edit.cpython-312.pyc +0 -0
- App_Function_Libraries/Gradio_UI/__pycache__/PDF_ingestion_tab.cpython-312.pyc +0 -0
- App_Function_Libraries/Gradio_UI/__pycache__/Podcast_tab.cpython-312.pyc +0 -0
- App_Function_Libraries/Gradio_UI/__pycache__/Re_summarize_tab.cpython-312.pyc +0 -0
- App_Function_Libraries/Gradio_UI/__pycache__/Search_Tab.cpython-312.pyc +0 -0
- App_Function_Libraries/Gradio_UI/__pycache__/Trash.cpython-312.pyc +0 -0
- App_Function_Libraries/Gradio_UI/__pycache__/Utilities.cpython-312.pyc +0 -0
- App_Function_Libraries/Gradio_UI/__pycache__/Video_transcription_tab.cpython-312.pyc +0 -0
- App_Function_Libraries/Gradio_UI/__pycache__/Website_scraping_tab.cpython-312.pyc +0 -0
- App_Function_Libraries/Gradio_UI/__pycache__/Writing.cpython-312.pyc +0 -0
App_Function_Libraries/Article_Summarization_Lib.py
CHANGED
@@ -1,292 +1,221 @@
|
|
1 |
-
# Article_Summarization_Lib.py
|
2 |
-
#########################################
|
3 |
-
# Article Summarization Library
|
4 |
-
# This library is used to handle summarization of articles.
|
5 |
-
|
6 |
-
#
|
7 |
-
####
|
8 |
-
#
|
9 |
-
####################
|
10 |
-
# Function List
|
11 |
-
#
|
12 |
-
# 1.
|
13 |
-
#
|
14 |
-
####################
|
15 |
-
#
|
16 |
-
# Import necessary libraries
|
17 |
-
import datetime
|
18 |
-
from datetime import datetime
|
19 |
-
import gradio as gr
|
20 |
-
import json
|
21 |
-
import os
|
22 |
-
import logging
|
23 |
-
import requests
|
24 |
-
# 3rd-Party Imports
|
25 |
-
from tqdm import tqdm
|
26 |
-
|
27 |
-
from App_Function_Libraries.Utils import sanitize_filename
|
28 |
-
# Local Imports
|
29 |
-
from Article_Extractor_Lib import scrape_article
|
30 |
-
from Local_Summarization_Lib import summarize_with_llama, summarize_with_oobabooga, summarize_with_tabbyapi, \
|
31 |
-
summarize_with_vllm, summarize_with_kobold, save_summary_to_file, summarize_with_local_llm
|
32 |
-
from Summarization_General_Lib import summarize_with_openai, summarize_with_anthropic, summarize_with_cohere, \
|
33 |
-
summarize_with_groq, summarize_with_openrouter, summarize_with_deepseek, summarize_with_huggingface, \
|
34 |
-
summarize_with_mistral
|
35 |
-
from
|
36 |
-
#
|
37 |
-
#######################################################################################################################
|
38 |
-
# Function Definitions
|
39 |
-
#
|
40 |
-
|
41 |
-
def
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
if not
|
78 |
-
|
79 |
-
|
80 |
-
if
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
#
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
def
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
def
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
elif api_name.lower() == "tabbyapi":
|
222 |
-
# def summarize_with_tabbyapi(input_data, tabby_model, custom_prompt_input, api_key=None, api_IP):
|
223 |
-
summary = summarize_with_tabbyapi(json_file_path, article_custom_prompt, system_message)
|
224 |
-
|
225 |
-
elif api_name.lower() == "vllm":
|
226 |
-
logging.debug(f"MAIN: Trying to summarize with VLLM")
|
227 |
-
# def summarize_with_vllm(api_key, input_data, custom_prompt_input):
|
228 |
-
summary = summarize_with_vllm(json_file_path, article_custom_prompt, system_message)
|
229 |
-
|
230 |
-
elif api_name.lower() == "local-llm":
|
231 |
-
logging.debug(f"MAIN: Trying to summarize with Local LLM")
|
232 |
-
summary = summarize_with_local_llm(json_file_path, article_custom_prompt, system_message)
|
233 |
-
|
234 |
-
elif api_name.lower() == "huggingface":
|
235 |
-
logging.debug(f"MAIN: Trying to summarize with huggingface")
|
236 |
-
# def summarize_with_huggingface(api_key, input_data, custom_prompt_arg):
|
237 |
-
summarize_with_huggingface(api_key, json_file_path, article_custom_prompt, system_message)
|
238 |
-
# Add additional API handlers here...
|
239 |
-
except requests.exceptions.ConnectionError as e:
|
240 |
-
logging.error(f"Connection error while trying to summarize with {api_name}: {str(e)}")
|
241 |
-
|
242 |
-
if summary:
|
243 |
-
logging.info(f"Article_Summarizer: Summary generated using {api_name} API")
|
244 |
-
save_summary_to_file(summary, json_file_path)
|
245 |
-
else:
|
246 |
-
summary = "Summary not available"
|
247 |
-
logging.warning(f"Failed to generate summary using {api_name} API")
|
248 |
-
|
249 |
-
else:
|
250 |
-
summary = "Article Summarization: No API provided for summarization."
|
251 |
-
|
252 |
-
print(f"Summary: {summary}") # Debugging statement
|
253 |
-
|
254 |
-
# Step 3: Ingest the article into the database
|
255 |
-
ingestion_result = ingest_article_to_db(url, title, author, content, keywords, summary, ingestion_date,
|
256 |
-
article_custom_prompt)
|
257 |
-
|
258 |
-
return f"Title: {title}\nAuthor: {author}\nIngestion Result: {ingestion_result}\n\nSummary: {summary}\n\nArticle Contents: {content}"
|
259 |
-
except Exception as e:
|
260 |
-
logging.error(f"Error processing URL {url}: {str(e)}")
|
261 |
-
return f"Failed to process URL {url}: {str(e)}"
|
262 |
-
|
263 |
-
|
264 |
-
def ingest_unstructured_text(text, custom_prompt, api_name, api_key, keywords, custom_article_title, system_message=None):
|
265 |
-
title = custom_article_title.strip() if custom_article_title else "Unstructured Text"
|
266 |
-
author = "Unknown"
|
267 |
-
ingestion_date = datetime.now().strftime('%Y-%m-%d')
|
268 |
-
|
269 |
-
# Summarize the unstructured text
|
270 |
-
if api_name:
|
271 |
-
json_file_path = f"Results/{title.replace(' ', '_')}_segments.json"
|
272 |
-
with open(json_file_path, 'w') as json_file:
|
273 |
-
json.dump([{'text': text}], json_file, indent=2)
|
274 |
-
|
275 |
-
if api_name.lower() == 'openai':
|
276 |
-
summary = summarize_with_openai(api_key, json_file_path, custom_prompt, system_message)
|
277 |
-
# Add other APIs as needed
|
278 |
-
else:
|
279 |
-
summary = "Unsupported API."
|
280 |
-
else:
|
281 |
-
summary = "No API provided for summarization."
|
282 |
-
|
283 |
-
# Ingest the unstructured text into the database
|
284 |
-
ingestion_result = ingest_article_to_db('Unstructured Text', title, author, text, keywords, summary, ingestion_date,
|
285 |
-
custom_prompt)
|
286 |
-
return f"Title: {title}\nSummary: {summary}\nIngestion Result: {ingestion_result}"
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
#
|
291 |
-
#
|
292 |
#######################################################################################################################
|
|
|
1 |
+
# Article_Summarization_Lib.py
|
2 |
+
#########################################
|
3 |
+
# Article Summarization Library
|
4 |
+
# This library is used to handle summarization of articles.
|
5 |
+
|
6 |
+
#
|
7 |
+
####
|
8 |
+
#
|
9 |
+
####################
|
10 |
+
# Function List
|
11 |
+
#
|
12 |
+
# 1.
|
13 |
+
#
|
14 |
+
####################
|
15 |
+
#
|
16 |
+
# Import necessary libraries
|
17 |
+
import datetime
|
18 |
+
from datetime import datetime
|
19 |
+
import gradio as gr
|
20 |
+
import json
|
21 |
+
import os
|
22 |
+
import logging
|
23 |
+
import requests
|
24 |
+
# 3rd-Party Imports
|
25 |
+
from tqdm import tqdm
|
26 |
+
|
27 |
+
from App_Function_Libraries.Utils import sanitize_filename
|
28 |
+
# Local Imports
|
29 |
+
from Article_Extractor_Lib import scrape_article
|
30 |
+
from Local_Summarization_Lib import summarize_with_llama, summarize_with_oobabooga, summarize_with_tabbyapi, \
|
31 |
+
summarize_with_vllm, summarize_with_kobold, save_summary_to_file, summarize_with_local_llm
|
32 |
+
from Summarization_General_Lib import summarize_with_openai, summarize_with_anthropic, summarize_with_cohere, \
|
33 |
+
summarize_with_groq, summarize_with_openrouter, summarize_with_deepseek, summarize_with_huggingface, \
|
34 |
+
summarize_with_mistral
|
35 |
+
from App_Function_Libraries.DB_Manager import ingest_article_to_db
|
36 |
+
#
|
37 |
+
#######################################################################################################################
|
38 |
+
# Function Definitions
|
39 |
+
#
|
40 |
+
|
41 |
+
def scrape_and_summarize_multiple(urls, custom_prompt_arg, api_name, api_key, keywords, custom_article_titles, system_message=None):
|
42 |
+
urls = [url.strip() for url in urls.split('\n') if url.strip()]
|
43 |
+
custom_titles = custom_article_titles.split('\n') if custom_article_titles else []
|
44 |
+
|
45 |
+
results = []
|
46 |
+
errors = []
|
47 |
+
|
48 |
+
# Create a progress bar
|
49 |
+
progress = gr.Progress()
|
50 |
+
|
51 |
+
for i, url in tqdm(enumerate(urls), total=len(urls), desc="Processing URLs"):
|
52 |
+
custom_title = custom_titles[i] if i < len(custom_titles) else None
|
53 |
+
try:
|
54 |
+
result = scrape_and_summarize(url, custom_prompt_arg, api_name, api_key, keywords, custom_title, system_message)
|
55 |
+
results.append(f"Results for URL {i + 1}:\n{result}")
|
56 |
+
except Exception as e:
|
57 |
+
error_message = f"Error processing URL {i + 1} ({url}): {str(e)}"
|
58 |
+
errors.append(error_message)
|
59 |
+
results.append(f"Failed to process URL {i + 1}: {url}")
|
60 |
+
|
61 |
+
# Update progress
|
62 |
+
progress((i + 1) / len(urls), desc=f"Processed {i + 1}/{len(urls)} URLs")
|
63 |
+
|
64 |
+
# Combine results and errors
|
65 |
+
combined_output = "\n".join(results)
|
66 |
+
if errors:
|
67 |
+
combined_output += "\n\nErrors encountered:\n" + "\n".join(errors)
|
68 |
+
|
69 |
+
return combined_output
|
70 |
+
|
71 |
+
|
72 |
+
def scrape_and_summarize(url, custom_prompt_arg, api_name, api_key, keywords, custom_article_title, system_message=None):
|
73 |
+
try:
|
74 |
+
# Step 1: Scrape the article
|
75 |
+
article_data = scrape_article(url)
|
76 |
+
print(f"Scraped Article Data: {article_data}") # Debugging statement
|
77 |
+
if not article_data:
|
78 |
+
return "Failed to scrape the article."
|
79 |
+
|
80 |
+
# Use the custom title if provided, otherwise use the scraped title
|
81 |
+
title = custom_article_title.strip() if custom_article_title else article_data.get('title', 'Untitled')
|
82 |
+
author = article_data.get('author', 'Unknown')
|
83 |
+
content = article_data.get('content', '')
|
84 |
+
ingestion_date = datetime.now().strftime('%Y-%m-%d')
|
85 |
+
|
86 |
+
print(f"Title: {title}, Author: {author}, Content Length: {len(content)}") # Debugging statement
|
87 |
+
|
88 |
+
# Custom system prompt for the article
|
89 |
+
system_message = system_message or "Act as a professional summarizer and summarize this article."
|
90 |
+
# Custom prompt for the article
|
91 |
+
article_custom_prompt = custom_prompt_arg or "Act as a professional summarizer and summarize this article."
|
92 |
+
|
93 |
+
# Step 2: Summarize the article
|
94 |
+
summary = None
|
95 |
+
if api_name:
|
96 |
+
logging.debug(f"Article_Summarizer: Summarization being performed by {api_name}")
|
97 |
+
|
98 |
+
# Sanitize filename for saving the JSON file
|
99 |
+
sanitized_title = sanitize_filename(title)
|
100 |
+
json_file_path = os.path.join("Results", f"{sanitized_title}_segments.json")
|
101 |
+
|
102 |
+
with open(json_file_path, 'w') as json_file:
|
103 |
+
json.dump([{'text': content}], json_file, indent=2)
|
104 |
+
|
105 |
+
# FIXME - Swap out this if/else to use the dedicated function....
|
106 |
+
try:
|
107 |
+
if api_name.lower() == 'openai':
|
108 |
+
# def summarize_with_openai(api_key, input_data, custom_prompt_arg)
|
109 |
+
summary = summarize_with_openai(api_key, json_file_path, article_custom_prompt, system_message)
|
110 |
+
|
111 |
+
elif api_name.lower() == "anthropic":
|
112 |
+
# def summarize_with_anthropic(api_key, input_data, model, custom_prompt_arg, max_retries=3, retry_delay=5):
|
113 |
+
summary = summarize_with_anthropic(api_key, json_file_path, article_custom_prompt, system_message)
|
114 |
+
elif api_name.lower() == "cohere":
|
115 |
+
# def summarize_with_cohere(api_key, input_data, model, custom_prompt_arg)
|
116 |
+
summary = summarize_with_cohere(api_key, json_file_path, article_custom_prompt, system_message)
|
117 |
+
|
118 |
+
elif api_name.lower() == "groq":
|
119 |
+
logging.debug(f"MAIN: Trying to summarize with groq")
|
120 |
+
# def summarize_with_groq(api_key, input_data, model, custom_prompt_arg):
|
121 |
+
summary = summarize_with_groq(api_key, json_file_path, article_custom_prompt, system_message)
|
122 |
+
|
123 |
+
elif api_name.lower() == "openrouter":
|
124 |
+
logging.debug(f"MAIN: Trying to summarize with OpenRouter")
|
125 |
+
# def summarize_with_openrouter(api_key, input_data, custom_prompt_arg):
|
126 |
+
summary = summarize_with_openrouter(api_key, json_file_path, article_custom_prompt, system_message)
|
127 |
+
|
128 |
+
elif api_name.lower() == "deepseek":
|
129 |
+
logging.debug(f"MAIN: Trying to summarize with DeepSeek")
|
130 |
+
# def summarize_with_deepseek(api_key, input_data, custom_prompt_arg):
|
131 |
+
summary = summarize_with_deepseek(api_key, json_file_path, article_custom_prompt, system_message)
|
132 |
+
|
133 |
+
elif api_name.lower() == "mistral":
|
134 |
+
summary = summarize_with_mistral(api_key, json_file_path, article_custom_prompt, system_message)
|
135 |
+
|
136 |
+
elif api_name.lower() == "llama.cpp":
|
137 |
+
logging.debug(f"MAIN: Trying to summarize with Llama.cpp")
|
138 |
+
# def summarize_with_llama(api_url, file_path, token, custom_prompt)
|
139 |
+
summary = summarize_with_llama(json_file_path, article_custom_prompt, system_message)
|
140 |
+
|
141 |
+
elif api_name.lower() == "kobold":
|
142 |
+
logging.debug(f"MAIN: Trying to summarize with Kobold.cpp")
|
143 |
+
# def summarize_with_kobold(input_data, kobold_api_token, custom_prompt_input, api_url):
|
144 |
+
summary = summarize_with_kobold(json_file_path, api_key, article_custom_prompt, system_message)
|
145 |
+
|
146 |
+
elif api_name.lower() == "ooba":
|
147 |
+
# def summarize_with_oobabooga(input_data, api_key, custom_prompt, api_url):
|
148 |
+
summary = summarize_with_oobabooga(json_file_path, api_key, article_custom_prompt, system_message)
|
149 |
+
|
150 |
+
elif api_name.lower() == "tabbyapi":
|
151 |
+
# def summarize_with_tabbyapi(input_data, tabby_model, custom_prompt_input, api_key=None, api_IP):
|
152 |
+
summary = summarize_with_tabbyapi(json_file_path, article_custom_prompt, system_message)
|
153 |
+
|
154 |
+
elif api_name.lower() == "vllm":
|
155 |
+
logging.debug(f"MAIN: Trying to summarize with VLLM")
|
156 |
+
# def summarize_with_vllm(api_key, input_data, custom_prompt_input):
|
157 |
+
summary = summarize_with_vllm(json_file_path, article_custom_prompt, system_message)
|
158 |
+
|
159 |
+
elif api_name.lower() == "local-llm":
|
160 |
+
logging.debug(f"MAIN: Trying to summarize with Local LLM")
|
161 |
+
summary = summarize_with_local_llm(json_file_path, article_custom_prompt, system_message)
|
162 |
+
|
163 |
+
elif api_name.lower() == "huggingface":
|
164 |
+
logging.debug(f"MAIN: Trying to summarize with huggingface")
|
165 |
+
# def summarize_with_huggingface(api_key, input_data, custom_prompt_arg):
|
166 |
+
summarize_with_huggingface(api_key, json_file_path, article_custom_prompt, system_message)
|
167 |
+
# Add additional API handlers here...
|
168 |
+
except requests.exceptions.ConnectionError as e:
|
169 |
+
logging.error(f"Connection error while trying to summarize with {api_name}: {str(e)}")
|
170 |
+
|
171 |
+
if summary:
|
172 |
+
logging.info(f"Article_Summarizer: Summary generated using {api_name} API")
|
173 |
+
save_summary_to_file(summary, json_file_path)
|
174 |
+
else:
|
175 |
+
summary = "Summary not available"
|
176 |
+
logging.warning(f"Failed to generate summary using {api_name} API")
|
177 |
+
|
178 |
+
else:
|
179 |
+
summary = "Article Summarization: No API provided for summarization."
|
180 |
+
|
181 |
+
print(f"Summary: {summary}") # Debugging statement
|
182 |
+
|
183 |
+
# Step 3: Ingest the article into the database
|
184 |
+
ingestion_result = ingest_article_to_db(url, title, author, content, keywords, summary, ingestion_date,
|
185 |
+
article_custom_prompt)
|
186 |
+
|
187 |
+
return f"Title: {title}\nAuthor: {author}\nIngestion Result: {ingestion_result}\n\nSummary: {summary}\n\nArticle Contents: {content}"
|
188 |
+
except Exception as e:
|
189 |
+
logging.error(f"Error processing URL {url}: {str(e)}")
|
190 |
+
return f"Failed to process URL {url}: {str(e)}"
|
191 |
+
|
192 |
+
|
193 |
+
def ingest_unstructured_text(text, custom_prompt, api_name, api_key, keywords, custom_article_title, system_message=None):
|
194 |
+
title = custom_article_title.strip() if custom_article_title else "Unstructured Text"
|
195 |
+
author = "Unknown"
|
196 |
+
ingestion_date = datetime.now().strftime('%Y-%m-%d')
|
197 |
+
|
198 |
+
# Summarize the unstructured text
|
199 |
+
if api_name:
|
200 |
+
json_file_path = f"Results/{title.replace(' ', '_')}_segments.json"
|
201 |
+
with open(json_file_path, 'w') as json_file:
|
202 |
+
json.dump([{'text': text}], json_file, indent=2)
|
203 |
+
|
204 |
+
if api_name.lower() == 'openai':
|
205 |
+
summary = summarize_with_openai(api_key, json_file_path, custom_prompt, system_message)
|
206 |
+
# Add other APIs as needed
|
207 |
+
else:
|
208 |
+
summary = "Unsupported API."
|
209 |
+
else:
|
210 |
+
summary = "No API provided for summarization."
|
211 |
+
|
212 |
+
# Ingest the unstructured text into the database
|
213 |
+
ingestion_result = ingest_article_to_db('Unstructured Text', title, author, text, keywords, summary, ingestion_date,
|
214 |
+
custom_prompt)
|
215 |
+
return f"Title: {title}\nSummary: {summary}\nIngestion Result: {ingestion_result}"
|
216 |
+
|
217 |
+
|
218 |
+
|
219 |
+
#
|
220 |
+
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
221 |
#######################################################################################################################
|
App_Function_Libraries/Audio_Files.py
CHANGED
@@ -1,692 +1,692 @@
|
|
1 |
-
# Audio_Files.py
|
2 |
-
#########################################
|
3 |
-
# Audio Processing Library
|
4 |
-
# This library is used to download or load audio files from a local directory.
|
5 |
-
#
|
6 |
-
####
|
7 |
-
#
|
8 |
-
# Functions:
|
9 |
-
#
|
10 |
-
# download_audio_file(url, save_path)
|
11 |
-
# process_audio(
|
12 |
-
# process_audio_file(audio_url, audio_file, whisper_model="small.en", api_name=None, api_key=None)
|
13 |
-
#
|
14 |
-
#
|
15 |
-
#########################################
|
16 |
-
# Imports
|
17 |
-
import json
|
18 |
-
import logging
|
19 |
-
import os
|
20 |
-
import subprocess
|
21 |
-
import tempfile
|
22 |
-
import uuid
|
23 |
-
from datetime import datetime
|
24 |
-
from pathlib import Path
|
25 |
-
|
26 |
-
import requests
|
27 |
-
import yt_dlp
|
28 |
-
|
29 |
-
from App_Function_Libraries.Audio_Transcription_Lib import speech_to_text
|
30 |
-
from App_Function_Libraries.Chunk_Lib import improved_chunking_process
|
31 |
-
#
|
32 |
-
# Local Imports
|
33 |
-
from App_Function_Libraries.
|
34 |
-
check_media_and_whisper_model
|
35 |
-
from App_Function_Libraries.Summarization_General_Lib import save_transcription_and_summary, perform_transcription, \
|
36 |
-
perform_summarization
|
37 |
-
from App_Function_Libraries.Utils import create_download_directory, save_segments_to_json, downloaded_files, \
|
38 |
-
sanitize_filename
|
39 |
-
from App_Function_Libraries.Video_DL_Ingestion_Lib import extract_metadata
|
40 |
-
|
41 |
-
#
|
42 |
-
#######################################################################################################################
|
43 |
-
# Function Definitions
|
44 |
-
#
|
45 |
-
|
46 |
-
MAX_FILE_SIZE = 500 * 1024 * 1024
|
47 |
-
|
48 |
-
|
49 |
-
def download_audio_file(url, current_whisper_model="", use_cookies=False, cookies=None):
|
50 |
-
try:
|
51 |
-
# Check if media already exists in the database and compare whisper models
|
52 |
-
should_download, reason = check_media_and_whisper_model(
|
53 |
-
url=url,
|
54 |
-
current_whisper_model=current_whisper_model
|
55 |
-
)
|
56 |
-
|
57 |
-
if not should_download:
|
58 |
-
logging.info(f"Skipping audio download: {reason}")
|
59 |
-
return None
|
60 |
-
|
61 |
-
logging.info(f"Proceeding with audio download: {reason}")
|
62 |
-
|
63 |
-
# Set up the request headers
|
64 |
-
headers = {}
|
65 |
-
if use_cookies and cookies:
|
66 |
-
try:
|
67 |
-
cookie_dict = json.loads(cookies)
|
68 |
-
headers['Cookie'] = '; '.join([f'{k}={v}' for k, v in cookie_dict.items()])
|
69 |
-
except json.JSONDecodeError:
|
70 |
-
logging.warning("Invalid cookie format. Proceeding without cookies.")
|
71 |
-
|
72 |
-
# Make the request
|
73 |
-
response = requests.get(url, headers=headers, stream=True)
|
74 |
-
# Raise an exception for bad status codes
|
75 |
-
response.raise_for_status()
|
76 |
-
|
77 |
-
# Get the file size
|
78 |
-
file_size = int(response.headers.get('content-length', 0))
|
79 |
-
if file_size > 500 * 1024 * 1024: # 500 MB limit
|
80 |
-
raise ValueError("File size exceeds the 500MB limit.")
|
81 |
-
|
82 |
-
# Generate a unique filename
|
83 |
-
file_name = f"audio_{uuid.uuid4().hex[:8]}.mp3"
|
84 |
-
save_path = os.path.join('downloads', file_name)
|
85 |
-
|
86 |
-
# Ensure the downloads directory exists
|
87 |
-
os.makedirs('downloads', exist_ok=True)
|
88 |
-
|
89 |
-
|
90 |
-
# Download the file
|
91 |
-
with open(save_path, 'wb') as f:
|
92 |
-
for chunk in response.iter_content(chunk_size=8192):
|
93 |
-
if chunk:
|
94 |
-
f.write(chunk)
|
95 |
-
|
96 |
-
logging.info(f"Audio file downloaded successfully: {save_path}")
|
97 |
-
return save_path
|
98 |
-
|
99 |
-
except requests.RequestException as e:
|
100 |
-
logging.error(f"Error downloading audio file: {str(e)}")
|
101 |
-
raise
|
102 |
-
except ValueError as e:
|
103 |
-
logging.error(str(e))
|
104 |
-
raise
|
105 |
-
except Exception as e:
|
106 |
-
logging.error(f"Unexpected error downloading audio file: {str(e)}")
|
107 |
-
raise
|
108 |
-
|
109 |
-
|
110 |
-
def process_audio(
|
111 |
-
audio_file_path,
|
112 |
-
num_speakers=2,
|
113 |
-
whisper_model="small.en",
|
114 |
-
custom_prompt_input=None,
|
115 |
-
offset=0,
|
116 |
-
api_name=None,
|
117 |
-
api_key=None,
|
118 |
-
vad_filter=False,
|
119 |
-
rolling_summarization=False,
|
120 |
-
detail_level=0.01,
|
121 |
-
keywords="default,no_keyword_set",
|
122 |
-
chunk_text_by_words=False,
|
123 |
-
max_words=0,
|
124 |
-
chunk_text_by_sentences=False,
|
125 |
-
max_sentences=0,
|
126 |
-
chunk_text_by_paragraphs=False,
|
127 |
-
max_paragraphs=0,
|
128 |
-
chunk_text_by_tokens=False,
|
129 |
-
max_tokens=0
|
130 |
-
):
|
131 |
-
try:
|
132 |
-
|
133 |
-
# Perform transcription
|
134 |
-
audio_file_path, segments = perform_transcription(audio_file_path, offset, whisper_model, vad_filter)
|
135 |
-
|
136 |
-
if audio_file_path is None or segments is None:
|
137 |
-
logging.error("Process_Audio: Transcription failed or segments not available.")
|
138 |
-
return "Process_Audio: Transcription failed.", None, None, None, None, None
|
139 |
-
|
140 |
-
logging.debug(f"Process_Audio: Transcription audio_file: {audio_file_path}")
|
141 |
-
logging.debug(f"Process_Audio: Transcription segments: {segments}")
|
142 |
-
|
143 |
-
transcription_text = {'audio_file': audio_file_path, 'transcription': segments}
|
144 |
-
logging.debug(f"Process_Audio: Transcription text: {transcription_text}")
|
145 |
-
|
146 |
-
# Save segments to JSON
|
147 |
-
segments_json_path = save_segments_to_json(segments)
|
148 |
-
|
149 |
-
# Perform summarization
|
150 |
-
summary_text = None
|
151 |
-
if api_name:
|
152 |
-
if rolling_summarization is not None:
|
153 |
-
pass
|
154 |
-
# FIXME rolling summarization
|
155 |
-
# summary_text = rolling_summarize_function(
|
156 |
-
# transcription_text,
|
157 |
-
# detail=detail_level,
|
158 |
-
# api_name=api_name,
|
159 |
-
# api_key=api_key,
|
160 |
-
# custom_prompt=custom_prompt_input,
|
161 |
-
# chunk_by_words=chunk_text_by_words,
|
162 |
-
# max_words=max_words,
|
163 |
-
# chunk_by_sentences=chunk_text_by_sentences,
|
164 |
-
# max_sentences=max_sentences,
|
165 |
-
# chunk_by_paragraphs=chunk_text_by_paragraphs,
|
166 |
-
# max_paragraphs=max_paragraphs,
|
167 |
-
# chunk_by_tokens=chunk_text_by_tokens,
|
168 |
-
# max_tokens=max_tokens
|
169 |
-
# )
|
170 |
-
else:
|
171 |
-
summary_text = perform_summarization(api_name, segments_json_path, custom_prompt_input, api_key)
|
172 |
-
|
173 |
-
if summary_text is None:
|
174 |
-
logging.error("Summary text is None. Check summarization function.")
|
175 |
-
summary_file_path = None
|
176 |
-
else:
|
177 |
-
summary_text = 'Summary not available'
|
178 |
-
summary_file_path = None
|
179 |
-
|
180 |
-
# Save transcription and summary
|
181 |
-
download_path = create_download_directory("Audio_Processing")
|
182 |
-
json_file_path, summary_file_path = save_transcription_and_summary(transcription_text, summary_text,
|
183 |
-
download_path)
|
184 |
-
|
185 |
-
# Update function call to add_media_to_database so that it properly applies the title, author and file type
|
186 |
-
# Add to database
|
187 |
-
add_media_to_database(None, {'title': 'Audio File', 'author': 'Unknown'}, segments, summary_text, keywords,
|
188 |
-
custom_prompt_input, whisper_model)
|
189 |
-
|
190 |
-
return transcription_text, summary_text, json_file_path, summary_file_path, None, None
|
191 |
-
|
192 |
-
except Exception as e:
|
193 |
-
logging.error(f"Error in process_audio: {str(e)}")
|
194 |
-
return str(e), None, None, None, None, None
|
195 |
-
|
196 |
-
|
197 |
-
def process_single_audio(audio_file_path, whisper_model, api_name, api_key, keep_original,custom_keywords, source,
|
198 |
-
custom_prompt_input, chunk_method, max_chunk_size, chunk_overlap, use_adaptive_chunking,
|
199 |
-
use_multi_level_chunking, chunk_language):
|
200 |
-
progress = []
|
201 |
-
transcription = ""
|
202 |
-
summary = ""
|
203 |
-
|
204 |
-
def update_progress(message):
|
205 |
-
progress.append(message)
|
206 |
-
return "\n".join(progress)
|
207 |
-
|
208 |
-
try:
|
209 |
-
# Check file size before processing
|
210 |
-
file_size = os.path.getsize(audio_file_path)
|
211 |
-
if file_size > MAX_FILE_SIZE:
|
212 |
-
update_progress(f"File size ({file_size / (1024 * 1024):.2f} MB) exceeds the maximum limit of {MAX_FILE_SIZE / (1024 * 1024):.2f} MB. Skipping this file.")
|
213 |
-
return "\n".join(progress), "", ""
|
214 |
-
|
215 |
-
# Perform transcription
|
216 |
-
update_progress("Starting transcription...")
|
217 |
-
segments = speech_to_text(audio_file_path, whisper_model=whisper_model)
|
218 |
-
transcription = " ".join([segment['Text'] for segment in segments])
|
219 |
-
update_progress("Audio transcribed successfully.")
|
220 |
-
|
221 |
-
# Perform summarization if API is provided
|
222 |
-
if api_name and api_key:
|
223 |
-
update_progress("Starting summarization...")
|
224 |
-
summary = perform_summarization(api_name, transcription, "Summarize the following audio transcript",
|
225 |
-
api_key)
|
226 |
-
update_progress("Audio summarized successfully.")
|
227 |
-
else:
|
228 |
-
summary = "No summary available"
|
229 |
-
|
230 |
-
# Prepare keywords
|
231 |
-
keywords = "audio,transcription"
|
232 |
-
if custom_keywords:
|
233 |
-
keywords += f",{custom_keywords}"
|
234 |
-
|
235 |
-
# Add to database
|
236 |
-
add_media_with_keywords(
|
237 |
-
url=source,
|
238 |
-
title=os.path.basename(audio_file_path),
|
239 |
-
media_type='audio',
|
240 |
-
content=transcription,
|
241 |
-
keywords=keywords,
|
242 |
-
prompt="Summarize the following audio transcript",
|
243 |
-
summary=summary,
|
244 |
-
transcription_model=whisper_model,
|
245 |
-
author="Unknown",
|
246 |
-
ingestion_date=None # This will use the current date
|
247 |
-
)
|
248 |
-
update_progress("Audio file added to database successfully.")
|
249 |
-
|
250 |
-
if not keep_original and source != "Uploaded File":
|
251 |
-
os.remove(audio_file_path)
|
252 |
-
update_progress(f"Temporary file {audio_file_path} removed.")
|
253 |
-
elif keep_original and source != "Uploaded File":
|
254 |
-
update_progress(f"Original audio file kept at: {audio_file_path}")
|
255 |
-
|
256 |
-
except Exception as e:
|
257 |
-
update_progress(f"Error processing {source}: {str(e)}")
|
258 |
-
transcription = f"Error: {str(e)}"
|
259 |
-
summary = "No summary due to error"
|
260 |
-
|
261 |
-
return "\n".join(progress), transcription, summary
|
262 |
-
|
263 |
-
|
264 |
-
def process_audio_files(audio_urls, audio_file, whisper_model, api_name, api_key, use_cookies, cookies, keep_original,
|
265 |
-
custom_keywords, custom_prompt_input, chunk_method, max_chunk_size, chunk_overlap,
|
266 |
-
use_adaptive_chunking, use_multi_level_chunking, chunk_language, diarize):
|
267 |
-
progress = []
|
268 |
-
temp_files = []
|
269 |
-
all_transcriptions = []
|
270 |
-
all_summaries = []
|
271 |
-
|
272 |
-
def update_progress(message):
|
273 |
-
progress.append(message)
|
274 |
-
return "\n".join(progress)
|
275 |
-
|
276 |
-
def cleanup_files():
|
277 |
-
for file in temp_files:
|
278 |
-
try:
|
279 |
-
if os.path.exists(file):
|
280 |
-
os.remove(file)
|
281 |
-
update_progress(f"Temporary file {file} removed.")
|
282 |
-
except Exception as e:
|
283 |
-
update_progress(f"Failed to remove temporary file {file}: {str(e)}")
|
284 |
-
|
285 |
-
def reencode_mp3(mp3_file_path):
|
286 |
-
try:
|
287 |
-
reencoded_mp3_path = mp3_file_path.replace(".mp3", "_reencoded.mp3")
|
288 |
-
subprocess.run([ffmpeg_cmd, '-i', mp3_file_path, '-codec:a', 'libmp3lame', reencoded_mp3_path], check=True)
|
289 |
-
update_progress(f"Re-encoded {mp3_file_path} to {reencoded_mp3_path}.")
|
290 |
-
return reencoded_mp3_path
|
291 |
-
except subprocess.CalledProcessError as e:
|
292 |
-
update_progress(f"Error re-encoding {mp3_file_path}: {str(e)}")
|
293 |
-
raise
|
294 |
-
|
295 |
-
def convert_mp3_to_wav(mp3_file_path):
|
296 |
-
try:
|
297 |
-
wav_file_path = mp3_file_path.replace(".mp3", ".wav")
|
298 |
-
subprocess.run([ffmpeg_cmd, '-i', mp3_file_path, wav_file_path], check=True)
|
299 |
-
update_progress(f"Converted {mp3_file_path} to {wav_file_path}.")
|
300 |
-
return wav_file_path
|
301 |
-
except subprocess.CalledProcessError as e:
|
302 |
-
update_progress(f"Error converting {mp3_file_path} to WAV: {str(e)}")
|
303 |
-
raise
|
304 |
-
|
305 |
-
try:
|
306 |
-
# Check and set the ffmpeg command
|
307 |
-
global ffmpeg_cmd
|
308 |
-
if os.name == "nt":
|
309 |
-
logging.debug("Running on Windows")
|
310 |
-
ffmpeg_cmd = os.path.join(os.getcwd(), "Bin", "ffmpeg.exe")
|
311 |
-
else:
|
312 |
-
ffmpeg_cmd = 'ffmpeg' # Assume 'ffmpeg' is in PATH for non-Windows systems
|
313 |
-
|
314 |
-
# Ensure ffmpeg is accessible
|
315 |
-
if not os.path.exists(ffmpeg_cmd) and os.name == "nt":
|
316 |
-
raise FileNotFoundError(f"ffmpeg executable not found at path: {ffmpeg_cmd}")
|
317 |
-
|
318 |
-
# Define chunk options early to avoid undefined errors
|
319 |
-
chunk_options = {
|
320 |
-
'method': chunk_method,
|
321 |
-
'max_size': max_chunk_size,
|
322 |
-
'overlap': chunk_overlap,
|
323 |
-
'adaptive': use_adaptive_chunking,
|
324 |
-
'multi_level': use_multi_level_chunking,
|
325 |
-
'language': chunk_language
|
326 |
-
}
|
327 |
-
|
328 |
-
# Process multiple URLs
|
329 |
-
urls = [url.strip() for url in audio_urls.split('\n') if url.strip()]
|
330 |
-
|
331 |
-
for i, url in enumerate(urls):
|
332 |
-
update_progress(f"Processing URL {i + 1}/{len(urls)}: {url}")
|
333 |
-
|
334 |
-
# Download and process audio file
|
335 |
-
audio_file_path = download_audio_file(url, use_cookies, cookies)
|
336 |
-
if not os.path.exists(audio_file_path):
|
337 |
-
update_progress(f"Downloaded file not found: {audio_file_path}")
|
338 |
-
continue
|
339 |
-
|
340 |
-
temp_files.append(audio_file_path)
|
341 |
-
update_progress("Audio file downloaded successfully.")
|
342 |
-
|
343 |
-
# Re-encode MP3 to fix potential issues
|
344 |
-
reencoded_mp3_path = reencode_mp3(audio_file_path)
|
345 |
-
if not os.path.exists(reencoded_mp3_path):
|
346 |
-
update_progress(f"Re-encoded file not found: {reencoded_mp3_path}")
|
347 |
-
continue
|
348 |
-
|
349 |
-
temp_files.append(reencoded_mp3_path)
|
350 |
-
|
351 |
-
# Convert re-encoded MP3 to WAV
|
352 |
-
wav_file_path = convert_mp3_to_wav(reencoded_mp3_path)
|
353 |
-
if not os.path.exists(wav_file_path):
|
354 |
-
update_progress(f"Converted WAV file not found: {wav_file_path}")
|
355 |
-
continue
|
356 |
-
|
357 |
-
temp_files.append(wav_file_path)
|
358 |
-
|
359 |
-
# Initialize transcription
|
360 |
-
transcription = ""
|
361 |
-
|
362 |
-
# Transcribe audio
|
363 |
-
if diarize:
|
364 |
-
segments = speech_to_text(wav_file_path, whisper_model=whisper_model, diarize=True)
|
365 |
-
else:
|
366 |
-
segments = speech_to_text(wav_file_path, whisper_model=whisper_model)
|
367 |
-
|
368 |
-
# Handle segments nested under 'segments' key
|
369 |
-
if isinstance(segments, dict) and 'segments' in segments:
|
370 |
-
segments = segments['segments']
|
371 |
-
|
372 |
-
if isinstance(segments, list):
|
373 |
-
transcription = " ".join([segment.get('Text', '') for segment in segments])
|
374 |
-
update_progress("Audio transcribed successfully.")
|
375 |
-
else:
|
376 |
-
update_progress("Unexpected segments format received from speech_to_text.")
|
377 |
-
logging.error(f"Unexpected segments format: {segments}")
|
378 |
-
continue
|
379 |
-
|
380 |
-
if not transcription.strip():
|
381 |
-
update_progress("Transcription is empty.")
|
382 |
-
else:
|
383 |
-
# Apply chunking
|
384 |
-
chunked_text = improved_chunking_process(transcription, chunk_options)
|
385 |
-
|
386 |
-
# Summarize
|
387 |
-
if api_name:
|
388 |
-
try:
|
389 |
-
summary = perform_summarization(api_name, chunked_text, custom_prompt_input, api_key)
|
390 |
-
update_progress("Audio summarized successfully.")
|
391 |
-
except Exception as e:
|
392 |
-
logging.error(f"Error during summarization: {str(e)}")
|
393 |
-
summary = "Summary generation failed"
|
394 |
-
else:
|
395 |
-
summary = "No summary available (API not provided)"
|
396 |
-
|
397 |
-
all_transcriptions.append(transcription)
|
398 |
-
all_summaries.append(summary)
|
399 |
-
|
400 |
-
# Add to database
|
401 |
-
add_media_with_keywords(
|
402 |
-
url=url,
|
403 |
-
title=os.path.basename(wav_file_path),
|
404 |
-
media_type='audio',
|
405 |
-
content=transcription,
|
406 |
-
keywords=custom_keywords,
|
407 |
-
prompt=custom_prompt_input,
|
408 |
-
summary=summary,
|
409 |
-
transcription_model=whisper_model,
|
410 |
-
author="Unknown",
|
411 |
-
ingestion_date=datetime.now().strftime('%Y-%m-%d')
|
412 |
-
)
|
413 |
-
update_progress("Audio file processed and added to database.")
|
414 |
-
|
415 |
-
# Process uploaded file if provided
|
416 |
-
if audio_file:
|
417 |
-
if os.path.getsize(audio_file.name) > MAX_FILE_SIZE:
|
418 |
-
update_progress(
|
419 |
-
f"Uploaded file size exceeds the maximum limit of {MAX_FILE_SIZE / (1024 * 1024):.2f}MB. Skipping this file.")
|
420 |
-
else:
|
421 |
-
# Re-encode MP3 to fix potential issues
|
422 |
-
reencoded_mp3_path = reencode_mp3(audio_file.name)
|
423 |
-
if not os.path.exists(reencoded_mp3_path):
|
424 |
-
update_progress(f"Re-encoded file not found: {reencoded_mp3_path}")
|
425 |
-
return update_progress("Processing failed: Re-encoded file not found"), "", ""
|
426 |
-
|
427 |
-
temp_files.append(reencoded_mp3_path)
|
428 |
-
|
429 |
-
# Convert re-encoded MP3 to WAV
|
430 |
-
wav_file_path = convert_mp3_to_wav(reencoded_mp3_path)
|
431 |
-
if not os.path.exists(wav_file_path):
|
432 |
-
update_progress(f"Converted WAV file not found: {wav_file_path}")
|
433 |
-
return update_progress("Processing failed: Converted WAV file not found"), "", ""
|
434 |
-
|
435 |
-
temp_files.append(wav_file_path)
|
436 |
-
|
437 |
-
# Initialize transcription
|
438 |
-
transcription = ""
|
439 |
-
|
440 |
-
if diarize:
|
441 |
-
segments = speech_to_text(wav_file_path, whisper_model=whisper_model, diarize=True)
|
442 |
-
else:
|
443 |
-
segments = speech_to_text(wav_file_path, whisper_model=whisper_model)
|
444 |
-
|
445 |
-
# Handle segments nested under 'segments' key
|
446 |
-
if isinstance(segments, dict) and 'segments' in segments:
|
447 |
-
segments = segments['segments']
|
448 |
-
|
449 |
-
if isinstance(segments, list):
|
450 |
-
transcription = " ".join([segment.get('Text', '') for segment in segments])
|
451 |
-
else:
|
452 |
-
update_progress("Unexpected segments format received from speech_to_text.")
|
453 |
-
logging.error(f"Unexpected segments format: {segments}")
|
454 |
-
|
455 |
-
chunked_text = improved_chunking_process(transcription, chunk_options)
|
456 |
-
|
457 |
-
if api_name and api_key:
|
458 |
-
try:
|
459 |
-
summary = perform_summarization(api_name, chunked_text, custom_prompt_input, api_key)
|
460 |
-
update_progress("Audio summarized successfully.")
|
461 |
-
except Exception as e:
|
462 |
-
logging.error(f"Error during summarization: {str(e)}")
|
463 |
-
summary = "Summary generation failed"
|
464 |
-
else:
|
465 |
-
summary = "No summary available (API not provided)"
|
466 |
-
|
467 |
-
all_transcriptions.append(transcription)
|
468 |
-
all_summaries.append(summary)
|
469 |
-
|
470 |
-
add_media_with_keywords(
|
471 |
-
url="Uploaded File",
|
472 |
-
title=os.path.basename(wav_file_path),
|
473 |
-
media_type='audio',
|
474 |
-
content=transcription,
|
475 |
-
keywords=custom_keywords,
|
476 |
-
prompt=custom_prompt_input,
|
477 |
-
summary=summary,
|
478 |
-
transcription_model=whisper_model,
|
479 |
-
author="Unknown",
|
480 |
-
ingestion_date=datetime.now().strftime('%Y-%m-%d')
|
481 |
-
)
|
482 |
-
update_progress("Uploaded file processed and added to database.")
|
483 |
-
|
484 |
-
# Final cleanup
|
485 |
-
if not keep_original:
|
486 |
-
cleanup_files()
|
487 |
-
|
488 |
-
final_progress = update_progress("All processing complete.")
|
489 |
-
final_transcriptions = "\n\n".join(all_transcriptions)
|
490 |
-
final_summaries = "\n\n".join(all_summaries)
|
491 |
-
|
492 |
-
return final_progress, final_transcriptions, final_summaries
|
493 |
-
|
494 |
-
except Exception as e:
|
495 |
-
logging.error(f"Error processing audio files: {str(e)}")
|
496 |
-
cleanup_files()
|
497 |
-
return update_progress(f"Processing failed: {str(e)}"), "", ""
|
498 |
-
|
499 |
-
|
500 |
-
def download_youtube_audio(url):
|
501 |
-
try:
|
502 |
-
# Determine ffmpeg path based on the operating system.
|
503 |
-
ffmpeg_path = './Bin/ffmpeg.exe' if os.name == 'nt' else 'ffmpeg'
|
504 |
-
|
505 |
-
# Create a temporary directory
|
506 |
-
with tempfile.TemporaryDirectory() as temp_dir:
|
507 |
-
# Extract information about the video
|
508 |
-
with yt_dlp.YoutubeDL({'quiet': True}) as ydl:
|
509 |
-
info_dict = ydl.extract_info(url, download=False)
|
510 |
-
sanitized_title = sanitize_filename(info_dict['title'])
|
511 |
-
|
512 |
-
# Setup the temporary filenames
|
513 |
-
temp_video_path = Path(temp_dir) / f"{sanitized_title}_temp.mp4"
|
514 |
-
temp_audio_path = Path(temp_dir) / f"{sanitized_title}.mp3"
|
515 |
-
|
516 |
-
# Initialize yt-dlp with options for downloading
|
517 |
-
ydl_opts = {
|
518 |
-
'format': 'bestaudio[ext=m4a]/best[height<=480]', # Prefer best audio, or video up to 480p
|
519 |
-
'ffmpeg_location': ffmpeg_path,
|
520 |
-
'outtmpl': str(temp_video_path),
|
521 |
-
'noplaylist': True,
|
522 |
-
'quiet': True
|
523 |
-
}
|
524 |
-
|
525 |
-
# Execute yt-dlp to download the video/audio
|
526 |
-
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
527 |
-
ydl.download([url])
|
528 |
-
|
529 |
-
# Check if the file exists
|
530 |
-
if not temp_video_path.exists():
|
531 |
-
raise FileNotFoundError(f"Expected file was not found: {temp_video_path}")
|
532 |
-
|
533 |
-
# Use ffmpeg to extract audio
|
534 |
-
ffmpeg_command = [
|
535 |
-
ffmpeg_path,
|
536 |
-
'-i', str(temp_video_path),
|
537 |
-
'-vn', # No video
|
538 |
-
'-acodec', 'libmp3lame',
|
539 |
-
'-b:a', '192k',
|
540 |
-
str(temp_audio_path)
|
541 |
-
]
|
542 |
-
subprocess.run(ffmpeg_command, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
543 |
-
|
544 |
-
# Check if the audio file was created
|
545 |
-
if not temp_audio_path.exists():
|
546 |
-
raise FileNotFoundError(f"Expected audio file was not found: {temp_audio_path}")
|
547 |
-
|
548 |
-
# Create a persistent directory for the download if it doesn't exist
|
549 |
-
persistent_dir = Path("downloads")
|
550 |
-
persistent_dir.mkdir(exist_ok=True)
|
551 |
-
|
552 |
-
# Move the file from the temporary directory to the persistent directory
|
553 |
-
persistent_file_path = persistent_dir / f"{sanitized_title}.mp3"
|
554 |
-
os.replace(str(temp_audio_path), str(persistent_file_path))
|
555 |
-
|
556 |
-
# Add the file to the list of downloaded files
|
557 |
-
downloaded_files.append(str(persistent_file_path))
|
558 |
-
|
559 |
-
return str(persistent_file_path), f"Audio downloaded successfully: {sanitized_title}.mp3"
|
560 |
-
except Exception as e:
|
561 |
-
return None, f"Error downloading audio: {str(e)}"
|
562 |
-
|
563 |
-
|
564 |
-
def process_podcast(url, title, author, keywords, custom_prompt, api_name, api_key, whisper_model,
|
565 |
-
keep_original=False, enable_diarization=False, use_cookies=False, cookies=None,
|
566 |
-
chunk_method=None, max_chunk_size=300, chunk_overlap=0, use_adaptive_chunking=False,
|
567 |
-
use_multi_level_chunking=False, chunk_language='english'):
|
568 |
-
progress = []
|
569 |
-
error_message = ""
|
570 |
-
temp_files = []
|
571 |
-
|
572 |
-
def update_progress(message):
|
573 |
-
progress.append(message)
|
574 |
-
return "\n".join(progress)
|
575 |
-
|
576 |
-
def cleanup_files():
|
577 |
-
if not keep_original:
|
578 |
-
for file in temp_files:
|
579 |
-
try:
|
580 |
-
if os.path.exists(file):
|
581 |
-
os.remove(file)
|
582 |
-
update_progress(f"Temporary file {file} removed.")
|
583 |
-
except Exception as e:
|
584 |
-
update_progress(f"Failed to remove temporary file {file}: {str(e)}")
|
585 |
-
|
586 |
-
try:
|
587 |
-
# Download podcast
|
588 |
-
audio_file = download_audio_file(url, use_cookies, cookies)
|
589 |
-
temp_files.append(audio_file)
|
590 |
-
update_progress("Podcast downloaded successfully.")
|
591 |
-
|
592 |
-
# Extract metadata
|
593 |
-
metadata = extract_metadata(url)
|
594 |
-
title = title or metadata.get('title', 'Unknown Podcast')
|
595 |
-
author = author or metadata.get('uploader', 'Unknown Author')
|
596 |
-
|
597 |
-
# Format metadata for storage
|
598 |
-
metadata_text = f"""
|
599 |
-
Metadata:
|
600 |
-
Title: {title}
|
601 |
-
Author: {author}
|
602 |
-
Series: {metadata.get('series', 'N/A')}
|
603 |
-
Episode: {metadata.get('episode', 'N/A')}
|
604 |
-
Season: {metadata.get('season', 'N/A')}
|
605 |
-
Upload Date: {metadata.get('upload_date', 'N/A')}
|
606 |
-
Duration: {metadata.get('duration', 'N/A')} seconds
|
607 |
-
Description: {metadata.get('description', 'N/A')}
|
608 |
-
"""
|
609 |
-
|
610 |
-
# Update keywords
|
611 |
-
new_keywords = []
|
612 |
-
if metadata.get('series'):
|
613 |
-
new_keywords.append(f"series:{metadata['series']}")
|
614 |
-
if metadata.get('episode'):
|
615 |
-
new_keywords.append(f"episode:{metadata['episode']}")
|
616 |
-
if metadata.get('season'):
|
617 |
-
new_keywords.append(f"season:{metadata['season']}")
|
618 |
-
|
619 |
-
keywords = f"{keywords},{','.join(new_keywords)}" if keywords else ','.join(new_keywords)
|
620 |
-
|
621 |
-
update_progress(f"Metadata extracted - Title: {title}, Author: {author}, Keywords: {keywords}")
|
622 |
-
|
623 |
-
# Transcribe the podcast
|
624 |
-
try:
|
625 |
-
if enable_diarization:
|
626 |
-
segments = speech_to_text(audio_file, whisper_model=whisper_model, diarize=True)
|
627 |
-
else:
|
628 |
-
segments = speech_to_text(audio_file, whisper_model=whisper_model)
|
629 |
-
transcription = " ".join([segment['Text'] for segment in segments])
|
630 |
-
update_progress("Podcast transcribed successfully.")
|
631 |
-
except Exception as e:
|
632 |
-
error_message = f"Transcription failed: {str(e)}"
|
633 |
-
raise
|
634 |
-
|
635 |
-
# Apply chunking
|
636 |
-
chunk_options = {
|
637 |
-
'method': chunk_method,
|
638 |
-
'max_size': max_chunk_size,
|
639 |
-
'overlap': chunk_overlap,
|
640 |
-
'adaptive': use_adaptive_chunking,
|
641 |
-
'multi_level': use_multi_level_chunking,
|
642 |
-
'language': chunk_language
|
643 |
-
}
|
644 |
-
chunked_text = improved_chunking_process(transcription, chunk_options)
|
645 |
-
|
646 |
-
# Combine metadata and transcription
|
647 |
-
full_content = metadata_text + "\n\nTranscription:\n" + transcription
|
648 |
-
|
649 |
-
# Summarize if API is provided
|
650 |
-
summary = None
|
651 |
-
if api_name and api_key:
|
652 |
-
try:
|
653 |
-
summary = perform_summarization(api_name, chunked_text, custom_prompt, api_key)
|
654 |
-
update_progress("Podcast summarized successfully.")
|
655 |
-
except Exception as e:
|
656 |
-
error_message = f"Summarization failed: {str(e)}"
|
657 |
-
raise
|
658 |
-
|
659 |
-
# Add to database
|
660 |
-
try:
|
661 |
-
add_media_with_keywords(
|
662 |
-
url=url,
|
663 |
-
title=title,
|
664 |
-
media_type='podcast',
|
665 |
-
content=full_content,
|
666 |
-
keywords=keywords,
|
667 |
-
prompt=custom_prompt,
|
668 |
-
summary=summary or "No summary available",
|
669 |
-
transcription_model=whisper_model,
|
670 |
-
author=author,
|
671 |
-
ingestion_date=datetime.now().strftime('%Y-%m-%d')
|
672 |
-
)
|
673 |
-
update_progress("Podcast added to database successfully.")
|
674 |
-
except Exception as e:
|
675 |
-
error_message = f"Error adding podcast to database: {str(e)}"
|
676 |
-
raise
|
677 |
-
|
678 |
-
# Cleanup
|
679 |
-
cleanup_files()
|
680 |
-
|
681 |
-
return (update_progress("Processing complete."), full_content, summary or "No summary generated.",
|
682 |
-
title, author, keywords, error_message)
|
683 |
-
|
684 |
-
except Exception as e:
|
685 |
-
logging.error(f"Error processing podcast: {str(e)}")
|
686 |
-
cleanup_files()
|
687 |
-
return update_progress(f"Processing failed: {str(e)}"), "", "", "", "", "", str(e)
|
688 |
-
|
689 |
-
|
690 |
-
#
|
691 |
-
#
|
692 |
#######################################################################################################################
|
|
|
1 |
+
# Audio_Files.py
|
2 |
+
#########################################
|
3 |
+
# Audio Processing Library
|
4 |
+
# This library is used to download or load audio files from a local directory.
|
5 |
+
#
|
6 |
+
####
|
7 |
+
#
|
8 |
+
# Functions:
|
9 |
+
#
|
10 |
+
# download_audio_file(url, save_path)
|
11 |
+
# process_audio(
|
12 |
+
# process_audio_file(audio_url, audio_file, whisper_model="small.en", api_name=None, api_key=None)
|
13 |
+
#
|
14 |
+
#
|
15 |
+
#########################################
|
16 |
+
# Imports
|
17 |
+
import json
|
18 |
+
import logging
|
19 |
+
import os
|
20 |
+
import subprocess
|
21 |
+
import tempfile
|
22 |
+
import uuid
|
23 |
+
from datetime import datetime
|
24 |
+
from pathlib import Path
|
25 |
+
|
26 |
+
import requests
|
27 |
+
import yt_dlp
|
28 |
+
|
29 |
+
from App_Function_Libraries.Audio_Transcription_Lib import speech_to_text
|
30 |
+
from App_Function_Libraries.Chunk_Lib import improved_chunking_process
|
31 |
+
#
|
32 |
+
# Local Imports
|
33 |
+
from App_Function_Libraries.DB_Manager import add_media_to_database, add_media_with_keywords, \
|
34 |
+
check_media_and_whisper_model
|
35 |
+
from App_Function_Libraries.Summarization_General_Lib import save_transcription_and_summary, perform_transcription, \
|
36 |
+
perform_summarization
|
37 |
+
from App_Function_Libraries.Utils import create_download_directory, save_segments_to_json, downloaded_files, \
|
38 |
+
sanitize_filename
|
39 |
+
from App_Function_Libraries.Video_DL_Ingestion_Lib import extract_metadata
|
40 |
+
|
41 |
+
#
|
42 |
+
#######################################################################################################################
|
43 |
+
# Function Definitions
|
44 |
+
#
|
45 |
+
|
46 |
+
MAX_FILE_SIZE = 500 * 1024 * 1024
|
47 |
+
|
48 |
+
|
49 |
+
def download_audio_file(url, current_whisper_model="", use_cookies=False, cookies=None):
|
50 |
+
try:
|
51 |
+
# Check if media already exists in the database and compare whisper models
|
52 |
+
should_download, reason = check_media_and_whisper_model(
|
53 |
+
url=url,
|
54 |
+
current_whisper_model=current_whisper_model
|
55 |
+
)
|
56 |
+
|
57 |
+
if not should_download:
|
58 |
+
logging.info(f"Skipping audio download: {reason}")
|
59 |
+
return None
|
60 |
+
|
61 |
+
logging.info(f"Proceeding with audio download: {reason}")
|
62 |
+
|
63 |
+
# Set up the request headers
|
64 |
+
headers = {}
|
65 |
+
if use_cookies and cookies:
|
66 |
+
try:
|
67 |
+
cookie_dict = json.loads(cookies)
|
68 |
+
headers['Cookie'] = '; '.join([f'{k}={v}' for k, v in cookie_dict.items()])
|
69 |
+
except json.JSONDecodeError:
|
70 |
+
logging.warning("Invalid cookie format. Proceeding without cookies.")
|
71 |
+
|
72 |
+
# Make the request
|
73 |
+
response = requests.get(url, headers=headers, stream=True)
|
74 |
+
# Raise an exception for bad status codes
|
75 |
+
response.raise_for_status()
|
76 |
+
|
77 |
+
# Get the file size
|
78 |
+
file_size = int(response.headers.get('content-length', 0))
|
79 |
+
if file_size > 500 * 1024 * 1024: # 500 MB limit
|
80 |
+
raise ValueError("File size exceeds the 500MB limit.")
|
81 |
+
|
82 |
+
# Generate a unique filename
|
83 |
+
file_name = f"audio_{uuid.uuid4().hex[:8]}.mp3"
|
84 |
+
save_path = os.path.join('downloads', file_name)
|
85 |
+
|
86 |
+
# Ensure the downloads directory exists
|
87 |
+
os.makedirs('downloads', exist_ok=True)
|
88 |
+
|
89 |
+
|
90 |
+
# Download the file
|
91 |
+
with open(save_path, 'wb') as f:
|
92 |
+
for chunk in response.iter_content(chunk_size=8192):
|
93 |
+
if chunk:
|
94 |
+
f.write(chunk)
|
95 |
+
|
96 |
+
logging.info(f"Audio file downloaded successfully: {save_path}")
|
97 |
+
return save_path
|
98 |
+
|
99 |
+
except requests.RequestException as e:
|
100 |
+
logging.error(f"Error downloading audio file: {str(e)}")
|
101 |
+
raise
|
102 |
+
except ValueError as e:
|
103 |
+
logging.error(str(e))
|
104 |
+
raise
|
105 |
+
except Exception as e:
|
106 |
+
logging.error(f"Unexpected error downloading audio file: {str(e)}")
|
107 |
+
raise
|
108 |
+
|
109 |
+
|
110 |
+
def process_audio(
|
111 |
+
audio_file_path,
|
112 |
+
num_speakers=2,
|
113 |
+
whisper_model="small.en",
|
114 |
+
custom_prompt_input=None,
|
115 |
+
offset=0,
|
116 |
+
api_name=None,
|
117 |
+
api_key=None,
|
118 |
+
vad_filter=False,
|
119 |
+
rolling_summarization=False,
|
120 |
+
detail_level=0.01,
|
121 |
+
keywords="default,no_keyword_set",
|
122 |
+
chunk_text_by_words=False,
|
123 |
+
max_words=0,
|
124 |
+
chunk_text_by_sentences=False,
|
125 |
+
max_sentences=0,
|
126 |
+
chunk_text_by_paragraphs=False,
|
127 |
+
max_paragraphs=0,
|
128 |
+
chunk_text_by_tokens=False,
|
129 |
+
max_tokens=0
|
130 |
+
):
|
131 |
+
try:
|
132 |
+
|
133 |
+
# Perform transcription
|
134 |
+
audio_file_path, segments = perform_transcription(audio_file_path, offset, whisper_model, vad_filter)
|
135 |
+
|
136 |
+
if audio_file_path is None or segments is None:
|
137 |
+
logging.error("Process_Audio: Transcription failed or segments not available.")
|
138 |
+
return "Process_Audio: Transcription failed.", None, None, None, None, None
|
139 |
+
|
140 |
+
logging.debug(f"Process_Audio: Transcription audio_file: {audio_file_path}")
|
141 |
+
logging.debug(f"Process_Audio: Transcription segments: {segments}")
|
142 |
+
|
143 |
+
transcription_text = {'audio_file': audio_file_path, 'transcription': segments}
|
144 |
+
logging.debug(f"Process_Audio: Transcription text: {transcription_text}")
|
145 |
+
|
146 |
+
# Save segments to JSON
|
147 |
+
segments_json_path = save_segments_to_json(segments)
|
148 |
+
|
149 |
+
# Perform summarization
|
150 |
+
summary_text = None
|
151 |
+
if api_name:
|
152 |
+
if rolling_summarization is not None:
|
153 |
+
pass
|
154 |
+
# FIXME rolling summarization
|
155 |
+
# summary_text = rolling_summarize_function(
|
156 |
+
# transcription_text,
|
157 |
+
# detail=detail_level,
|
158 |
+
# api_name=api_name,
|
159 |
+
# api_key=api_key,
|
160 |
+
# custom_prompt=custom_prompt_input,
|
161 |
+
# chunk_by_words=chunk_text_by_words,
|
162 |
+
# max_words=max_words,
|
163 |
+
# chunk_by_sentences=chunk_text_by_sentences,
|
164 |
+
# max_sentences=max_sentences,
|
165 |
+
# chunk_by_paragraphs=chunk_text_by_paragraphs,
|
166 |
+
# max_paragraphs=max_paragraphs,
|
167 |
+
# chunk_by_tokens=chunk_text_by_tokens,
|
168 |
+
# max_tokens=max_tokens
|
169 |
+
# )
|
170 |
+
else:
|
171 |
+
summary_text = perform_summarization(api_name, segments_json_path, custom_prompt_input, api_key)
|
172 |
+
|
173 |
+
if summary_text is None:
|
174 |
+
logging.error("Summary text is None. Check summarization function.")
|
175 |
+
summary_file_path = None
|
176 |
+
else:
|
177 |
+
summary_text = 'Summary not available'
|
178 |
+
summary_file_path = None
|
179 |
+
|
180 |
+
# Save transcription and summary
|
181 |
+
download_path = create_download_directory("Audio_Processing")
|
182 |
+
json_file_path, summary_file_path = save_transcription_and_summary(transcription_text, summary_text,
|
183 |
+
download_path)
|
184 |
+
|
185 |
+
# Update function call to add_media_to_database so that it properly applies the title, author and file type
|
186 |
+
# Add to database
|
187 |
+
add_media_to_database(None, {'title': 'Audio File', 'author': 'Unknown'}, segments, summary_text, keywords,
|
188 |
+
custom_prompt_input, whisper_model)
|
189 |
+
|
190 |
+
return transcription_text, summary_text, json_file_path, summary_file_path, None, None
|
191 |
+
|
192 |
+
except Exception as e:
|
193 |
+
logging.error(f"Error in process_audio: {str(e)}")
|
194 |
+
return str(e), None, None, None, None, None
|
195 |
+
|
196 |
+
|
197 |
+
def process_single_audio(audio_file_path, whisper_model, api_name, api_key, keep_original,custom_keywords, source,
|
198 |
+
custom_prompt_input, chunk_method, max_chunk_size, chunk_overlap, use_adaptive_chunking,
|
199 |
+
use_multi_level_chunking, chunk_language):
|
200 |
+
progress = []
|
201 |
+
transcription = ""
|
202 |
+
summary = ""
|
203 |
+
|
204 |
+
def update_progress(message):
|
205 |
+
progress.append(message)
|
206 |
+
return "\n".join(progress)
|
207 |
+
|
208 |
+
try:
|
209 |
+
# Check file size before processing
|
210 |
+
file_size = os.path.getsize(audio_file_path)
|
211 |
+
if file_size > MAX_FILE_SIZE:
|
212 |
+
update_progress(f"File size ({file_size / (1024 * 1024):.2f} MB) exceeds the maximum limit of {MAX_FILE_SIZE / (1024 * 1024):.2f} MB. Skipping this file.")
|
213 |
+
return "\n".join(progress), "", ""
|
214 |
+
|
215 |
+
# Perform transcription
|
216 |
+
update_progress("Starting transcription...")
|
217 |
+
segments = speech_to_text(audio_file_path, whisper_model=whisper_model)
|
218 |
+
transcription = " ".join([segment['Text'] for segment in segments])
|
219 |
+
update_progress("Audio transcribed successfully.")
|
220 |
+
|
221 |
+
# Perform summarization if API is provided
|
222 |
+
if api_name and api_key:
|
223 |
+
update_progress("Starting summarization...")
|
224 |
+
summary = perform_summarization(api_name, transcription, "Summarize the following audio transcript",
|
225 |
+
api_key)
|
226 |
+
update_progress("Audio summarized successfully.")
|
227 |
+
else:
|
228 |
+
summary = "No summary available"
|
229 |
+
|
230 |
+
# Prepare keywords
|
231 |
+
keywords = "audio,transcription"
|
232 |
+
if custom_keywords:
|
233 |
+
keywords += f",{custom_keywords}"
|
234 |
+
|
235 |
+
# Add to database
|
236 |
+
add_media_with_keywords(
|
237 |
+
url=source,
|
238 |
+
title=os.path.basename(audio_file_path),
|
239 |
+
media_type='audio',
|
240 |
+
content=transcription,
|
241 |
+
keywords=keywords,
|
242 |
+
prompt="Summarize the following audio transcript",
|
243 |
+
summary=summary,
|
244 |
+
transcription_model=whisper_model,
|
245 |
+
author="Unknown",
|
246 |
+
ingestion_date=None # This will use the current date
|
247 |
+
)
|
248 |
+
update_progress("Audio file added to database successfully.")
|
249 |
+
|
250 |
+
if not keep_original and source != "Uploaded File":
|
251 |
+
os.remove(audio_file_path)
|
252 |
+
update_progress(f"Temporary file {audio_file_path} removed.")
|
253 |
+
elif keep_original and source != "Uploaded File":
|
254 |
+
update_progress(f"Original audio file kept at: {audio_file_path}")
|
255 |
+
|
256 |
+
except Exception as e:
|
257 |
+
update_progress(f"Error processing {source}: {str(e)}")
|
258 |
+
transcription = f"Error: {str(e)}"
|
259 |
+
summary = "No summary due to error"
|
260 |
+
|
261 |
+
return "\n".join(progress), transcription, summary
|
262 |
+
|
263 |
+
|
264 |
+
def process_audio_files(audio_urls, audio_file, whisper_model, api_name, api_key, use_cookies, cookies, keep_original,
|
265 |
+
custom_keywords, custom_prompt_input, chunk_method, max_chunk_size, chunk_overlap,
|
266 |
+
use_adaptive_chunking, use_multi_level_chunking, chunk_language, diarize):
|
267 |
+
progress = []
|
268 |
+
temp_files = []
|
269 |
+
all_transcriptions = []
|
270 |
+
all_summaries = []
|
271 |
+
|
272 |
+
def update_progress(message):
|
273 |
+
progress.append(message)
|
274 |
+
return "\n".join(progress)
|
275 |
+
|
276 |
+
def cleanup_files():
|
277 |
+
for file in temp_files:
|
278 |
+
try:
|
279 |
+
if os.path.exists(file):
|
280 |
+
os.remove(file)
|
281 |
+
update_progress(f"Temporary file {file} removed.")
|
282 |
+
except Exception as e:
|
283 |
+
update_progress(f"Failed to remove temporary file {file}: {str(e)}")
|
284 |
+
|
285 |
+
def reencode_mp3(mp3_file_path):
|
286 |
+
try:
|
287 |
+
reencoded_mp3_path = mp3_file_path.replace(".mp3", "_reencoded.mp3")
|
288 |
+
subprocess.run([ffmpeg_cmd, '-i', mp3_file_path, '-codec:a', 'libmp3lame', reencoded_mp3_path], check=True)
|
289 |
+
update_progress(f"Re-encoded {mp3_file_path} to {reencoded_mp3_path}.")
|
290 |
+
return reencoded_mp3_path
|
291 |
+
except subprocess.CalledProcessError as e:
|
292 |
+
update_progress(f"Error re-encoding {mp3_file_path}: {str(e)}")
|
293 |
+
raise
|
294 |
+
|
295 |
+
def convert_mp3_to_wav(mp3_file_path):
|
296 |
+
try:
|
297 |
+
wav_file_path = mp3_file_path.replace(".mp3", ".wav")
|
298 |
+
subprocess.run([ffmpeg_cmd, '-i', mp3_file_path, wav_file_path], check=True)
|
299 |
+
update_progress(f"Converted {mp3_file_path} to {wav_file_path}.")
|
300 |
+
return wav_file_path
|
301 |
+
except subprocess.CalledProcessError as e:
|
302 |
+
update_progress(f"Error converting {mp3_file_path} to WAV: {str(e)}")
|
303 |
+
raise
|
304 |
+
|
305 |
+
try:
|
306 |
+
# Check and set the ffmpeg command
|
307 |
+
global ffmpeg_cmd
|
308 |
+
if os.name == "nt":
|
309 |
+
logging.debug("Running on Windows")
|
310 |
+
ffmpeg_cmd = os.path.join(os.getcwd(), "Bin", "ffmpeg.exe")
|
311 |
+
else:
|
312 |
+
ffmpeg_cmd = 'ffmpeg' # Assume 'ffmpeg' is in PATH for non-Windows systems
|
313 |
+
|
314 |
+
# Ensure ffmpeg is accessible
|
315 |
+
if not os.path.exists(ffmpeg_cmd) and os.name == "nt":
|
316 |
+
raise FileNotFoundError(f"ffmpeg executable not found at path: {ffmpeg_cmd}")
|
317 |
+
|
318 |
+
# Define chunk options early to avoid undefined errors
|
319 |
+
chunk_options = {
|
320 |
+
'method': chunk_method,
|
321 |
+
'max_size': max_chunk_size,
|
322 |
+
'overlap': chunk_overlap,
|
323 |
+
'adaptive': use_adaptive_chunking,
|
324 |
+
'multi_level': use_multi_level_chunking,
|
325 |
+
'language': chunk_language
|
326 |
+
}
|
327 |
+
|
328 |
+
# Process multiple URLs
|
329 |
+
urls = [url.strip() for url in audio_urls.split('\n') if url.strip()]
|
330 |
+
|
331 |
+
for i, url in enumerate(urls):
|
332 |
+
update_progress(f"Processing URL {i + 1}/{len(urls)}: {url}")
|
333 |
+
|
334 |
+
# Download and process audio file
|
335 |
+
audio_file_path = download_audio_file(url, use_cookies, cookies)
|
336 |
+
if not os.path.exists(audio_file_path):
|
337 |
+
update_progress(f"Downloaded file not found: {audio_file_path}")
|
338 |
+
continue
|
339 |
+
|
340 |
+
temp_files.append(audio_file_path)
|
341 |
+
update_progress("Audio file downloaded successfully.")
|
342 |
+
|
343 |
+
# Re-encode MP3 to fix potential issues
|
344 |
+
reencoded_mp3_path = reencode_mp3(audio_file_path)
|
345 |
+
if not os.path.exists(reencoded_mp3_path):
|
346 |
+
update_progress(f"Re-encoded file not found: {reencoded_mp3_path}")
|
347 |
+
continue
|
348 |
+
|
349 |
+
temp_files.append(reencoded_mp3_path)
|
350 |
+
|
351 |
+
# Convert re-encoded MP3 to WAV
|
352 |
+
wav_file_path = convert_mp3_to_wav(reencoded_mp3_path)
|
353 |
+
if not os.path.exists(wav_file_path):
|
354 |
+
update_progress(f"Converted WAV file not found: {wav_file_path}")
|
355 |
+
continue
|
356 |
+
|
357 |
+
temp_files.append(wav_file_path)
|
358 |
+
|
359 |
+
# Initialize transcription
|
360 |
+
transcription = ""
|
361 |
+
|
362 |
+
# Transcribe audio
|
363 |
+
if diarize:
|
364 |
+
segments = speech_to_text(wav_file_path, whisper_model=whisper_model, diarize=True)
|
365 |
+
else:
|
366 |
+
segments = speech_to_text(wav_file_path, whisper_model=whisper_model)
|
367 |
+
|
368 |
+
# Handle segments nested under 'segments' key
|
369 |
+
if isinstance(segments, dict) and 'segments' in segments:
|
370 |
+
segments = segments['segments']
|
371 |
+
|
372 |
+
if isinstance(segments, list):
|
373 |
+
transcription = " ".join([segment.get('Text', '') for segment in segments])
|
374 |
+
update_progress("Audio transcribed successfully.")
|
375 |
+
else:
|
376 |
+
update_progress("Unexpected segments format received from speech_to_text.")
|
377 |
+
logging.error(f"Unexpected segments format: {segments}")
|
378 |
+
continue
|
379 |
+
|
380 |
+
if not transcription.strip():
|
381 |
+
update_progress("Transcription is empty.")
|
382 |
+
else:
|
383 |
+
# Apply chunking
|
384 |
+
chunked_text = improved_chunking_process(transcription, chunk_options)
|
385 |
+
|
386 |
+
# Summarize
|
387 |
+
if api_name:
|
388 |
+
try:
|
389 |
+
summary = perform_summarization(api_name, chunked_text, custom_prompt_input, api_key)
|
390 |
+
update_progress("Audio summarized successfully.")
|
391 |
+
except Exception as e:
|
392 |
+
logging.error(f"Error during summarization: {str(e)}")
|
393 |
+
summary = "Summary generation failed"
|
394 |
+
else:
|
395 |
+
summary = "No summary available (API not provided)"
|
396 |
+
|
397 |
+
all_transcriptions.append(transcription)
|
398 |
+
all_summaries.append(summary)
|
399 |
+
|
400 |
+
# Add to database
|
401 |
+
add_media_with_keywords(
|
402 |
+
url=url,
|
403 |
+
title=os.path.basename(wav_file_path),
|
404 |
+
media_type='audio',
|
405 |
+
content=transcription,
|
406 |
+
keywords=custom_keywords,
|
407 |
+
prompt=custom_prompt_input,
|
408 |
+
summary=summary,
|
409 |
+
transcription_model=whisper_model,
|
410 |
+
author="Unknown",
|
411 |
+
ingestion_date=datetime.now().strftime('%Y-%m-%d')
|
412 |
+
)
|
413 |
+
update_progress("Audio file processed and added to database.")
|
414 |
+
|
415 |
+
# Process uploaded file if provided
|
416 |
+
if audio_file:
|
417 |
+
if os.path.getsize(audio_file.name) > MAX_FILE_SIZE:
|
418 |
+
update_progress(
|
419 |
+
f"Uploaded file size exceeds the maximum limit of {MAX_FILE_SIZE / (1024 * 1024):.2f}MB. Skipping this file.")
|
420 |
+
else:
|
421 |
+
# Re-encode MP3 to fix potential issues
|
422 |
+
reencoded_mp3_path = reencode_mp3(audio_file.name)
|
423 |
+
if not os.path.exists(reencoded_mp3_path):
|
424 |
+
update_progress(f"Re-encoded file not found: {reencoded_mp3_path}")
|
425 |
+
return update_progress("Processing failed: Re-encoded file not found"), "", ""
|
426 |
+
|
427 |
+
temp_files.append(reencoded_mp3_path)
|
428 |
+
|
429 |
+
# Convert re-encoded MP3 to WAV
|
430 |
+
wav_file_path = convert_mp3_to_wav(reencoded_mp3_path)
|
431 |
+
if not os.path.exists(wav_file_path):
|
432 |
+
update_progress(f"Converted WAV file not found: {wav_file_path}")
|
433 |
+
return update_progress("Processing failed: Converted WAV file not found"), "", ""
|
434 |
+
|
435 |
+
temp_files.append(wav_file_path)
|
436 |
+
|
437 |
+
# Initialize transcription
|
438 |
+
transcription = ""
|
439 |
+
|
440 |
+
if diarize:
|
441 |
+
segments = speech_to_text(wav_file_path, whisper_model=whisper_model, diarize=True)
|
442 |
+
else:
|
443 |
+
segments = speech_to_text(wav_file_path, whisper_model=whisper_model)
|
444 |
+
|
445 |
+
# Handle segments nested under 'segments' key
|
446 |
+
if isinstance(segments, dict) and 'segments' in segments:
|
447 |
+
segments = segments['segments']
|
448 |
+
|
449 |
+
if isinstance(segments, list):
|
450 |
+
transcription = " ".join([segment.get('Text', '') for segment in segments])
|
451 |
+
else:
|
452 |
+
update_progress("Unexpected segments format received from speech_to_text.")
|
453 |
+
logging.error(f"Unexpected segments format: {segments}")
|
454 |
+
|
455 |
+
chunked_text = improved_chunking_process(transcription, chunk_options)
|
456 |
+
|
457 |
+
if api_name and api_key:
|
458 |
+
try:
|
459 |
+
summary = perform_summarization(api_name, chunked_text, custom_prompt_input, api_key)
|
460 |
+
update_progress("Audio summarized successfully.")
|
461 |
+
except Exception as e:
|
462 |
+
logging.error(f"Error during summarization: {str(e)}")
|
463 |
+
summary = "Summary generation failed"
|
464 |
+
else:
|
465 |
+
summary = "No summary available (API not provided)"
|
466 |
+
|
467 |
+
all_transcriptions.append(transcription)
|
468 |
+
all_summaries.append(summary)
|
469 |
+
|
470 |
+
add_media_with_keywords(
|
471 |
+
url="Uploaded File",
|
472 |
+
title=os.path.basename(wav_file_path),
|
473 |
+
media_type='audio',
|
474 |
+
content=transcription,
|
475 |
+
keywords=custom_keywords,
|
476 |
+
prompt=custom_prompt_input,
|
477 |
+
summary=summary,
|
478 |
+
transcription_model=whisper_model,
|
479 |
+
author="Unknown",
|
480 |
+
ingestion_date=datetime.now().strftime('%Y-%m-%d')
|
481 |
+
)
|
482 |
+
update_progress("Uploaded file processed and added to database.")
|
483 |
+
|
484 |
+
# Final cleanup
|
485 |
+
if not keep_original:
|
486 |
+
cleanup_files()
|
487 |
+
|
488 |
+
final_progress = update_progress("All processing complete.")
|
489 |
+
final_transcriptions = "\n\n".join(all_transcriptions)
|
490 |
+
final_summaries = "\n\n".join(all_summaries)
|
491 |
+
|
492 |
+
return final_progress, final_transcriptions, final_summaries
|
493 |
+
|
494 |
+
except Exception as e:
|
495 |
+
logging.error(f"Error processing audio files: {str(e)}")
|
496 |
+
cleanup_files()
|
497 |
+
return update_progress(f"Processing failed: {str(e)}"), "", ""
|
498 |
+
|
499 |
+
|
500 |
+
def download_youtube_audio(url):
|
501 |
+
try:
|
502 |
+
# Determine ffmpeg path based on the operating system.
|
503 |
+
ffmpeg_path = './Bin/ffmpeg.exe' if os.name == 'nt' else 'ffmpeg'
|
504 |
+
|
505 |
+
# Create a temporary directory
|
506 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
507 |
+
# Extract information about the video
|
508 |
+
with yt_dlp.YoutubeDL({'quiet': True}) as ydl:
|
509 |
+
info_dict = ydl.extract_info(url, download=False)
|
510 |
+
sanitized_title = sanitize_filename(info_dict['title'])
|
511 |
+
|
512 |
+
# Setup the temporary filenames
|
513 |
+
temp_video_path = Path(temp_dir) / f"{sanitized_title}_temp.mp4"
|
514 |
+
temp_audio_path = Path(temp_dir) / f"{sanitized_title}.mp3"
|
515 |
+
|
516 |
+
# Initialize yt-dlp with options for downloading
|
517 |
+
ydl_opts = {
|
518 |
+
'format': 'bestaudio[ext=m4a]/best[height<=480]', # Prefer best audio, or video up to 480p
|
519 |
+
'ffmpeg_location': ffmpeg_path,
|
520 |
+
'outtmpl': str(temp_video_path),
|
521 |
+
'noplaylist': True,
|
522 |
+
'quiet': True
|
523 |
+
}
|
524 |
+
|
525 |
+
# Execute yt-dlp to download the video/audio
|
526 |
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
527 |
+
ydl.download([url])
|
528 |
+
|
529 |
+
# Check if the file exists
|
530 |
+
if not temp_video_path.exists():
|
531 |
+
raise FileNotFoundError(f"Expected file was not found: {temp_video_path}")
|
532 |
+
|
533 |
+
# Use ffmpeg to extract audio
|
534 |
+
ffmpeg_command = [
|
535 |
+
ffmpeg_path,
|
536 |
+
'-i', str(temp_video_path),
|
537 |
+
'-vn', # No video
|
538 |
+
'-acodec', 'libmp3lame',
|
539 |
+
'-b:a', '192k',
|
540 |
+
str(temp_audio_path)
|
541 |
+
]
|
542 |
+
subprocess.run(ffmpeg_command, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
543 |
+
|
544 |
+
# Check if the audio file was created
|
545 |
+
if not temp_audio_path.exists():
|
546 |
+
raise FileNotFoundError(f"Expected audio file was not found: {temp_audio_path}")
|
547 |
+
|
548 |
+
# Create a persistent directory for the download if it doesn't exist
|
549 |
+
persistent_dir = Path("downloads")
|
550 |
+
persistent_dir.mkdir(exist_ok=True)
|
551 |
+
|
552 |
+
# Move the file from the temporary directory to the persistent directory
|
553 |
+
persistent_file_path = persistent_dir / f"{sanitized_title}.mp3"
|
554 |
+
os.replace(str(temp_audio_path), str(persistent_file_path))
|
555 |
+
|
556 |
+
# Add the file to the list of downloaded files
|
557 |
+
downloaded_files.append(str(persistent_file_path))
|
558 |
+
|
559 |
+
return str(persistent_file_path), f"Audio downloaded successfully: {sanitized_title}.mp3"
|
560 |
+
except Exception as e:
|
561 |
+
return None, f"Error downloading audio: {str(e)}"
|
562 |
+
|
563 |
+
|
564 |
+
def process_podcast(url, title, author, keywords, custom_prompt, api_name, api_key, whisper_model,
|
565 |
+
keep_original=False, enable_diarization=False, use_cookies=False, cookies=None,
|
566 |
+
chunk_method=None, max_chunk_size=300, chunk_overlap=0, use_adaptive_chunking=False,
|
567 |
+
use_multi_level_chunking=False, chunk_language='english'):
|
568 |
+
progress = []
|
569 |
+
error_message = ""
|
570 |
+
temp_files = []
|
571 |
+
|
572 |
+
def update_progress(message):
|
573 |
+
progress.append(message)
|
574 |
+
return "\n".join(progress)
|
575 |
+
|
576 |
+
def cleanup_files():
|
577 |
+
if not keep_original:
|
578 |
+
for file in temp_files:
|
579 |
+
try:
|
580 |
+
if os.path.exists(file):
|
581 |
+
os.remove(file)
|
582 |
+
update_progress(f"Temporary file {file} removed.")
|
583 |
+
except Exception as e:
|
584 |
+
update_progress(f"Failed to remove temporary file {file}: {str(e)}")
|
585 |
+
|
586 |
+
try:
|
587 |
+
# Download podcast
|
588 |
+
audio_file = download_audio_file(url, use_cookies, cookies)
|
589 |
+
temp_files.append(audio_file)
|
590 |
+
update_progress("Podcast downloaded successfully.")
|
591 |
+
|
592 |
+
# Extract metadata
|
593 |
+
metadata = extract_metadata(url)
|
594 |
+
title = title or metadata.get('title', 'Unknown Podcast')
|
595 |
+
author = author or metadata.get('uploader', 'Unknown Author')
|
596 |
+
|
597 |
+
# Format metadata for storage
|
598 |
+
metadata_text = f"""
|
599 |
+
Metadata:
|
600 |
+
Title: {title}
|
601 |
+
Author: {author}
|
602 |
+
Series: {metadata.get('series', 'N/A')}
|
603 |
+
Episode: {metadata.get('episode', 'N/A')}
|
604 |
+
Season: {metadata.get('season', 'N/A')}
|
605 |
+
Upload Date: {metadata.get('upload_date', 'N/A')}
|
606 |
+
Duration: {metadata.get('duration', 'N/A')} seconds
|
607 |
+
Description: {metadata.get('description', 'N/A')}
|
608 |
+
"""
|
609 |
+
|
610 |
+
# Update keywords
|
611 |
+
new_keywords = []
|
612 |
+
if metadata.get('series'):
|
613 |
+
new_keywords.append(f"series:{metadata['series']}")
|
614 |
+
if metadata.get('episode'):
|
615 |
+
new_keywords.append(f"episode:{metadata['episode']}")
|
616 |
+
if metadata.get('season'):
|
617 |
+
new_keywords.append(f"season:{metadata['season']}")
|
618 |
+
|
619 |
+
keywords = f"{keywords},{','.join(new_keywords)}" if keywords else ','.join(new_keywords)
|
620 |
+
|
621 |
+
update_progress(f"Metadata extracted - Title: {title}, Author: {author}, Keywords: {keywords}")
|
622 |
+
|
623 |
+
# Transcribe the podcast
|
624 |
+
try:
|
625 |
+
if enable_diarization:
|
626 |
+
segments = speech_to_text(audio_file, whisper_model=whisper_model, diarize=True)
|
627 |
+
else:
|
628 |
+
segments = speech_to_text(audio_file, whisper_model=whisper_model)
|
629 |
+
transcription = " ".join([segment['Text'] for segment in segments])
|
630 |
+
update_progress("Podcast transcribed successfully.")
|
631 |
+
except Exception as e:
|
632 |
+
error_message = f"Transcription failed: {str(e)}"
|
633 |
+
raise
|
634 |
+
|
635 |
+
# Apply chunking
|
636 |
+
chunk_options = {
|
637 |
+
'method': chunk_method,
|
638 |
+
'max_size': max_chunk_size,
|
639 |
+
'overlap': chunk_overlap,
|
640 |
+
'adaptive': use_adaptive_chunking,
|
641 |
+
'multi_level': use_multi_level_chunking,
|
642 |
+
'language': chunk_language
|
643 |
+
}
|
644 |
+
chunked_text = improved_chunking_process(transcription, chunk_options)
|
645 |
+
|
646 |
+
# Combine metadata and transcription
|
647 |
+
full_content = metadata_text + "\n\nTranscription:\n" + transcription
|
648 |
+
|
649 |
+
# Summarize if API is provided
|
650 |
+
summary = None
|
651 |
+
if api_name and api_key:
|
652 |
+
try:
|
653 |
+
summary = perform_summarization(api_name, chunked_text, custom_prompt, api_key)
|
654 |
+
update_progress("Podcast summarized successfully.")
|
655 |
+
except Exception as e:
|
656 |
+
error_message = f"Summarization failed: {str(e)}"
|
657 |
+
raise
|
658 |
+
|
659 |
+
# Add to database
|
660 |
+
try:
|
661 |
+
add_media_with_keywords(
|
662 |
+
url=url,
|
663 |
+
title=title,
|
664 |
+
media_type='podcast',
|
665 |
+
content=full_content,
|
666 |
+
keywords=keywords,
|
667 |
+
prompt=custom_prompt,
|
668 |
+
summary=summary or "No summary available",
|
669 |
+
transcription_model=whisper_model,
|
670 |
+
author=author,
|
671 |
+
ingestion_date=datetime.now().strftime('%Y-%m-%d')
|
672 |
+
)
|
673 |
+
update_progress("Podcast added to database successfully.")
|
674 |
+
except Exception as e:
|
675 |
+
error_message = f"Error adding podcast to database: {str(e)}"
|
676 |
+
raise
|
677 |
+
|
678 |
+
# Cleanup
|
679 |
+
cleanup_files()
|
680 |
+
|
681 |
+
return (update_progress("Processing complete."), full_content, summary or "No summary generated.",
|
682 |
+
title, author, keywords, error_message)
|
683 |
+
|
684 |
+
except Exception as e:
|
685 |
+
logging.error(f"Error processing podcast: {str(e)}")
|
686 |
+
cleanup_files()
|
687 |
+
return update_progress(f"Processing failed: {str(e)}"), "", "", "", "", "", str(e)
|
688 |
+
|
689 |
+
|
690 |
+
#
|
691 |
+
#
|
692 |
#######################################################################################################################
|
App_Function_Libraries/Audio_Transcription_Lib.py
CHANGED
@@ -1,192 +1,192 @@
|
|
1 |
-
# Audio_Transcription_Lib.py
|
2 |
-
#########################################
|
3 |
-
# Transcription Library
|
4 |
-
# This library is used to perform transcription of audio files.
|
5 |
-
# Currently, uses faster_whisper for transcription.
|
6 |
-
#
|
7 |
-
####
|
8 |
-
import configparser
|
9 |
-
####################
|
10 |
-
# Function List
|
11 |
-
#
|
12 |
-
# 1. convert_to_wav(video_file_path, offset=0, overwrite=False)
|
13 |
-
# 2. speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='small.en', vad_filter=False)
|
14 |
-
#
|
15 |
-
####################
|
16 |
-
#
|
17 |
-
# Import necessary libraries to run solo for testing
|
18 |
-
import gc
|
19 |
-
import json
|
20 |
-
import logging
|
21 |
-
import os
|
22 |
-
import sys
|
23 |
-
import subprocess
|
24 |
-
import time
|
25 |
-
|
26 |
-
# DEBUG Imports
|
27 |
-
#from memory_profiler import profile
|
28 |
-
|
29 |
-
# Import Local
|
30 |
-
#
|
31 |
-
#######################################################################################################################
|
32 |
-
# Function Definitions
|
33 |
-
#
|
34 |
-
|
35 |
-
# Convert video .m4a into .wav using ffmpeg
|
36 |
-
# ffmpeg -i "example.mp4" -ar 16000 -ac 1 -c:a pcm_s16le "output.wav"
|
37 |
-
# https://www.gyan.dev/ffmpeg/builds/
|
38 |
-
#
|
39 |
-
|
40 |
-
|
41 |
-
whisper_model_instance = None
|
42 |
-
# Retrieve processing choice from the configuration file
|
43 |
-
config = configparser.ConfigParser()
|
44 |
-
config.read('config.txt')
|
45 |
-
processing_choice = config.get('Processing', 'processing_choice', fallback='cpu')
|
46 |
-
|
47 |
-
|
48 |
-
# FIXME: This is a temporary solution.
|
49 |
-
# This doesn't clear older models, which means potentially a lot of memory is being used...
|
50 |
-
def get_whisper_model(model_name, device):
|
51 |
-
global whisper_model_instance
|
52 |
-
if whisper_model_instance is None:
|
53 |
-
from faster_whisper import WhisperModel
|
54 |
-
logging.info(f"Initializing new WhisperModel with size {model_name} on device {device}")
|
55 |
-
whisper_model_instance = WhisperModel(model_name, device=device)
|
56 |
-
return whisper_model_instance
|
57 |
-
|
58 |
-
|
59 |
-
# os.system(r'.\Bin\ffmpeg.exe -ss 00:00:00 -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{out_path}"')
|
60 |
-
#DEBUG
|
61 |
-
#@profile
|
62 |
-
def convert_to_wav(video_file_path, offset=0, overwrite=False):
|
63 |
-
out_path = os.path.splitext(video_file_path)[0] + ".wav"
|
64 |
-
|
65 |
-
if os.path.exists(out_path) and not overwrite:
|
66 |
-
print(f"File '{out_path}' already exists. Skipping conversion.")
|
67 |
-
logging.info(f"Skipping conversion as file already exists: {out_path}")
|
68 |
-
return out_path
|
69 |
-
print("Starting conversion process of .m4a to .WAV")
|
70 |
-
out_path = os.path.splitext(video_file_path)[0] + ".wav"
|
71 |
-
|
72 |
-
try:
|
73 |
-
if os.name == "nt":
|
74 |
-
logging.debug("ffmpeg being ran on windows")
|
75 |
-
|
76 |
-
if sys.platform.startswith('win'):
|
77 |
-
ffmpeg_cmd = ".\\Bin\\ffmpeg.exe"
|
78 |
-
logging.debug(f"ffmpeg_cmd: {ffmpeg_cmd}")
|
79 |
-
else:
|
80 |
-
ffmpeg_cmd = 'ffmpeg' # Assume 'ffmpeg' is in PATH for non-Windows systems
|
81 |
-
|
82 |
-
command = [
|
83 |
-
ffmpeg_cmd, # Assuming the working directory is correctly set where .\Bin exists
|
84 |
-
"-ss", "00:00:00", # Start at the beginning of the video
|
85 |
-
"-i", video_file_path,
|
86 |
-
"-ar", "16000", # Audio sample rate
|
87 |
-
"-ac", "1", # Number of audio channels
|
88 |
-
"-c:a", "pcm_s16le", # Audio codec
|
89 |
-
out_path
|
90 |
-
]
|
91 |
-
try:
|
92 |
-
# Redirect stdin from null device to prevent ffmpeg from waiting for input
|
93 |
-
with open(os.devnull, 'rb') as null_file:
|
94 |
-
result = subprocess.run(command, stdin=null_file, text=True, capture_output=True)
|
95 |
-
if result.returncode == 0:
|
96 |
-
logging.info("FFmpeg executed successfully")
|
97 |
-
logging.debug("FFmpeg output: %s", result.stdout)
|
98 |
-
else:
|
99 |
-
logging.error("Error in running FFmpeg")
|
100 |
-
logging.error("FFmpeg stderr: %s", result.stderr)
|
101 |
-
raise RuntimeError(f"FFmpeg error: {result.stderr}")
|
102 |
-
except Exception as e:
|
103 |
-
logging.error("Error occurred - ffmpeg doesn't like windows")
|
104 |
-
raise RuntimeError("ffmpeg failed")
|
105 |
-
elif os.name == "posix":
|
106 |
-
os.system(f'ffmpeg -ss 00:00:00 -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{out_path}"')
|
107 |
-
else:
|
108 |
-
raise RuntimeError("Unsupported operating system")
|
109 |
-
logging.info("Conversion to WAV completed: %s", out_path)
|
110 |
-
except subprocess.CalledProcessError as e:
|
111 |
-
logging.error("Error executing FFmpeg command: %s", str(e))
|
112 |
-
raise RuntimeError("Error converting video file to WAV")
|
113 |
-
except Exception as e:
|
114 |
-
logging.error("speech-to-text: Error transcribing audio: %s", str(e))
|
115 |
-
return {"error": str(e)}
|
116 |
-
gc.collect()
|
117 |
-
return out_path
|
118 |
-
|
119 |
-
|
120 |
-
# Transcribe .wav into .segments.json
|
121 |
-
#DEBUG
|
122 |
-
#@profile
|
123 |
-
def speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='medium.en', vad_filter=False, diarize=False):
|
124 |
-
global whisper_model_instance, processing_choice
|
125 |
-
logging.info('speech-to-text: Loading faster_whisper model: %s', whisper_model)
|
126 |
-
|
127 |
-
time_start = time.time()
|
128 |
-
if audio_file_path is None:
|
129 |
-
raise ValueError("speech-to-text: No audio file provided")
|
130 |
-
logging.info("speech-to-text: Audio file path: %s", audio_file_path)
|
131 |
-
|
132 |
-
try:
|
133 |
-
_, file_ending = os.path.splitext(audio_file_path)
|
134 |
-
out_file = audio_file_path.replace(file_ending, ".segments.json")
|
135 |
-
prettified_out_file = audio_file_path.replace(file_ending, ".segments_pretty.json")
|
136 |
-
if os.path.exists(out_file):
|
137 |
-
logging.info("speech-to-text: Segments file already exists: %s", out_file)
|
138 |
-
with open(out_file) as f:
|
139 |
-
global segments
|
140 |
-
segments = json.load(f)
|
141 |
-
return segments
|
142 |
-
|
143 |
-
logging.info('speech-to-text: Starting transcription...')
|
144 |
-
options = dict(language=selected_source_lang, beam_size=5, best_of=5, vad_filter=vad_filter)
|
145 |
-
transcribe_options = dict(task="transcribe", **options)
|
146 |
-
# use function and config at top of file
|
147 |
-
whisper_model_instance = get_whisper_model(whisper_model, processing_choice)
|
148 |
-
segments_raw, info = whisper_model_instance.transcribe(audio_file_path, **transcribe_options)
|
149 |
-
|
150 |
-
segments = []
|
151 |
-
for segment_chunk in segments_raw:
|
152 |
-
chunk = {
|
153 |
-
"Time_Start": segment_chunk.start,
|
154 |
-
"Time_End": segment_chunk.end,
|
155 |
-
"Text": segment_chunk.text
|
156 |
-
}
|
157 |
-
logging.debug("Segment: %s", chunk)
|
158 |
-
segments.append(chunk)
|
159 |
-
|
160 |
-
if segments:
|
161 |
-
segments[0]["Text"] = f"This text was transcribed using whisper model: {whisper_model}\n\n" + segments[0]["Text"]
|
162 |
-
|
163 |
-
if not segments:
|
164 |
-
raise RuntimeError("No transcription produced. The audio file may be invalid or empty.")
|
165 |
-
logging.info("speech-to-text: Transcription completed in %.2f seconds", time.time() - time_start)
|
166 |
-
|
167 |
-
# Save the segments to a JSON file - prettified and non-prettified
|
168 |
-
# FIXME so this is an optional flag to save either the prettified json file or the normal one
|
169 |
-
save_json = True
|
170 |
-
if save_json:
|
171 |
-
logging.info("speech-to-text: Saving segments to JSON file")
|
172 |
-
output_data = {'segments': segments}
|
173 |
-
|
174 |
-
logging.info("speech-to-text: Saving prettified JSON to %s", prettified_out_file)
|
175 |
-
with open(prettified_out_file, 'w') as f:
|
176 |
-
json.dump(output_data, f, indent=2)
|
177 |
-
|
178 |
-
logging.info("speech-to-text: Saving JSON to %s", out_file)
|
179 |
-
with open(out_file, 'w') as f:
|
180 |
-
json.dump(output_data, f)
|
181 |
-
|
182 |
-
logging.debug(f"speech-to-text: returning {segments[:500]}")
|
183 |
-
gc.collect()
|
184 |
-
return segments
|
185 |
-
|
186 |
-
except Exception as e:
|
187 |
-
logging.error("speech-to-text: Error transcribing audio: %s", str(e))
|
188 |
-
raise RuntimeError("speech-to-text: Error transcribing audio")
|
189 |
-
|
190 |
-
#
|
191 |
-
#
|
192 |
#######################################################################################################################
|
|
|
1 |
+
# Audio_Transcription_Lib.py
|
2 |
+
#########################################
|
3 |
+
# Transcription Library
|
4 |
+
# This library is used to perform transcription of audio files.
|
5 |
+
# Currently, uses faster_whisper for transcription.
|
6 |
+
#
|
7 |
+
####
|
8 |
+
import configparser
|
9 |
+
####################
|
10 |
+
# Function List
|
11 |
+
#
|
12 |
+
# 1. convert_to_wav(video_file_path, offset=0, overwrite=False)
|
13 |
+
# 2. speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='small.en', vad_filter=False)
|
14 |
+
#
|
15 |
+
####################
|
16 |
+
#
|
17 |
+
# Import necessary libraries to run solo for testing
|
18 |
+
import gc
|
19 |
+
import json
|
20 |
+
import logging
|
21 |
+
import os
|
22 |
+
import sys
|
23 |
+
import subprocess
|
24 |
+
import time
|
25 |
+
|
26 |
+
# DEBUG Imports
|
27 |
+
#from memory_profiler import profile
|
28 |
+
|
29 |
+
# Import Local
|
30 |
+
#
|
31 |
+
#######################################################################################################################
|
32 |
+
# Function Definitions
|
33 |
+
#
|
34 |
+
|
35 |
+
# Convert video .m4a into .wav using ffmpeg
|
36 |
+
# ffmpeg -i "example.mp4" -ar 16000 -ac 1 -c:a pcm_s16le "output.wav"
|
37 |
+
# https://www.gyan.dev/ffmpeg/builds/
|
38 |
+
#
|
39 |
+
|
40 |
+
|
41 |
+
whisper_model_instance = None
|
42 |
+
# Retrieve processing choice from the configuration file
|
43 |
+
config = configparser.ConfigParser()
|
44 |
+
config.read('config.txt')
|
45 |
+
processing_choice = config.get('Processing', 'processing_choice', fallback='cpu')
|
46 |
+
|
47 |
+
|
48 |
+
# FIXME: This is a temporary solution.
|
49 |
+
# This doesn't clear older models, which means potentially a lot of memory is being used...
|
50 |
+
def get_whisper_model(model_name, device):
|
51 |
+
global whisper_model_instance
|
52 |
+
if whisper_model_instance is None:
|
53 |
+
from faster_whisper import WhisperModel
|
54 |
+
logging.info(f"Initializing new WhisperModel with size {model_name} on device {device}")
|
55 |
+
whisper_model_instance = WhisperModel(model_name, device=device)
|
56 |
+
return whisper_model_instance
|
57 |
+
|
58 |
+
|
59 |
+
# os.system(r'.\Bin\ffmpeg.exe -ss 00:00:00 -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{out_path}"')
|
60 |
+
#DEBUG
|
61 |
+
#@profile
|
62 |
+
def convert_to_wav(video_file_path, offset=0, overwrite=False):
|
63 |
+
out_path = os.path.splitext(video_file_path)[0] + ".wav"
|
64 |
+
|
65 |
+
if os.path.exists(out_path) and not overwrite:
|
66 |
+
print(f"File '{out_path}' already exists. Skipping conversion.")
|
67 |
+
logging.info(f"Skipping conversion as file already exists: {out_path}")
|
68 |
+
return out_path
|
69 |
+
print("Starting conversion process of .m4a to .WAV")
|
70 |
+
out_path = os.path.splitext(video_file_path)[0] + ".wav"
|
71 |
+
|
72 |
+
try:
|
73 |
+
if os.name == "nt":
|
74 |
+
logging.debug("ffmpeg being ran on windows")
|
75 |
+
|
76 |
+
if sys.platform.startswith('win'):
|
77 |
+
ffmpeg_cmd = ".\\Bin\\ffmpeg.exe"
|
78 |
+
logging.debug(f"ffmpeg_cmd: {ffmpeg_cmd}")
|
79 |
+
else:
|
80 |
+
ffmpeg_cmd = 'ffmpeg' # Assume 'ffmpeg' is in PATH for non-Windows systems
|
81 |
+
|
82 |
+
command = [
|
83 |
+
ffmpeg_cmd, # Assuming the working directory is correctly set where .\Bin exists
|
84 |
+
"-ss", "00:00:00", # Start at the beginning of the video
|
85 |
+
"-i", video_file_path,
|
86 |
+
"-ar", "16000", # Audio sample rate
|
87 |
+
"-ac", "1", # Number of audio channels
|
88 |
+
"-c:a", "pcm_s16le", # Audio codec
|
89 |
+
out_path
|
90 |
+
]
|
91 |
+
try:
|
92 |
+
# Redirect stdin from null device to prevent ffmpeg from waiting for input
|
93 |
+
with open(os.devnull, 'rb') as null_file:
|
94 |
+
result = subprocess.run(command, stdin=null_file, text=True, capture_output=True)
|
95 |
+
if result.returncode == 0:
|
96 |
+
logging.info("FFmpeg executed successfully")
|
97 |
+
logging.debug("FFmpeg output: %s", result.stdout)
|
98 |
+
else:
|
99 |
+
logging.error("Error in running FFmpeg")
|
100 |
+
logging.error("FFmpeg stderr: %s", result.stderr)
|
101 |
+
raise RuntimeError(f"FFmpeg error: {result.stderr}")
|
102 |
+
except Exception as e:
|
103 |
+
logging.error("Error occurred - ffmpeg doesn't like windows")
|
104 |
+
raise RuntimeError("ffmpeg failed")
|
105 |
+
elif os.name == "posix":
|
106 |
+
os.system(f'ffmpeg -ss 00:00:00 -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{out_path}"')
|
107 |
+
else:
|
108 |
+
raise RuntimeError("Unsupported operating system")
|
109 |
+
logging.info("Conversion to WAV completed: %s", out_path)
|
110 |
+
except subprocess.CalledProcessError as e:
|
111 |
+
logging.error("Error executing FFmpeg command: %s", str(e))
|
112 |
+
raise RuntimeError("Error converting video file to WAV")
|
113 |
+
except Exception as e:
|
114 |
+
logging.error("speech-to-text: Error transcribing audio: %s", str(e))
|
115 |
+
return {"error": str(e)}
|
116 |
+
gc.collect()
|
117 |
+
return out_path
|
118 |
+
|
119 |
+
|
120 |
+
# Transcribe .wav into .segments.json
|
121 |
+
#DEBUG
|
122 |
+
#@profile
|
123 |
+
def speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='medium.en', vad_filter=False, diarize=False):
|
124 |
+
global whisper_model_instance, processing_choice
|
125 |
+
logging.info('speech-to-text: Loading faster_whisper model: %s', whisper_model)
|
126 |
+
|
127 |
+
time_start = time.time()
|
128 |
+
if audio_file_path is None:
|
129 |
+
raise ValueError("speech-to-text: No audio file provided")
|
130 |
+
logging.info("speech-to-text: Audio file path: %s", audio_file_path)
|
131 |
+
|
132 |
+
try:
|
133 |
+
_, file_ending = os.path.splitext(audio_file_path)
|
134 |
+
out_file = audio_file_path.replace(file_ending, ".segments.json")
|
135 |
+
prettified_out_file = audio_file_path.replace(file_ending, ".segments_pretty.json")
|
136 |
+
if os.path.exists(out_file):
|
137 |
+
logging.info("speech-to-text: Segments file already exists: %s", out_file)
|
138 |
+
with open(out_file) as f:
|
139 |
+
global segments
|
140 |
+
segments = json.load(f)
|
141 |
+
return segments
|
142 |
+
|
143 |
+
logging.info('speech-to-text: Starting transcription...')
|
144 |
+
options = dict(language=selected_source_lang, beam_size=5, best_of=5, vad_filter=vad_filter)
|
145 |
+
transcribe_options = dict(task="transcribe", **options)
|
146 |
+
# use function and config at top of file
|
147 |
+
whisper_model_instance = get_whisper_model(whisper_model, processing_choice)
|
148 |
+
segments_raw, info = whisper_model_instance.transcribe(audio_file_path, **transcribe_options)
|
149 |
+
|
150 |
+
segments = []
|
151 |
+
for segment_chunk in segments_raw:
|
152 |
+
chunk = {
|
153 |
+
"Time_Start": segment_chunk.start,
|
154 |
+
"Time_End": segment_chunk.end,
|
155 |
+
"Text": segment_chunk.text
|
156 |
+
}
|
157 |
+
logging.debug("Segment: %s", chunk)
|
158 |
+
segments.append(chunk)
|
159 |
+
|
160 |
+
if segments:
|
161 |
+
segments[0]["Text"] = f"This text was transcribed using whisper model: {whisper_model}\n\n" + segments[0]["Text"]
|
162 |
+
|
163 |
+
if not segments:
|
164 |
+
raise RuntimeError("No transcription produced. The audio file may be invalid or empty.")
|
165 |
+
logging.info("speech-to-text: Transcription completed in %.2f seconds", time.time() - time_start)
|
166 |
+
|
167 |
+
# Save the segments to a JSON file - prettified and non-prettified
|
168 |
+
# FIXME so this is an optional flag to save either the prettified json file or the normal one
|
169 |
+
save_json = True
|
170 |
+
if save_json:
|
171 |
+
logging.info("speech-to-text: Saving segments to JSON file")
|
172 |
+
output_data = {'segments': segments}
|
173 |
+
|
174 |
+
logging.info("speech-to-text: Saving prettified JSON to %s", prettified_out_file)
|
175 |
+
with open(prettified_out_file, 'w') as f:
|
176 |
+
json.dump(output_data, f, indent=2)
|
177 |
+
|
178 |
+
logging.info("speech-to-text: Saving JSON to %s", out_file)
|
179 |
+
with open(out_file, 'w') as f:
|
180 |
+
json.dump(output_data, f)
|
181 |
+
|
182 |
+
logging.debug(f"speech-to-text: returning {segments[:500]}")
|
183 |
+
gc.collect()
|
184 |
+
return segments
|
185 |
+
|
186 |
+
except Exception as e:
|
187 |
+
logging.error("speech-to-text: Error transcribing audio: %s", str(e))
|
188 |
+
raise RuntimeError("speech-to-text: Error transcribing audio")
|
189 |
+
|
190 |
+
#
|
191 |
+
#
|
192 |
#######################################################################################################################
|
App_Function_Libraries/Chat.py
ADDED
@@ -0,0 +1,273 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Chat.py
|
2 |
+
# Chat functions for interacting with the LLMs as chatbots
|
3 |
+
|
4 |
+
# Imports
|
5 |
+
import json
|
6 |
+
import logging
|
7 |
+
import os
|
8 |
+
import re
|
9 |
+
import tempfile
|
10 |
+
from datetime import datetime
|
11 |
+
|
12 |
+
from App_Function_Libraries.DB_Manager import get_conversation_name, save_chat_history_to_database
|
13 |
+
from App_Function_Libraries.LLM_API_Calls import chat_with_openai, chat_with_anthropic, chat_with_cohere, \
|
14 |
+
chat_with_groq, chat_with_openrouter, chat_with_deepseek, chat_with_mistral, chat_with_huggingface, chat_with_vllm
|
15 |
+
from App_Function_Libraries.LLM_API_Calls_Local import chat_with_aphrodite, chat_with_local_llm, chat_with_ollama, \
|
16 |
+
chat_with_kobold, chat_with_llama, chat_with_oobabooga, chat_with_tabbyapi
|
17 |
+
from App_Function_Libraries.SQLite_DB import load_media_content
|
18 |
+
from App_Function_Libraries.Utils import generate_unique_filename
|
19 |
+
|
20 |
+
|
21 |
+
#
|
22 |
+
# External Imports
|
23 |
+
#
|
24 |
+
# Local Imports
|
25 |
+
#
|
26 |
+
|
27 |
+
####################################################################################################
|
28 |
+
def chat(message, history, media_content, selected_parts, api_endpoint, api_key, prompt, temperature,
|
29 |
+
system_message=None):
|
30 |
+
try:
|
31 |
+
logging.info(f"Debug - Chat Function - Message: {message}")
|
32 |
+
logging.info(f"Debug - Chat Function - Media Content: {media_content}")
|
33 |
+
logging.info(f"Debug - Chat Function - Selected Parts: {selected_parts}")
|
34 |
+
logging.info(f"Debug - Chat Function - API Endpoint: {api_endpoint}")
|
35 |
+
# logging.info(f"Debug - Chat Function - Prompt: {prompt}")
|
36 |
+
|
37 |
+
# Ensure selected_parts is a list
|
38 |
+
if not isinstance(selected_parts, (list, tuple)):
|
39 |
+
selected_parts = [selected_parts] if selected_parts else []
|
40 |
+
|
41 |
+
# logging.debug(f"Debug - Chat Function - Selected Parts (after check): {selected_parts}")
|
42 |
+
|
43 |
+
# Combine the selected parts of the media content
|
44 |
+
combined_content = "\n\n".join(
|
45 |
+
[f"{part.capitalize()}: {media_content.get(part, '')}" for part in selected_parts if part in media_content])
|
46 |
+
# Print first 500 chars
|
47 |
+
# logging.debug(f"Debug - Chat Function - Combined Content: {combined_content[:500]}...")
|
48 |
+
|
49 |
+
# Prepare the input for the API
|
50 |
+
if not history:
|
51 |
+
input_data = f"{combined_content}\n\nUser: {message}\n"
|
52 |
+
else:
|
53 |
+
input_data = f"User: {message}\n"
|
54 |
+
# Print first 500 chars
|
55 |
+
# logging.info(f"Debug - Chat Function - Input Data: {input_data[:500]}...")
|
56 |
+
|
57 |
+
if system_message:
|
58 |
+
print(f"System message: {system_message}")
|
59 |
+
logging.debug(f"Debug - Chat Function - System Message: {system_message}")
|
60 |
+
temperature = float(temperature) if temperature else 0.7
|
61 |
+
temp = temperature
|
62 |
+
|
63 |
+
logging.debug("Debug - Chat Function - Temperature: {temperature}")
|
64 |
+
logging.debug(f"Debug - Chat Function - API Key: {api_key[:10]}")
|
65 |
+
logging.debug(f"Debug - Chat Function - Prompt: {prompt}")
|
66 |
+
|
67 |
+
# Use the existing API request code based on the selected endpoint
|
68 |
+
logging.info(f"Debug - Chat Function - API Endpoint: {api_endpoint}")
|
69 |
+
if api_endpoint.lower() == 'openai':
|
70 |
+
response = chat_with_openai(api_key, input_data, prompt, temp, system_message)
|
71 |
+
elif api_endpoint.lower() == "anthropic":
|
72 |
+
response = chat_with_anthropic(api_key, input_data, prompt, temp, system_message)
|
73 |
+
elif api_endpoint.lower() == "cohere":
|
74 |
+
response = chat_with_cohere(api_key, input_data, prompt, temp, system_message)
|
75 |
+
elif api_endpoint.lower() == "groq":
|
76 |
+
response = chat_with_groq(api_key, input_data, prompt, temp, system_message)
|
77 |
+
elif api_endpoint.lower() == "openrouter":
|
78 |
+
response = chat_with_openrouter(api_key, input_data, prompt, temp, system_message)
|
79 |
+
elif api_endpoint.lower() == "deepseek":
|
80 |
+
response = chat_with_deepseek(api_key, input_data, prompt, temp, system_message)
|
81 |
+
elif api_endpoint.lower() == "mistral":
|
82 |
+
response = chat_with_mistral(api_key, input_data, prompt, temp, system_message)
|
83 |
+
elif api_endpoint.lower() == "llama.cpp":
|
84 |
+
response = chat_with_llama(input_data, prompt, temp, system_message)
|
85 |
+
elif api_endpoint.lower() == "kobold":
|
86 |
+
response = chat_with_kobold(input_data, api_key, prompt, temp, system_message)
|
87 |
+
elif api_endpoint.lower() == "ooba":
|
88 |
+
response = chat_with_oobabooga(input_data, api_key, prompt, temp, system_message)
|
89 |
+
elif api_endpoint.lower() == "tabbyapi":
|
90 |
+
response = chat_with_tabbyapi(input_data, prompt, temp, system_message)
|
91 |
+
elif api_endpoint.lower() == "vllm":
|
92 |
+
response = chat_with_vllm(input_data, prompt, system_message)
|
93 |
+
elif api_endpoint.lower() == "local-llm":
|
94 |
+
response = chat_with_local_llm(input_data, prompt, temp, system_message)
|
95 |
+
elif api_endpoint.lower() == "huggingface":
|
96 |
+
response = chat_with_huggingface(api_key, input_data, prompt, temp) # , system_message)
|
97 |
+
elif api_endpoint.lower() == "ollama":
|
98 |
+
response = chat_with_ollama(input_data, prompt, temp, system_message)
|
99 |
+
elif api_endpoint.lower() == "aphrodite":
|
100 |
+
response = chat_with_aphrodite(input_data, prompt, temp, system_message)
|
101 |
+
else:
|
102 |
+
raise ValueError(f"Unsupported API endpoint: {api_endpoint}")
|
103 |
+
|
104 |
+
return response
|
105 |
+
|
106 |
+
except Exception as e:
|
107 |
+
logging.error(f"Error in chat function: {str(e)}")
|
108 |
+
return f"An error occurred: {str(e)}"
|
109 |
+
|
110 |
+
|
111 |
+
def save_chat_history_to_db_wrapper(chatbot, conversation_id, media_content):
|
112 |
+
logging.info(f"Attempting to save chat history. Media content type: {type(media_content)}")
|
113 |
+
try:
|
114 |
+
# Extract the media_id and media_name from the media_content
|
115 |
+
media_id = None
|
116 |
+
media_name = None
|
117 |
+
if isinstance(media_content, dict):
|
118 |
+
logging.debug(f"Media content keys: {media_content.keys()}")
|
119 |
+
if 'content' in media_content:
|
120 |
+
try:
|
121 |
+
content = media_content['content']
|
122 |
+
if isinstance(content, str):
|
123 |
+
content_json = json.loads(content)
|
124 |
+
elif isinstance(content, dict):
|
125 |
+
content_json = content
|
126 |
+
else:
|
127 |
+
raise ValueError(f"Unexpected content type: {type(content)}")
|
128 |
+
|
129 |
+
# Use the webpage_url as the media_id
|
130 |
+
media_id = content_json.get('webpage_url')
|
131 |
+
# Use the title as the media_name
|
132 |
+
media_name = content_json.get('title')
|
133 |
+
|
134 |
+
logging.info(f"Extracted media_id: {media_id}, media_name: {media_name}")
|
135 |
+
except json.JSONDecodeError:
|
136 |
+
logging.error("Failed to decode JSON from media_content['content']")
|
137 |
+
except Exception as e:
|
138 |
+
logging.error(f"Error processing media_content: {str(e)}")
|
139 |
+
else:
|
140 |
+
logging.warning("'content' key not found in media_content")
|
141 |
+
else:
|
142 |
+
logging.warning(f"media_content is not a dictionary. Type: {type(media_content)}")
|
143 |
+
|
144 |
+
if media_id is None:
|
145 |
+
# If we couldn't find a media_id, we'll use a placeholder
|
146 |
+
media_id = "unknown_media"
|
147 |
+
logging.warning(f"Unable to extract media_id from media_content. Using placeholder: {media_id}")
|
148 |
+
|
149 |
+
if media_name is None:
|
150 |
+
media_name = "Unnamed Media"
|
151 |
+
logging.warning(f"Unable to extract media_name from media_content. Using placeholder: {media_name}")
|
152 |
+
|
153 |
+
# Generate a unique conversation name using media_id and current timestamp
|
154 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
155 |
+
conversation_name = f"Chat_{media_id}_{timestamp}"
|
156 |
+
|
157 |
+
new_conversation_id = save_chat_history_to_database(chatbot, conversation_id, media_id, media_name,
|
158 |
+
conversation_name)
|
159 |
+
return new_conversation_id, f"Chat history saved successfully as {conversation_name}!"
|
160 |
+
except Exception as e:
|
161 |
+
error_message = f"Failed to save chat history: {str(e)}"
|
162 |
+
logging.error(error_message, exc_info=True)
|
163 |
+
return conversation_id, error_message
|
164 |
+
|
165 |
+
|
166 |
+
def save_chat_history(history, conversation_id, media_content):
|
167 |
+
try:
|
168 |
+
content, conversation_name = generate_chat_history_content(history, conversation_id, media_content)
|
169 |
+
|
170 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
171 |
+
safe_conversation_name = re.sub(r'[^a-zA-Z0-9_-]', '_', conversation_name)
|
172 |
+
base_filename = f"{safe_conversation_name}_{timestamp}.json"
|
173 |
+
|
174 |
+
# Create a temporary file
|
175 |
+
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.json') as temp_file:
|
176 |
+
temp_file.write(content)
|
177 |
+
temp_file_path = temp_file.name
|
178 |
+
|
179 |
+
# Generate a unique filename
|
180 |
+
unique_filename = generate_unique_filename(os.path.dirname(temp_file_path), base_filename)
|
181 |
+
final_path = os.path.join(os.path.dirname(temp_file_path), unique_filename)
|
182 |
+
|
183 |
+
# Rename the temporary file to the unique filename
|
184 |
+
os.rename(temp_file_path, final_path)
|
185 |
+
|
186 |
+
return final_path
|
187 |
+
except Exception as e:
|
188 |
+
logging.error(f"Error saving chat history: {str(e)}")
|
189 |
+
return None
|
190 |
+
|
191 |
+
|
192 |
+
def generate_chat_history_content(history, conversation_id, media_content):
|
193 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
194 |
+
|
195 |
+
conversation_name = get_conversation_name(conversation_id)
|
196 |
+
|
197 |
+
if not conversation_name:
|
198 |
+
media_name = extract_media_name(media_content)
|
199 |
+
if media_name:
|
200 |
+
conversation_name = f"{media_name}-chat"
|
201 |
+
else:
|
202 |
+
conversation_name = f"chat-{timestamp}" # Fallback name
|
203 |
+
|
204 |
+
chat_data = {
|
205 |
+
"conversation_id": conversation_id,
|
206 |
+
"conversation_name": conversation_name,
|
207 |
+
"timestamp": timestamp,
|
208 |
+
"history": [
|
209 |
+
{
|
210 |
+
"role": "user" if i % 2 == 0 else "bot",
|
211 |
+
"content": msg[0] if isinstance(msg, tuple) else msg
|
212 |
+
}
|
213 |
+
for i, msg in enumerate(history)
|
214 |
+
]
|
215 |
+
}
|
216 |
+
|
217 |
+
return json.dumps(chat_data, indent=2), conversation_name
|
218 |
+
|
219 |
+
|
220 |
+
def extract_media_name(media_content):
|
221 |
+
if isinstance(media_content, dict):
|
222 |
+
content = media_content.get('content', {})
|
223 |
+
if isinstance(content, str):
|
224 |
+
try:
|
225 |
+
content = json.loads(content)
|
226 |
+
except json.JSONDecodeError:
|
227 |
+
logging.warning("Failed to parse media_content JSON string")
|
228 |
+
return None
|
229 |
+
|
230 |
+
# Try to extract title from the content
|
231 |
+
if isinstance(content, dict):
|
232 |
+
return content.get('title') or content.get('name')
|
233 |
+
|
234 |
+
logging.warning(f"Unexpected media_content format: {type(media_content)}")
|
235 |
+
return None
|
236 |
+
|
237 |
+
|
238 |
+
def update_chat_content(selected_item, use_content, use_summary, use_prompt, item_mapping):
|
239 |
+
logging.debug(f"Debug - Update Chat Content - Selected Item: {selected_item}\n")
|
240 |
+
logging.debug(f"Debug - Update Chat Content - Use Content: {use_content}\n\n\n\n")
|
241 |
+
logging.debug(f"Debug - Update Chat Content - Use Summary: {use_summary}\n\n")
|
242 |
+
logging.debug(f"Debug - Update Chat Content - Use Prompt: {use_prompt}\n\n")
|
243 |
+
logging.debug(f"Debug - Update Chat Content - Item Mapping: {item_mapping}\n\n")
|
244 |
+
|
245 |
+
if selected_item and selected_item in item_mapping:
|
246 |
+
media_id = item_mapping[selected_item]
|
247 |
+
content = load_media_content(media_id)
|
248 |
+
selected_parts = []
|
249 |
+
if use_content and "content" in content:
|
250 |
+
selected_parts.append("content")
|
251 |
+
if use_summary and "summary" in content:
|
252 |
+
selected_parts.append("summary")
|
253 |
+
if use_prompt and "prompt" in content:
|
254 |
+
selected_parts.append("prompt")
|
255 |
+
|
256 |
+
# Modified debug print
|
257 |
+
if isinstance(content, dict):
|
258 |
+
print(f"Debug - Update Chat Content - Content keys: {list(content.keys())}")
|
259 |
+
for key, value in content.items():
|
260 |
+
print(f"Debug - Update Chat Content - {key} (first 500 char): {str(value)[:500]}\n\n\n\n")
|
261 |
+
else:
|
262 |
+
print(f"Debug - Update Chat Content - Content(first 500 char): {str(content)[:500]}\n\n\n\n")
|
263 |
+
|
264 |
+
print(f"Debug - Update Chat Content - Selected Parts: {selected_parts}")
|
265 |
+
return content, selected_parts
|
266 |
+
else:
|
267 |
+
print(f"Debug - Update Chat Content - No item selected or item not in mapping")
|
268 |
+
return {}, []
|
269 |
+
|
270 |
+
|
271 |
+
#
|
272 |
+
# End of Chat.py
|
273 |
+
##########################################################################################################################
|
App_Function_Libraries/Chat_related_functions.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Chat_related_functions.py
|
2 |
+
# Contains functions related to chat
|
3 |
+
# WIP.
|
4 |
+
#
|
5 |
+
# Importing required libraries
|
6 |
+
import json
|
7 |
+
import os
|
8 |
+
from pathlib import Path
|
9 |
+
import json
|
10 |
+
#
|
11 |
+
########################################################################################################################
|
12 |
+
# Set globals
|
13 |
+
CHARACTERS_FILE = Path('.', 'Helper_Scripts', 'Character_Cards', 'Characters.json')
|
14 |
+
|
15 |
+
def save_character(character_data):
|
16 |
+
if CHARACTERS_FILE.exists():
|
17 |
+
with CHARACTERS_FILE.open('r') as f:
|
18 |
+
characters = json.load(f)
|
19 |
+
else:
|
20 |
+
characters = {}
|
21 |
+
|
22 |
+
characters[character_data['name']] = character_data
|
23 |
+
|
24 |
+
with CHARACTERS_FILE.open('w') as f:
|
25 |
+
json.dump(characters, f, indent=2)
|
26 |
+
|
27 |
+
|
28 |
+
def load_characters():
|
29 |
+
if os.path.exists(CHARACTERS_FILE):
|
30 |
+
with open(CHARACTERS_FILE, 'r') as f:
|
31 |
+
return json.load(f)
|
32 |
+
return {}
|
33 |
+
|
34 |
+
|
35 |
+
def get_character_names():
|
36 |
+
characters = load_characters()
|
37 |
+
return list(characters.keys())
|
38 |
+
|
39 |
+
|
40 |
+
|
41 |
+
|
App_Function_Libraries/ChromaDB_Library.py
ADDED
@@ -0,0 +1,225 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import configparser
|
2 |
+
import logging
|
3 |
+
import sqlite3
|
4 |
+
from typing import List, Dict, Any
|
5 |
+
|
6 |
+
import chromadb
|
7 |
+
import requests
|
8 |
+
|
9 |
+
from App_Function_Libraries.Chunk_Lib import improved_chunking_process
|
10 |
+
|
11 |
+
#######################################################################################################################
|
12 |
+
#
|
13 |
+
# Functions for ChromaDB
|
14 |
+
|
15 |
+
# Get ChromaDB settings
|
16 |
+
# Load configuration
|
17 |
+
config = configparser.ConfigParser()
|
18 |
+
config.read('config.txt')
|
19 |
+
chroma_db_path = config.get('Database', 'chroma_db_path', fallback='chroma_db')
|
20 |
+
chroma_client = chromadb.PersistentClient(path=chroma_db_path)
|
21 |
+
|
22 |
+
# Get embedding settings
|
23 |
+
embedding_provider = config.get('Embeddings', 'provider', fallback='openai')
|
24 |
+
embedding_model = config.get('Embeddings', 'model', fallback='text-embedding-3-small')
|
25 |
+
embedding_api_key = config.get('Embeddings', 'api_key', fallback='')
|
26 |
+
embedding_api_url = config.get('Embeddings', 'api_url', fallback='')
|
27 |
+
|
28 |
+
# Get chunking options
|
29 |
+
chunk_options = {
|
30 |
+
'method': config.get('Chunking', 'method', fallback='words'),
|
31 |
+
'max_size': config.getint('Chunking', 'max_size', fallback=400),
|
32 |
+
'overlap': config.getint('Chunking', 'overlap', fallback=200),
|
33 |
+
'adaptive': config.getboolean('Chunking', 'adaptive', fallback=False),
|
34 |
+
'multi_level': config.getboolean('Chunking', 'multi_level', fallback=False),
|
35 |
+
'language': config.get('Chunking', 'language', fallback='english')
|
36 |
+
}
|
37 |
+
|
38 |
+
|
39 |
+
def auto_update_chroma_embeddings(media_id: int, content: str):
|
40 |
+
"""
|
41 |
+
Automatically update ChromaDB embeddings when a new item is ingested into the SQLite database.
|
42 |
+
|
43 |
+
:param media_id: The ID of the newly ingested media item
|
44 |
+
:param content: The content of the newly ingested media item
|
45 |
+
"""
|
46 |
+
collection_name = f"media_{media_id}"
|
47 |
+
|
48 |
+
# Initialize or get the ChromaDB collection
|
49 |
+
collection = chroma_client.get_or_create_collection(name=collection_name)
|
50 |
+
|
51 |
+
# Check if embeddings already exist for this media_id
|
52 |
+
existing_embeddings = collection.get(ids=[f"{media_id}_chunk_{i}" for i in range(len(content))])
|
53 |
+
|
54 |
+
if existing_embeddings and len(existing_embeddings) > 0:
|
55 |
+
logging.info(f"Embeddings already exist for media ID {media_id}, skipping...")
|
56 |
+
else:
|
57 |
+
# Process and store content if embeddings do not already exist
|
58 |
+
process_and_store_content(content, collection_name, media_id)
|
59 |
+
logging.info(f"Updated ChromaDB embeddings for media ID: {media_id}")
|
60 |
+
|
61 |
+
|
62 |
+
# Function to process content, create chunks, embeddings, and store in ChromaDB and SQLite
|
63 |
+
def process_and_store_content(content: str, collection_name: str, media_id: int):
|
64 |
+
# Process the content into chunks
|
65 |
+
chunks = improved_chunking_process(content, chunk_options)
|
66 |
+
texts = [chunk['text'] for chunk in chunks]
|
67 |
+
|
68 |
+
# Generate embeddings for each chunk
|
69 |
+
embeddings = [create_embedding(text) for text in texts]
|
70 |
+
|
71 |
+
# Create unique IDs for each chunk using the media_id and chunk index
|
72 |
+
ids = [f"{media_id}_chunk_{i}" for i in range(len(texts))]
|
73 |
+
|
74 |
+
# Store the texts, embeddings, and IDs in ChromaDB
|
75 |
+
store_in_chroma(collection_name, texts, embeddings, ids)
|
76 |
+
|
77 |
+
# Store the chunks in SQLite FTS as well
|
78 |
+
from App_Function_Libraries.DB_Manager import db
|
79 |
+
with db.get_connection() as conn:
|
80 |
+
cursor = conn.cursor()
|
81 |
+
for text in texts:
|
82 |
+
cursor.execute("INSERT INTO media_fts (content) VALUES (?)", (text,))
|
83 |
+
conn.commit()
|
84 |
+
|
85 |
+
|
86 |
+
# Function to store documents and their embeddings in ChromaDB
|
87 |
+
def store_in_chroma(collection_name: str, texts: List[str], embeddings: List[List[float]], ids: List[str]):
|
88 |
+
collection = chroma_client.get_or_create_collection(name=collection_name)
|
89 |
+
collection.add(
|
90 |
+
documents=texts,
|
91 |
+
embeddings=embeddings,
|
92 |
+
ids=ids
|
93 |
+
)
|
94 |
+
|
95 |
+
# Function to perform vector search using ChromaDB
|
96 |
+
def vector_search(collection_name: str, query: str, k: int = 10) -> List[str]:
|
97 |
+
query_embedding = create_embedding(query)
|
98 |
+
collection = chroma_client.get_collection(name=collection_name)
|
99 |
+
results = collection.query(
|
100 |
+
query_embeddings=[query_embedding],
|
101 |
+
n_results=k
|
102 |
+
)
|
103 |
+
return results['documents'][0]
|
104 |
+
|
105 |
+
|
106 |
+
def create_embedding(text: str) -> List[float]:
|
107 |
+
if embedding_provider == 'openai':
|
108 |
+
import openai
|
109 |
+
openai.api_key = embedding_api_key
|
110 |
+
response = openai.Embedding.create(input=text, model=embedding_model)
|
111 |
+
return response['data'][0]['embedding']
|
112 |
+
elif embedding_provider == 'local':
|
113 |
+
# FIXME - This is a placeholder for API calls to a local embedding model
|
114 |
+
response = requests.post(
|
115 |
+
embedding_api_url,
|
116 |
+
json={"text": text, "model": embedding_model},
|
117 |
+
headers={"Authorization": f"Bearer {embedding_api_key}"}
|
118 |
+
)
|
119 |
+
return response.json()['embedding']
|
120 |
+
# FIXME - this seems correct, but idk....
|
121 |
+
elif embedding_provider == 'huggingface':
|
122 |
+
from transformers import AutoTokenizer, AutoModel
|
123 |
+
import torch
|
124 |
+
|
125 |
+
tokenizer = AutoTokenizer.from_pretrained(embedding_model)
|
126 |
+
model = AutoModel.from_pretrained(embedding_model)
|
127 |
+
|
128 |
+
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
|
129 |
+
with torch.no_grad():
|
130 |
+
outputs = model(**inputs)
|
131 |
+
|
132 |
+
# Use the mean of the last hidden state as the sentence embedding
|
133 |
+
embeddings = outputs.last_hidden_state.mean(dim=1)
|
134 |
+
return embeddings[0].tolist() # Convert to list for consistency
|
135 |
+
else:
|
136 |
+
raise ValueError(f"Unsupported embedding provider: {embedding_provider}")
|
137 |
+
|
138 |
+
|
139 |
+
def create_all_embeddings(api_choice: str) -> str:
|
140 |
+
try:
|
141 |
+
global embedding_provider
|
142 |
+
embedding_provider = api_choice
|
143 |
+
|
144 |
+
all_content = get_all_content_from_database()
|
145 |
+
|
146 |
+
if not all_content:
|
147 |
+
return "No content found in the database."
|
148 |
+
|
149 |
+
texts_to_embed = []
|
150 |
+
embeddings_to_store = []
|
151 |
+
ids_to_store = []
|
152 |
+
collection_name = "all_content_embeddings"
|
153 |
+
|
154 |
+
# Initialize or get the ChromaDB collection
|
155 |
+
collection = chroma_client.get_or_create_collection(name=collection_name)
|
156 |
+
|
157 |
+
for content_item in all_content:
|
158 |
+
media_id = content_item['id']
|
159 |
+
text = content_item['content']
|
160 |
+
|
161 |
+
# Check if the embedding already exists in ChromaDB
|
162 |
+
embedding_exists = collection.get(ids=[f"doc_{media_id}"])
|
163 |
+
|
164 |
+
if embedding_exists:
|
165 |
+
logging.info(f"Embedding already exists for media ID {media_id}, skipping...")
|
166 |
+
continue # Skip if embedding already exists
|
167 |
+
|
168 |
+
# Create the embedding
|
169 |
+
embedding = create_embedding(text)
|
170 |
+
|
171 |
+
# Collect the text, embedding, and ID for batch storage
|
172 |
+
texts_to_embed.append(text)
|
173 |
+
embeddings_to_store.append(embedding)
|
174 |
+
ids_to_store.append(f"doc_{media_id}")
|
175 |
+
|
176 |
+
# Store all new embeddings in ChromaDB
|
177 |
+
if texts_to_embed and embeddings_to_store:
|
178 |
+
store_in_chroma(collection_name, texts_to_embed, embeddings_to_store, ids_to_store)
|
179 |
+
|
180 |
+
return "Embeddings created and stored successfully for all new content."
|
181 |
+
except Exception as e:
|
182 |
+
logging.error(f"Error during embedding creation: {str(e)}")
|
183 |
+
return f"Error: {str(e)}"
|
184 |
+
|
185 |
+
|
186 |
+
def get_all_content_from_database() -> List[Dict[str, Any]]:
|
187 |
+
"""
|
188 |
+
Retrieve all media content from the database that requires embedding.
|
189 |
+
|
190 |
+
Returns:
|
191 |
+
List[Dict[str, Any]]: A list of dictionaries, each containing the media ID, content, title, and other relevant fields.
|
192 |
+
"""
|
193 |
+
try:
|
194 |
+
from App_Function_Libraries.DB_Manager import db
|
195 |
+
with db.get_connection() as conn:
|
196 |
+
cursor = conn.cursor()
|
197 |
+
cursor.execute("""
|
198 |
+
SELECT id, content, title, author, type
|
199 |
+
FROM Media
|
200 |
+
WHERE is_trash = 0 -- Exclude items marked as trash
|
201 |
+
""")
|
202 |
+
media_items = cursor.fetchall()
|
203 |
+
|
204 |
+
# Convert the results into a list of dictionaries
|
205 |
+
all_content = [
|
206 |
+
{
|
207 |
+
'id': item[0],
|
208 |
+
'content': item[1],
|
209 |
+
'title': item[2],
|
210 |
+
'author': item[3],
|
211 |
+
'type': item[4]
|
212 |
+
}
|
213 |
+
for item in media_items
|
214 |
+
]
|
215 |
+
|
216 |
+
return all_content
|
217 |
+
|
218 |
+
except sqlite3.Error as e:
|
219 |
+
logging.error(f"Error retrieving all content from database: {e}")
|
220 |
+
from App_Function_Libraries.SQLite_DB import DatabaseError
|
221 |
+
raise DatabaseError(f"Error retrieving all content from database: {e}")
|
222 |
+
|
223 |
+
#
|
224 |
+
# End of Functions for ChromaDB
|
225 |
+
#######################################################################################################################
|
App_Function_Libraries/Chunk_Lib.py
CHANGED
@@ -1,583 +1,587 @@
|
|
1 |
-
# Chunk_Lib.py
|
2 |
-
#########################################
|
3 |
-
# Chunking Library
|
4 |
-
# This library is used to perform chunking of input files.
|
5 |
-
# Currently, uses naive approaches. Nothing fancy.
|
6 |
-
#
|
7 |
-
####
|
8 |
-
# Import necessary libraries
|
9 |
-
import logging
|
10 |
-
import re
|
11 |
-
|
12 |
-
from typing import List, Optional, Tuple, Dict, Any
|
13 |
-
|
14 |
-
from openai import OpenAI
|
15 |
-
from tqdm import tqdm
|
16 |
-
#
|
17 |
-
# Import 3rd party
|
18 |
-
from transformers import GPT2Tokenizer
|
19 |
-
import nltk
|
20 |
-
from nltk.tokenize import sent_tokenize, word_tokenize
|
21 |
-
from sklearn.feature_extraction.text import TfidfVectorizer
|
22 |
-
from sklearn.metrics.pairwise import cosine_similarity
|
23 |
-
#
|
24 |
-
# Import Local
|
25 |
-
from App_Function_Libraries.Tokenization_Methods_Lib import openai_tokenize
|
26 |
-
from App_Function_Libraries.Utils import load_comprehensive_config
|
27 |
-
|
28 |
-
|
29 |
-
#
|
30 |
-
#######################################################################################################################
|
31 |
-
# Function Definitions
|
32 |
-
#
|
33 |
-
|
34 |
-
# FIXME - Make sure it only downloads if it already exists, and does a check first.
|
35 |
-
# Ensure NLTK data is downloaded
|
36 |
-
def ntlk_prep():
|
37 |
-
nltk.download('punkt')
|
38 |
-
|
39 |
-
# Load GPT2 tokenizer
|
40 |
-
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
|
41 |
-
|
42 |
-
# Load Config file for API keys
|
43 |
-
config = load_comprehensive_config()
|
44 |
-
openai_api_key = config.get('API', 'openai_api_key', fallback=None)
|
45 |
-
|
46 |
-
def load_document(file_path):
|
47 |
-
with open(file_path, 'r') as file:
|
48 |
-
text = file.read()
|
49 |
-
return re.sub('\\s+', ' ', text).strip()
|
50 |
-
|
51 |
-
|
52 |
-
def improved_chunking_process(text: str, chunk_options: Dict[str, Any]) -> List[Dict[str, Any]]:
|
53 |
-
chunk_method = chunk_options.get('method', 'words')
|
54 |
-
max_chunk_size = chunk_options.get('max_size', 300)
|
55 |
-
overlap = chunk_options.get('overlap', 0)
|
56 |
-
language = chunk_options.get('language', 'english')
|
57 |
-
adaptive = chunk_options.get('adaptive', False)
|
58 |
-
multi_level = chunk_options.get('multi_level', False)
|
59 |
-
|
60 |
-
if adaptive:
|
61 |
-
max_chunk_size = adaptive_chunk_size(text, max_chunk_size)
|
62 |
-
|
63 |
-
if multi_level:
|
64 |
-
chunks = multi_level_chunking(text, chunk_method, max_chunk_size, overlap, language)
|
65 |
-
else:
|
66 |
-
if chunk_method == 'words':
|
67 |
-
chunks = chunk_text_by_words(text, max_chunk_size, overlap)
|
68 |
-
elif chunk_method == 'sentences':
|
69 |
-
chunks = chunk_text_by_sentences(text, max_chunk_size, overlap, language)
|
70 |
-
elif chunk_method == 'paragraphs':
|
71 |
-
chunks = chunk_text_by_paragraphs(text, max_chunk_size, overlap)
|
72 |
-
elif chunk_method == 'tokens':
|
73 |
-
chunks = chunk_text_by_tokens(text, max_chunk_size, overlap)
|
74 |
-
elif chunk_method == 'chapters':
|
75 |
-
return chunk_ebook_by_chapters(text, chunk_options)
|
76 |
-
else:
|
77 |
-
# No chunking applied
|
78 |
-
chunks = [text]
|
79 |
-
|
80 |
-
return [{'text': chunk, 'metadata': get_chunk_metadata(chunk, text)} for chunk in chunks]
|
81 |
-
|
82 |
-
|
83 |
-
def adaptive_chunk_size(text: str, base_size: int) -> int:
|
84 |
-
# Simple adaptive logic: adjust chunk size based on text complexity
|
85 |
-
avg_word_length = sum(len(word) for word in text.split()) / len(text.split())
|
86 |
-
if avg_word_length > 6: # Arbitrary threshold for "complex" text
|
87 |
-
return int(base_size * 0.8) # Reduce chunk size for complex text
|
88 |
-
return base_size
|
89 |
-
|
90 |
-
|
91 |
-
def multi_level_chunking(text: str, method: str, max_size: int, overlap: int, language: str) -> List[str]:
|
92 |
-
# First level: chunk by paragraphs
|
93 |
-
paragraphs = chunk_text_by_paragraphs(text, max_size * 2, overlap)
|
94 |
-
|
95 |
-
# Second level: chunk each paragraph further
|
96 |
-
chunks = []
|
97 |
-
for para in paragraphs:
|
98 |
-
if method == 'words':
|
99 |
-
chunks.extend(chunk_text_by_words(para, max_size, overlap))
|
100 |
-
elif method == 'sentences':
|
101 |
-
chunks.extend(chunk_text_by_sentences(para, max_size, overlap, language))
|
102 |
-
else:
|
103 |
-
chunks.append(para)
|
104 |
-
|
105 |
-
return chunks
|
106 |
-
|
107 |
-
|
108 |
-
def chunk_text_by_words(text: str, max_words: int = 300, overlap: int = 0) -> List[str]:
|
109 |
-
words = text.split()
|
110 |
-
chunks = []
|
111 |
-
for i in range(0, len(words), max_words - overlap):
|
112 |
-
chunk = ' '.join(words[i:i + max_words])
|
113 |
-
chunks.append(chunk)
|
114 |
-
return post_process_chunks(chunks)
|
115 |
-
|
116 |
-
|
117 |
-
def chunk_text_by_sentences(text: str, max_sentences: int = 10, overlap: int = 0, language: str = 'english') -> List[
|
118 |
-
str]:
|
119 |
-
nltk.download('punkt', quiet=True)
|
120 |
-
sentences = nltk.sent_tokenize(text, language=language)
|
121 |
-
chunks = []
|
122 |
-
for i in range(0, len(sentences), max_sentences - overlap):
|
123 |
-
chunk = ' '.join(sentences[i:i + max_sentences])
|
124 |
-
chunks.append(chunk)
|
125 |
-
return post_process_chunks(chunks)
|
126 |
-
|
127 |
-
|
128 |
-
def chunk_text_by_paragraphs(text: str, max_paragraphs: int = 5, overlap: int = 0) -> List[str]:
|
129 |
-
paragraphs = re.split(r'\n\s*\n', text)
|
130 |
-
chunks = []
|
131 |
-
for i in range(0, len(paragraphs), max_paragraphs - overlap):
|
132 |
-
chunk = '\n\n'.join(paragraphs[i:i + max_paragraphs])
|
133 |
-
chunks.append(chunk)
|
134 |
-
return post_process_chunks(chunks)
|
135 |
-
|
136 |
-
|
137 |
-
def chunk_text_by_tokens(text: str, max_tokens: int = 1000, overlap: int = 0) -> List[str]:
|
138 |
-
# This is a simplified token-based chunking. For more accurate tokenization,
|
139 |
-
# consider using a proper tokenizer like GPT-2 TokenizerFast
|
140 |
-
words = text.split()
|
141 |
-
chunks = []
|
142 |
-
current_chunk = []
|
143 |
-
current_token_count = 0
|
144 |
-
|
145 |
-
for word in words:
|
146 |
-
word_token_count = len(word) // 4 + 1 # Rough estimate of token count
|
147 |
-
if current_token_count + word_token_count > max_tokens and current_chunk:
|
148 |
-
chunks.append(' '.join(current_chunk))
|
149 |
-
current_chunk = current_chunk[-overlap:] if overlap > 0 else []
|
150 |
-
current_token_count = sum(len(w) // 4 + 1 for w in current_chunk)
|
151 |
-
|
152 |
-
current_chunk.append(word)
|
153 |
-
current_token_count += word_token_count
|
154 |
-
|
155 |
-
if current_chunk:
|
156 |
-
chunks.append(' '.join(current_chunk))
|
157 |
-
|
158 |
-
return post_process_chunks(chunks)
|
159 |
-
|
160 |
-
|
161 |
-
def post_process_chunks(chunks: List[str]) -> List[str]:
|
162 |
-
return [chunk.strip() for chunk in chunks if chunk.strip()]
|
163 |
-
|
164 |
-
|
165 |
-
def get_chunk_metadata(chunk: str, full_text: str, chunk_type: str = "generic", chapter_number: Optional[int] = None, chapter_pattern: Optional[str] = None) -> Dict[str, Any]:
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
current_chunk
|
195 |
-
current_length
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
combined_chunks
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
#
|
253 |
-
# print(
|
254 |
-
#
|
255 |
-
#
|
256 |
-
# print(
|
257 |
-
#
|
258 |
-
#
|
259 |
-
# print(
|
260 |
-
#
|
261 |
-
#
|
262 |
-
# print(
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
#
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
return len(text)
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
291 |
-
|
292 |
-
|
293 |
-
|
294 |
-
|
295 |
-
|
296 |
-
|
297 |
-
|
298 |
-
|
299 |
-
|
300 |
-
|
301 |
-
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
|
309 |
-
|
310 |
-
|
311 |
-
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
|
367 |
-
|
368 |
-
|
369 |
-
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
|
384 |
-
|
385 |
-
|
386 |
-
|
387 |
-
|
388 |
-
|
389 |
-
|
390 |
-
|
391 |
-
|
392 |
-
candidate
|
393 |
-
candidate_indices
|
394 |
-
|
395 |
-
|
396 |
-
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
|
401 |
-
|
402 |
-
|
403 |
-
|
404 |
-
|
405 |
-
|
406 |
-
|
407 |
-
|
408 |
-
|
409 |
-
|
410 |
-
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
-
|
417 |
-
|
418 |
-
-
|
419 |
-
|
420 |
-
|
421 |
-
|
422 |
-
-
|
423 |
-
|
424 |
-
|
425 |
-
|
426 |
-
|
427 |
-
|
428 |
-
|
429 |
-
|
430 |
-
|
431 |
-
|
432 |
-
|
433 |
-
|
434 |
-
|
435 |
-
|
436 |
-
|
437 |
-
|
438 |
-
|
439 |
-
|
440 |
-
|
441 |
-
|
442 |
-
|
443 |
-
|
444 |
-
|
445 |
-
|
446 |
-
|
447 |
-
|
448 |
-
|
449 |
-
|
450 |
-
|
451 |
-
|
452 |
-
|
453 |
-
|
454 |
-
|
455 |
-
|
456 |
-
|
457 |
-
|
458 |
-
|
459 |
-
|
460 |
-
|
461 |
-
|
462 |
-
|
463 |
-
|
464 |
-
|
465 |
-
|
466 |
-
|
467 |
-
|
468 |
-
|
469 |
-
|
470 |
-
|
471 |
-
|
472 |
-
|
473 |
-
|
474 |
-
|
475 |
-
|
476 |
-
|
477 |
-
|
478 |
-
|
479 |
-
|
480 |
-
#
|
481 |
-
|
482 |
-
|
483 |
-
|
484 |
-
|
485 |
-
|
486 |
-
|
487 |
-
|
488 |
-
|
489 |
-
|
490 |
-
|
491 |
-
|
492 |
-
|
493 |
-
|
494 |
-
|
495 |
-
|
496 |
-
|
497 |
-
|
498 |
-
|
499 |
-
|
500 |
-
|
501 |
-
|
502 |
-
|
503 |
-
|
504 |
-
|
505 |
-
if
|
506 |
-
|
507 |
-
|
508 |
-
|
509 |
-
|
510 |
-
|
511 |
-
|
512 |
-
|
513 |
-
#
|
514 |
-
|
515 |
-
|
516 |
-
|
517 |
-
|
518 |
-
|
519 |
-
|
520 |
-
|
521 |
-
|
522 |
-
|
523 |
-
|
524 |
-
|
525 |
-
|
526 |
-
|
527 |
-
|
528 |
-
|
529 |
-
|
530 |
-
|
531 |
-
|
532 |
-
|
533 |
-
|
534 |
-
|
535 |
-
|
536 |
-
|
537 |
-
|
538 |
-
|
539 |
-
|
540 |
-
#
|
541 |
-
#
|
542 |
-
#
|
543 |
-
#
|
544 |
-
#
|
545 |
-
#
|
546 |
-
#
|
547 |
-
#
|
548 |
-
#
|
549 |
-
#
|
550 |
-
#
|
551 |
-
#
|
552 |
-
#
|
553 |
-
#
|
554 |
-
#
|
555 |
-
#
|
556 |
-
#
|
557 |
-
#
|
558 |
-
#
|
559 |
-
#
|
560 |
-
#
|
561 |
-
# This is the
|
562 |
-
#
|
563 |
-
#
|
564 |
-
#
|
565 |
-
#
|
566 |
-
#
|
567 |
-
#
|
568 |
-
#
|
569 |
-
#
|
570 |
-
#
|
571 |
-
#
|
572 |
-
#
|
573 |
-
#
|
574 |
-
#
|
575 |
-
#
|
576 |
-
#
|
577 |
-
|
578 |
-
|
579 |
-
|
580 |
-
|
581 |
-
|
582 |
-
|
|
|
|
|
|
|
|
|
583 |
#######################################################################################################################
|
|
|
1 |
+
# Chunk_Lib.py
|
2 |
+
#########################################
|
3 |
+
# Chunking Library
|
4 |
+
# This library is used to perform chunking of input files.
|
5 |
+
# Currently, uses naive approaches. Nothing fancy.
|
6 |
+
#
|
7 |
+
####
|
8 |
+
# Import necessary libraries
|
9 |
+
import logging
|
10 |
+
import re
|
11 |
+
|
12 |
+
from typing import List, Optional, Tuple, Dict, Any
|
13 |
+
|
14 |
+
from openai import OpenAI
|
15 |
+
from tqdm import tqdm
|
16 |
+
#
|
17 |
+
# Import 3rd party
|
18 |
+
from transformers import GPT2Tokenizer
|
19 |
+
import nltk
|
20 |
+
from nltk.tokenize import sent_tokenize, word_tokenize
|
21 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
22 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
23 |
+
#
|
24 |
+
# Import Local
|
25 |
+
from App_Function_Libraries.Tokenization_Methods_Lib import openai_tokenize
|
26 |
+
from App_Function_Libraries.Utils import load_comprehensive_config
|
27 |
+
|
28 |
+
|
29 |
+
#
|
30 |
+
#######################################################################################################################
|
31 |
+
# Function Definitions
|
32 |
+
#
|
33 |
+
|
34 |
+
# FIXME - Make sure it only downloads if it already exists, and does a check first.
|
35 |
+
# Ensure NLTK data is downloaded
|
36 |
+
def ntlk_prep():
|
37 |
+
nltk.download('punkt')
|
38 |
+
|
39 |
+
# Load GPT2 tokenizer
|
40 |
+
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
|
41 |
+
|
42 |
+
# Load Config file for API keys
|
43 |
+
config = load_comprehensive_config()
|
44 |
+
openai_api_key = config.get('API', 'openai_api_key', fallback=None)
|
45 |
+
|
46 |
+
def load_document(file_path):
|
47 |
+
with open(file_path, 'r') as file:
|
48 |
+
text = file.read()
|
49 |
+
return re.sub('\\s+', ' ', text).strip()
|
50 |
+
|
51 |
+
|
52 |
+
def improved_chunking_process(text: str, chunk_options: Dict[str, Any]) -> List[Dict[str, Any]]:
|
53 |
+
chunk_method = chunk_options.get('method', 'words')
|
54 |
+
max_chunk_size = chunk_options.get('max_size', 300)
|
55 |
+
overlap = chunk_options.get('overlap', 0)
|
56 |
+
language = chunk_options.get('language', 'english')
|
57 |
+
adaptive = chunk_options.get('adaptive', False)
|
58 |
+
multi_level = chunk_options.get('multi_level', False)
|
59 |
+
|
60 |
+
if adaptive:
|
61 |
+
max_chunk_size = adaptive_chunk_size(text, max_chunk_size)
|
62 |
+
|
63 |
+
if multi_level:
|
64 |
+
chunks = multi_level_chunking(text, chunk_method, max_chunk_size, overlap, language)
|
65 |
+
else:
|
66 |
+
if chunk_method == 'words':
|
67 |
+
chunks = chunk_text_by_words(text, max_chunk_size, overlap)
|
68 |
+
elif chunk_method == 'sentences':
|
69 |
+
chunks = chunk_text_by_sentences(text, max_chunk_size, overlap, language)
|
70 |
+
elif chunk_method == 'paragraphs':
|
71 |
+
chunks = chunk_text_by_paragraphs(text, max_chunk_size, overlap)
|
72 |
+
elif chunk_method == 'tokens':
|
73 |
+
chunks = chunk_text_by_tokens(text, max_chunk_size, overlap)
|
74 |
+
elif chunk_method == 'chapters':
|
75 |
+
return chunk_ebook_by_chapters(text, chunk_options)
|
76 |
+
else:
|
77 |
+
# No chunking applied
|
78 |
+
chunks = [text]
|
79 |
+
|
80 |
+
return [{'text': chunk, 'metadata': get_chunk_metadata(chunk, text)} for chunk in chunks]
|
81 |
+
|
82 |
+
|
83 |
+
def adaptive_chunk_size(text: str, base_size: int) -> int:
|
84 |
+
# Simple adaptive logic: adjust chunk size based on text complexity
|
85 |
+
avg_word_length = sum(len(word) for word in text.split()) / len(text.split())
|
86 |
+
if avg_word_length > 6: # Arbitrary threshold for "complex" text
|
87 |
+
return int(base_size * 0.8) # Reduce chunk size for complex text
|
88 |
+
return base_size
|
89 |
+
|
90 |
+
|
91 |
+
def multi_level_chunking(text: str, method: str, max_size: int, overlap: int, language: str) -> List[str]:
|
92 |
+
# First level: chunk by paragraphs
|
93 |
+
paragraphs = chunk_text_by_paragraphs(text, max_size * 2, overlap)
|
94 |
+
|
95 |
+
# Second level: chunk each paragraph further
|
96 |
+
chunks = []
|
97 |
+
for para in paragraphs:
|
98 |
+
if method == 'words':
|
99 |
+
chunks.extend(chunk_text_by_words(para, max_size, overlap))
|
100 |
+
elif method == 'sentences':
|
101 |
+
chunks.extend(chunk_text_by_sentences(para, max_size, overlap, language))
|
102 |
+
else:
|
103 |
+
chunks.append(para)
|
104 |
+
|
105 |
+
return chunks
|
106 |
+
|
107 |
+
|
108 |
+
def chunk_text_by_words(text: str, max_words: int = 300, overlap: int = 0) -> List[str]:
|
109 |
+
words = text.split()
|
110 |
+
chunks = []
|
111 |
+
for i in range(0, len(words), max_words - overlap):
|
112 |
+
chunk = ' '.join(words[i:i + max_words])
|
113 |
+
chunks.append(chunk)
|
114 |
+
return post_process_chunks(chunks)
|
115 |
+
|
116 |
+
|
117 |
+
def chunk_text_by_sentences(text: str, max_sentences: int = 10, overlap: int = 0, language: str = 'english') -> List[
|
118 |
+
str]:
|
119 |
+
nltk.download('punkt', quiet=True)
|
120 |
+
sentences = nltk.sent_tokenize(text, language=language)
|
121 |
+
chunks = []
|
122 |
+
for i in range(0, len(sentences), max_sentences - overlap):
|
123 |
+
chunk = ' '.join(sentences[i:i + max_sentences])
|
124 |
+
chunks.append(chunk)
|
125 |
+
return post_process_chunks(chunks)
|
126 |
+
|
127 |
+
|
128 |
+
def chunk_text_by_paragraphs(text: str, max_paragraphs: int = 5, overlap: int = 0) -> List[str]:
|
129 |
+
paragraphs = re.split(r'\n\s*\n', text)
|
130 |
+
chunks = []
|
131 |
+
for i in range(0, len(paragraphs), max_paragraphs - overlap):
|
132 |
+
chunk = '\n\n'.join(paragraphs[i:i + max_paragraphs])
|
133 |
+
chunks.append(chunk)
|
134 |
+
return post_process_chunks(chunks)
|
135 |
+
|
136 |
+
|
137 |
+
def chunk_text_by_tokens(text: str, max_tokens: int = 1000, overlap: int = 0) -> List[str]:
|
138 |
+
# This is a simplified token-based chunking. For more accurate tokenization,
|
139 |
+
# consider using a proper tokenizer like GPT-2 TokenizerFast
|
140 |
+
words = text.split()
|
141 |
+
chunks = []
|
142 |
+
current_chunk = []
|
143 |
+
current_token_count = 0
|
144 |
+
|
145 |
+
for word in words:
|
146 |
+
word_token_count = len(word) // 4 + 1 # Rough estimate of token count
|
147 |
+
if current_token_count + word_token_count > max_tokens and current_chunk:
|
148 |
+
chunks.append(' '.join(current_chunk))
|
149 |
+
current_chunk = current_chunk[-overlap:] if overlap > 0 else []
|
150 |
+
current_token_count = sum(len(w) // 4 + 1 for w in current_chunk)
|
151 |
+
|
152 |
+
current_chunk.append(word)
|
153 |
+
current_token_count += word_token_count
|
154 |
+
|
155 |
+
if current_chunk:
|
156 |
+
chunks.append(' '.join(current_chunk))
|
157 |
+
|
158 |
+
return post_process_chunks(chunks)
|
159 |
+
|
160 |
+
|
161 |
+
def post_process_chunks(chunks: List[str]) -> List[str]:
|
162 |
+
return [chunk.strip() for chunk in chunks if chunk.strip()]
|
163 |
+
|
164 |
+
|
165 |
+
def get_chunk_metadata(chunk: str, full_text: str, chunk_type: str = "generic", chapter_number: Optional[int] = None, chapter_pattern: Optional[str] = None) -> Dict[str, Any]:
|
166 |
+
try:
|
167 |
+
start_index = full_text.index(chunk)
|
168 |
+
metadata = {
|
169 |
+
'start_index': start_index,
|
170 |
+
'end_index': start_index + len(chunk),
|
171 |
+
'word_count': len(chunk.split()),
|
172 |
+
'char_count': len(chunk),
|
173 |
+
'chunk_type': chunk_type
|
174 |
+
}
|
175 |
+
if chunk_type == "chapter":
|
176 |
+
metadata['chapter_number'] = chapter_number
|
177 |
+
metadata['chapter_pattern'] = chapter_pattern
|
178 |
+
return metadata
|
179 |
+
except ValueError as e:
|
180 |
+
logging.error(f"Chunk not found in full_text: {chunk[:50]}... Full text length: {len(full_text)}")
|
181 |
+
raise
|
182 |
+
|
183 |
+
|
184 |
+
# Hybrid approach, chunk each sentence while ensuring total token size does not exceed a maximum number
|
185 |
+
def chunk_text_hybrid(text, max_tokens=1000):
|
186 |
+
sentences = nltk.tokenize.sent_tokenize(text)
|
187 |
+
chunks = []
|
188 |
+
current_chunk = []
|
189 |
+
current_length = 0
|
190 |
+
|
191 |
+
for sentence in sentences:
|
192 |
+
tokens = tokenizer.encode(sentence)
|
193 |
+
if current_length + len(tokens) <= max_tokens:
|
194 |
+
current_chunk.append(sentence)
|
195 |
+
current_length += len(tokens)
|
196 |
+
else:
|
197 |
+
chunks.append(' '.join(current_chunk))
|
198 |
+
current_chunk = [sentence]
|
199 |
+
current_length = len(tokens)
|
200 |
+
|
201 |
+
if current_chunk:
|
202 |
+
chunks.append(' '.join(current_chunk))
|
203 |
+
|
204 |
+
return chunks
|
205 |
+
|
206 |
+
# Thanks openai
|
207 |
+
def chunk_on_delimiter(input_string: str,
|
208 |
+
max_tokens: int,
|
209 |
+
delimiter: str) -> List[str]:
|
210 |
+
chunks = input_string.split(delimiter)
|
211 |
+
combined_chunks, _, dropped_chunk_count = combine_chunks_with_no_minimum(
|
212 |
+
chunks, max_tokens, chunk_delimiter=delimiter, add_ellipsis_for_overflow=True)
|
213 |
+
if dropped_chunk_count > 0:
|
214 |
+
print(f"Warning: {dropped_chunk_count} chunks were dropped due to exceeding the token limit.")
|
215 |
+
combined_chunks = [f"{chunk}{delimiter}" for chunk in combined_chunks]
|
216 |
+
return combined_chunks
|
217 |
+
|
218 |
+
# ????FIXME
|
219 |
+
def recursive_summarize_chunks(chunks, summarize_func, custom_prompt, temp=None, system_prompt=None):
|
220 |
+
summarized_chunks = []
|
221 |
+
current_summary = ""
|
222 |
+
|
223 |
+
logging.debug(f"recursive_summarize_chunks: Summarizing {len(chunks)} chunks recursively...")
|
224 |
+
logging.debug(f"recursive_summarize_chunks: temperature is @ {temp}")
|
225 |
+
for i, chunk in enumerate(chunks):
|
226 |
+
if i == 0:
|
227 |
+
current_summary = summarize_func(chunk, custom_prompt, temp, system_prompt)
|
228 |
+
else:
|
229 |
+
combined_text = current_summary + "\n\n" + chunk
|
230 |
+
current_summary = summarize_func(combined_text, custom_prompt, temp, system_prompt)
|
231 |
+
|
232 |
+
summarized_chunks.append(current_summary)
|
233 |
+
|
234 |
+
return summarized_chunks
|
235 |
+
|
236 |
+
|
237 |
+
# Sample text for testing
|
238 |
+
sample_text = """
|
239 |
+
Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence
|
240 |
+
concerned with the interactions between computers and human language, in particular how to program computers
|
241 |
+
to process and analyze large amounts of natural language data. The result is a computer capable of "understanding"
|
242 |
+
the contents of documents, including the contextual nuances of the language within them. The technology can then
|
243 |
+
accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves.
|
244 |
+
|
245 |
+
Challenges in natural language processing frequently involve speech recognition, natural language understanding,
|
246 |
+
and natural language generation.
|
247 |
+
|
248 |
+
Natural language processing has its roots in the 1950s. Already in 1950, Alan Turing published an article titled
|
249 |
+
"Computing Machinery and Intelligence" which proposed what is now called the Turing test as a criterion of intelligence.
|
250 |
+
"""
|
251 |
+
|
252 |
+
# Example usage of different chunking methods
|
253 |
+
# print("Chunking by words:")
|
254 |
+
# print(chunk_text_by_words(sample_text, max_words=50))
|
255 |
+
#
|
256 |
+
# print("\nChunking by sentences:")
|
257 |
+
# print(chunk_text_by_sentences(sample_text, max_sentences=2))
|
258 |
+
#
|
259 |
+
# print("\nChunking by paragraphs:")
|
260 |
+
# print(chunk_text_by_paragraphs(sample_text, max_paragraphs=1))
|
261 |
+
#
|
262 |
+
# print("\nChunking by tokens:")
|
263 |
+
# print(chunk_text_by_tokens(sample_text, max_tokens=50))
|
264 |
+
#
|
265 |
+
# print("\nHybrid chunking:")
|
266 |
+
# print(chunk_text_hybrid(sample_text, max_tokens=50))
|
267 |
+
|
268 |
+
|
269 |
+
|
270 |
+
#######################################################################################################################
|
271 |
+
#
|
272 |
+
# Experimental Semantic Chunking
|
273 |
+
#
|
274 |
+
|
275 |
+
# Chunk text into segments based on semantic similarity
|
276 |
+
def count_units(text, unit='tokens'):
|
277 |
+
if unit == 'words':
|
278 |
+
return len(text.split())
|
279 |
+
elif unit == 'tokens':
|
280 |
+
return len(word_tokenize(text))
|
281 |
+
elif unit == 'characters':
|
282 |
+
return len(text)
|
283 |
+
else:
|
284 |
+
raise ValueError("Invalid unit. Choose 'words', 'tokens', or 'characters'.")
|
285 |
+
|
286 |
+
|
287 |
+
def semantic_chunking(text, max_chunk_size=2000, unit='words'):
|
288 |
+
nltk.download('punkt', quiet=True)
|
289 |
+
sentences = sent_tokenize(text)
|
290 |
+
vectorizer = TfidfVectorizer()
|
291 |
+
sentence_vectors = vectorizer.fit_transform(sentences)
|
292 |
+
|
293 |
+
chunks = []
|
294 |
+
current_chunk = []
|
295 |
+
current_size = 0
|
296 |
+
|
297 |
+
for i, sentence in enumerate(sentences):
|
298 |
+
sentence_size = count_units(sentence, unit)
|
299 |
+
if current_size + sentence_size > max_chunk_size and current_chunk:
|
300 |
+
chunks.append(' '.join(current_chunk))
|
301 |
+
overlap_size = count_units(' '.join(current_chunk[-3:]), unit) # Use last 3 sentences for overlap
|
302 |
+
current_chunk = current_chunk[-3:] # Keep last 3 sentences for overlap
|
303 |
+
current_size = overlap_size
|
304 |
+
|
305 |
+
current_chunk.append(sentence)
|
306 |
+
current_size += sentence_size
|
307 |
+
|
308 |
+
if i + 1 < len(sentences):
|
309 |
+
current_vector = sentence_vectors[i]
|
310 |
+
next_vector = sentence_vectors[i + 1]
|
311 |
+
similarity = cosine_similarity(current_vector, next_vector)[0][0]
|
312 |
+
if similarity < 0.5 and current_size >= max_chunk_size // 2:
|
313 |
+
chunks.append(' '.join(current_chunk))
|
314 |
+
overlap_size = count_units(' '.join(current_chunk[-3:]), unit)
|
315 |
+
current_chunk = current_chunk[-3:]
|
316 |
+
current_size = overlap_size
|
317 |
+
|
318 |
+
if current_chunk:
|
319 |
+
chunks.append(' '.join(current_chunk))
|
320 |
+
|
321 |
+
return chunks
|
322 |
+
|
323 |
+
|
324 |
+
def semantic_chunk_long_file(file_path, max_chunk_size=1000, overlap=100):
|
325 |
+
try:
|
326 |
+
with open(file_path, 'r', encoding='utf-8') as file:
|
327 |
+
content = file.read()
|
328 |
+
|
329 |
+
chunks = semantic_chunking(content, max_chunk_size, overlap)
|
330 |
+
return chunks
|
331 |
+
except Exception as e:
|
332 |
+
logging.error(f"Error chunking text file: {str(e)}")
|
333 |
+
return None
|
334 |
+
#######################################################################################################################
|
335 |
+
|
336 |
+
|
337 |
+
|
338 |
+
|
339 |
+
|
340 |
+
|
341 |
+
#######################################################################################################################
|
342 |
+
#
|
343 |
+
# OpenAI Rolling Summarization
|
344 |
+
#
|
345 |
+
|
346 |
+
client = OpenAI(api_key=openai_api_key)
|
347 |
+
def get_chat_completion(messages, model='gpt-4-turbo'):
|
348 |
+
response = client.chat.completions.create(
|
349 |
+
model=model,
|
350 |
+
messages=messages,
|
351 |
+
temperature=0,
|
352 |
+
)
|
353 |
+
return response.choices[0].message.content
|
354 |
+
|
355 |
+
|
356 |
+
# This function combines text chunks into larger blocks without exceeding a specified token count.
|
357 |
+
# It returns the combined chunks, their original indices, and the number of dropped chunks due to overflow.
|
358 |
+
def combine_chunks_with_no_minimum(
|
359 |
+
chunks: List[str],
|
360 |
+
max_tokens: int,
|
361 |
+
chunk_delimiter="\n\n",
|
362 |
+
header: Optional[str] = None,
|
363 |
+
add_ellipsis_for_overflow=False,
|
364 |
+
) -> Tuple[List[str], List[int]]:
|
365 |
+
dropped_chunk_count = 0
|
366 |
+
output = [] # list to hold the final combined chunks
|
367 |
+
output_indices = [] # list to hold the indices of the final combined chunks
|
368 |
+
candidate = (
|
369 |
+
[] if header is None else [header]
|
370 |
+
) # list to hold the current combined chunk candidate
|
371 |
+
candidate_indices = []
|
372 |
+
for chunk_i, chunk in enumerate(chunks):
|
373 |
+
chunk_with_header = [chunk] if header is None else [header, chunk]
|
374 |
+
# FIXME MAKE NOT OPENAI SPECIFIC
|
375 |
+
if len(openai_tokenize(chunk_delimiter.join(chunk_with_header))) > max_tokens:
|
376 |
+
print(f"warning: chunk overflow")
|
377 |
+
if (
|
378 |
+
add_ellipsis_for_overflow
|
379 |
+
# FIXME MAKE NOT OPENAI SPECIFIC
|
380 |
+
and len(openai_tokenize(chunk_delimiter.join(candidate + ["..."]))) <= max_tokens
|
381 |
+
):
|
382 |
+
candidate.append("...")
|
383 |
+
dropped_chunk_count += 1
|
384 |
+
continue # this case would break downstream assumptions
|
385 |
+
# estimate token count with the current chunk added
|
386 |
+
# FIXME MAKE NOT OPENAI SPECIFIC
|
387 |
+
extended_candidate_token_count = len(openai_tokenize(chunk_delimiter.join(candidate + [chunk])))
|
388 |
+
# If the token count exceeds max_tokens, add the current candidate to output and start a new candidate
|
389 |
+
if extended_candidate_token_count > max_tokens:
|
390 |
+
output.append(chunk_delimiter.join(candidate))
|
391 |
+
output_indices.append(candidate_indices)
|
392 |
+
candidate = chunk_with_header # re-initialize candidate
|
393 |
+
candidate_indices = [chunk_i]
|
394 |
+
# otherwise keep extending the candidate
|
395 |
+
else:
|
396 |
+
candidate.append(chunk)
|
397 |
+
candidate_indices.append(chunk_i)
|
398 |
+
# add the remaining candidate to output if it's not empty
|
399 |
+
if (header is not None and len(candidate) > 1) or (header is None and len(candidate) > 0):
|
400 |
+
output.append(chunk_delimiter.join(candidate))
|
401 |
+
output_indices.append(candidate_indices)
|
402 |
+
return output, output_indices, dropped_chunk_count
|
403 |
+
|
404 |
+
|
405 |
+
def rolling_summarize(text: str,
|
406 |
+
detail: float = 0,
|
407 |
+
model: str = 'gpt-4-turbo',
|
408 |
+
additional_instructions: Optional[str] = None,
|
409 |
+
minimum_chunk_size: Optional[int] = 500,
|
410 |
+
chunk_delimiter: str = ".",
|
411 |
+
summarize_recursively=False,
|
412 |
+
verbose=False):
|
413 |
+
"""
|
414 |
+
Summarizes a given text by splitting it into chunks, each of which is summarized individually.
|
415 |
+
The level of detail in the summary can be adjusted, and the process can optionally be made recursive.
|
416 |
+
|
417 |
+
Parameters:
|
418 |
+
- text (str): The text to be summarized.
|
419 |
+
- detail (float, optional): A value between 0 and 1
|
420 |
+
indicating the desired level of detail in the summary. 0 leads to a higher level summary, and 1 results in a more
|
421 |
+
detailed summary. Defaults to 0.
|
422 |
+
- additional_instructions (Optional[str], optional): Additional instructions to provide to the
|
423 |
+
model for customizing summaries. - minimum_chunk_size (Optional[int], optional): The minimum size for text
|
424 |
+
chunks. Defaults to 500.
|
425 |
+
- chunk_delimiter (str, optional): The delimiter used to split the text into chunks. Defaults to ".".
|
426 |
+
- summarize_recursively (bool, optional): If True, summaries are generated recursively, using previous summaries for context.
|
427 |
+
- verbose (bool, optional): If True, prints detailed information about the chunking process.
|
428 |
+
Returns:
|
429 |
+
- str: The final compiled summary of the text.
|
430 |
+
|
431 |
+
The function first determines the number of chunks by interpolating between a minimum and a maximum chunk count
|
432 |
+
based on the `detail` parameter. It then splits the text into chunks and summarizes each chunk. If
|
433 |
+
`summarize_recursively` is True, each summary is based on the previous summaries, adding more context to the
|
434 |
+
summarization process. The function returns a compiled summary of all chunks.
|
435 |
+
"""
|
436 |
+
|
437 |
+
# check detail is set correctly
|
438 |
+
assert 0 <= detail <= 1
|
439 |
+
|
440 |
+
# interpolate the number of chunks based to get specified level of detail
|
441 |
+
max_chunks = len(chunk_on_delimiter(text, minimum_chunk_size, chunk_delimiter))
|
442 |
+
min_chunks = 1
|
443 |
+
num_chunks = int(min_chunks + detail * (max_chunks - min_chunks))
|
444 |
+
|
445 |
+
# adjust chunk_size based on interpolated number of chunks
|
446 |
+
# FIXME MAKE NOT OPENAI SPECIFIC
|
447 |
+
document_length = len(openai_tokenize(text))
|
448 |
+
chunk_size = max(minimum_chunk_size, document_length // num_chunks)
|
449 |
+
text_chunks = chunk_on_delimiter(text, chunk_size, chunk_delimiter)
|
450 |
+
if verbose:
|
451 |
+
print(f"Splitting the text into {len(text_chunks)} chunks to be summarized.")
|
452 |
+
# FIXME MAKE NOT OPENAI SPECIFIC
|
453 |
+
print(f"Chunk lengths are {[len(openai_tokenize(x)) for x in text_chunks]}")
|
454 |
+
|
455 |
+
# set system message - FIXME
|
456 |
+
system_message_content = "Rewrite this text in summarized form."
|
457 |
+
if additional_instructions is not None:
|
458 |
+
system_message_content += f"\n\n{additional_instructions}"
|
459 |
+
|
460 |
+
accumulated_summaries = []
|
461 |
+
for i, chunk in enumerate(tqdm(text_chunks)):
|
462 |
+
if summarize_recursively and accumulated_summaries:
|
463 |
+
# Combine previous summary with current chunk for recursive summarization
|
464 |
+
combined_text = accumulated_summaries[-1] + "\n\n" + chunk
|
465 |
+
user_message_content = f"Previous summary and new content to summarize:\n\n{combined_text}"
|
466 |
+
else:
|
467 |
+
user_message_content = chunk
|
468 |
+
|
469 |
+
messages = [
|
470 |
+
{"role": "system", "content": system_message_content},
|
471 |
+
{"role": "user", "content": user_message_content}
|
472 |
+
]
|
473 |
+
|
474 |
+
response = get_chat_completion(messages, model=model)
|
475 |
+
accumulated_summaries.append(response)
|
476 |
+
|
477 |
+
final_summary = '\n\n'.join(accumulated_summaries)
|
478 |
+
return final_summary
|
479 |
+
|
480 |
+
#
|
481 |
+
#
|
482 |
+
#######################################################################################################################
|
483 |
+
#
|
484 |
+
# Ebook Chapter Chunking
|
485 |
+
|
486 |
+
|
487 |
+
def chunk_ebook_by_chapters(text: str, chunk_options: Dict[str, Any]) -> List[Dict[str, Any]]:
|
488 |
+
max_chunk_size = chunk_options.get('max_size', 300)
|
489 |
+
overlap = chunk_options.get('overlap', 0)
|
490 |
+
custom_pattern = chunk_options.get('custom_chapter_pattern', None)
|
491 |
+
|
492 |
+
# List of chapter heading patterns to try, in order
|
493 |
+
chapter_patterns = [
|
494 |
+
custom_pattern,
|
495 |
+
r'^#{1,2}\s+', # Markdown style: '# ' or '## '
|
496 |
+
r'^Chapter\s+\d+', # 'Chapter ' followed by numbers
|
497 |
+
r'^\d+\.\s+', # Numbered chapters: '1. ', '2. ', etc.
|
498 |
+
r'^[A-Z\s]+$' # All caps headings
|
499 |
+
]
|
500 |
+
|
501 |
+
chapter_positions = []
|
502 |
+
used_pattern = None
|
503 |
+
|
504 |
+
for pattern in chapter_patterns:
|
505 |
+
if pattern is None:
|
506 |
+
continue
|
507 |
+
chapter_regex = re.compile(pattern, re.MULTILINE | re.IGNORECASE)
|
508 |
+
chapter_positions = [match.start() for match in chapter_regex.finditer(text)]
|
509 |
+
if chapter_positions:
|
510 |
+
used_pattern = pattern
|
511 |
+
break
|
512 |
+
|
513 |
+
# If no chapters found, return the entire content as one chunk
|
514 |
+
if not chapter_positions:
|
515 |
+
return [{'text': text, 'metadata': get_chunk_metadata(text, text, chunk_type="whole_document")}]
|
516 |
+
|
517 |
+
# Split content into chapters
|
518 |
+
chunks = []
|
519 |
+
for i in range(len(chapter_positions)):
|
520 |
+
start = chapter_positions[i]
|
521 |
+
end = chapter_positions[i + 1] if i + 1 < len(chapter_positions) else None
|
522 |
+
chapter = text[start:end]
|
523 |
+
|
524 |
+
# Apply overlap if specified
|
525 |
+
if overlap > 0 and i > 0:
|
526 |
+
overlap_start = max(0, start - overlap)
|
527 |
+
chapter = text[overlap_start:end]
|
528 |
+
|
529 |
+
chunks.append(chapter)
|
530 |
+
|
531 |
+
# Post-process chunks
|
532 |
+
processed_chunks = post_process_chunks(chunks)
|
533 |
+
|
534 |
+
# Add metadata to chunks
|
535 |
+
return [{'text': chunk, 'metadata': get_chunk_metadata(chunk, text, chunk_type="chapter", chapter_number=i + 1,
|
536 |
+
chapter_pattern=used_pattern)}
|
537 |
+
for i, chunk in enumerate(processed_chunks)]
|
538 |
+
|
539 |
+
|
540 |
+
# # Example usage
|
541 |
+
# if __name__ == "__main__":
|
542 |
+
# sample_ebook_content = """
|
543 |
+
# # Chapter 1: Introduction
|
544 |
+
#
|
545 |
+
# This is the introduction.
|
546 |
+
#
|
547 |
+
# ## Section 1.1
|
548 |
+
#
|
549 |
+
# Some content here.
|
550 |
+
#
|
551 |
+
# # Chapter 2: Main Content
|
552 |
+
#
|
553 |
+
# This is the main content.
|
554 |
+
#
|
555 |
+
# ## Section 2.1
|
556 |
+
#
|
557 |
+
# More content here.
|
558 |
+
#
|
559 |
+
# CHAPTER THREE
|
560 |
+
#
|
561 |
+
# This is the third chapter.
|
562 |
+
#
|
563 |
+
# 4. Fourth Chapter
|
564 |
+
#
|
565 |
+
# This is the fourth chapter.
|
566 |
+
# """
|
567 |
+
#
|
568 |
+
# chunk_options = {
|
569 |
+
# 'method': 'chapters',
|
570 |
+
# 'max_size': 500,
|
571 |
+
# 'overlap': 50,
|
572 |
+
# 'custom_chapter_pattern': r'^CHAPTER\s+[A-Z]+' # Custom pattern for 'CHAPTER THREE' style
|
573 |
+
# }
|
574 |
+
#
|
575 |
+
# chunked_chapters = improved_chunking_process(sample_ebook_content, chunk_options)
|
576 |
+
#
|
577 |
+
# for i, chunk in enumerate(chunked_chapters, 1):
|
578 |
+
# print(f"Chunk {i}:")
|
579 |
+
# print(chunk['text'])
|
580 |
+
# print(f"Metadata: {chunk['metadata']}\n")
|
581 |
+
|
582 |
+
|
583 |
+
|
584 |
+
|
585 |
+
#
|
586 |
+
# End of Chunking Library
|
587 |
#######################################################################################################################
|
App_Function_Libraries/DB_Manager.py
ADDED
@@ -0,0 +1,472 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import configparser
|
2 |
+
import logging
|
3 |
+
import os
|
4 |
+
from contextlib import contextmanager
|
5 |
+
from time import sleep
|
6 |
+
from typing import Tuple
|
7 |
+
import sqlite3
|
8 |
+
# 3rd-Party Libraries
|
9 |
+
from elasticsearch import Elasticsearch
|
10 |
+
|
11 |
+
############################################################################################################
|
12 |
+
#
|
13 |
+
# This file contains the DatabaseManager class, which is responsible for managing the database connection, i.e. either SQLite or Elasticsearch.
|
14 |
+
|
15 |
+
####
|
16 |
+
# The DatabaseManager class provides the following methods:
|
17 |
+
# - add_media: Add a new media item to the database
|
18 |
+
# - fetch_items_by_keyword: Fetch media items from the database based on a keyword
|
19 |
+
# - fetch_item_details: Fetch details of a specific media item from the database
|
20 |
+
# - update_media_content: Update the content of a specific media item in the database
|
21 |
+
# - search_and_display_items: Search for media items in the database and display the results
|
22 |
+
# - close_connection: Close the database connection
|
23 |
+
####
|
24 |
+
|
25 |
+
# Import your existing SQLite functions
|
26 |
+
from SQLite_DB import (
|
27 |
+
update_media_content as sqlite_update_media_content,
|
28 |
+
list_prompts as sqlite_list_prompts,
|
29 |
+
search_and_display as sqlite_search_and_display,
|
30 |
+
fetch_prompt_details as sqlite_fetch_prompt_details,
|
31 |
+
keywords_browser_interface as sqlite_keywords_browser_interface,
|
32 |
+
add_keyword as sqlite_add_keyword,
|
33 |
+
delete_keyword as sqlite_delete_keyword,
|
34 |
+
export_keywords_to_csv as sqlite_export_keywords_to_csv,
|
35 |
+
ingest_article_to_db as sqlite_ingest_article_to_db,
|
36 |
+
add_media_to_database as sqlite_add_media_to_database,
|
37 |
+
import_obsidian_note_to_db as sqlite_import_obsidian_note_to_db,
|
38 |
+
add_prompt as sqlite_add_prompt,
|
39 |
+
delete_chat_message as sqlite_delete_chat_message,
|
40 |
+
update_chat_message as sqlite_update_chat_message,
|
41 |
+
add_chat_message as sqlite_add_chat_message,
|
42 |
+
get_chat_messages as sqlite_get_chat_messages,
|
43 |
+
search_chat_conversations as sqlite_search_chat_conversations,
|
44 |
+
create_chat_conversation as sqlite_create_chat_conversation,
|
45 |
+
save_chat_history_to_database as sqlite_save_chat_history_to_database,
|
46 |
+
view_database as sqlite_view_database,
|
47 |
+
get_transcripts as sqlite_get_transcripts,
|
48 |
+
get_trashed_items as sqlite_get_trashed_items,
|
49 |
+
user_delete_item as sqlite_user_delete_item,
|
50 |
+
empty_trash as sqlite_empty_trash,
|
51 |
+
create_automated_backup as sqlite_create_automated_backup,
|
52 |
+
add_or_update_prompt as sqlite_add_or_update_prompt,
|
53 |
+
load_prompt_details as sqlite_load_prompt_details,
|
54 |
+
load_preset_prompts as sqlite_load_preset_prompts,
|
55 |
+
insert_prompt_to_db as sqlite_insert_prompt_to_db,
|
56 |
+
delete_prompt as sqlite_delete_prompt,
|
57 |
+
search_and_display_items as sqlite_search_and_display_items,
|
58 |
+
get_conversation_name as sqlite_get_conversation_name,
|
59 |
+
add_media_with_keywords as sqlite_add_media_with_keywords,
|
60 |
+
check_media_and_whisper_model as sqlite_check_media_and_whisper_model,
|
61 |
+
DatabaseError
|
62 |
+
)
|
63 |
+
|
64 |
+
class Database:
|
65 |
+
def __init__(self, db_path=None):
|
66 |
+
self.db_path = db_path or os.getenv('DB_NAME', 'media_summary.db')
|
67 |
+
self.pool = []
|
68 |
+
self.pool_size = 10
|
69 |
+
|
70 |
+
@contextmanager
|
71 |
+
def get_connection(self):
|
72 |
+
retry_count = 5
|
73 |
+
retry_delay = 1
|
74 |
+
conn = None
|
75 |
+
while retry_count > 0:
|
76 |
+
try:
|
77 |
+
conn = self.pool.pop() if self.pool else sqlite3.connect(self.db_path, check_same_thread=False)
|
78 |
+
yield conn
|
79 |
+
self.pool.append(conn)
|
80 |
+
return
|
81 |
+
except sqlite3.OperationalError as e:
|
82 |
+
if 'database is locked' in str(e):
|
83 |
+
logging.warning(f"Database is locked, retrying in {retry_delay} seconds...")
|
84 |
+
retry_count -= 1
|
85 |
+
sleep(retry_delay)
|
86 |
+
else:
|
87 |
+
raise DatabaseError(f"Database error: {e}")
|
88 |
+
except Exception as e:
|
89 |
+
raise DatabaseError(f"Unexpected error: {e}")
|
90 |
+
finally:
|
91 |
+
# Ensure the connection is returned to the pool even on failure
|
92 |
+
if conn and conn not in self.pool:
|
93 |
+
self.pool.append(conn)
|
94 |
+
raise DatabaseError("Database is locked and retries have been exhausted")
|
95 |
+
|
96 |
+
def execute_query(self, query: str, params: Tuple = ()) -> None:
|
97 |
+
with self.get_connection() as conn:
|
98 |
+
try:
|
99 |
+
cursor = conn.cursor()
|
100 |
+
cursor.execute(query, params)
|
101 |
+
conn.commit()
|
102 |
+
except sqlite3.Error as e:
|
103 |
+
raise DatabaseError(f"Database error: {e}, Query: {query}")
|
104 |
+
|
105 |
+
def close_all_connections(self):
|
106 |
+
for conn in self.pool:
|
107 |
+
conn.close()
|
108 |
+
self.pool.clear()
|
109 |
+
|
110 |
+
def get_db_config():
|
111 |
+
config = configparser.ConfigParser()
|
112 |
+
config.read('config.txt')
|
113 |
+
return {
|
114 |
+
'type': config['Database']['type'],
|
115 |
+
'sqlite_path': config.get('Database', 'sqlite_path', fallback='media_summary.db'),
|
116 |
+
'elasticsearch_host': config.get('Database', 'elasticsearch_host', fallback='localhost'),
|
117 |
+
'elasticsearch_port': config.getint('Database', 'elasticsearch_port', fallback=9200)
|
118 |
+
}
|
119 |
+
|
120 |
+
db_config = get_db_config()
|
121 |
+
db_type = db_config['type']
|
122 |
+
|
123 |
+
if db_type == 'sqlite':
|
124 |
+
# Use the config path if provided, otherwise fall back to default
|
125 |
+
db = Database(db_config.get('sqlite_path'))
|
126 |
+
elif db_type == 'elasticsearch':
|
127 |
+
es = Elasticsearch([{
|
128 |
+
'host': db_config['elasticsearch_host'],
|
129 |
+
'port': db_config['elasticsearch_port']
|
130 |
+
}])
|
131 |
+
else:
|
132 |
+
raise ValueError(f"Unsupported database type: {db_type}")
|
133 |
+
|
134 |
+
db_path = db_config['sqlite_path']
|
135 |
+
|
136 |
+
# Update this path to the directory where you want to store the database backups
|
137 |
+
backup_dir = os.environ.get('DB_BACKUP_DIR', 'path/to/backup/directory')
|
138 |
+
|
139 |
+
|
140 |
+
|
141 |
+
|
142 |
+
if db_type == 'sqlite':
|
143 |
+
conn = sqlite3.connect(db_config['sqlite_path'])
|
144 |
+
cursor = conn.cursor()
|
145 |
+
elif db_type == 'elasticsearch':
|
146 |
+
es = Elasticsearch([{
|
147 |
+
'host': db_config['elasticsearch_host'],
|
148 |
+
'port': db_config['elasticsearch_port']
|
149 |
+
}])
|
150 |
+
else:
|
151 |
+
raise ValueError(f"Unsupported database type: {db_type}")
|
152 |
+
|
153 |
+
############################################################################################################
|
154 |
+
#
|
155 |
+
# DB-Searching functions
|
156 |
+
|
157 |
+
def view_database(*args, **kwargs):
|
158 |
+
if db_type == 'sqlite':
|
159 |
+
return sqlite_view_database(*args, **kwargs)
|
160 |
+
elif db_type == 'elasticsearch':
|
161 |
+
# Implement Elasticsearch version
|
162 |
+
raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented")
|
163 |
+
|
164 |
+
def search_and_display_items(*args, **kwargs):
|
165 |
+
if db_type == 'sqlite':
|
166 |
+
return sqlite_search_and_display_items(*args, **kwargs)
|
167 |
+
elif db_type == 'elasticsearch':
|
168 |
+
# Implement Elasticsearch version
|
169 |
+
raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented")
|
170 |
+
|
171 |
+
def search_and_display(*args, **kwargs):
|
172 |
+
if db_type == 'sqlite':
|
173 |
+
return sqlite_search_and_display(*args, **kwargs)
|
174 |
+
elif db_type == 'elasticsearch':
|
175 |
+
# Implement Elasticsearch version
|
176 |
+
raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented")
|
177 |
+
|
178 |
+
#
|
179 |
+
# End of DB-Searching functions
|
180 |
+
############################################################################################################
|
181 |
+
|
182 |
+
############################################################################################################
|
183 |
+
#
|
184 |
+
# Transcript-related Functions
|
185 |
+
|
186 |
+
def get_transcripts(*args, **kwargs):
|
187 |
+
if db_type == 'sqlite':
|
188 |
+
return sqlite_get_transcripts(*args, **kwargs)
|
189 |
+
elif db_type == 'elasticsearch':
|
190 |
+
# Implement Elasticsearch version
|
191 |
+
raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented")
|
192 |
+
|
193 |
+
#
|
194 |
+
# End of Transcript-related Functions
|
195 |
+
############################################################################################################
|
196 |
+
|
197 |
+
############################################################################################################
|
198 |
+
#
|
199 |
+
# DB-Ingestion functions
|
200 |
+
|
201 |
+
def add_media_to_database(*args, **kwargs):
|
202 |
+
if db_type == 'sqlite':
|
203 |
+
return sqlite_add_media_to_database(*args, **kwargs)
|
204 |
+
elif db_type == 'elasticsearch':
|
205 |
+
# Implement Elasticsearch version
|
206 |
+
raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented")
|
207 |
+
|
208 |
+
|
209 |
+
def import_obsidian_note_to_db(*args, **kwargs):
|
210 |
+
if db_type == 'sqlite':
|
211 |
+
return sqlite_import_obsidian_note_to_db(*args, **kwargs)
|
212 |
+
elif db_type == 'elasticsearch':
|
213 |
+
# Implement Elasticsearch version
|
214 |
+
raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented")
|
215 |
+
|
216 |
+
def update_media_content(*args, **kwargs):
|
217 |
+
if db_type == 'sqlite':
|
218 |
+
return sqlite_update_media_content(*args, **kwargs)
|
219 |
+
elif db_type == 'elasticsearch':
|
220 |
+
# Implement Elasticsearch version
|
221 |
+
raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented")
|
222 |
+
|
223 |
+
def add_media_with_keywords(*args, **kwargs):
|
224 |
+
if db_type == 'sqlite':
|
225 |
+
return sqlite_add_media_with_keywords(*args, **kwargs)
|
226 |
+
elif db_type == 'elasticsearch':
|
227 |
+
raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented")
|
228 |
+
|
229 |
+
def check_media_and_whisper_model(*args, **kwargs):
|
230 |
+
if db_type == 'sqlite':
|
231 |
+
return sqlite_check_media_and_whisper_model(*args, **kwargs)
|
232 |
+
elif db_type == 'elasticsearch':
|
233 |
+
raise NotImplementedError("Elasticsearch version of check_media_and_whisper_model not yet implemented")
|
234 |
+
|
235 |
+
def ingest_article_to_db(url, title, author, content, keywords, summary, ingestion_date, custom_prompt):
|
236 |
+
if db_type == 'sqlite':
|
237 |
+
return sqlite_ingest_article_to_db(url, title, author, content, keywords, summary, ingestion_date, custom_prompt)
|
238 |
+
elif db_type == 'elasticsearch':
|
239 |
+
# Implement Elasticsearch version
|
240 |
+
raise NotImplementedError("Elasticsearch version of ingest_article_to_db not yet implemented")
|
241 |
+
else:
|
242 |
+
raise ValueError(f"Unsupported database type: {db_type}")
|
243 |
+
|
244 |
+
#
|
245 |
+
# End of DB-Ingestion functions
|
246 |
+
############################################################################################################
|
247 |
+
|
248 |
+
|
249 |
+
############################################################################################################
|
250 |
+
#
|
251 |
+
# Prompt-related functions
|
252 |
+
|
253 |
+
def list_prompts(*args, **kwargs):
|
254 |
+
if db_type == 'sqlite':
|
255 |
+
return sqlite_list_prompts(*args, **kwargs)
|
256 |
+
elif db_type == 'elasticsearch':
|
257 |
+
# Implement Elasticsearch version
|
258 |
+
raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented")
|
259 |
+
|
260 |
+
|
261 |
+
def fetch_prompt_details(*args, **kwargs):
|
262 |
+
if db_type == 'sqlite':
|
263 |
+
return sqlite_fetch_prompt_details(*args, **kwargs)
|
264 |
+
elif db_type == 'elasticsearch':
|
265 |
+
# Implement Elasticsearch version
|
266 |
+
raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented")
|
267 |
+
|
268 |
+
def add_prompt(*args, **kwargs):
|
269 |
+
if db_type == 'sqlite':
|
270 |
+
return sqlite_add_prompt(*args, **kwargs)
|
271 |
+
elif db_type == 'elasticsearch':
|
272 |
+
# Implement Elasticsearch version
|
273 |
+
raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented")
|
274 |
+
|
275 |
+
|
276 |
+
def add_or_update_prompt(*args, **kwargs):
|
277 |
+
if db_type == 'sqlite':
|
278 |
+
return sqlite_add_or_update_prompt(*args, **kwargs)
|
279 |
+
elif db_type == 'elasticsearch':
|
280 |
+
# Implement Elasticsearch version
|
281 |
+
raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented")
|
282 |
+
|
283 |
+
def load_prompt_details(*args, **kwargs):
|
284 |
+
if db_type == 'sqlite':
|
285 |
+
return sqlite_load_prompt_details(*args, **kwargs)
|
286 |
+
elif db_type == 'elasticsearch':
|
287 |
+
# Implement Elasticsearch version
|
288 |
+
raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented")
|
289 |
+
|
290 |
+
def load_preset_prompts(*args, **kwargs):
|
291 |
+
if db_type == 'sqlite':
|
292 |
+
return sqlite_load_preset_prompts(*args, **kwargs)
|
293 |
+
elif db_type == 'elasticsearch':
|
294 |
+
# Implement Elasticsearch version
|
295 |
+
raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented")
|
296 |
+
|
297 |
+
def insert_prompt_to_db(*args, **kwargs):
|
298 |
+
if db_type == 'sqlite':
|
299 |
+
return sqlite_insert_prompt_to_db(*args, **kwargs)
|
300 |
+
elif db_type == 'elasticsearch':
|
301 |
+
# Implement Elasticsearch version
|
302 |
+
raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented")
|
303 |
+
|
304 |
+
def delete_prompt(*args, **kwargs):
|
305 |
+
if db_type == 'sqlite':
|
306 |
+
return sqlite_delete_prompt(*args, **kwargs)
|
307 |
+
elif db_type == 'elasticsearch':
|
308 |
+
# Implement Elasticsearch version
|
309 |
+
raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented")
|
310 |
+
|
311 |
+
#
|
312 |
+
# End of Prompt-related functions
|
313 |
+
############################################################################################################
|
314 |
+
|
315 |
+
############################################################################################################
|
316 |
+
#
|
317 |
+
# Keywords-related Functions
|
318 |
+
|
319 |
+
def keywords_browser_interface(*args, **kwargs):
|
320 |
+
if db_type == 'sqlite':
|
321 |
+
return sqlite_keywords_browser_interface(*args, **kwargs)
|
322 |
+
elif db_type == 'elasticsearch':
|
323 |
+
# Implement Elasticsearch version
|
324 |
+
raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented")
|
325 |
+
|
326 |
+
def add_keyword(*args, **kwargs):
|
327 |
+
if db_type == 'sqlite':
|
328 |
+
with db.get_connection() as conn:
|
329 |
+
cursor = conn.cursor()
|
330 |
+
return sqlite_add_keyword(*args, **kwargs)
|
331 |
+
elif db_type == 'elasticsearch':
|
332 |
+
# Implement Elasticsearch version
|
333 |
+
raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented")
|
334 |
+
|
335 |
+
def delete_keyword(*args, **kwargs):
|
336 |
+
if db_type == 'sqlite':
|
337 |
+
return sqlite_delete_keyword(*args, **kwargs)
|
338 |
+
elif db_type == 'elasticsearch':
|
339 |
+
# Implement Elasticsearch version
|
340 |
+
raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented")
|
341 |
+
|
342 |
+
def export_keywords_to_csv(*args, **kwargs):
|
343 |
+
if db_type == 'sqlite':
|
344 |
+
return sqlite_export_keywords_to_csv(*args, **kwargs)
|
345 |
+
elif db_type == 'elasticsearch':
|
346 |
+
# Implement Elasticsearch version
|
347 |
+
raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented")
|
348 |
+
|
349 |
+
#
|
350 |
+
# End of Keywords-related Functions
|
351 |
+
############################################################################################################
|
352 |
+
|
353 |
+
############################################################################################################
|
354 |
+
#
|
355 |
+
# Chat-related Functions
|
356 |
+
|
357 |
+
def delete_chat_message(*args, **kwargs):
|
358 |
+
if db_type == 'sqlite':
|
359 |
+
return sqlite_delete_chat_message(*args, **kwargs)
|
360 |
+
elif db_type == 'elasticsearch':
|
361 |
+
# Implement Elasticsearch version
|
362 |
+
raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented")
|
363 |
+
|
364 |
+
def update_chat_message(*args, **kwargs):
|
365 |
+
if db_type == 'sqlite':
|
366 |
+
return sqlite_update_chat_message(*args, **kwargs)
|
367 |
+
elif db_type == 'elasticsearch':
|
368 |
+
# Implement Elasticsearch version
|
369 |
+
raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented")
|
370 |
+
|
371 |
+
def add_chat_message(*args, **kwargs):
|
372 |
+
if db_type == 'sqlite':
|
373 |
+
return sqlite_add_chat_message(*args, **kwargs)
|
374 |
+
elif db_type == 'elasticsearch':
|
375 |
+
# Implement Elasticsearch version
|
376 |
+
raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented")
|
377 |
+
|
378 |
+
def get_chat_messages(*args, **kwargs):
|
379 |
+
if db_type == 'sqlite':
|
380 |
+
return sqlite_get_chat_messages(*args, **kwargs)
|
381 |
+
elif db_type == 'elasticsearch':
|
382 |
+
# Implement Elasticsearch version
|
383 |
+
raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented")
|
384 |
+
|
385 |
+
def search_chat_conversations(*args, **kwargs):
|
386 |
+
if db_type == 'sqlite':
|
387 |
+
return sqlite_search_chat_conversations(*args, **kwargs)
|
388 |
+
elif db_type == 'elasticsearch':
|
389 |
+
# Implement Elasticsearch version
|
390 |
+
raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented")
|
391 |
+
|
392 |
+
def create_chat_conversation(*args, **kwargs):
|
393 |
+
if db_type == 'sqlite':
|
394 |
+
return sqlite_create_chat_conversation(*args, **kwargs)
|
395 |
+
elif db_type == 'elasticsearch':
|
396 |
+
# Implement Elasticsearch version
|
397 |
+
raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented")
|
398 |
+
|
399 |
+
def save_chat_history_to_database(*args, **kwargs):
|
400 |
+
if db_type == 'sqlite':
|
401 |
+
return sqlite_save_chat_history_to_database(*args, **kwargs)
|
402 |
+
elif db_type == 'elasticsearch':
|
403 |
+
# Implement Elasticsearch version
|
404 |
+
raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented")
|
405 |
+
|
406 |
+
def get_conversation_name(*args, **kwargs):
|
407 |
+
if db_type == 'sqlite':
|
408 |
+
return sqlite_get_conversation_name(*args, **kwargs)
|
409 |
+
elif db_type == 'elasticsearch':
|
410 |
+
# Implement Elasticsearch version
|
411 |
+
raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented")
|
412 |
+
|
413 |
+
#
|
414 |
+
# End of Chat-related Functions
|
415 |
+
############################################################################################################
|
416 |
+
|
417 |
+
############################################################################################################
|
418 |
+
#
|
419 |
+
# Trash-related Functions
|
420 |
+
|
421 |
+
def get_trashed_items(*args, **kwargs):
|
422 |
+
if db_type == 'sqlite':
|
423 |
+
return sqlite_get_trashed_items(*args, **kwargs)
|
424 |
+
elif db_type == 'elasticsearch':
|
425 |
+
# Implement Elasticsearch version
|
426 |
+
raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented")
|
427 |
+
|
428 |
+
def user_delete_item(*args, **kwargs):
|
429 |
+
if db_type == 'sqlite':
|
430 |
+
return sqlite_user_delete_item(*args, **kwargs)
|
431 |
+
elif db_type == 'elasticsearch':
|
432 |
+
# Implement Elasticsearch version
|
433 |
+
raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented")
|
434 |
+
|
435 |
+
def empty_trash(*args, **kwargs):
|
436 |
+
if db_type == 'sqlite':
|
437 |
+
return sqlite_empty_trash(*args, **kwargs)
|
438 |
+
elif db_type == 'elasticsearch':
|
439 |
+
# Implement Elasticsearch version
|
440 |
+
raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented")
|
441 |
+
|
442 |
+
#
|
443 |
+
# End of Trash-related Functions
|
444 |
+
############################################################################################################
|
445 |
+
|
446 |
+
############################################################################################################
|
447 |
+
#
|
448 |
+
# DB-Backup Functions
|
449 |
+
|
450 |
+
def create_automated_backup(*args, **kwargs):
|
451 |
+
if db_type == 'sqlite':
|
452 |
+
return sqlite_create_automated_backup(*args, **kwargs)
|
453 |
+
elif db_type == 'elasticsearch':
|
454 |
+
# Implement Elasticsearch version
|
455 |
+
raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented")
|
456 |
+
|
457 |
+
#
|
458 |
+
# End of DB-Backup Functions
|
459 |
+
############################################################################################################
|
460 |
+
|
461 |
+
############################################################################################################
|
462 |
+
#
|
463 |
+
# Function to close the database connection for SQLite
|
464 |
+
|
465 |
+
def close_connection():
|
466 |
+
if db_type == 'sqlite':
|
467 |
+
db.close_all_connections()
|
468 |
+
# Elasticsearch doesn't need explicit closing
|
469 |
+
|
470 |
+
#
|
471 |
+
# End of file
|
472 |
+
############################################################################################################
|
App_Function_Libraries/Diarization_Lib.py
CHANGED
@@ -33,7 +33,7 @@ import yaml
|
|
33 |
|
34 |
def load_pipeline_from_pretrained(path_to_config: str | Path) -> SpeakerDiarization:
|
35 |
path_to_config = Path(path_to_config).resolve()
|
36 |
-
|
37 |
|
38 |
if not path_to_config.exists():
|
39 |
raise FileNotFoundError(f"Config file not found: {path_to_config}")
|
@@ -45,11 +45,6 @@ def load_pipeline_from_pretrained(path_to_config: str | Path) -> SpeakerDiarizat
|
|
45 |
# Store current working directory
|
46 |
cwd = Path.cwd().resolve()
|
47 |
|
48 |
-
# Change to the directory containing the config file
|
49 |
-
cd_to = path_to_config.parent.resolve()
|
50 |
-
print(f"Changing working directory to {cd_to}")
|
51 |
-
os.chdir(cd_to)
|
52 |
-
|
53 |
try:
|
54 |
# Create a SpeakerDiarization pipeline
|
55 |
pipeline = SpeakerDiarization()
|
|
|
33 |
|
34 |
def load_pipeline_from_pretrained(path_to_config: str | Path) -> SpeakerDiarization:
|
35 |
path_to_config = Path(path_to_config).resolve()
|
36 |
+
logging.debug(f"Loading pyannote pipeline from {path_to_config}...")
|
37 |
|
38 |
if not path_to_config.exists():
|
39 |
raise FileNotFoundError(f"Config file not found: {path_to_config}")
|
|
|
45 |
# Store current working directory
|
46 |
cwd = Path.cwd().resolve()
|
47 |
|
|
|
|
|
|
|
|
|
|
|
48 |
try:
|
49 |
# Create a SpeakerDiarization pipeline
|
50 |
pipeline = SpeakerDiarization()
|
App_Function_Libraries/Gradio_Related.py
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
App_Function_Libraries/Gradio_UI/Audio_ingestion_tab.py
ADDED
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Audio_ingestion_tab.py
|
2 |
+
# Description: Gradio UI for ingesting audio files into the database
|
3 |
+
#
|
4 |
+
# Imports
|
5 |
+
#
|
6 |
+
# External Imports
|
7 |
+
import gradio as gr
|
8 |
+
#
|
9 |
+
# Local Imports
|
10 |
+
from App_Function_Libraries.Audio_Files import process_audio_files
|
11 |
+
from App_Function_Libraries.DB_Manager import load_preset_prompts
|
12 |
+
from App_Function_Libraries.Gradio_UI.Chat_ui import update_user_prompt
|
13 |
+
from App_Function_Libraries.Gradio_UI.Gradio_Shared import whisper_models
|
14 |
+
#
|
15 |
+
#######################################################################################################################
|
16 |
+
# Functions:
|
17 |
+
|
18 |
+
def create_audio_processing_tab():
|
19 |
+
with gr.TabItem("Audio File Transcription + Summarization"):
|
20 |
+
gr.Markdown("# Transcribe & Summarize Audio Files from URLs or Local Files!")
|
21 |
+
with gr.Row():
|
22 |
+
with gr.Column():
|
23 |
+
audio_url_input = gr.Textbox(label="Audio File URL(s)", placeholder="Enter the URL(s) of the audio file(s), one per line")
|
24 |
+
audio_file_input = gr.File(label="Upload Audio File", file_types=["audio/*"])
|
25 |
+
|
26 |
+
use_cookies_input = gr.Checkbox(label="Use cookies for authenticated download", value=False)
|
27 |
+
cookies_input = gr.Textbox(
|
28 |
+
label="Audio Download Cookies",
|
29 |
+
placeholder="Paste your cookies here (JSON format)",
|
30 |
+
lines=3,
|
31 |
+
visible=False
|
32 |
+
)
|
33 |
+
|
34 |
+
use_cookies_input.change(
|
35 |
+
fn=lambda x: gr.update(visible=x),
|
36 |
+
inputs=[use_cookies_input],
|
37 |
+
outputs=[cookies_input]
|
38 |
+
)
|
39 |
+
|
40 |
+
diarize_input = gr.Checkbox(label="Enable Speaker Diarization", value=False)
|
41 |
+
whisper_model_input = gr.Dropdown(choices=whisper_models, value="medium", label="Whisper Model")
|
42 |
+
|
43 |
+
with gr.Row():
|
44 |
+
custom_prompt_checkbox = gr.Checkbox(label="Use a Custom Prompt",
|
45 |
+
value=False,
|
46 |
+
visible=True)
|
47 |
+
preset_prompt_checkbox = gr.Checkbox(label="Use a pre-set Prompt",
|
48 |
+
value=False,
|
49 |
+
visible=True)
|
50 |
+
with gr.Row():
|
51 |
+
preset_prompt = gr.Dropdown(label="Select Preset Prompt",
|
52 |
+
choices=load_preset_prompts(),
|
53 |
+
visible=False)
|
54 |
+
with gr.Row():
|
55 |
+
custom_prompt_input = gr.Textbox(label="Custom Prompt",
|
56 |
+
placeholder="Enter custom prompt here",
|
57 |
+
lines=3,
|
58 |
+
visible=False)
|
59 |
+
with gr.Row():
|
60 |
+
system_prompt_input = gr.Textbox(label="System Prompt",
|
61 |
+
value="""<s>You are a bulleted notes specialist. [INST]```When creating comprehensive bulleted notes, you should follow these guidelines: Use multiple headings based on the referenced topics, not categories like quotes or terms. Headings should be surrounded by bold formatting and not be listed as bullet points themselves. Leave no space between headings and their corresponding list items underneath. Important terms within the content should be emphasized by setting them in bold font. Any text that ends with a colon should also be bolded. Before submitting your response, review the instructions, and make any corrections necessary to adhered to the specified format. Do not reference these instructions within the notes.``` \nBased on the content between backticks create comprehensive bulleted notes.[/INST]
|
62 |
+
**Bulleted Note Creation Guidelines**
|
63 |
+
|
64 |
+
**Headings**:
|
65 |
+
- Based on referenced topics, not categories like quotes or terms
|
66 |
+
- Surrounded by **bold** formatting
|
67 |
+
- Not listed as bullet points
|
68 |
+
- No space between headings and list items underneath
|
69 |
+
|
70 |
+
**Emphasis**:
|
71 |
+
- **Important terms** set in bold font
|
72 |
+
- **Text ending in a colon**: also bolded
|
73 |
+
|
74 |
+
**Review**:
|
75 |
+
- Ensure adherence to specified format
|
76 |
+
- Do not reference these instructions in your response.</s>[INST] {{ .Prompt }} [/INST]
|
77 |
+
""",
|
78 |
+
lines=3,
|
79 |
+
visible=False)
|
80 |
+
|
81 |
+
custom_prompt_checkbox.change(
|
82 |
+
fn=lambda x: (gr.update(visible=x), gr.update(visible=x)),
|
83 |
+
inputs=[custom_prompt_checkbox],
|
84 |
+
outputs=[custom_prompt_input, system_prompt_input]
|
85 |
+
)
|
86 |
+
preset_prompt_checkbox.change(
|
87 |
+
fn=lambda x: gr.update(visible=x),
|
88 |
+
inputs=[preset_prompt_checkbox],
|
89 |
+
outputs=[preset_prompt]
|
90 |
+
)
|
91 |
+
|
92 |
+
def update_prompts(preset_name):
|
93 |
+
prompts = update_user_prompt(preset_name)
|
94 |
+
return (
|
95 |
+
gr.update(value=prompts["user_prompt"], visible=True),
|
96 |
+
gr.update(value=prompts["system_prompt"], visible=True)
|
97 |
+
)
|
98 |
+
|
99 |
+
preset_prompt.change(
|
100 |
+
update_prompts,
|
101 |
+
inputs=preset_prompt,
|
102 |
+
outputs=[custom_prompt_input, system_prompt_input]
|
103 |
+
)
|
104 |
+
|
105 |
+
api_name_input = gr.Dropdown(
|
106 |
+
choices=[None, "Local-LLM", "OpenAI", "Anthropic", "Cohere", "Groq", "DeepSeek", "Mistral", "OpenRouter",
|
107 |
+
"Llama.cpp", "Kobold", "Ooba", "Tabbyapi", "VLLM","ollama", "HuggingFace"],
|
108 |
+
value=None,
|
109 |
+
label="API for Summarization (Optional)"
|
110 |
+
)
|
111 |
+
api_key_input = gr.Textbox(label="API Key (if required)", placeholder="Enter your API key here", type="password")
|
112 |
+
custom_keywords_input = gr.Textbox(label="Custom Keywords", placeholder="Enter custom keywords, comma-separated")
|
113 |
+
keep_original_input = gr.Checkbox(label="Keep original audio file", value=False)
|
114 |
+
|
115 |
+
chunking_options_checkbox = gr.Checkbox(label="Show Chunking Options", value=False)
|
116 |
+
with gr.Row(visible=False) as chunking_options_box:
|
117 |
+
gr.Markdown("### Chunking Options")
|
118 |
+
with gr.Column():
|
119 |
+
chunk_method = gr.Dropdown(choices=['words', 'sentences', 'paragraphs', 'tokens'], label="Chunking Method")
|
120 |
+
max_chunk_size = gr.Slider(minimum=100, maximum=1000, value=300, step=50, label="Max Chunk Size")
|
121 |
+
chunk_overlap = gr.Slider(minimum=0, maximum=100, value=0, step=10, label="Chunk Overlap")
|
122 |
+
use_adaptive_chunking = gr.Checkbox(label="Use Adaptive Chunking")
|
123 |
+
use_multi_level_chunking = gr.Checkbox(label="Use Multi-level Chunking")
|
124 |
+
chunk_language = gr.Dropdown(choices=['english', 'french', 'german', 'spanish'], label="Chunking Language")
|
125 |
+
|
126 |
+
chunking_options_checkbox.change(
|
127 |
+
fn=lambda x: gr.update(visible=x),
|
128 |
+
inputs=[chunking_options_checkbox],
|
129 |
+
outputs=[chunking_options_box]
|
130 |
+
)
|
131 |
+
|
132 |
+
process_audio_button = gr.Button("Process Audio File(s)")
|
133 |
+
|
134 |
+
with gr.Column():
|
135 |
+
audio_progress_output = gr.Textbox(label="Progress")
|
136 |
+
audio_transcription_output = gr.Textbox(label="Transcription")
|
137 |
+
audio_summary_output = gr.Textbox(label="Summary")
|
138 |
+
download_transcription = gr.File(label="Download All Transcriptions as JSON")
|
139 |
+
download_summary = gr.File(label="Download All Summaries as Text")
|
140 |
+
|
141 |
+
process_audio_button.click(
|
142 |
+
fn=process_audio_files,
|
143 |
+
inputs=[audio_url_input, audio_file_input, whisper_model_input, api_name_input, api_key_input,
|
144 |
+
use_cookies_input, cookies_input, keep_original_input, custom_keywords_input, custom_prompt_input,
|
145 |
+
chunk_method, max_chunk_size, chunk_overlap, use_adaptive_chunking, use_multi_level_chunking,
|
146 |
+
chunk_language, diarize_input],
|
147 |
+
outputs=[audio_progress_output, audio_transcription_output, audio_summary_output]
|
148 |
+
)
|
149 |
+
|
150 |
+
#
|
151 |
+
# End of Audio_ingestion_tab.py
|
152 |
+
#######################################################################################################################
|
App_Function_Libraries/Gradio_UI/Chat_ui.py
ADDED
@@ -0,0 +1,1017 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Chat_ui.py
|
2 |
+
# Description: Chat interface functions for Gradio
|
3 |
+
#
|
4 |
+
# Imports
|
5 |
+
import html
|
6 |
+
import json
|
7 |
+
import logging
|
8 |
+
import os
|
9 |
+
import sqlite3
|
10 |
+
from datetime import datetime
|
11 |
+
from pathlib import Path
|
12 |
+
#
|
13 |
+
# External Imports
|
14 |
+
import gradio as gr
|
15 |
+
#
|
16 |
+
# Local Imports
|
17 |
+
from App_Function_Libraries.Chat import chat, save_chat_history, update_chat_content, save_chat_history_to_db_wrapper
|
18 |
+
from App_Function_Libraries.DB_Manager import add_chat_message, search_chat_conversations, create_chat_conversation, \
|
19 |
+
get_chat_messages, update_chat_message, delete_chat_message, load_preset_prompts, db
|
20 |
+
from App_Function_Libraries.Gradio_UI.Gradio_Shared import update_dropdown, update_user_prompt
|
21 |
+
|
22 |
+
|
23 |
+
#
|
24 |
+
#
|
25 |
+
########################################################################################################################
|
26 |
+
#
|
27 |
+
# Functions:
|
28 |
+
|
29 |
+
|
30 |
+
def show_edit_message(selected):
|
31 |
+
if selected:
|
32 |
+
return gr.update(value=selected[0], visible=True), gr.update(value=selected[1], visible=True), gr.update(
|
33 |
+
visible=True)
|
34 |
+
return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
|
35 |
+
|
36 |
+
|
37 |
+
def show_delete_message(selected):
|
38 |
+
if selected:
|
39 |
+
return gr.update(value=selected[1], visible=True), gr.update(visible=True)
|
40 |
+
return gr.update(visible=False), gr.update(visible=False)
|
41 |
+
|
42 |
+
|
43 |
+
def debug_output(media_content, selected_parts):
|
44 |
+
print(f"Debug - Media Content: {media_content}")
|
45 |
+
print(f"Debug - Selected Parts: {selected_parts}")
|
46 |
+
return ""
|
47 |
+
|
48 |
+
|
49 |
+
def update_selected_parts(use_content, use_summary, use_prompt):
|
50 |
+
selected_parts = []
|
51 |
+
if use_content:
|
52 |
+
selected_parts.append("content")
|
53 |
+
if use_summary:
|
54 |
+
selected_parts.append("summary")
|
55 |
+
if use_prompt:
|
56 |
+
selected_parts.append("prompt")
|
57 |
+
print(f"Debug - Update Selected Parts: {selected_parts}")
|
58 |
+
return selected_parts
|
59 |
+
|
60 |
+
|
61 |
+
# Old update_user_prompt shim for backwards compatibility
|
62 |
+
def get_system_prompt(preset_name):
|
63 |
+
# For backwards compatibility
|
64 |
+
prompts = update_user_prompt(preset_name)
|
65 |
+
return prompts["system_prompt"]
|
66 |
+
|
67 |
+
def clear_chat():
|
68 |
+
# Return empty list for chatbot and None for conversation_id
|
69 |
+
return gr.update(value=[]), None
|
70 |
+
|
71 |
+
|
72 |
+
# FIXME - add additional features....
|
73 |
+
def chat_wrapper(message, history, media_content, selected_parts, api_endpoint, api_key, custom_prompt, conversation_id,
|
74 |
+
save_conversation, temperature, system_prompt, max_tokens=None, top_p=None, frequency_penalty=None,
|
75 |
+
presence_penalty=None, stop_sequence=None):
|
76 |
+
try:
|
77 |
+
if save_conversation:
|
78 |
+
if conversation_id is None:
|
79 |
+
# Create a new conversation
|
80 |
+
media_id = media_content.get('id', None)
|
81 |
+
conversation_name = f"Chat about {media_content.get('title', 'Unknown Media')} - {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
|
82 |
+
conversation_id = create_chat_conversation(media_id, conversation_name)
|
83 |
+
|
84 |
+
# Add user message to the database
|
85 |
+
user_message_id = add_chat_message(conversation_id, "user", message)
|
86 |
+
|
87 |
+
# Include the selected parts and custom_prompt only for the first message
|
88 |
+
if not history and selected_parts:
|
89 |
+
message_body = "\n".join(selected_parts)
|
90 |
+
full_message = f"{custom_prompt}\n\n{message}\n\n{message_body}"
|
91 |
+
elif custom_prompt:
|
92 |
+
full_message = f"{custom_prompt}\n\n{message}"
|
93 |
+
else:
|
94 |
+
full_message = message
|
95 |
+
|
96 |
+
# Generate bot response
|
97 |
+
bot_message = chat(full_message, history, media_content, selected_parts, api_endpoint, api_key, custom_prompt,
|
98 |
+
temperature, system_prompt)
|
99 |
+
|
100 |
+
if save_conversation:
|
101 |
+
# Add assistant message to the database
|
102 |
+
add_chat_message(conversation_id, "assistant", bot_message)
|
103 |
+
|
104 |
+
# Update history
|
105 |
+
history.append((message, bot_message))
|
106 |
+
|
107 |
+
return bot_message, history, conversation_id
|
108 |
+
except Exception as e:
|
109 |
+
logging.error(f"Error in chat wrapper: {str(e)}")
|
110 |
+
return "An error occurred.", history, conversation_id
|
111 |
+
|
112 |
+
|
113 |
+
def search_conversations(query):
|
114 |
+
try:
|
115 |
+
conversations = search_chat_conversations(query)
|
116 |
+
if not conversations:
|
117 |
+
print(f"Debug - Search Conversations - No results found for query: {query}")
|
118 |
+
return gr.update(choices=[])
|
119 |
+
|
120 |
+
conversation_options = [
|
121 |
+
(f"{c['conversation_name']} (Media: {c['media_title']}, ID: {c['id']})", c['id'])
|
122 |
+
for c in conversations
|
123 |
+
]
|
124 |
+
print(f"Debug - Search Conversations - Options: {conversation_options}")
|
125 |
+
return gr.update(choices=conversation_options)
|
126 |
+
except Exception as e:
|
127 |
+
print(f"Debug - Search Conversations - Error: {str(e)}")
|
128 |
+
return gr.update(choices=[])
|
129 |
+
|
130 |
+
|
131 |
+
def load_conversation(conversation_id):
|
132 |
+
if not conversation_id:
|
133 |
+
return [], None
|
134 |
+
|
135 |
+
messages = get_chat_messages(conversation_id)
|
136 |
+
history = [
|
137 |
+
(msg['message'], None) if msg['sender'] == 'user' else (None, msg['message'])
|
138 |
+
for msg in messages
|
139 |
+
]
|
140 |
+
return history, conversation_id
|
141 |
+
|
142 |
+
|
143 |
+
def update_message_in_chat(message_id, new_text, history):
|
144 |
+
update_chat_message(message_id, new_text)
|
145 |
+
updated_history = [(msg1, msg2) if msg1[1] != message_id and msg2[1] != message_id
|
146 |
+
else ((new_text, msg1[1]) if msg1[1] == message_id else (new_text, msg2[1]))
|
147 |
+
for msg1, msg2 in history]
|
148 |
+
return updated_history
|
149 |
+
|
150 |
+
|
151 |
+
def delete_message_from_chat(message_id, history):
|
152 |
+
delete_chat_message(message_id)
|
153 |
+
updated_history = [(msg1, msg2) for msg1, msg2 in history if msg1[1] != message_id and msg2[1] != message_id]
|
154 |
+
return updated_history
|
155 |
+
|
156 |
+
|
157 |
+
def create_chat_interface():
|
158 |
+
custom_css = """
|
159 |
+
.chatbot-container .message-wrap .message {
|
160 |
+
font-size: 14px !important;
|
161 |
+
}
|
162 |
+
"""
|
163 |
+
with gr.TabItem("Remote LLM Chat (Horizontal)"):
|
164 |
+
gr.Markdown("# Chat with a designated LLM Endpoint, using your selected item as starting context")
|
165 |
+
chat_history = gr.State([])
|
166 |
+
media_content = gr.State({})
|
167 |
+
selected_parts = gr.State([])
|
168 |
+
conversation_id = gr.State(None)
|
169 |
+
|
170 |
+
with gr.Row():
|
171 |
+
with gr.Column(scale=1):
|
172 |
+
search_query_input = gr.Textbox(label="Search Query", placeholder="Enter your search query here...")
|
173 |
+
search_type_input = gr.Radio(choices=["Title", "URL", "Keyword", "Content"], value="Title",
|
174 |
+
label="Search By")
|
175 |
+
search_button = gr.Button("Search")
|
176 |
+
items_output = gr.Dropdown(label="Select Item", choices=[], interactive=True)
|
177 |
+
item_mapping = gr.State({})
|
178 |
+
with gr.Row():
|
179 |
+
use_content = gr.Checkbox(label="Use Content")
|
180 |
+
use_summary = gr.Checkbox(label="Use Summary")
|
181 |
+
use_prompt = gr.Checkbox(label="Use Prompt")
|
182 |
+
save_conversation = gr.Checkbox(label="Save Conversation", value=False, visible=True)
|
183 |
+
with gr.Row():
|
184 |
+
temperature = gr.Slider(label="Temperature", minimum=0.00, maximum=1.0, step=0.05, value=0.7)
|
185 |
+
with gr.Row():
|
186 |
+
conversation_search = gr.Textbox(label="Search Conversations")
|
187 |
+
with gr.Row():
|
188 |
+
search_conversations_btn = gr.Button("Search Conversations")
|
189 |
+
with gr.Row():
|
190 |
+
previous_conversations = gr.Dropdown(label="Select Conversation", choices=[], interactive=True)
|
191 |
+
with gr.Row():
|
192 |
+
load_conversations_btn = gr.Button("Load Selected Conversation")
|
193 |
+
|
194 |
+
api_endpoint = gr.Dropdown(label="Select API Endpoint",
|
195 |
+
choices=["Local-LLM", "OpenAI", "Anthropic", "Cohere", "Groq", "DeepSeek",
|
196 |
+
"Mistral", "OpenRouter",
|
197 |
+
"Llama.cpp", "Kobold", "Ooba", "Tabbyapi", "VLLM", "ollama",
|
198 |
+
"HuggingFace"])
|
199 |
+
api_key = gr.Textbox(label="API Key (if required)", type="password")
|
200 |
+
custom_prompt_checkbox = gr.Checkbox(label="Use a Custom Prompt",
|
201 |
+
value=False,
|
202 |
+
visible=True)
|
203 |
+
preset_prompt_checkbox = gr.Checkbox(label="Use a pre-set Prompt",
|
204 |
+
value=False,
|
205 |
+
visible=True)
|
206 |
+
preset_prompt = gr.Dropdown(label="Select Preset Prompt",
|
207 |
+
choices=load_preset_prompts(),
|
208 |
+
visible=False)
|
209 |
+
user_prompt = gr.Textbox(label="Custom Prompt",
|
210 |
+
placeholder="Enter custom prompt here",
|
211 |
+
lines=3,
|
212 |
+
visible=False)
|
213 |
+
system_prompt_input = gr.Textbox(label="System Prompt",
|
214 |
+
value="You are a helpful AI assitant",
|
215 |
+
lines=3,
|
216 |
+
visible=False)
|
217 |
+
with gr.Column():
|
218 |
+
chatbot = gr.Chatbot(height=600, elem_classes="chatbot-container")
|
219 |
+
msg = gr.Textbox(label="Enter your message")
|
220 |
+
submit = gr.Button("Submit")
|
221 |
+
clear_chat_button = gr.Button("Clear Chat")
|
222 |
+
|
223 |
+
edit_message_id = gr.Number(label="Message ID to Edit", visible=False)
|
224 |
+
edit_message_text = gr.Textbox(label="Edit Message", visible=False)
|
225 |
+
update_message_button = gr.Button("Update Message", visible=False)
|
226 |
+
|
227 |
+
delete_message_id = gr.Number(label="Message ID to Delete", visible=False)
|
228 |
+
delete_message_button = gr.Button("Delete Message", visible=False)
|
229 |
+
|
230 |
+
save_chat_history_to_db = gr.Button("Save Chat History to DataBase")
|
231 |
+
save_chat_history_as_file = gr.Button("Save Chat History as File")
|
232 |
+
download_file = gr.File(label="Download Chat History")
|
233 |
+
save_status = gr.Textbox(label="Save Status", interactive=False)
|
234 |
+
|
235 |
+
# Restore original functionality
|
236 |
+
search_button.click(
|
237 |
+
fn=update_dropdown,
|
238 |
+
inputs=[search_query_input, search_type_input],
|
239 |
+
outputs=[items_output, item_mapping]
|
240 |
+
)
|
241 |
+
|
242 |
+
def save_chat_wrapper(history, conversation_id, media_content):
|
243 |
+
file_path = save_chat_history(history, conversation_id, media_content)
|
244 |
+
if file_path:
|
245 |
+
return file_path, f"Chat history saved successfully as {os.path.basename(file_path)}!"
|
246 |
+
else:
|
247 |
+
return None, "Error saving chat history. Please check the logs and try again."
|
248 |
+
|
249 |
+
save_chat_history_as_file.click(
|
250 |
+
save_chat_wrapper,
|
251 |
+
inputs=[chatbot, conversation_id, media_content],
|
252 |
+
outputs=[download_file, save_status]
|
253 |
+
)
|
254 |
+
|
255 |
+
def update_prompts(preset_name):
|
256 |
+
prompts = update_user_prompt(preset_name)
|
257 |
+
return (
|
258 |
+
gr.update(value=prompts["user_prompt"], visible=True),
|
259 |
+
gr.update(value=prompts["system_prompt"], visible=True)
|
260 |
+
)
|
261 |
+
|
262 |
+
def clear_chat():
|
263 |
+
return [], None # Return empty list for chatbot and None for conversation_id
|
264 |
+
|
265 |
+
clear_chat_button.click(
|
266 |
+
clear_chat,
|
267 |
+
outputs=[chatbot, conversation_id]
|
268 |
+
)
|
269 |
+
preset_prompt.change(
|
270 |
+
update_prompts,
|
271 |
+
inputs=preset_prompt,
|
272 |
+
outputs=[user_prompt, system_prompt_input]
|
273 |
+
)
|
274 |
+
custom_prompt_checkbox.change(
|
275 |
+
fn=lambda x: (gr.update(visible=x), gr.update(visible=x)),
|
276 |
+
inputs=[custom_prompt_checkbox],
|
277 |
+
outputs=[user_prompt, system_prompt_input]
|
278 |
+
)
|
279 |
+
preset_prompt_checkbox.change(
|
280 |
+
fn=lambda x: gr.update(visible=x),
|
281 |
+
inputs=[preset_prompt_checkbox],
|
282 |
+
outputs=[preset_prompt]
|
283 |
+
)
|
284 |
+
|
285 |
+
submit.click(
|
286 |
+
chat_wrapper,
|
287 |
+
inputs=[msg, chatbot, media_content, selected_parts, api_endpoint, api_key, user_prompt,
|
288 |
+
conversation_id, save_conversation, temperature, system_prompt_input],
|
289 |
+
outputs=[msg, chatbot, conversation_id]
|
290 |
+
).then( # Clear the message box after submission
|
291 |
+
lambda x: gr.update(value=""),
|
292 |
+
inputs=[chatbot],
|
293 |
+
outputs=[msg]
|
294 |
+
).then( # Clear the user prompt after the first message
|
295 |
+
lambda: (gr.update(value=""), gr.update(value="")),
|
296 |
+
outputs=[user_prompt, system_prompt_input]
|
297 |
+
)
|
298 |
+
|
299 |
+
items_output.change(
|
300 |
+
update_chat_content,
|
301 |
+
inputs=[items_output, use_content, use_summary, use_prompt, item_mapping],
|
302 |
+
outputs=[media_content, selected_parts]
|
303 |
+
)
|
304 |
+
use_content.change(update_selected_parts, inputs=[use_content, use_summary, use_prompt],
|
305 |
+
outputs=[selected_parts])
|
306 |
+
use_summary.change(update_selected_parts, inputs=[use_content, use_summary, use_prompt],
|
307 |
+
outputs=[selected_parts])
|
308 |
+
use_prompt.change(update_selected_parts, inputs=[use_content, use_summary, use_prompt],
|
309 |
+
outputs=[selected_parts])
|
310 |
+
items_output.change(debug_output, inputs=[media_content, selected_parts], outputs=[])
|
311 |
+
|
312 |
+
search_conversations_btn.click(
|
313 |
+
search_conversations,
|
314 |
+
inputs=[conversation_search],
|
315 |
+
outputs=[previous_conversations]
|
316 |
+
)
|
317 |
+
|
318 |
+
load_conversations_btn.click(
|
319 |
+
clear_chat,
|
320 |
+
outputs=[chatbot, chat_history]
|
321 |
+
).then(
|
322 |
+
load_conversation,
|
323 |
+
inputs=[previous_conversations],
|
324 |
+
outputs=[chatbot, conversation_id]
|
325 |
+
)
|
326 |
+
|
327 |
+
previous_conversations.change(
|
328 |
+
load_conversation,
|
329 |
+
inputs=[previous_conversations],
|
330 |
+
outputs=[chat_history]
|
331 |
+
)
|
332 |
+
|
333 |
+
update_message_button.click(
|
334 |
+
update_message_in_chat,
|
335 |
+
inputs=[edit_message_id, edit_message_text, chat_history],
|
336 |
+
outputs=[chatbot]
|
337 |
+
)
|
338 |
+
|
339 |
+
delete_message_button.click(
|
340 |
+
delete_message_from_chat,
|
341 |
+
inputs=[delete_message_id, chat_history],
|
342 |
+
outputs=[chatbot]
|
343 |
+
)
|
344 |
+
|
345 |
+
save_chat_history_as_file.click(
|
346 |
+
save_chat_history,
|
347 |
+
inputs=[chatbot, conversation_id],
|
348 |
+
outputs=[download_file]
|
349 |
+
)
|
350 |
+
|
351 |
+
save_chat_history_to_db.click(
|
352 |
+
save_chat_history_to_db_wrapper,
|
353 |
+
inputs=[chatbot, conversation_id, media_content],
|
354 |
+
outputs=[conversation_id, gr.Textbox(label="Save Status")]
|
355 |
+
)
|
356 |
+
|
357 |
+
chatbot.select(show_edit_message, None, [edit_message_text, edit_message_id, update_message_button])
|
358 |
+
chatbot.select(show_delete_message, None, [delete_message_id, delete_message_button])
|
359 |
+
|
360 |
+
|
361 |
+
def create_chat_interface_stacked():
|
362 |
+
custom_css = """
|
363 |
+
.chatbot-container .message-wrap .message {
|
364 |
+
font-size: 14px !important;
|
365 |
+
}
|
366 |
+
"""
|
367 |
+
with gr.TabItem("Remote LLM Chat - Stacked"):
|
368 |
+
gr.Markdown("# Stacked Chat")
|
369 |
+
chat_history = gr.State([])
|
370 |
+
media_content = gr.State({})
|
371 |
+
selected_parts = gr.State([])
|
372 |
+
conversation_id = gr.State(None)
|
373 |
+
|
374 |
+
with gr.Row():
|
375 |
+
with gr.Column():
|
376 |
+
search_query_input = gr.Textbox(label="Search Query", placeholder="Enter your search query here...")
|
377 |
+
search_type_input = gr.Radio(choices=["Title", "URL", "Keyword", "Content"], value="Title",
|
378 |
+
label="Search By")
|
379 |
+
search_button = gr.Button("Search")
|
380 |
+
items_output = gr.Dropdown(label="Select Item", choices=[], interactive=True)
|
381 |
+
item_mapping = gr.State({})
|
382 |
+
with gr.Row():
|
383 |
+
use_content = gr.Checkbox(label="Use Content")
|
384 |
+
use_summary = gr.Checkbox(label="Use Summary")
|
385 |
+
use_prompt = gr.Checkbox(label="Use Prompt")
|
386 |
+
save_conversation = gr.Checkbox(label="Save Conversation", value=False, visible=True)
|
387 |
+
temp = gr.Slider(label="Temperature", minimum=0.00, maximum=1.0, step=0.05, value=0.7)
|
388 |
+
with gr.Row():
|
389 |
+
conversation_search = gr.Textbox(label="Search Conversations")
|
390 |
+
with gr.Row():
|
391 |
+
previous_conversations = gr.Dropdown(label="Select Conversation", choices=[], interactive=True)
|
392 |
+
with gr.Row():
|
393 |
+
search_conversations_btn = gr.Button("Search Conversations")
|
394 |
+
load_conversations_btn = gr.Button("Load Selected Conversation")
|
395 |
+
with gr.Column():
|
396 |
+
api_endpoint = gr.Dropdown(label="Select API Endpoint",
|
397 |
+
choices=["Local-LLM", "OpenAI", "Anthropic", "Cohere", "Groq", "DeepSeek",
|
398 |
+
"OpenRouter", "Mistral", "Llama.cpp", "Kobold", "Ooba", "Tabbyapi",
|
399 |
+
"VLLM", "ollama", "HuggingFace"])
|
400 |
+
api_key = gr.Textbox(label="API Key (if required)", type="password")
|
401 |
+
preset_prompt = gr.Dropdown(label="Select Preset Prompt",
|
402 |
+
choices=load_preset_prompts(),
|
403 |
+
visible=True)
|
404 |
+
system_prompt = gr.Textbox(label="System Prompt",
|
405 |
+
value="You are a helpful AI assistant.",
|
406 |
+
lines=3,
|
407 |
+
visible=True)
|
408 |
+
user_prompt = gr.Textbox(label="Custom User Prompt",
|
409 |
+
placeholder="Enter custom prompt here",
|
410 |
+
lines=3,
|
411 |
+
visible=True)
|
412 |
+
gr.Markdown("Scroll down for the chat window...")
|
413 |
+
with gr.Row():
|
414 |
+
with gr.Column(scale=1):
|
415 |
+
chatbot = gr.Chatbot(height=600, elem_classes="chatbot-container")
|
416 |
+
msg = gr.Textbox(label="Enter your message")
|
417 |
+
with gr.Row():
|
418 |
+
with gr.Column():
|
419 |
+
submit = gr.Button("Submit")
|
420 |
+
clear_chat_button = gr.Button("Clear Chat")
|
421 |
+
|
422 |
+
edit_message_id = gr.Number(label="Message ID to Edit", visible=False)
|
423 |
+
edit_message_text = gr.Textbox(label="Edit Message", visible=False)
|
424 |
+
update_message_button = gr.Button("Update Message", visible=False)
|
425 |
+
|
426 |
+
delete_message_id = gr.Number(label="Message ID to Delete", visible=False)
|
427 |
+
delete_message_button = gr.Button("Delete Message", visible=False)
|
428 |
+
save_chat_history_to_db = gr.Button("Save Chat History to DataBase")
|
429 |
+
save_chat_history_as_file = gr.Button("Save Chat History as File")
|
430 |
+
with gr.Column():
|
431 |
+
download_file = gr.File(label="Download Chat History")
|
432 |
+
|
433 |
+
# Restore original functionality
|
434 |
+
search_button.click(
|
435 |
+
fn=update_dropdown,
|
436 |
+
inputs=[search_query_input, search_type_input],
|
437 |
+
outputs=[items_output, item_mapping]
|
438 |
+
)
|
439 |
+
|
440 |
+
def update_prompts(preset_name):
|
441 |
+
prompts = update_user_prompt(preset_name)
|
442 |
+
return (
|
443 |
+
gr.update(value=prompts["user_prompt"], visible=True),
|
444 |
+
gr.update(value=prompts["system_prompt"], visible=True)
|
445 |
+
)
|
446 |
+
|
447 |
+
clear_chat_button.click(
|
448 |
+
clear_chat,
|
449 |
+
outputs=[chatbot, conversation_id]
|
450 |
+
)
|
451 |
+
preset_prompt.change(
|
452 |
+
update_prompts,
|
453 |
+
inputs=preset_prompt,
|
454 |
+
outputs=[user_prompt, system_prompt]
|
455 |
+
)
|
456 |
+
|
457 |
+
submit.click(
|
458 |
+
chat_wrapper,
|
459 |
+
inputs=[msg, chatbot, media_content, selected_parts, api_endpoint, api_key, user_prompt,
|
460 |
+
conversation_id, save_conversation, temp, system_prompt],
|
461 |
+
outputs=[msg, chatbot, conversation_id]
|
462 |
+
).then( # Clear the message box after submission
|
463 |
+
lambda x: gr.update(value=""),
|
464 |
+
inputs=[chatbot],
|
465 |
+
outputs=[msg]
|
466 |
+
).then( # Clear the user prompt after the first message
|
467 |
+
lambda: gr.update(value=""),
|
468 |
+
outputs=[user_prompt, system_prompt]
|
469 |
+
)
|
470 |
+
|
471 |
+
items_output.change(
|
472 |
+
update_chat_content,
|
473 |
+
inputs=[items_output, use_content, use_summary, use_prompt, item_mapping],
|
474 |
+
outputs=[media_content, selected_parts]
|
475 |
+
)
|
476 |
+
use_content.change(update_selected_parts, inputs=[use_content, use_summary, use_prompt],
|
477 |
+
outputs=[selected_parts])
|
478 |
+
use_summary.change(update_selected_parts, inputs=[use_content, use_summary, use_prompt],
|
479 |
+
outputs=[selected_parts])
|
480 |
+
use_prompt.change(update_selected_parts, inputs=[use_content, use_summary, use_prompt],
|
481 |
+
outputs=[selected_parts])
|
482 |
+
items_output.change(debug_output, inputs=[media_content, selected_parts], outputs=[])
|
483 |
+
|
484 |
+
search_conversations_btn.click(
|
485 |
+
search_conversations,
|
486 |
+
inputs=[conversation_search],
|
487 |
+
outputs=[previous_conversations]
|
488 |
+
)
|
489 |
+
|
490 |
+
load_conversations_btn.click(
|
491 |
+
clear_chat,
|
492 |
+
outputs=[chatbot, chat_history]
|
493 |
+
).then(
|
494 |
+
load_conversation,
|
495 |
+
inputs=[previous_conversations],
|
496 |
+
outputs=[chatbot, conversation_id]
|
497 |
+
)
|
498 |
+
|
499 |
+
previous_conversations.change(
|
500 |
+
load_conversation,
|
501 |
+
inputs=[previous_conversations],
|
502 |
+
outputs=[chat_history]
|
503 |
+
)
|
504 |
+
|
505 |
+
update_message_button.click(
|
506 |
+
update_message_in_chat,
|
507 |
+
inputs=[edit_message_id, edit_message_text, chat_history],
|
508 |
+
outputs=[chatbot]
|
509 |
+
)
|
510 |
+
|
511 |
+
delete_message_button.click(
|
512 |
+
delete_message_from_chat,
|
513 |
+
inputs=[delete_message_id, chat_history],
|
514 |
+
outputs=[chatbot]
|
515 |
+
)
|
516 |
+
|
517 |
+
save_chat_history_as_file.click(
|
518 |
+
save_chat_history,
|
519 |
+
inputs=[chatbot, conversation_id],
|
520 |
+
outputs=[download_file]
|
521 |
+
)
|
522 |
+
|
523 |
+
save_chat_history_to_db.click(
|
524 |
+
save_chat_history_to_db_wrapper,
|
525 |
+
inputs=[chatbot, conversation_id, media_content],
|
526 |
+
outputs=[conversation_id, gr.Textbox(label="Save Status")]
|
527 |
+
)
|
528 |
+
|
529 |
+
chatbot.select(show_edit_message, None, [edit_message_text, edit_message_id, update_message_button])
|
530 |
+
chatbot.select(show_delete_message, None, [delete_message_id, delete_message_button])
|
531 |
+
|
532 |
+
|
533 |
+
# FIXME - System prompts
|
534 |
+
def create_chat_interface_multi_api():
|
535 |
+
custom_css = """
|
536 |
+
.chatbot-container .message-wrap .message {
|
537 |
+
font-size: 14px !important;
|
538 |
+
}
|
539 |
+
.chat-window {
|
540 |
+
height: 400px;
|
541 |
+
overflow-y: auto;
|
542 |
+
}
|
543 |
+
"""
|
544 |
+
with gr.TabItem("One Prompt - Multiple APIs"):
|
545 |
+
gr.Markdown("# One Prompt but Multiple API Chat Interface")
|
546 |
+
|
547 |
+
with gr.Row():
|
548 |
+
with gr.Column(scale=1):
|
549 |
+
search_query_input = gr.Textbox(label="Search Query", placeholder="Enter your search query here...")
|
550 |
+
search_type_input = gr.Radio(choices=["Title", "URL", "Keyword", "Content"], value="Title",
|
551 |
+
label="Search By")
|
552 |
+
search_button = gr.Button("Search")
|
553 |
+
items_output = gr.Dropdown(label="Select Item", choices=[], interactive=True)
|
554 |
+
item_mapping = gr.State({})
|
555 |
+
with gr.Row():
|
556 |
+
use_content = gr.Checkbox(label="Use Content")
|
557 |
+
use_summary = gr.Checkbox(label="Use Summary")
|
558 |
+
use_prompt = gr.Checkbox(label="Use Prompt")
|
559 |
+
with gr.Column():
|
560 |
+
preset_prompt = gr.Dropdown(label="Select Preset Prompt", choices=load_preset_prompts(), visible=True)
|
561 |
+
system_prompt = gr.Textbox(label="System Prompt", value="You are a helpful AI assistant.", lines=5)
|
562 |
+
user_prompt = gr.Textbox(label="Modify Prompt", lines=5, value=".")
|
563 |
+
|
564 |
+
with gr.Row():
|
565 |
+
chatbots = []
|
566 |
+
api_endpoints = []
|
567 |
+
api_keys = []
|
568 |
+
temperatures = []
|
569 |
+
for i in range(3):
|
570 |
+
with gr.Column():
|
571 |
+
gr.Markdown(f"### Chat Window {i + 1}")
|
572 |
+
api_endpoint = gr.Dropdown(label=f"API Endpoint {i + 1}",
|
573 |
+
choices=["Local-LLM", "OpenAI", "Anthropic", "Cohere", "Groq",
|
574 |
+
"DeepSeek", "Mistral", "OpenRouter", "Llama.cpp", "Kobold",
|
575 |
+
"Ooba",
|
576 |
+
"Tabbyapi", "VLLM", "ollama", "HuggingFace"])
|
577 |
+
api_key = gr.Textbox(label=f"API Key {i + 1} (if required)", type="password")
|
578 |
+
temperature = gr.Slider(label=f"Temperature {i + 1}", minimum=0.0, maximum=1.0, step=0.05,
|
579 |
+
value=0.7)
|
580 |
+
chatbot = gr.Chatbot(height=800, elem_classes="chat-window")
|
581 |
+
chatbots.append(chatbot)
|
582 |
+
api_endpoints.append(api_endpoint)
|
583 |
+
api_keys.append(api_key)
|
584 |
+
temperatures.append(temperature)
|
585 |
+
|
586 |
+
with gr.Row():
|
587 |
+
msg = gr.Textbox(label="Enter your message", scale=4)
|
588 |
+
submit = gr.Button("Submit", scale=1)
|
589 |
+
# FIXME - clear chat
|
590 |
+
# clear_chat_button = gr.Button("Clear Chat")
|
591 |
+
#
|
592 |
+
# clear_chat_button.click(
|
593 |
+
# clear_chat,
|
594 |
+
# outputs=[chatbot]
|
595 |
+
# )
|
596 |
+
|
597 |
+
# State variables
|
598 |
+
chat_history = [gr.State([]) for _ in range(3)]
|
599 |
+
media_content = gr.State({})
|
600 |
+
selected_parts = gr.State([])
|
601 |
+
conversation_id = gr.State(None)
|
602 |
+
|
603 |
+
# Event handlers
|
604 |
+
search_button.click(
|
605 |
+
fn=update_dropdown,
|
606 |
+
inputs=[search_query_input, search_type_input],
|
607 |
+
outputs=[items_output, item_mapping]
|
608 |
+
)
|
609 |
+
|
610 |
+
preset_prompt.change(update_user_prompt, inputs=preset_prompt, outputs=user_prompt)
|
611 |
+
|
612 |
+
def chat_wrapper_multi(message, custom_prompt, system_prompt, *args):
|
613 |
+
chat_histories = args[:3]
|
614 |
+
chatbots = args[3:6]
|
615 |
+
api_endpoints = args[6:9]
|
616 |
+
api_keys = args[9:12]
|
617 |
+
temperatures = args[12:15]
|
618 |
+
media_content = args[15]
|
619 |
+
selected_parts = args[16]
|
620 |
+
|
621 |
+
new_chat_histories = []
|
622 |
+
new_chatbots = []
|
623 |
+
|
624 |
+
for i in range(3):
|
625 |
+
# Call chat_wrapper with dummy values for conversation_id and save_conversation
|
626 |
+
bot_message, new_history, _ = chat_wrapper(
|
627 |
+
message, chat_histories[i], media_content, selected_parts,
|
628 |
+
api_endpoints[i], api_keys[i], custom_prompt, None, # None for conversation_id
|
629 |
+
False, # False for save_conversation
|
630 |
+
temperature=temperatures[i],
|
631 |
+
system_prompt=system_prompt
|
632 |
+
)
|
633 |
+
|
634 |
+
new_chatbot = chatbots[i] + [(message, bot_message)]
|
635 |
+
|
636 |
+
new_chat_histories.append(new_history)
|
637 |
+
new_chatbots.append(new_chatbot)
|
638 |
+
|
639 |
+
return [gr.update(value="")] + new_chatbots + new_chat_histories
|
640 |
+
|
641 |
+
# In the create_chat_interface_multi_api function:
|
642 |
+
submit.click(
|
643 |
+
chat_wrapper_multi,
|
644 |
+
inputs=[msg, user_prompt,
|
645 |
+
system_prompt] + chat_history + chatbots + api_endpoints + api_keys + temperatures +
|
646 |
+
[media_content, selected_parts],
|
647 |
+
outputs=[msg] + chatbots + chat_history
|
648 |
+
).then(
|
649 |
+
lambda: (gr.update(value=""), gr.update(value="")),
|
650 |
+
outputs=[msg, user_prompt]
|
651 |
+
)
|
652 |
+
|
653 |
+
items_output.change(
|
654 |
+
update_chat_content,
|
655 |
+
inputs=[items_output, use_content, use_summary, use_prompt, item_mapping],
|
656 |
+
outputs=[media_content, selected_parts]
|
657 |
+
)
|
658 |
+
|
659 |
+
for checkbox in [use_content, use_summary, use_prompt]:
|
660 |
+
checkbox.change(
|
661 |
+
update_selected_parts,
|
662 |
+
inputs=[use_content, use_summary, use_prompt],
|
663 |
+
outputs=[selected_parts]
|
664 |
+
)
|
665 |
+
|
666 |
+
|
667 |
+
def create_chat_interface_four():
|
668 |
+
custom_css = """
|
669 |
+
.chatbot-container .message-wrap .message {
|
670 |
+
font-size: 14px !important;
|
671 |
+
}
|
672 |
+
.chat-window {
|
673 |
+
height: 400px;
|
674 |
+
overflow-y: auto;
|
675 |
+
}
|
676 |
+
"""
|
677 |
+
with gr.TabItem("Four Independent API Chats"):
|
678 |
+
gr.Markdown("# Four Independent API Chat Interfaces")
|
679 |
+
|
680 |
+
with gr.Row():
|
681 |
+
with gr.Column():
|
682 |
+
preset_prompt = gr.Dropdown(label="Select Preset Prompt", choices=load_preset_prompts(), visible=True)
|
683 |
+
user_prompt = gr.Textbox(label="Modify Prompt", lines=3, value=".")
|
684 |
+
with gr.Column():
|
685 |
+
gr.Markdown("Scroll down for the chat windows...")
|
686 |
+
chat_interfaces = []
|
687 |
+
for row in range(2):
|
688 |
+
with gr.Row():
|
689 |
+
for col in range(2):
|
690 |
+
i = row * 2 + col
|
691 |
+
with gr.Column():
|
692 |
+
gr.Markdown(f"### Chat Window {i + 1}")
|
693 |
+
api_endpoint = gr.Dropdown(label=f"API Endpoint {i + 1}",
|
694 |
+
choices=["Local-LLM", "OpenAI", "Anthropic", "Cohere", "Groq",
|
695 |
+
"DeepSeek", "Mistral", "OpenRouter", "Llama.cpp", "Kobold",
|
696 |
+
"Ooba",
|
697 |
+
"Tabbyapi", "VLLM", "ollama", "HuggingFace"])
|
698 |
+
api_key = gr.Textbox(label=f"API Key {i + 1} (if required)", type="password")
|
699 |
+
temperature = gr.Slider(label=f"Temperature {i + 1}", minimum=0.0, maximum=1.0, step=0.05,
|
700 |
+
value=0.7)
|
701 |
+
chatbot = gr.Chatbot(height=400, elem_classes="chat-window")
|
702 |
+
msg = gr.Textbox(label=f"Enter your message for Chat {i + 1}")
|
703 |
+
submit = gr.Button(f"Submit to Chat {i + 1}")
|
704 |
+
|
705 |
+
chat_interfaces.append({
|
706 |
+
'api_endpoint': api_endpoint,
|
707 |
+
'api_key': api_key,
|
708 |
+
'temperature': temperature,
|
709 |
+
'chatbot': chatbot,
|
710 |
+
'msg': msg,
|
711 |
+
'submit': submit,
|
712 |
+
'chat_history': gr.State([])
|
713 |
+
})
|
714 |
+
|
715 |
+
preset_prompt.change(update_user_prompt, inputs=preset_prompt, outputs=user_prompt)
|
716 |
+
|
717 |
+
def chat_wrapper_single(message, chat_history, api_endpoint, api_key, temperature, user_prompt):
|
718 |
+
logging.debug(f"Chat Wrapper Single - Message: {message}, Chat History: {chat_history}")
|
719 |
+
new_msg, new_history, _ = chat_wrapper(
|
720 |
+
message, chat_history, {}, [], # Empty media_content and selected_parts
|
721 |
+
api_endpoint, api_key, user_prompt, None, # No conversation_id
|
722 |
+
False, # Not saving conversation
|
723 |
+
temperature=temperature, system_prompt=""
|
724 |
+
)
|
725 |
+
chat_history.append((message, new_msg))
|
726 |
+
return "", chat_history, chat_history
|
727 |
+
|
728 |
+
for interface in chat_interfaces:
|
729 |
+
logging.debug(f"Chat Interface - Clicked Submit for Chat {interface['chatbot']}"),
|
730 |
+
interface['submit'].click(
|
731 |
+
chat_wrapper_single,
|
732 |
+
inputs=[
|
733 |
+
interface['msg'],
|
734 |
+
interface['chat_history'],
|
735 |
+
interface['api_endpoint'],
|
736 |
+
interface['api_key'],
|
737 |
+
interface['temperature'],
|
738 |
+
user_prompt
|
739 |
+
],
|
740 |
+
outputs=[
|
741 |
+
interface['msg'],
|
742 |
+
interface['chatbot'],
|
743 |
+
interface['chat_history']
|
744 |
+
]
|
745 |
+
)
|
746 |
+
|
747 |
+
|
748 |
+
def chat_wrapper_single(message, chat_history, chatbot, api_endpoint, api_key, temperature, media_content,
|
749 |
+
selected_parts, conversation_id, save_conversation, user_prompt):
|
750 |
+
new_msg, new_history, new_conv_id = chat_wrapper(
|
751 |
+
message, chat_history, media_content, selected_parts,
|
752 |
+
api_endpoint, api_key, user_prompt, conversation_id,
|
753 |
+
save_conversation, temperature, system_prompt=""
|
754 |
+
)
|
755 |
+
|
756 |
+
if new_msg:
|
757 |
+
updated_chatbot = chatbot + [(message, new_msg)]
|
758 |
+
else:
|
759 |
+
updated_chatbot = chatbot
|
760 |
+
|
761 |
+
return new_msg, updated_chatbot, new_history, new_conv_id
|
762 |
+
|
763 |
+
|
764 |
+
# FIXME - Finish implementing functions + testing/valdidation
|
765 |
+
def create_chat_management_tab():
|
766 |
+
with gr.TabItem("Chat Management"):
|
767 |
+
gr.Markdown("# Chat Management")
|
768 |
+
|
769 |
+
with gr.Row():
|
770 |
+
search_query = gr.Textbox(label="Search Conversations")
|
771 |
+
search_button = gr.Button("Search")
|
772 |
+
|
773 |
+
conversation_list = gr.Dropdown(label="Select Conversation", choices=[])
|
774 |
+
conversation_mapping = gr.State({})
|
775 |
+
|
776 |
+
with gr.Tabs():
|
777 |
+
with gr.TabItem("Edit"):
|
778 |
+
chat_content = gr.TextArea(label="Chat Content (JSON)", lines=20, max_lines=50)
|
779 |
+
save_button = gr.Button("Save Changes")
|
780 |
+
|
781 |
+
with gr.TabItem("Preview"):
|
782 |
+
chat_preview = gr.HTML(label="Chat Preview")
|
783 |
+
result_message = gr.Markdown("")
|
784 |
+
|
785 |
+
def search_conversations(query):
|
786 |
+
conversations = search_chat_conversations(query)
|
787 |
+
choices = [f"{conv['conversation_name']} (Media: {conv['media_title']}, ID: {conv['id']})" for conv in
|
788 |
+
conversations]
|
789 |
+
mapping = {choice: conv['id'] for choice, conv in zip(choices, conversations)}
|
790 |
+
return gr.update(choices=choices), mapping
|
791 |
+
|
792 |
+
def load_conversations(selected, conversation_mapping):
|
793 |
+
logging.info(f"Selected: {selected}")
|
794 |
+
logging.info(f"Conversation mapping: {conversation_mapping}")
|
795 |
+
|
796 |
+
try:
|
797 |
+
if selected and selected in conversation_mapping:
|
798 |
+
conversation_id = conversation_mapping[selected]
|
799 |
+
messages = get_chat_messages(conversation_id)
|
800 |
+
conversation_data = {
|
801 |
+
"conversation_id": conversation_id,
|
802 |
+
"messages": messages
|
803 |
+
}
|
804 |
+
json_content = json.dumps(conversation_data, indent=2)
|
805 |
+
|
806 |
+
# Create HTML preview
|
807 |
+
html_preview = "<div style='max-height: 500px; overflow-y: auto;'>"
|
808 |
+
for msg in messages:
|
809 |
+
sender_style = "background-color: #e6f3ff;" if msg[
|
810 |
+
'sender'] == 'user' else "background-color: #f0f0f0;"
|
811 |
+
html_preview += f"<div style='margin-bottom: 10px; padding: 10px; border-radius: 5px; {sender_style}'>"
|
812 |
+
html_preview += f"<strong>{msg['sender']}:</strong> {html.escape(msg['message'])}<br>"
|
813 |
+
html_preview += f"<small>Timestamp: {msg['timestamp']}</small>"
|
814 |
+
html_preview += "</div>"
|
815 |
+
html_preview += "</div>"
|
816 |
+
|
817 |
+
logging.info("Returning json_content and html_preview")
|
818 |
+
return json_content, html_preview
|
819 |
+
else:
|
820 |
+
logging.warning("No conversation selected or not in mapping")
|
821 |
+
return "", "<p>No conversation selected</p>"
|
822 |
+
except Exception as e:
|
823 |
+
logging.error(f"Error in load_conversations: {str(e)}")
|
824 |
+
return f"Error: {str(e)}", "<p>Error loading conversation</p>"
|
825 |
+
|
826 |
+
def validate_conversation_json(content):
|
827 |
+
try:
|
828 |
+
data = json.loads(content)
|
829 |
+
if not isinstance(data, dict):
|
830 |
+
return False, "Invalid JSON structure: root should be an object"
|
831 |
+
if "conversation_id" not in data or not isinstance(data["conversation_id"], int):
|
832 |
+
return False, "Missing or invalid conversation_id"
|
833 |
+
if "messages" not in data or not isinstance(data["messages"], list):
|
834 |
+
return False, "Missing or invalid messages array"
|
835 |
+
for msg in data["messages"]:
|
836 |
+
if not all(key in msg for key in ["sender", "message"]):
|
837 |
+
return False, "Invalid message structure: missing required fields"
|
838 |
+
return True, data
|
839 |
+
except json.JSONDecodeError as e:
|
840 |
+
return False, f"Invalid JSON: {str(e)}"
|
841 |
+
|
842 |
+
def save_conversation(selected, conversation_mapping, content):
|
843 |
+
if not selected or selected not in conversation_mapping:
|
844 |
+
return "Please select a conversation before saving.", "<p>No changes made</p>"
|
845 |
+
|
846 |
+
conversation_id = conversation_mapping[selected]
|
847 |
+
is_valid, result = validate_conversation_json(content)
|
848 |
+
|
849 |
+
if not is_valid:
|
850 |
+
return f"Error: {result}", "<p>No changes made due to error</p>"
|
851 |
+
|
852 |
+
conversation_data = result
|
853 |
+
if conversation_data["conversation_id"] != conversation_id:
|
854 |
+
return "Error: Conversation ID mismatch.", "<p>No changes made due to ID mismatch</p>"
|
855 |
+
|
856 |
+
try:
|
857 |
+
with db.get_connection() as conn:
|
858 |
+
conn.execute("BEGIN TRANSACTION")
|
859 |
+
cursor = conn.cursor()
|
860 |
+
|
861 |
+
# Backup original conversation
|
862 |
+
cursor.execute("SELECT * FROM ChatMessages WHERE conversation_id = ?", (conversation_id,))
|
863 |
+
original_messages = cursor.fetchall()
|
864 |
+
backup_data = json.dumps({"conversation_id": conversation_id, "messages": original_messages})
|
865 |
+
|
866 |
+
# You might want to save this backup_data somewhere
|
867 |
+
|
868 |
+
# Delete existing messages
|
869 |
+
cursor.execute("DELETE FROM ChatMessages WHERE conversation_id = ?", (conversation_id,))
|
870 |
+
|
871 |
+
# Insert updated messages
|
872 |
+
for message in conversation_data["messages"]:
|
873 |
+
cursor.execute('''
|
874 |
+
INSERT INTO ChatMessages (conversation_id, sender, message, timestamp)
|
875 |
+
VALUES (?, ?, ?, COALESCE(?, CURRENT_TIMESTAMP))
|
876 |
+
''', (conversation_id, message["sender"], message["message"], message.get("timestamp")))
|
877 |
+
|
878 |
+
conn.commit()
|
879 |
+
|
880 |
+
# Create updated HTML preview
|
881 |
+
html_preview = "<div style='max-height: 500px; overflow-y: auto;'>"
|
882 |
+
for msg in conversation_data["messages"]:
|
883 |
+
sender_style = "background-color: #e6f3ff;" if msg[
|
884 |
+
'sender'] == 'user' else "background-color: #f0f0f0;"
|
885 |
+
html_preview += f"<div style='margin-bottom: 10px; padding: 10px; border-radius: 5px; {sender_style}'>"
|
886 |
+
html_preview += f"<strong>{msg['sender']}:</strong> {html.escape(msg['message'])}<br>"
|
887 |
+
html_preview += f"<small>Timestamp: {msg.get('timestamp', 'N/A')}</small>"
|
888 |
+
html_preview += "</div>"
|
889 |
+
html_preview += "</div>"
|
890 |
+
|
891 |
+
return "Conversation updated successfully.", html_preview
|
892 |
+
except sqlite3.Error as e:
|
893 |
+
conn.rollback()
|
894 |
+
logging.error(f"Database error in save_conversation: {e}")
|
895 |
+
return f"Error updating conversation: {str(e)}", "<p>Error occurred while saving</p>"
|
896 |
+
except Exception as e:
|
897 |
+
conn.rollback()
|
898 |
+
logging.error(f"Unexpected error in save_conversation: {e}")
|
899 |
+
return f"Unexpected error: {str(e)}", "<p>Unexpected error occurred</p>"
|
900 |
+
|
901 |
+
def parse_formatted_content(formatted_content):
|
902 |
+
lines = formatted_content.split('\n')
|
903 |
+
conversation_id = int(lines[0].split(': ')[1])
|
904 |
+
timestamp = lines[1].split(': ')[1]
|
905 |
+
history = []
|
906 |
+
current_role = None
|
907 |
+
current_content = None
|
908 |
+
for line in lines[3:]:
|
909 |
+
if line.startswith("Role: "):
|
910 |
+
if current_role is not None:
|
911 |
+
history.append({"role": current_role, "content": ["", current_content]})
|
912 |
+
current_role = line.split(': ')[1]
|
913 |
+
elif line.startswith("Content: "):
|
914 |
+
current_content = line.split(': ', 1)[1]
|
915 |
+
if current_role is not None:
|
916 |
+
history.append({"role": current_role, "content": ["", current_content]})
|
917 |
+
return json.dumps({
|
918 |
+
"conversation_id": conversation_id,
|
919 |
+
"timestamp": timestamp,
|
920 |
+
"history": history
|
921 |
+
}, indent=2)
|
922 |
+
|
923 |
+
search_button.click(
|
924 |
+
search_conversations,
|
925 |
+
inputs=[search_query],
|
926 |
+
outputs=[conversation_list, conversation_mapping]
|
927 |
+
)
|
928 |
+
|
929 |
+
conversation_list.change(
|
930 |
+
load_conversations,
|
931 |
+
inputs=[conversation_list, conversation_mapping],
|
932 |
+
outputs=[chat_content, chat_preview]
|
933 |
+
)
|
934 |
+
|
935 |
+
save_button.click(
|
936 |
+
save_conversation,
|
937 |
+
inputs=[conversation_list, conversation_mapping, chat_content],
|
938 |
+
outputs=[result_message, chat_preview]
|
939 |
+
)
|
940 |
+
|
941 |
+
return search_query, search_button, conversation_list, conversation_mapping, chat_content, save_button, result_message, chat_preview
|
942 |
+
|
943 |
+
|
944 |
+
# FIXME - busted and incomplete
|
945 |
+
# Mock function to simulate LLM processing
|
946 |
+
def process_with_llm(workflow, context, prompt):
|
947 |
+
return f"LLM output for {workflow} with context: {context[:30]}... and prompt: {prompt[:30]}..."
|
948 |
+
|
949 |
+
|
950 |
+
# Load workflows from a JSON file
|
951 |
+
json_path = Path('./Helper_Scripts/Workflows/Workflows.json')
|
952 |
+
with json_path.open('r') as f:
|
953 |
+
workflows = json.load(f)
|
954 |
+
|
955 |
+
|
956 |
+
# FIXME - broken Completely. Doesn't work.
|
957 |
+
def chat_workflows_tab():
|
958 |
+
with gr.TabItem("Chat Workflows"):
|
959 |
+
with gr.Blocks() as chat_workflows_block:
|
960 |
+
gr.Markdown("# Workflows using LLMs")
|
961 |
+
|
962 |
+
workflow_selector = gr.Dropdown(label="Select Workflow", choices=[wf['name'] for wf in workflows])
|
963 |
+
context_input = gr.Textbox(label="Context", lines=5)
|
964 |
+
|
965 |
+
# Create lists to hold UI components
|
966 |
+
prompt_inputs = []
|
967 |
+
process_buttons = []
|
968 |
+
output_boxes = []
|
969 |
+
max_prompts = max(len(wf['prompts']) for wf in workflows)
|
970 |
+
|
971 |
+
# Pre-create the maximum number of prompt sections
|
972 |
+
for i in range(max_prompts):
|
973 |
+
prompt_input = gr.Textbox(label=f"Prompt {i + 1}", lines=2, visible=False)
|
974 |
+
output_box = gr.Textbox(label=f"Output {i + 1}", lines=5, visible=False)
|
975 |
+
process_button = gr.Button(f"Process Prompt {i + 1}", visible=False)
|
976 |
+
|
977 |
+
prompt_inputs.append(prompt_input)
|
978 |
+
output_boxes.append(output_box)
|
979 |
+
process_buttons.append(process_button)
|
980 |
+
|
981 |
+
process_button.click(
|
982 |
+
fn=lambda context, prompt, workflow_name, step=i: process(context, prompt, workflow_name, step),
|
983 |
+
inputs=[context_input, prompt_input, workflow_selector],
|
984 |
+
outputs=[output_box]
|
985 |
+
)
|
986 |
+
|
987 |
+
def process(context, prompt, workflow_name, step):
|
988 |
+
selected_workflow = next(wf for wf in workflows if wf['name'] == workflow_name)
|
989 |
+
# Update context with previous outputs
|
990 |
+
for j in range(step):
|
991 |
+
context += f"\n\n{output_boxes[j].value}"
|
992 |
+
result = process_with_llm(selected_workflow['name'], context, prompt)
|
993 |
+
return result
|
994 |
+
|
995 |
+
def update_prompt_sections(workflow_name):
|
996 |
+
selected_workflow = next(wf for wf in workflows if wf['name'] == workflow_name)
|
997 |
+
num_prompts = len(selected_workflow['prompts'])
|
998 |
+
|
999 |
+
for i in range(max_prompts):
|
1000 |
+
if i < num_prompts:
|
1001 |
+
prompt_inputs[i].visible = True
|
1002 |
+
prompt_inputs[i].value = selected_workflow['prompts'][i]
|
1003 |
+
process_buttons[i].visible = True
|
1004 |
+
output_boxes[i].visible = True
|
1005 |
+
else:
|
1006 |
+
prompt_inputs[i].visible = False
|
1007 |
+
process_buttons[i].visible = False
|
1008 |
+
output_boxes[i].visible = False
|
1009 |
+
|
1010 |
+
# Bind the workflow selector to update the UI
|
1011 |
+
workflow_selector.change(update_prompt_sections, inputs=[workflow_selector], outputs=[])
|
1012 |
+
|
1013 |
+
return chat_workflows_block
|
1014 |
+
|
1015 |
+
#
|
1016 |
+
# End of Chat_ui.py
|
1017 |
+
#######################################################################################################################
|
App_Function_Libraries/Gradio_UI/Explain_summarize_tab.py
ADDED
@@ -0,0 +1,193 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Explain_summarize_tab.py
|
2 |
+
# Gradio UI for explaining and summarizing text
|
3 |
+
#
|
4 |
+
# Imports
|
5 |
+
import logging
|
6 |
+
#
|
7 |
+
# External Imports
|
8 |
+
import gradio as gr
|
9 |
+
#
|
10 |
+
# Local Imports
|
11 |
+
from App_Function_Libraries.Local_Summarization_Lib import summarize_with_llama, summarize_with_kobold, \
|
12 |
+
summarize_with_oobabooga, summarize_with_tabbyapi, summarize_with_vllm, summarize_with_local_llm, \
|
13 |
+
summarize_with_ollama
|
14 |
+
from App_Function_Libraries.Summarization_General_Lib import summarize_with_openai, summarize_with_anthropic, \
|
15 |
+
summarize_with_cohere, summarize_with_groq, summarize_with_openrouter, summarize_with_deepseek, \
|
16 |
+
summarize_with_huggingface
|
17 |
+
#
|
18 |
+
#
|
19 |
+
############################################################################################################
|
20 |
+
#
|
21 |
+
# Functions:
|
22 |
+
|
23 |
+
def create_summarize_explain_tab():
|
24 |
+
with gr.TabItem("Explain/Summarize Text"):
|
25 |
+
gr.Markdown("# Explain or Summarize Text without ingesting it into the DB")
|
26 |
+
with gr.Row():
|
27 |
+
with gr.Column():
|
28 |
+
text_to_work_input = gr.Textbox(label="Text to be Explained or Summarized",
|
29 |
+
placeholder="Enter the text you want explained or summarized here",
|
30 |
+
lines=20)
|
31 |
+
with gr.Row():
|
32 |
+
explanation_checkbox = gr.Checkbox(label="Explain Text", value=True)
|
33 |
+
summarization_checkbox = gr.Checkbox(label="Summarize Text", value=True)
|
34 |
+
api_endpoint = gr.Dropdown(
|
35 |
+
choices=[None, "Local-LLM", "OpenAI", "Anthropic", "Cohere", "Groq", "DeepSeek", "Mistral",
|
36 |
+
"OpenRouter",
|
37 |
+
"Llama.cpp", "Kobold", "Ooba", "Tabbyapi", "VLLM", "ollama", "HuggingFace"],
|
38 |
+
value=None,
|
39 |
+
label="API for Summarization (Optional)"
|
40 |
+
)
|
41 |
+
api_key_input = gr.Textbox(label="API Key (if required)", placeholder="Enter your API key here",
|
42 |
+
type="password")
|
43 |
+
explain_summarize_button = gr.Button("Explain/Summarize")
|
44 |
+
|
45 |
+
with gr.Column():
|
46 |
+
summarization_output = gr.Textbox(label="Summary:", lines=20)
|
47 |
+
explanation_output = gr.Textbox(label="Explanation:", lines=50)
|
48 |
+
|
49 |
+
explain_summarize_button.click(
|
50 |
+
fn=summarize_explain_text,
|
51 |
+
inputs=[text_to_work_input, api_endpoint, api_key_input, summarization_checkbox, explanation_checkbox],
|
52 |
+
outputs=[summarization_output, explanation_output]
|
53 |
+
)
|
54 |
+
|
55 |
+
|
56 |
+
def summarize_explain_text(message, api_endpoint, api_key, summarization, explanation):
|
57 |
+
summarization_response = None
|
58 |
+
explanation_response = None
|
59 |
+
temp = 0.7
|
60 |
+
try:
|
61 |
+
logging.info(f"Debug - summarize_explain_text Function - Message: {message}")
|
62 |
+
logging.info(f"Debug - summarize_explain_text Function - API Endpoint: {api_endpoint}")
|
63 |
+
|
64 |
+
# Prepare the input for the API
|
65 |
+
input_data = f"User: {message}\n"
|
66 |
+
# Print first 500 chars
|
67 |
+
logging.info(f"Debug - Chat Function - Input Data: {input_data[:500]}...")
|
68 |
+
logging.debug(f"Debug - Chat Function - API Key: {api_key[:10]}")
|
69 |
+
user_prompt = " "
|
70 |
+
if not api_endpoint:
|
71 |
+
return "Please select an API endpoint", "Please select an API endpoint"
|
72 |
+
try:
|
73 |
+
if summarization:
|
74 |
+
system_prompt = """<s>You are a bulleted notes specialist. [INST]```When creating comprehensive bulleted notes, you should follow these guidelines: Use multiple headings based on the referenced topics, not categories like quotes or terms. Headings should be surrounded by bold formatting and not be listed as bullet points themselves. Leave no space between headings and their corresponding list items underneath. Important terms within the content should be emphasized by setting them in bold font. Any text that ends with a colon should also be bolded. Before submitting your response, review the instructions, and make any corrections necessary to adhered to the specified format. Do not reference these instructions within the notes.``` \nBased on the content between backticks create comprehensive bulleted notes.[/INST]
|
75 |
+
**Bulleted Note Creation Guidelines**
|
76 |
+
|
77 |
+
**Headings**:
|
78 |
+
- Based on referenced topics, not categories like quotes or terms
|
79 |
+
- Surrounded by **bold** formatting
|
80 |
+
- Not listed as bullet points
|
81 |
+
- No space between headings and list items underneath
|
82 |
+
|
83 |
+
**Emphasis**:
|
84 |
+
- **Important terms** set in bold font
|
85 |
+
- **Text ending in a colon**: also bolded
|
86 |
+
|
87 |
+
**Review**:
|
88 |
+
- Ensure adherence to specified format
|
89 |
+
- Do not reference these instructions in your response.</s>[INST] {{ .Prompt }} [/INST]"""
|
90 |
+
|
91 |
+
# Use the existing API request code based on the selected endpoint
|
92 |
+
logging.info(f"Debug - Chat Function - API Endpoint: {api_endpoint}")
|
93 |
+
if api_endpoint.lower() == 'openai':
|
94 |
+
summarization_response = summarize_with_openai(api_key, input_data, user_prompt, temp,
|
95 |
+
system_prompt)
|
96 |
+
elif api_endpoint.lower() == "anthropic":
|
97 |
+
summarization_response = summarize_with_anthropic(api_key, input_data, user_prompt, temp,
|
98 |
+
system_prompt)
|
99 |
+
elif api_endpoint.lower() == "cohere":
|
100 |
+
summarization_response = summarize_with_cohere(api_key, input_data, user_prompt, temp,
|
101 |
+
system_prompt)
|
102 |
+
elif api_endpoint.lower() == "groq":
|
103 |
+
summarization_response = summarize_with_groq(api_key, input_data, user_prompt, temp, system_prompt)
|
104 |
+
elif api_endpoint.lower() == "openrouter":
|
105 |
+
summarization_response = summarize_with_openrouter(api_key, input_data, user_prompt, temp,
|
106 |
+
system_prompt)
|
107 |
+
elif api_endpoint.lower() == "deepseek":
|
108 |
+
summarization_response = summarize_with_deepseek(api_key, input_data, user_prompt, temp,
|
109 |
+
system_prompt)
|
110 |
+
elif api_endpoint.lower() == "llama.cpp":
|
111 |
+
summarization_response = summarize_with_llama(input_data, user_prompt, temp, system_prompt)
|
112 |
+
elif api_endpoint.lower() == "kobold":
|
113 |
+
summarization_response = summarize_with_kobold(input_data, api_key, user_prompt, temp,
|
114 |
+
system_prompt)
|
115 |
+
elif api_endpoint.lower() == "ooba":
|
116 |
+
summarization_response = summarize_with_oobabooga(input_data, api_key, user_prompt, temp,
|
117 |
+
system_prompt)
|
118 |
+
elif api_endpoint.lower() == "tabbyapi":
|
119 |
+
summarization_response = summarize_with_tabbyapi(input_data, user_prompt, temp, system_prompt)
|
120 |
+
elif api_endpoint.lower() == "vllm":
|
121 |
+
summarization_response = summarize_with_vllm(input_data, user_prompt, system_prompt)
|
122 |
+
elif api_endpoint.lower() == "local-llm":
|
123 |
+
summarization_response = summarize_with_local_llm(input_data, user_prompt, temp, system_prompt)
|
124 |
+
elif api_endpoint.lower() == "huggingface":
|
125 |
+
summarization_response = summarize_with_huggingface(api_key, input_data, user_prompt,
|
126 |
+
temp) # , system_prompt)
|
127 |
+
elif api_endpoint.lower() == "ollama":
|
128 |
+
summarization_response = summarize_with_ollama(input_data, user_prompt, temp, system_prompt)
|
129 |
+
else:
|
130 |
+
raise ValueError(f"Unsupported API endpoint: {api_endpoint}")
|
131 |
+
except Exception as e:
|
132 |
+
logging.error(f"Error in summarization: {str(e)}")
|
133 |
+
response1 = f"An error occurred during summarization: {str(e)}"
|
134 |
+
|
135 |
+
try:
|
136 |
+
if explanation:
|
137 |
+
system_prompt = """You are a professional teacher. Please explain the content presented in an easy to digest fashion so that a non-specialist may understand it."""
|
138 |
+
# Use the existing API request code based on the selected endpoint
|
139 |
+
logging.info(f"Debug - Chat Function - API Endpoint: {api_endpoint}")
|
140 |
+
if api_endpoint.lower() == 'openai':
|
141 |
+
explanation_response = summarize_with_openai(api_key, input_data, user_prompt, temp, system_prompt)
|
142 |
+
elif api_endpoint.lower() == "anthropic":
|
143 |
+
explanation_response = summarize_with_anthropic(api_key, input_data, user_prompt, temp,
|
144 |
+
system_prompt)
|
145 |
+
elif api_endpoint.lower() == "cohere":
|
146 |
+
explanation_response = summarize_with_cohere(api_key, input_data, user_prompt, temp, system_prompt)
|
147 |
+
elif api_endpoint.lower() == "groq":
|
148 |
+
explanation_response = summarize_with_groq(api_key, input_data, user_prompt, temp, system_prompt)
|
149 |
+
elif api_endpoint.lower() == "openrouter":
|
150 |
+
explanation_response = summarize_with_openrouter(api_key, input_data, user_prompt, temp,
|
151 |
+
system_prompt)
|
152 |
+
elif api_endpoint.lower() == "deepseek":
|
153 |
+
explanation_response = summarize_with_deepseek(api_key, input_data, user_prompt, temp,
|
154 |
+
system_prompt)
|
155 |
+
elif api_endpoint.lower() == "llama.cpp":
|
156 |
+
explanation_response = summarize_with_llama(input_data, user_prompt, temp, system_prompt)
|
157 |
+
elif api_endpoint.lower() == "kobold":
|
158 |
+
explanation_response = summarize_with_kobold(input_data, api_key, user_prompt, temp, system_prompt)
|
159 |
+
elif api_endpoint.lower() == "ooba":
|
160 |
+
explanation_response = summarize_with_oobabooga(input_data, api_key, user_prompt, temp,
|
161 |
+
system_prompt)
|
162 |
+
elif api_endpoint.lower() == "tabbyapi":
|
163 |
+
explanation_response = summarize_with_tabbyapi(input_data, user_prompt, temp, system_prompt)
|
164 |
+
elif api_endpoint.lower() == "vllm":
|
165 |
+
explanation_response = summarize_with_vllm(input_data, user_prompt, system_prompt)
|
166 |
+
elif api_endpoint.lower() == "local-llm":
|
167 |
+
explanation_response = summarize_with_local_llm(input_data, user_prompt, temp, system_prompt)
|
168 |
+
elif api_endpoint.lower() == "huggingface":
|
169 |
+
explanation_response = summarize_with_huggingface(api_key, input_data, user_prompt,
|
170 |
+
temp) # , system_prompt)
|
171 |
+
elif api_endpoint.lower() == "ollama":
|
172 |
+
explanation_response = summarize_with_ollama(input_data, user_prompt, temp, system_prompt)
|
173 |
+
else:
|
174 |
+
raise ValueError(f"Unsupported API endpoint: {api_endpoint}")
|
175 |
+
except Exception as e:
|
176 |
+
logging.error(f"Error in summarization: {str(e)}")
|
177 |
+
response2 = f"An error occurred during summarization: {str(e)}"
|
178 |
+
|
179 |
+
if summarization_response:
|
180 |
+
response1 = f"Summary: {summarization_response}"
|
181 |
+
else:
|
182 |
+
response1 = "Summary: No summary requested"
|
183 |
+
|
184 |
+
if explanation_response:
|
185 |
+
response2 = f"Explanation: {explanation_response}"
|
186 |
+
else:
|
187 |
+
response2 = "Explanation: No explanation requested"
|
188 |
+
|
189 |
+
return response1, response2
|
190 |
+
|
191 |
+
except Exception as e:
|
192 |
+
logging.error(f"Error in chat function: {str(e)}")
|
193 |
+
return f"An error occurred: {str(e)}"
|
App_Function_Libraries/Gradio_UI/Export_Functionality.py
ADDED
@@ -0,0 +1,314 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Export_Functionality.py
|
2 |
+
# Functionality for exporting items as markdown files
|
3 |
+
import os
|
4 |
+
import json
|
5 |
+
import math
|
6 |
+
import logging
|
7 |
+
import shutil
|
8 |
+
import tempfile
|
9 |
+
from typing import List, Dict, Optional, Tuple
|
10 |
+
import gradio as gr
|
11 |
+
from App_Function_Libraries.DB_Manager import DatabaseError, create_automated_backup, db_path, backup_dir
|
12 |
+
from App_Function_Libraries.Gradio_UI.Gradio_Shared import fetch_item_details, fetch_items_by_keyword, browse_items
|
13 |
+
|
14 |
+
logger = logging.getLogger(__name__)
|
15 |
+
|
16 |
+
def export_item_as_markdown(media_id: int) -> Tuple[Optional[str], str]:
|
17 |
+
try:
|
18 |
+
content, prompt, summary = fetch_item_details(media_id)
|
19 |
+
title = f"Item {media_id}" # You might want to fetch the actual title
|
20 |
+
markdown_content = f"# {title}\n\n## Prompt\n{prompt}\n\n## Summary\n{summary}\n\n## Content\n{content}"
|
21 |
+
|
22 |
+
filename = f"export_item_{media_id}.md"
|
23 |
+
with open(filename, "w", encoding='utf-8') as f:
|
24 |
+
f.write(markdown_content)
|
25 |
+
|
26 |
+
logger.info(f"Successfully exported item {media_id} to {filename}")
|
27 |
+
return filename, f"Successfully exported item {media_id} to {filename}"
|
28 |
+
except Exception as e:
|
29 |
+
error_message = f"Error exporting item {media_id}: {str(e)}"
|
30 |
+
logger.error(error_message)
|
31 |
+
return None, error_message
|
32 |
+
|
33 |
+
|
34 |
+
def export_items_by_keyword(keyword: str) -> str:
|
35 |
+
try:
|
36 |
+
items = fetch_items_by_keyword(keyword)
|
37 |
+
if not items:
|
38 |
+
logger.warning(f"No items found for keyword: {keyword}")
|
39 |
+
return None
|
40 |
+
|
41 |
+
# Create a temporary directory to store individual markdown files
|
42 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
43 |
+
folder_name = f"export_keyword_{keyword}"
|
44 |
+
export_folder = os.path.join(temp_dir, folder_name)
|
45 |
+
os.makedirs(export_folder)
|
46 |
+
|
47 |
+
for item in items:
|
48 |
+
content, prompt, summary = fetch_item_details(item['id'])
|
49 |
+
markdown_content = f"# {item['title']}\n\n## Prompt\n{prompt}\n\n## Summary\n{summary}\n\n## Content\n{content}"
|
50 |
+
|
51 |
+
# Create individual markdown file for each item
|
52 |
+
file_name = f"{item['id']}_{item['title'][:50]}.md" # Limit filename length
|
53 |
+
file_path = os.path.join(export_folder, file_name)
|
54 |
+
with open(file_path, "w", encoding='utf-8') as f:
|
55 |
+
f.write(markdown_content)
|
56 |
+
|
57 |
+
# Create a zip file containing all markdown files
|
58 |
+
zip_filename = f"{folder_name}.zip"
|
59 |
+
shutil.make_archive(os.path.join(temp_dir, folder_name), 'zip', export_folder)
|
60 |
+
|
61 |
+
# Move the zip file to a location accessible by Gradio
|
62 |
+
final_zip_path = os.path.join(os.getcwd(), zip_filename)
|
63 |
+
shutil.move(os.path.join(temp_dir, zip_filename), final_zip_path)
|
64 |
+
|
65 |
+
logger.info(f"Successfully exported {len(items)} items for keyword '{keyword}' to {zip_filename}")
|
66 |
+
return final_zip_path
|
67 |
+
except Exception as e:
|
68 |
+
logger.error(f"Error exporting items for keyword '{keyword}': {str(e)}")
|
69 |
+
return None
|
70 |
+
|
71 |
+
|
72 |
+
def export_selected_items(selected_items: List[Dict]) -> Tuple[Optional[str], str]:
|
73 |
+
try:
|
74 |
+
logger.debug(f"Received selected_items: {selected_items}")
|
75 |
+
if not selected_items:
|
76 |
+
logger.warning("No items selected for export")
|
77 |
+
return None, "No items selected for export"
|
78 |
+
|
79 |
+
markdown_content = "# Selected Items\n\n"
|
80 |
+
for item in selected_items:
|
81 |
+
logger.debug(f"Processing item: {item}")
|
82 |
+
try:
|
83 |
+
# Check if 'value' is a string (JSON) or already a dictionary
|
84 |
+
if isinstance(item, str):
|
85 |
+
item_data = json.loads(item)
|
86 |
+
elif isinstance(item, dict) and 'value' in item:
|
87 |
+
item_data = item['value'] if isinstance(item['value'], dict) else json.loads(item['value'])
|
88 |
+
else:
|
89 |
+
item_data = item
|
90 |
+
|
91 |
+
logger.debug(f"Item data after processing: {item_data}")
|
92 |
+
|
93 |
+
if 'id' not in item_data:
|
94 |
+
logger.error(f"'id' not found in item data: {item_data}")
|
95 |
+
continue
|
96 |
+
|
97 |
+
content, prompt, summary = fetch_item_details(item_data['id'])
|
98 |
+
markdown_content += f"## {item_data.get('title', 'Item {}'.format(item_data['id']))}\n\n### Prompt\n{prompt}\n\n### Summary\n{summary}\n\n### Content\n{content}\n\n---\n\n"
|
99 |
+
except Exception as e:
|
100 |
+
logger.error(f"Error processing item {item}: {str(e)}")
|
101 |
+
markdown_content += f"## Error\n\nUnable to process this item.\n\n---\n\n"
|
102 |
+
|
103 |
+
filename = "export_selected_items.md"
|
104 |
+
with open(filename, "w", encoding='utf-8') as f:
|
105 |
+
f.write(markdown_content)
|
106 |
+
|
107 |
+
logger.info(f"Successfully exported {len(selected_items)} selected items to {filename}")
|
108 |
+
return filename, f"Successfully exported {len(selected_items)} items to {filename}"
|
109 |
+
except Exception as e:
|
110 |
+
error_message = f"Error exporting selected items: {str(e)}"
|
111 |
+
logger.error(error_message)
|
112 |
+
return None, error_message
|
113 |
+
|
114 |
+
|
115 |
+
def display_search_results_export_tab(search_query: str, search_type: str, page: int = 1, items_per_page: int = 10):
|
116 |
+
logger.info(f"Searching with query: '{search_query}', type: '{search_type}', page: {page}")
|
117 |
+
try:
|
118 |
+
results = browse_items(search_query, search_type)
|
119 |
+
logger.info(f"browse_items returned {len(results)} results")
|
120 |
+
|
121 |
+
if not results:
|
122 |
+
return [], f"No results found for query: '{search_query}'", 1, 1
|
123 |
+
|
124 |
+
total_pages = math.ceil(len(results) / items_per_page)
|
125 |
+
start_index = (page - 1) * items_per_page
|
126 |
+
end_index = start_index + items_per_page
|
127 |
+
paginated_results = results[start_index:end_index]
|
128 |
+
|
129 |
+
checkbox_data = [
|
130 |
+
{
|
131 |
+
"name": f"Name: {item[1]}\nURL: {item[2]}",
|
132 |
+
"value": {"id": item[0], "title": item[1], "url": item[2]}
|
133 |
+
}
|
134 |
+
for item in paginated_results
|
135 |
+
]
|
136 |
+
|
137 |
+
logger.info(f"Returning {len(checkbox_data)} items for checkbox (page {page} of {total_pages})")
|
138 |
+
return checkbox_data, f"Found {len(results)} results (showing page {page} of {total_pages})", page, total_pages
|
139 |
+
|
140 |
+
except DatabaseError as e:
|
141 |
+
error_message = f"Error in display_search_results_export_tab: {str(e)}"
|
142 |
+
logger.error(error_message)
|
143 |
+
return [], error_message, 1, 1
|
144 |
+
except Exception as e:
|
145 |
+
error_message = f"Unexpected error in display_search_results_export_tab: {str(e)}"
|
146 |
+
logger.error(error_message)
|
147 |
+
return [], error_message, 1, 1
|
148 |
+
|
149 |
+
|
150 |
+
def create_export_tab():
|
151 |
+
with gr.Tab("Search and Export"):
|
152 |
+
with gr.Row():
|
153 |
+
with gr.Column():
|
154 |
+
gr.Markdown("# Search and Export Items")
|
155 |
+
gr.Markdown("Search for items and export them as markdown files")
|
156 |
+
gr.Markdown("You can also export items by keyword")
|
157 |
+
search_query = gr.Textbox(label="Search Query")
|
158 |
+
search_type = gr.Radio(["Title", "URL", "Keyword", "Content"], label="Search By")
|
159 |
+
search_button = gr.Button("Search")
|
160 |
+
|
161 |
+
with gr.Column():
|
162 |
+
prev_button = gr.Button("Previous Page")
|
163 |
+
next_button = gr.Button("Next Page")
|
164 |
+
|
165 |
+
current_page = gr.State(1)
|
166 |
+
total_pages = gr.State(1)
|
167 |
+
|
168 |
+
search_results = gr.CheckboxGroup(label="Search Results", choices=[])
|
169 |
+
export_selected_button = gr.Button("Export Selected Items")
|
170 |
+
|
171 |
+
keyword_input = gr.Textbox(label="Enter keyword for export")
|
172 |
+
export_by_keyword_button = gr.Button("Export items by keyword")
|
173 |
+
|
174 |
+
export_output = gr.File(label="Download Exported File")
|
175 |
+
error_output = gr.Textbox(label="Status/Error Messages", interactive=False)
|
176 |
+
|
177 |
+
def search_and_update(query, search_type, page):
|
178 |
+
results, message, current, total = display_search_results_export_tab(query, search_type, page)
|
179 |
+
logger.debug(f"search_and_update results: {results}")
|
180 |
+
return results, message, current, total, gr.update(choices=results)
|
181 |
+
|
182 |
+
search_button.click(
|
183 |
+
fn=search_and_update,
|
184 |
+
inputs=[search_query, search_type, current_page],
|
185 |
+
outputs=[search_results, error_output, current_page, total_pages, search_results],
|
186 |
+
show_progress="full"
|
187 |
+
)
|
188 |
+
|
189 |
+
|
190 |
+
def update_page(current, total, direction):
|
191 |
+
new_page = max(1, min(total, current + direction))
|
192 |
+
return new_page
|
193 |
+
|
194 |
+
prev_button.click(
|
195 |
+
fn=update_page,
|
196 |
+
inputs=[current_page, total_pages, gr.State(-1)],
|
197 |
+
outputs=[current_page]
|
198 |
+
).then(
|
199 |
+
fn=search_and_update,
|
200 |
+
inputs=[search_query, search_type, current_page],
|
201 |
+
outputs=[search_results, error_output, current_page, total_pages],
|
202 |
+
show_progress=True
|
203 |
+
)
|
204 |
+
|
205 |
+
next_button.click(
|
206 |
+
fn=update_page,
|
207 |
+
inputs=[current_page, total_pages, gr.State(1)],
|
208 |
+
outputs=[current_page]
|
209 |
+
).then(
|
210 |
+
fn=search_and_update,
|
211 |
+
inputs=[search_query, search_type, current_page],
|
212 |
+
outputs=[search_results, error_output, current_page, total_pages],
|
213 |
+
show_progress=True
|
214 |
+
)
|
215 |
+
|
216 |
+
def handle_export_selected(selected_items):
|
217 |
+
logger.debug(f"Exporting selected items: {selected_items}")
|
218 |
+
return export_selected_items(selected_items)
|
219 |
+
|
220 |
+
export_selected_button.click(
|
221 |
+
fn=handle_export_selected,
|
222 |
+
inputs=[search_results],
|
223 |
+
outputs=[export_output, error_output],
|
224 |
+
show_progress="full"
|
225 |
+
)
|
226 |
+
|
227 |
+
export_by_keyword_button.click(
|
228 |
+
fn=export_items_by_keyword,
|
229 |
+
inputs=[keyword_input],
|
230 |
+
outputs=[export_output, error_output],
|
231 |
+
show_progress="full"
|
232 |
+
)
|
233 |
+
|
234 |
+
def handle_item_selection(selected_items):
|
235 |
+
logger.debug(f"Selected items: {selected_items}")
|
236 |
+
if not selected_items:
|
237 |
+
return None, "No item selected"
|
238 |
+
|
239 |
+
try:
|
240 |
+
# Assuming selected_items is a list of dictionaries
|
241 |
+
selected_item = selected_items[0]
|
242 |
+
logger.debug(f"First selected item: {selected_item}")
|
243 |
+
|
244 |
+
# Check if 'value' is a string (JSON) or already a dictionary
|
245 |
+
if isinstance(selected_item['value'], str):
|
246 |
+
item_data = json.loads(selected_item['value'])
|
247 |
+
else:
|
248 |
+
item_data = selected_item['value']
|
249 |
+
|
250 |
+
logger.debug(f"Item data: {item_data}")
|
251 |
+
|
252 |
+
item_id = item_data['id']
|
253 |
+
return export_item_as_markdown(item_id)
|
254 |
+
except Exception as e:
|
255 |
+
error_message = f"Error processing selected item: {str(e)}"
|
256 |
+
logger.error(error_message)
|
257 |
+
return None, error_message
|
258 |
+
|
259 |
+
search_results.select(
|
260 |
+
fn=handle_item_selection,
|
261 |
+
inputs=[search_results],
|
262 |
+
outputs=[export_output, error_output],
|
263 |
+
show_progress="full"
|
264 |
+
)
|
265 |
+
|
266 |
+
|
267 |
+
|
268 |
+
def create_backup():
|
269 |
+
backup_file = create_automated_backup(db_path, backup_dir)
|
270 |
+
return f"Backup created: {backup_file}"
|
271 |
+
|
272 |
+
def list_backups():
|
273 |
+
backups = [f for f in os.listdir(backup_dir) if f.endswith('.db')]
|
274 |
+
return "\n".join(backups)
|
275 |
+
|
276 |
+
def restore_backup(backup_name):
|
277 |
+
backup_path = os.path.join(backup_dir, backup_name)
|
278 |
+
if os.path.exists(backup_path):
|
279 |
+
shutil.copy2(backup_path, db_path)
|
280 |
+
return f"Database restored from {backup_name}"
|
281 |
+
else:
|
282 |
+
return "Backup file not found"
|
283 |
+
|
284 |
+
|
285 |
+
def create_backup_tab():
|
286 |
+
with gr.Tab("Create Backup"):
|
287 |
+
gr.Markdown("# Create a backup of the database")
|
288 |
+
with gr.Row():
|
289 |
+
with gr.Column():
|
290 |
+
create_button = gr.Button("Create Backup")
|
291 |
+
create_output = gr.Textbox(label="Result")
|
292 |
+
with gr.Column():
|
293 |
+
create_button.click(create_backup, inputs=[], outputs=create_output)
|
294 |
+
|
295 |
+
def create_view_backups_tab():
|
296 |
+
with gr.TabItem("View Backups"):
|
297 |
+
gr.Markdown("# Browse available backups")
|
298 |
+
with gr.Row():
|
299 |
+
with gr.Column():
|
300 |
+
view_button = gr.Button("View Backups")
|
301 |
+
with gr.Column():
|
302 |
+
backup_list = gr.Textbox(label="Available Backups")
|
303 |
+
view_button.click(list_backups, inputs=[], outputs=backup_list)
|
304 |
+
|
305 |
+
|
306 |
+
def create_restore_backup_tab():
|
307 |
+
with gr.TabItem("Restore Backup"):
|
308 |
+
gr.Markdown("# Restore a backup of the database")
|
309 |
+
with gr.Column():
|
310 |
+
backup_input = gr.Textbox(label="Backup Filename")
|
311 |
+
restore_button = gr.Button("Restore")
|
312 |
+
with gr.Column():
|
313 |
+
restore_output = gr.Textbox(label="Result")
|
314 |
+
restore_button.click(restore_backup, inputs=[backup_input], outputs=restore_output)
|
App_Function_Libraries/Gradio_UI/Gradio_Shared.py
ADDED
@@ -0,0 +1,284 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Gradio_Shared.py
|
2 |
+
# Gradio UI functions that are shared across multiple tabs
|
3 |
+
#
|
4 |
+
# Imports
|
5 |
+
import logging
|
6 |
+
import sqlite3
|
7 |
+
import traceback
|
8 |
+
from functools import wraps
|
9 |
+
from typing import List, Tuple
|
10 |
+
#
|
11 |
+
# External Imports
|
12 |
+
import gradio as gr
|
13 |
+
#
|
14 |
+
# Local Imports
|
15 |
+
from App_Function_Libraries.DB_Manager import list_prompts, db, search_and_display, fetch_prompt_details
|
16 |
+
from App_Function_Libraries.SQLite_DB import DatabaseError
|
17 |
+
from App_Function_Libraries.Utils import format_transcription
|
18 |
+
#
|
19 |
+
##############################################################################################################
|
20 |
+
#
|
21 |
+
# Functions:
|
22 |
+
|
23 |
+
whisper_models = ["small", "medium", "small.en", "medium.en", "medium", "large", "large-v1", "large-v2", "large-v3",
|
24 |
+
"distil-large-v2", "distil-medium.en", "distil-small.en"]
|
25 |
+
|
26 |
+
# Sample data
|
27 |
+
prompts_category_1 = [
|
28 |
+
"What are the key points discussed in the video?",
|
29 |
+
"Summarize the main arguments made by the speaker.",
|
30 |
+
"Describe the conclusions of the study presented."
|
31 |
+
]
|
32 |
+
|
33 |
+
prompts_category_2 = [
|
34 |
+
"How does the proposed solution address the problem?",
|
35 |
+
"What are the implications of the findings?",
|
36 |
+
"Can you explain the theory behind the observed phenomenon?"
|
37 |
+
]
|
38 |
+
|
39 |
+
all_prompts = prompts_category_1 + prompts_category_2
|
40 |
+
|
41 |
+
|
42 |
+
|
43 |
+
#FIXME - SQL Functions that need to be addressed/added to DB manager
|
44 |
+
def search_media(query, fields, keyword, page):
|
45 |
+
try:
|
46 |
+
results = search_and_display(query, fields, keyword, page)
|
47 |
+
return results
|
48 |
+
except Exception as e:
|
49 |
+
logger = logging.getLogger()
|
50 |
+
logger.error(f"Error searching media: {e}")
|
51 |
+
return str(e)
|
52 |
+
|
53 |
+
def fetch_items_by_title_or_url(search_query: str, search_type: str):
|
54 |
+
try:
|
55 |
+
with db.get_connection() as conn:
|
56 |
+
cursor = conn.cursor()
|
57 |
+
if search_type == 'Title':
|
58 |
+
cursor.execute("SELECT id, title, url FROM Media WHERE title LIKE ?", (f'%{search_query}%',))
|
59 |
+
elif search_type == 'URL':
|
60 |
+
cursor.execute("SELECT id, title, url FROM Media WHERE url LIKE ?", (f'%{search_query}%',))
|
61 |
+
results = cursor.fetchall()
|
62 |
+
return results
|
63 |
+
except sqlite3.Error as e:
|
64 |
+
raise DatabaseError(f"Error fetching items by {search_type}: {e}")
|
65 |
+
|
66 |
+
def fetch_items_by_keyword(search_query: str):
|
67 |
+
try:
|
68 |
+
with db.get_connection() as conn:
|
69 |
+
cursor = conn.cursor()
|
70 |
+
cursor.execute("""
|
71 |
+
SELECT m.id, m.title, m.url
|
72 |
+
FROM Media m
|
73 |
+
JOIN MediaKeywords mk ON m.id = mk.media_id
|
74 |
+
JOIN Keywords k ON mk.keyword_id = k.id
|
75 |
+
WHERE k.keyword LIKE ?
|
76 |
+
""", (f'%{search_query}%',))
|
77 |
+
results = cursor.fetchall()
|
78 |
+
return results
|
79 |
+
except sqlite3.Error as e:
|
80 |
+
raise DatabaseError(f"Error fetching items by keyword: {e}")
|
81 |
+
|
82 |
+
# FIXME - Raw SQL not using DB_Manager...
|
83 |
+
def fetch_items_by_content(search_query: str):
|
84 |
+
try:
|
85 |
+
with db.get_connection() as conn:
|
86 |
+
cursor = conn.cursor()
|
87 |
+
cursor.execute("SELECT id, title, url FROM Media WHERE content LIKE ?", (f'%{search_query}%',))
|
88 |
+
results = cursor.fetchall()
|
89 |
+
return results
|
90 |
+
except sqlite3.Error as e:
|
91 |
+
raise DatabaseError(f"Error fetching items by content: {e}")
|
92 |
+
|
93 |
+
|
94 |
+
|
95 |
+
# FIXME - RAW SQL not using DB_Manager...
|
96 |
+
def fetch_item_details_single(media_id: int):
|
97 |
+
try:
|
98 |
+
with db.get_connection() as conn:
|
99 |
+
cursor = conn.cursor()
|
100 |
+
cursor.execute("""
|
101 |
+
SELECT prompt, summary
|
102 |
+
FROM MediaModifications
|
103 |
+
WHERE media_id = ?
|
104 |
+
ORDER BY modification_date DESC
|
105 |
+
LIMIT 1
|
106 |
+
""", (media_id,))
|
107 |
+
prompt_summary_result = cursor.fetchone()
|
108 |
+
cursor.execute("SELECT content FROM Media WHERE id = ?", (media_id,))
|
109 |
+
content_result = cursor.fetchone()
|
110 |
+
|
111 |
+
prompt = prompt_summary_result[0] if prompt_summary_result else ""
|
112 |
+
summary = prompt_summary_result[1] if prompt_summary_result else ""
|
113 |
+
content = content_result[0] if content_result else ""
|
114 |
+
|
115 |
+
return prompt, summary, content
|
116 |
+
except sqlite3.Error as e:
|
117 |
+
raise Exception(f"Error fetching item details: {e}")
|
118 |
+
|
119 |
+
|
120 |
+
# FIXME - RAW SQL not using DB_Manager...
|
121 |
+
def fetch_item_details(media_id: int):
|
122 |
+
try:
|
123 |
+
with db.get_connection() as conn:
|
124 |
+
cursor = conn.cursor()
|
125 |
+
cursor.execute("""
|
126 |
+
SELECT prompt, summary
|
127 |
+
FROM MediaModifications
|
128 |
+
WHERE media_id = ?
|
129 |
+
ORDER BY modification_date DESC
|
130 |
+
LIMIT 1
|
131 |
+
""", (media_id,))
|
132 |
+
prompt_summary_result = cursor.fetchone()
|
133 |
+
cursor.execute("SELECT content FROM Media WHERE id = ?", (media_id,))
|
134 |
+
content_result = cursor.fetchone()
|
135 |
+
|
136 |
+
prompt = prompt_summary_result[0] if prompt_summary_result else ""
|
137 |
+
summary = prompt_summary_result[1] if prompt_summary_result else ""
|
138 |
+
content = content_result[0] if content_result else ""
|
139 |
+
|
140 |
+
return content, prompt, summary
|
141 |
+
except sqlite3.Error as e:
|
142 |
+
logging.error(f"Error fetching item details: {e}")
|
143 |
+
return "", "", "" # Return empty strings if there's an error
|
144 |
+
|
145 |
+
# Handle prompt selection
|
146 |
+
def handle_prompt_selection(prompt):
|
147 |
+
return f"You selected: {prompt}"
|
148 |
+
|
149 |
+
|
150 |
+
def update_user_prompt(preset_name):
|
151 |
+
details = fetch_prompt_details(preset_name)
|
152 |
+
if details:
|
153 |
+
# Return a dictionary with all details
|
154 |
+
return {
|
155 |
+
"title": details[0],
|
156 |
+
"details": details[1],
|
157 |
+
"system_prompt": details[2],
|
158 |
+
"user_prompt": details[3] if len(details) > 3 else ""
|
159 |
+
}
|
160 |
+
return {"title": "", "details": "", "system_prompt": "", "user_prompt": ""}
|
161 |
+
|
162 |
+
def browse_items(search_query, search_type):
|
163 |
+
if search_type == 'Keyword':
|
164 |
+
results = fetch_items_by_keyword(search_query)
|
165 |
+
elif search_type == 'Content':
|
166 |
+
results = fetch_items_by_content(search_query)
|
167 |
+
else:
|
168 |
+
results = fetch_items_by_title_or_url(search_query, search_type)
|
169 |
+
return results
|
170 |
+
|
171 |
+
|
172 |
+
def update_dropdown(search_query, search_type):
|
173 |
+
results = browse_items(search_query, search_type)
|
174 |
+
item_options = [f"{item[1]} ({item[2]})" for item in results]
|
175 |
+
new_item_mapping = {f"{item[1]} ({item[2]})": item[0] for item in results}
|
176 |
+
print(f"Debug - Update Dropdown - New Item Mapping: {new_item_mapping}")
|
177 |
+
return gr.update(choices=item_options), new_item_mapping
|
178 |
+
|
179 |
+
|
180 |
+
|
181 |
+
def get_media_id(selected_item, item_mapping):
|
182 |
+
return item_mapping.get(selected_item)
|
183 |
+
|
184 |
+
|
185 |
+
def update_detailed_view(item, item_mapping):
|
186 |
+
# Function to update the detailed view based on selected item
|
187 |
+
if item:
|
188 |
+
item_id = item_mapping.get(item)
|
189 |
+
if item_id:
|
190 |
+
content, prompt, summary = fetch_item_details(item_id)
|
191 |
+
if content or prompt or summary:
|
192 |
+
details_html = "<h4>Details:</h4>"
|
193 |
+
if prompt:
|
194 |
+
formatted_prompt = format_transcription(prompt)
|
195 |
+
details_html += f"<h4>Prompt:</h4>{formatted_prompt}</p>"
|
196 |
+
if summary:
|
197 |
+
formatted_summary = format_transcription(summary)
|
198 |
+
details_html += f"<h4>Summary:</h4>{formatted_summary}</p>"
|
199 |
+
# Format the transcription content for better readability
|
200 |
+
formatted_content = format_transcription(content)
|
201 |
+
#content_html = f"<h4>Transcription:</h4><div style='white-space: pre-wrap;'>{content}</div>"
|
202 |
+
content_html = f"<h4>Transcription:</h4><div style='white-space: pre-wrap;'>{formatted_content}</div>"
|
203 |
+
return details_html, content_html
|
204 |
+
else:
|
205 |
+
return "No details available.", "No details available."
|
206 |
+
else:
|
207 |
+
return "No item selected", "No item selected"
|
208 |
+
else:
|
209 |
+
return "No item selected", "No item selected"
|
210 |
+
|
211 |
+
|
212 |
+
def format_content(content):
|
213 |
+
# Format content using markdown
|
214 |
+
formatted_content = f"```\n{content}\n```"
|
215 |
+
return formatted_content
|
216 |
+
|
217 |
+
|
218 |
+
def update_prompt_dropdown():
|
219 |
+
prompt_names = list_prompts()
|
220 |
+
return gr.update(choices=prompt_names)
|
221 |
+
|
222 |
+
|
223 |
+
def display_prompt_details(selected_prompt):
|
224 |
+
if selected_prompt:
|
225 |
+
prompts = update_user_prompt(selected_prompt)
|
226 |
+
if prompts["title"]: # Check if we have any details
|
227 |
+
details_str = f"<h4>Details:</h4><p>{prompts['details']}</p>"
|
228 |
+
system_str = f"<h4>System:</h4><p>{prompts['system_prompt']}</p>"
|
229 |
+
user_str = f"<h4>User:</h4><p>{prompts['user_prompt']}</p>" if prompts['user_prompt'] else ""
|
230 |
+
return details_str + system_str + user_str
|
231 |
+
return "No details available."
|
232 |
+
|
233 |
+
def search_media_database(query: str) -> List[Tuple[int, str, str]]:
|
234 |
+
return browse_items(query, 'Title')
|
235 |
+
|
236 |
+
|
237 |
+
def load_media_content(media_id: int) -> dict:
|
238 |
+
try:
|
239 |
+
print(f"Debug - Load Media Content - Media ID: {media_id}")
|
240 |
+
item_details = fetch_item_details(media_id)
|
241 |
+
print(f"Debug - Load Media Content - Item Details: \n\n{item_details}\n\n\n\n")
|
242 |
+
|
243 |
+
if isinstance(item_details, tuple) and len(item_details) == 3:
|
244 |
+
content, prompt, summary = item_details
|
245 |
+
else:
|
246 |
+
print(f"Debug - Load Media Content - Unexpected item_details format: \n\n{item_details}\n\n\n\n")
|
247 |
+
content, prompt, summary = "", "", ""
|
248 |
+
|
249 |
+
return {
|
250 |
+
"content": content or "No content available",
|
251 |
+
"prompt": prompt or "No prompt available",
|
252 |
+
"summary": summary or "No summary available"
|
253 |
+
}
|
254 |
+
except Exception as e:
|
255 |
+
print(f"Debug - Load Media Content - Error: {str(e)}")
|
256 |
+
return {"content": "", "prompt": "", "summary": ""}
|
257 |
+
|
258 |
+
|
259 |
+
def error_handler(func):
|
260 |
+
@wraps(func)
|
261 |
+
def wrapper(*args, **kwargs):
|
262 |
+
try:
|
263 |
+
return func(*args, **kwargs)
|
264 |
+
except Exception as e:
|
265 |
+
error_message = f"Error in {func.__name__}: {str(e)}"
|
266 |
+
logging.error(f"{error_message}\n{traceback.format_exc()}")
|
267 |
+
return {"error": error_message, "details": traceback.format_exc()}
|
268 |
+
return wrapper
|
269 |
+
|
270 |
+
|
271 |
+
def create_chunking_inputs():
|
272 |
+
chunk_text_by_words_checkbox = gr.Checkbox(label="Chunk Text by Words", value=False, visible=True)
|
273 |
+
max_words_input = gr.Number(label="Max Words", value=300, precision=0, visible=True)
|
274 |
+
chunk_text_by_sentences_checkbox = gr.Checkbox(label="Chunk Text by Sentences", value=False, visible=True)
|
275 |
+
max_sentences_input = gr.Number(label="Max Sentences", value=10, precision=0, visible=True)
|
276 |
+
chunk_text_by_paragraphs_checkbox = gr.Checkbox(label="Chunk Text by Paragraphs", value=False, visible=True)
|
277 |
+
max_paragraphs_input = gr.Number(label="Max Paragraphs", value=5, precision=0, visible=True)
|
278 |
+
chunk_text_by_tokens_checkbox = gr.Checkbox(label="Chunk Text by Tokens", value=False, visible=True)
|
279 |
+
max_tokens_input = gr.Number(label="Max Tokens", value=1000, precision=0, visible=True)
|
280 |
+
gr_semantic_chunk_long_file = gr.Checkbox(label="Semantic Chunking by Sentence similarity", value=False, visible=True)
|
281 |
+
gr_semantic_chunk_long_file_size = gr.Number(label="Max Chunk Size", value=2000, visible=True)
|
282 |
+
gr_semantic_chunk_long_file_overlap = gr.Number(label="Max Chunk Overlap Size", value=100, visible=True)
|
283 |
+
return [chunk_text_by_words_checkbox, max_words_input, chunk_text_by_sentences_checkbox, max_sentences_input,
|
284 |
+
chunk_text_by_paragraphs_checkbox, max_paragraphs_input, chunk_text_by_tokens_checkbox, max_tokens_input]
|
App_Function_Libraries/Gradio_UI/Import_Functionality.py
ADDED
@@ -0,0 +1,473 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Import_Functionality.py
|
2 |
+
# Functionality to import content into the DB
|
3 |
+
#
|
4 |
+
# Imports
|
5 |
+
from time import sleep
|
6 |
+
import logging
|
7 |
+
import re
|
8 |
+
import shutil
|
9 |
+
import tempfile
|
10 |
+
import os
|
11 |
+
import traceback
|
12 |
+
import zipfile
|
13 |
+
#
|
14 |
+
# External Imports
|
15 |
+
import gradio as gr
|
16 |
+
import pypandoc
|
17 |
+
#
|
18 |
+
# Local Imports
|
19 |
+
from App_Function_Libraries.DB_Manager import insert_prompt_to_db, load_preset_prompts, import_obsidian_note_to_db, \
|
20 |
+
add_media_to_database
|
21 |
+
from App_Function_Libraries.Prompt_Handling import import_prompt_from_file, import_prompts_from_zip#
|
22 |
+
from App_Function_Libraries.Summarization_General_Lib import perform_summarization
|
23 |
+
|
24 |
+
###################################################################################################################
|
25 |
+
#
|
26 |
+
# Functions:
|
27 |
+
|
28 |
+
logger = logging.getLogger()
|
29 |
+
|
30 |
+
|
31 |
+
def import_data(file, title, author, keywords, custom_prompt, summary, auto_summarize, api_name, api_key):
|
32 |
+
if file is None:
|
33 |
+
return "No file uploaded. Please upload a file."
|
34 |
+
|
35 |
+
try:
|
36 |
+
logging.debug(f"File object type: {type(file)}")
|
37 |
+
logging.debug(f"File object attributes: {dir(file)}")
|
38 |
+
|
39 |
+
if hasattr(file, 'name'):
|
40 |
+
file_name = file.name
|
41 |
+
else:
|
42 |
+
file_name = 'unknown_file'
|
43 |
+
|
44 |
+
# Create a temporary file
|
45 |
+
with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.txt', encoding='utf-8') as temp_file:
|
46 |
+
if isinstance(file, str):
|
47 |
+
# If file is a string, it's likely file content
|
48 |
+
temp_file.write(file)
|
49 |
+
elif hasattr(file, 'read'):
|
50 |
+
# If file has a 'read' method, it's likely a file-like object
|
51 |
+
content = file.read()
|
52 |
+
if isinstance(content, bytes):
|
53 |
+
content = content.decode('utf-8')
|
54 |
+
temp_file.write(content)
|
55 |
+
else:
|
56 |
+
# If it's neither a string nor a file-like object, try converting it to a string
|
57 |
+
temp_file.write(str(file))
|
58 |
+
|
59 |
+
temp_file.seek(0)
|
60 |
+
file_content = temp_file.read()
|
61 |
+
|
62 |
+
logging.debug(f"File name: {file_name}")
|
63 |
+
logging.debug(f"File content (first 100 chars): {file_content[:100]}")
|
64 |
+
|
65 |
+
# Create info_dict
|
66 |
+
info_dict = {
|
67 |
+
'title': title or 'Untitled',
|
68 |
+
'uploader': author or 'Unknown',
|
69 |
+
}
|
70 |
+
|
71 |
+
# FIXME - Add chunking support... I added chapter chunking specifically for this...
|
72 |
+
# Create segments (assuming one segment for the entire content)
|
73 |
+
segments = [{'Text': file_content}]
|
74 |
+
|
75 |
+
# Process keywords
|
76 |
+
keyword_list = [kw.strip() for kw in keywords.split(',') if kw.strip()]
|
77 |
+
|
78 |
+
# Handle summarization
|
79 |
+
if auto_summarize and api_name and api_key:
|
80 |
+
summary = perform_summarization(api_name, file_content, custom_prompt, api_key)
|
81 |
+
elif not summary:
|
82 |
+
summary = "No summary provided"
|
83 |
+
|
84 |
+
# Add to database
|
85 |
+
add_media_to_database(
|
86 |
+
url=file_name, # Using filename as URL
|
87 |
+
info_dict=info_dict,
|
88 |
+
segments=segments,
|
89 |
+
summary=summary,
|
90 |
+
keywords=keyword_list,
|
91 |
+
custom_prompt_input=custom_prompt,
|
92 |
+
whisper_model="Imported", # Indicating this was an imported file
|
93 |
+
media_type="document"
|
94 |
+
)
|
95 |
+
|
96 |
+
# Clean up the temporary file
|
97 |
+
os.unlink(temp_file.name)
|
98 |
+
|
99 |
+
return f"File '{file_name}' successfully imported with title '{title}' and author '{author}'."
|
100 |
+
except Exception as e:
|
101 |
+
logging.error(f"Error importing file: {str(e)}")
|
102 |
+
return f"Error importing file: {str(e)}"
|
103 |
+
|
104 |
+
|
105 |
+
|
106 |
+
|
107 |
+
|
108 |
+
def process_obsidian_zip(zip_file):
|
109 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
110 |
+
try:
|
111 |
+
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
|
112 |
+
zip_ref.extractall(temp_dir)
|
113 |
+
|
114 |
+
imported_files, total_files, errors = import_obsidian_vault(temp_dir)
|
115 |
+
|
116 |
+
return imported_files, total_files, errors
|
117 |
+
except zipfile.BadZipFile:
|
118 |
+
error_msg = "The uploaded file is not a valid zip file."
|
119 |
+
logger.error(error_msg)
|
120 |
+
return 0, 0, [error_msg]
|
121 |
+
except Exception as e:
|
122 |
+
error_msg = f"Error processing zip file: {str(e)}\n{traceback.format_exc()}"
|
123 |
+
logger.error(error_msg)
|
124 |
+
return 0, 0, [error_msg]
|
125 |
+
finally:
|
126 |
+
shutil.rmtree(temp_dir, ignore_errors=True)
|
127 |
+
|
128 |
+
|
129 |
+
|
130 |
+
def scan_obsidian_vault(vault_path):
|
131 |
+
markdown_files = []
|
132 |
+
for root, dirs, files in os.walk(vault_path):
|
133 |
+
for file in files:
|
134 |
+
if file.endswith('.md'):
|
135 |
+
markdown_files.append(os.path.join(root, file))
|
136 |
+
return markdown_files
|
137 |
+
|
138 |
+
|
139 |
+
def parse_obsidian_note(file_path):
|
140 |
+
with open(file_path, 'r', encoding='utf-8') as file:
|
141 |
+
content = file.read()
|
142 |
+
|
143 |
+
frontmatter = {}
|
144 |
+
frontmatter_match = re.match(r'^---\s*\n(.*?)\n---\s*\n', content, re.DOTALL)
|
145 |
+
if frontmatter_match:
|
146 |
+
frontmatter_text = frontmatter_match.group(1)
|
147 |
+
import yaml
|
148 |
+
frontmatter = yaml.safe_load(frontmatter_text)
|
149 |
+
content = content[frontmatter_match.end():]
|
150 |
+
|
151 |
+
tags = re.findall(r'#(\w+)', content)
|
152 |
+
links = re.findall(r'\[\[(.*?)\]\]', content)
|
153 |
+
|
154 |
+
return {
|
155 |
+
'title': os.path.basename(file_path).replace('.md', ''),
|
156 |
+
'content': content,
|
157 |
+
'frontmatter': frontmatter,
|
158 |
+
'tags': tags,
|
159 |
+
'links': links,
|
160 |
+
'file_path': file_path # Add this line
|
161 |
+
}
|
162 |
+
|
163 |
+
def create_import_single_prompt_tab():
|
164 |
+
with gr.TabItem("Import a Prompt"):
|
165 |
+
gr.Markdown("# Import a prompt into the database")
|
166 |
+
|
167 |
+
with gr.Row():
|
168 |
+
with gr.Column():
|
169 |
+
import_file = gr.File(label="Upload file for import", file_types=["txt", "md"])
|
170 |
+
title_input = gr.Textbox(label="Title", placeholder="Enter the title of the content")
|
171 |
+
author_input = gr.Textbox(label="Author", placeholder="Enter the author's name")
|
172 |
+
system_input = gr.Textbox(label="System", placeholder="Enter the system message for the prompt", lines=3)
|
173 |
+
user_input = gr.Textbox(label="User", placeholder="Enter the user message for the prompt", lines=3)
|
174 |
+
keywords_input = gr.Textbox(label="Keywords", placeholder="Enter keywords separated by commas")
|
175 |
+
import_button = gr.Button("Import Prompt")
|
176 |
+
|
177 |
+
with gr.Column():
|
178 |
+
import_output = gr.Textbox(label="Import Status")
|
179 |
+
save_button = gr.Button("Save to Database")
|
180 |
+
save_output = gr.Textbox(label="Save Status")
|
181 |
+
|
182 |
+
def handle_import(file):
|
183 |
+
result = import_prompt_from_file(file)
|
184 |
+
if isinstance(result, tuple) and len(result) == 5:
|
185 |
+
title, author, system, user, keywords = result
|
186 |
+
return gr.update(value="File successfully imported. You can now edit the content before saving."), \
|
187 |
+
gr.update(value=title), gr.update(value=author), gr.update(value=system), \
|
188 |
+
gr.update(value=user), gr.update(value=", ".join(keywords))
|
189 |
+
else:
|
190 |
+
return gr.update(value=result), gr.update(), gr.update(), gr.update(), gr.update(), gr.update()
|
191 |
+
|
192 |
+
import_button.click(
|
193 |
+
fn=handle_import,
|
194 |
+
inputs=[import_file],
|
195 |
+
outputs=[import_output, title_input, author_input, system_input, user_input, keywords_input]
|
196 |
+
)
|
197 |
+
|
198 |
+
def save_prompt_to_db(title, author, system, user, keywords):
|
199 |
+
keyword_list = [k.strip() for k in keywords.split(',') if k.strip()]
|
200 |
+
return insert_prompt_to_db(title, author, system, user, keyword_list)
|
201 |
+
|
202 |
+
save_button.click(
|
203 |
+
fn=save_prompt_to_db,
|
204 |
+
inputs=[title_input, author_input, system_input, user_input, keywords_input],
|
205 |
+
outputs=save_output
|
206 |
+
)
|
207 |
+
|
208 |
+
def update_prompt_dropdown():
|
209 |
+
return gr.update(choices=load_preset_prompts())
|
210 |
+
|
211 |
+
save_button.click(
|
212 |
+
fn=update_prompt_dropdown,
|
213 |
+
inputs=[],
|
214 |
+
outputs=[gr.Dropdown(label="Select Preset Prompt")]
|
215 |
+
)
|
216 |
+
|
217 |
+
def create_import_item_tab():
|
218 |
+
with gr.TabItem("Import Markdown/Text Files"):
|
219 |
+
gr.Markdown("# Import a markdown file or text file into the database")
|
220 |
+
gr.Markdown("...and have it tagged + summarized")
|
221 |
+
with gr.Row():
|
222 |
+
with gr.Column():
|
223 |
+
import_file = gr.File(label="Upload file for import", file_types=["txt", "md"])
|
224 |
+
title_input = gr.Textbox(label="Title", placeholder="Enter the title of the content")
|
225 |
+
author_input = gr.Textbox(label="Author", placeholder="Enter the author's name")
|
226 |
+
keywords_input = gr.Textbox(label="Keywords", placeholder="Enter keywords, comma-separated")
|
227 |
+
custom_prompt_input = gr.Textbox(label="Custom Prompt",
|
228 |
+
placeholder="Enter a custom prompt for summarization (optional)")
|
229 |
+
summary_input = gr.Textbox(label="Summary",
|
230 |
+
placeholder="Enter a summary or leave blank for auto-summarization", lines=3)
|
231 |
+
auto_summarize_checkbox = gr.Checkbox(label="Auto-summarize", value=False)
|
232 |
+
api_name_input = gr.Dropdown(
|
233 |
+
choices=[None, "Local-LLM", "OpenAI", "Anthropic", "Cohere", "Groq", "DeepSeek", "Mistral", "OpenRouter",
|
234 |
+
"Llama.cpp", "Kobold", "Ooba", "Tabbyapi", "VLLM","ollama", "HuggingFace"],
|
235 |
+
label="API for Auto-summarization"
|
236 |
+
)
|
237 |
+
api_key_input = gr.Textbox(label="API Key", type="password")
|
238 |
+
with gr.Column():
|
239 |
+
import_button = gr.Button("Import Data")
|
240 |
+
import_output = gr.Textbox(label="Import Status")
|
241 |
+
|
242 |
+
import_button.click(
|
243 |
+
fn=import_data,
|
244 |
+
inputs=[import_file, title_input, author_input, keywords_input, custom_prompt_input,
|
245 |
+
summary_input, auto_summarize_checkbox, api_name_input, api_key_input],
|
246 |
+
outputs=import_output
|
247 |
+
)
|
248 |
+
|
249 |
+
|
250 |
+
def create_import_multiple_prompts_tab():
|
251 |
+
with gr.TabItem("Import Multiple Prompts"):
|
252 |
+
gr.Markdown("# Import multiple prompts into the database")
|
253 |
+
gr.Markdown("Upload a zip file containing multiple prompt files (txt or md)")
|
254 |
+
|
255 |
+
with gr.Row():
|
256 |
+
with gr.Column():
|
257 |
+
zip_file = gr.File(label="Upload zip file for import", file_types=["zip"])
|
258 |
+
import_button = gr.Button("Import Prompts")
|
259 |
+
prompts_dropdown = gr.Dropdown(label="Select Prompt to Edit", choices=[])
|
260 |
+
title_input = gr.Textbox(label="Title", placeholder="Enter the title of the content")
|
261 |
+
author_input = gr.Textbox(label="Author", placeholder="Enter the author's name")
|
262 |
+
system_input = gr.Textbox(label="System", placeholder="Enter the system message for the prompt",
|
263 |
+
lines=3)
|
264 |
+
user_input = gr.Textbox(label="User", placeholder="Enter the user message for the prompt", lines=3)
|
265 |
+
keywords_input = gr.Textbox(label="Keywords", placeholder="Enter keywords separated by commas")
|
266 |
+
|
267 |
+
with gr.Column():
|
268 |
+
import_output = gr.Textbox(label="Import Status")
|
269 |
+
save_button = gr.Button("Save to Database")
|
270 |
+
save_output = gr.Textbox(label="Save Status")
|
271 |
+
prompts_display = gr.Textbox(label="Identified Prompts")
|
272 |
+
|
273 |
+
def handle_zip_import(zip_file):
|
274 |
+
result = import_prompts_from_zip(zip_file)
|
275 |
+
if isinstance(result, list):
|
276 |
+
prompt_titles = [prompt['title'] for prompt in result]
|
277 |
+
return gr.update(
|
278 |
+
value="Zip file successfully imported. Select a prompt to edit from the dropdown."), prompt_titles, gr.update(
|
279 |
+
value="\n".join(prompt_titles)), result
|
280 |
+
else:
|
281 |
+
return gr.update(value=result), [], gr.update(value=""), []
|
282 |
+
|
283 |
+
def handle_prompt_selection(selected_title, prompts):
|
284 |
+
selected_prompt = next((prompt for prompt in prompts if prompt['title'] == selected_title), None)
|
285 |
+
if selected_prompt:
|
286 |
+
return (
|
287 |
+
selected_prompt['title'],
|
288 |
+
selected_prompt.get('author', ''),
|
289 |
+
selected_prompt['system'],
|
290 |
+
selected_prompt.get('user', ''),
|
291 |
+
", ".join(selected_prompt.get('keywords', []))
|
292 |
+
)
|
293 |
+
else:
|
294 |
+
return "", "", "", "", ""
|
295 |
+
|
296 |
+
zip_import_state = gr.State([])
|
297 |
+
|
298 |
+
import_button.click(
|
299 |
+
fn=handle_zip_import,
|
300 |
+
inputs=[zip_file],
|
301 |
+
outputs=[import_output, prompts_dropdown, prompts_display, zip_import_state]
|
302 |
+
)
|
303 |
+
|
304 |
+
prompts_dropdown.change(
|
305 |
+
fn=handle_prompt_selection,
|
306 |
+
inputs=[prompts_dropdown, zip_import_state],
|
307 |
+
outputs=[title_input, author_input, system_input, user_input, keywords_input]
|
308 |
+
)
|
309 |
+
|
310 |
+
def save_prompt_to_db(title, author, system, user, keywords):
|
311 |
+
keyword_list = [k.strip() for k in keywords.split(',') if k.strip()]
|
312 |
+
return insert_prompt_to_db(title, author, system, user, keyword_list)
|
313 |
+
|
314 |
+
save_button.click(
|
315 |
+
fn=save_prompt_to_db,
|
316 |
+
inputs=[title_input, author_input, system_input, user_input, keywords_input],
|
317 |
+
outputs=save_output
|
318 |
+
)
|
319 |
+
|
320 |
+
def update_prompt_dropdown():
|
321 |
+
return gr.update(choices=load_preset_prompts())
|
322 |
+
|
323 |
+
save_button.click(
|
324 |
+
fn=update_prompt_dropdown,
|
325 |
+
inputs=[],
|
326 |
+
outputs=[gr.Dropdown(label="Select Preset Prompt")]
|
327 |
+
)
|
328 |
+
|
329 |
+
|
330 |
+
def create_import_obsidian_vault_tab():
|
331 |
+
with gr.TabItem("Import Obsidian Vault"):
|
332 |
+
gr.Markdown("## Import Obsidian Vault")
|
333 |
+
with gr.Row():
|
334 |
+
with gr.Column():
|
335 |
+
vault_path_input = gr.Textbox(label="Obsidian Vault Path (Local)")
|
336 |
+
vault_zip_input = gr.File(label="Upload Obsidian Vault (Zip)")
|
337 |
+
with gr.Column():
|
338 |
+
import_vault_button = gr.Button("Import Obsidian Vault")
|
339 |
+
import_status = gr.Textbox(label="Import Status", interactive=False)
|
340 |
+
|
341 |
+
|
342 |
+
def import_vault(vault_path, vault_zip):
|
343 |
+
if vault_zip:
|
344 |
+
imported, total, errors = process_obsidian_zip(vault_zip.name)
|
345 |
+
elif vault_path:
|
346 |
+
imported, total, errors = import_obsidian_vault(vault_path)
|
347 |
+
else:
|
348 |
+
return "Please provide either a local vault path or upload a zip file."
|
349 |
+
|
350 |
+
status = f"Imported {imported} out of {total} files.\n"
|
351 |
+
if errors:
|
352 |
+
status += f"Encountered {len(errors)} errors:\n" + "\n".join(errors)
|
353 |
+
return status
|
354 |
+
|
355 |
+
|
356 |
+
import_vault_button.click(
|
357 |
+
fn=import_vault,
|
358 |
+
inputs=[vault_path_input, vault_zip_input],
|
359 |
+
outputs=[import_status],
|
360 |
+
show_progress=True
|
361 |
+
)
|
362 |
+
|
363 |
+
|
364 |
+
|
365 |
+
# Using pypandoc to convert EPUB to Markdown
|
366 |
+
def create_import_book_tab():
|
367 |
+
with gr.TabItem("Import .epub/ebook Files"):
|
368 |
+
with gr.Row():
|
369 |
+
with gr.Column():
|
370 |
+
gr.Markdown("# Ingest an .epub file using pypandoc")
|
371 |
+
gr.Markdown("...and have it tagged + summarized")
|
372 |
+
gr.Markdown(
|
373 |
+
"How to remove DRM from your ebooks: https://www.reddit.com/r/Calibre/comments/1ck4w8e/2024_guide_on_removing_drm_from_kobo_kindle_ebooks/")
|
374 |
+
import_file = gr.File(label="Upload file for import", file_types=[".epub"])
|
375 |
+
title_input = gr.Textbox(label="Title", placeholder="Enter the title of the content")
|
376 |
+
author_input = gr.Textbox(label="Author", placeholder="Enter the author's name")
|
377 |
+
keywords_input = gr.Textbox(label="Keywords(like genre or publish year)",
|
378 |
+
placeholder="Enter keywords, comma-separated")
|
379 |
+
system_prompt_input = gr.Textbox(label="System Prompt",
|
380 |
+
lines=3,
|
381 |
+
value=""""
|
382 |
+
<s>You are a bulleted notes specialist. [INST]```When creating comprehensive bulleted notes, you should follow these guidelines: Use multiple headings based on the referenced topics, not categories like quotes or terms. Headings should be surrounded by bold formatting and not be listed as bullet points themselves. Leave no space between headings and their corresponding list items underneath. Important terms within the content should be emphasized by setting them in bold font. Any text that ends with a colon should also be bolded. Before submitting your response, review the instructions, and make any corrections necessary to adhered to the specified format. Do not reference these instructions within the notes.``` \nBased on the content between backticks create comprehensive bulleted notes.[/INST]
|
383 |
+
**Bulleted Note Creation Guidelines**
|
384 |
+
|
385 |
+
**Headings**:
|
386 |
+
- Based on referenced topics, not categories like quotes or terms
|
387 |
+
- Surrounded by **bold** formatting
|
388 |
+
- Not listed as bullet points
|
389 |
+
- No space between headings and list items underneath
|
390 |
+
|
391 |
+
**Emphasis**:
|
392 |
+
- **Important terms** set in bold font
|
393 |
+
- **Text ending in a colon**: also bolded
|
394 |
+
|
395 |
+
**Review**:
|
396 |
+
- Ensure adherence to specified format
|
397 |
+
- Do not reference these instructions in your response.</s>[INST] {{ .Prompt }} [/INST]
|
398 |
+
""", )
|
399 |
+
custom_prompt_input = gr.Textbox(label="Custom User Prompt",
|
400 |
+
placeholder="Enter a custom user prompt for summarization (optional)")
|
401 |
+
auto_summarize_checkbox = gr.Checkbox(label="Auto-summarize", value=False)
|
402 |
+
api_name_input = gr.Dropdown(
|
403 |
+
choices=[None, "Local-LLM", "OpenAI", "Anthropic", "Cohere", "Groq", "DeepSeek", "Mistral",
|
404 |
+
"OpenRouter",
|
405 |
+
"Llama.cpp", "Kobold", "Ooba", "Tabbyapi", "VLLM", "ollama", "HuggingFace"],
|
406 |
+
label="API for Auto-summarization"
|
407 |
+
)
|
408 |
+
api_key_input = gr.Textbox(label="API Key", type="password")
|
409 |
+
import_button = gr.Button("Import eBook")
|
410 |
+
with gr.Column():
|
411 |
+
with gr.Row():
|
412 |
+
import_output = gr.Textbox(label="Import Status")
|
413 |
+
|
414 |
+
def import_epub(epub_file, title, author, keywords, system_prompt, user_prompt, auto_summarize, api_name,
|
415 |
+
api_key):
|
416 |
+
try:
|
417 |
+
# Create a temporary directory to store the converted file
|
418 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
419 |
+
epub_path = epub_file.name
|
420 |
+
md_path = os.path.join(temp_dir, "converted.md")
|
421 |
+
|
422 |
+
# Use pypandoc to convert EPUB to Markdown
|
423 |
+
output = pypandoc.convert_file(epub_path, 'md', outputfile=md_path)
|
424 |
+
|
425 |
+
if output != "":
|
426 |
+
return f"Error converting EPUB: {output}"
|
427 |
+
|
428 |
+
# Read the converted markdown content
|
429 |
+
with open(md_path, "r", encoding="utf-8") as md_file:
|
430 |
+
content = md_file.read()
|
431 |
+
|
432 |
+
# Now process the content as you would with a text file
|
433 |
+
return import_data(content, title, author, keywords, system_prompt,
|
434 |
+
user_prompt, auto_summarize, api_name, api_key)
|
435 |
+
except Exception as e:
|
436 |
+
return f"Error processing EPUB: {str(e)}"
|
437 |
+
|
438 |
+
import_button.click(
|
439 |
+
fn=import_epub,
|
440 |
+
inputs=[import_file, title_input, author_input, keywords_input, system_prompt_input,
|
441 |
+
custom_prompt_input, auto_summarize_checkbox, api_name_input, api_key_input],
|
442 |
+
outputs=import_output
|
443 |
+
)
|
444 |
+
|
445 |
+
def import_obsidian_vault(vault_path, progress=gr.Progress()):
|
446 |
+
try:
|
447 |
+
from App_Function_Libraries.Gradio_UI.Export_Functionality import scan_obsidian_vault
|
448 |
+
markdown_files = scan_obsidian_vault(vault_path)
|
449 |
+
total_files = len(markdown_files)
|
450 |
+
imported_files = 0
|
451 |
+
errors = []
|
452 |
+
|
453 |
+
for i, file_path in enumerate(markdown_files):
|
454 |
+
try:
|
455 |
+
note_data = parse_obsidian_note(file_path)
|
456 |
+
success, error_msg = import_obsidian_note_to_db(note_data)
|
457 |
+
if success:
|
458 |
+
imported_files += 1
|
459 |
+
else:
|
460 |
+
errors.append(error_msg)
|
461 |
+
except Exception as e:
|
462 |
+
error_msg = f"Error processing {file_path}: {str(e)}"
|
463 |
+
logger.error(error_msg)
|
464 |
+
errors.append(error_msg)
|
465 |
+
|
466 |
+
progress((i + 1) / total_files, f"Imported {imported_files} of {total_files} files")
|
467 |
+
sleep(0.1) # Small delay to prevent UI freezing
|
468 |
+
|
469 |
+
return imported_files, total_files, errors
|
470 |
+
except Exception as e:
|
471 |
+
error_msg = f"Error scanning vault: {str(e)}\n{traceback.format_exc()}"
|
472 |
+
logger.error(error_msg)
|
473 |
+
return 0, 0, [error_msg]
|
App_Function_Libraries/Gradio_UI/Introduction_tab.py
ADDED
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Introduction_tab.py
|
2 |
+
# Gradio UI functions for the Introduction tab
|
3 |
+
|
4 |
+
# Imports
|
5 |
+
#
|
6 |
+
# External Imports
|
7 |
+
import gradio as gr
|
8 |
+
|
9 |
+
from App_Function_Libraries.DB_Manager import get_db_config
|
10 |
+
|
11 |
+
|
12 |
+
#
|
13 |
+
# Local Imports
|
14 |
+
|
15 |
+
|
16 |
+
def create_introduction_tab():
|
17 |
+
with (gr.TabItem("Introduction")):
|
18 |
+
db_config = get_db_config()
|
19 |
+
db_type = db_config['type']
|
20 |
+
gr.Markdown(f"# tldw: Your LLM-powered Research Multi-tool (Using {db_type.capitalize()} Database)")
|
21 |
+
with gr.Row():
|
22 |
+
with gr.Column():
|
23 |
+
gr.Markdown("""### What can it do?
|
24 |
+
- Transcribe and summarize videos from URLs/Local files
|
25 |
+
- Transcribe and Summarize Audio files/Podcasts (URL/local file)
|
26 |
+
- Summarize articles from URLs/Local notes
|
27 |
+
- Ingest and summarize books(epub/PDF)
|
28 |
+
- Ingest and summarize research papers (PDFs - WIP)
|
29 |
+
- Search and display ingested content + summaries
|
30 |
+
- Create and manage custom prompts
|
31 |
+
- Chat with an LLM of your choice to generate content using the selected item + Prompts
|
32 |
+
- Keyword support for content search and display
|
33 |
+
- Export keywords/items to markdown/CSV(csv is wip)
|
34 |
+
- Import existing notes from Obsidian to the database (Markdown/txt files or a zip containing a collection of files)
|
35 |
+
- View and manage chat history
|
36 |
+
- Writing Tools: Grammar & Style check, Tone Analyzer & Editor, more planned...
|
37 |
+
- RAG (Retrieval-Augmented Generation) support for content generation(think about asking questions about your entire library of items)
|
38 |
+
- More features planned...
|
39 |
+
- All powered by your choice of LLM.
|
40 |
+
- Currently supports: Local-LLM(llamafile-server), OpenAI, Anthropic, Cohere, Groq, DeepSeek, OpenRouter, Llama.cpp, Kobold, Ooba, Tabbyapi, VLLM and more to come...
|
41 |
+
- All data is stored locally in a SQLite database for easy access and management.
|
42 |
+
- No trackers (Gradio has some analytics but it's disabled here...)
|
43 |
+
- No ads, no tracking, no BS. Just you and your content.
|
44 |
+
- Open-source and free to use. Contributions welcome!
|
45 |
+
- If you have any thoughts or feedback, please let me know on github or via email.
|
46 |
+
""")
|
47 |
+
gr.Markdown(
|
48 |
+
"""Follow this project at [tl/dw: Too Long, Didn't Watch - Your Personal Research Multi-Tool - GitHub](https://github.com/rmusser01/tldw)""")
|
49 |
+
with gr.Column():
|
50 |
+
gr.Markdown("""### How to use:
|
51 |
+
##### Quick Start: Just click on the appropriate tab for what you're trying to do and fill in the required fields. Click "Process <video/audio/article/etc>" and wait for the results.
|
52 |
+
#### Simple Instructions
|
53 |
+
- Basic Usage:
|
54 |
+
- If you don't have an API key/don't know what an LLM is/don't know what an API key is, please look further down the page for information on getting started.
|
55 |
+
- If you want summaries/chat with an LLM, you'll need:
|
56 |
+
1. An API key for the LLM API service you want to use, or,
|
57 |
+
2. A local inference server running an LLM (like llamafile-server/llama.cpp - for instructions on how to do so see the projects README or below), or,
|
58 |
+
3. A "local" inference server you have access to running an LLM.
|
59 |
+
- If you just want transcriptions you can ignore the above.
|
60 |
+
- Select the tab for the task you want to perform
|
61 |
+
- Fill in the required fields
|
62 |
+
- Click the "Process" button
|
63 |
+
- Wait for the results to appear
|
64 |
+
- Download the results if needed
|
65 |
+
- Repeat as needed
|
66 |
+
- As of writing this, the UI is still a work in progress.
|
67 |
+
- That being said, I plan to replace it all eventually. In the meantime, please have patience.
|
68 |
+
- The UI is divided into tabs for different tasks.
|
69 |
+
- Each tab has a set of fields that you can fill in to perform the task.
|
70 |
+
- Some fields are mandatory, some are optional.
|
71 |
+
- The fields are mostly self-explanatory, but I will try to add more detailed instructions as I go.
|
72 |
+
#### Detailed Usage:
|
73 |
+
- There are 8 Top-level tabs in the UI. Each tab has a specific set of tasks that you can perform by selecting one of the 'sub-tabs' made available by clicking on the top tab.
|
74 |
+
- The tabs are as follows:
|
75 |
+
1. Transcription / Summarization / Ingestion - This tab is for processing videos, audio files, articles, books, and PDFs/office docs.
|
76 |
+
2. Search / Detailed View - This tab is for searching and displaying content from the database. You can also view detailed information about the selected item.
|
77 |
+
3. Chat with an LLM - This tab is for chatting with an LLM to generate content based on the selected item and prompts.
|
78 |
+
4. Edit Existing Items - This tab is for editing existing items in the database (Prompts + ingested items).
|
79 |
+
5. Writing Tools - This tab is for using various writing tools like Grammar & Style check, Tone Analyzer & Editor, etc.
|
80 |
+
6. Keywords - This tab is for managing keywords for content search and display.
|
81 |
+
7. Import/Export - This tab is for importing notes from Obsidian and exporting keywords/items to markdown/CSV.
|
82 |
+
8. Utilities - This tab contains some random utilities that I thought might be useful.
|
83 |
+
- Each sub-tab is responsible for that set of functionality. This is reflected in the codebase as well, where I have split the functionality into separate files for each tab/larger goal.
|
84 |
+
""")
|
85 |
+
with gr.Row():
|
86 |
+
gr.Markdown("""### HELP! I don't know what any of this this shit is!
|
87 |
+
### DON'T PANIC
|
88 |
+
#### Its ok, you're not alone, most people have no clue what any of this stuff is.
|
89 |
+
- So let's try and fix that.
|
90 |
+
|
91 |
+
#### Introduction to LLMs:
|
92 |
+
- Non-Technical introduction to Generative AI and LLMs: https://paruir.medium.com/understanding-generative-ai-and-llms-a-non-technical-overview-part-1-788c0eb0dd64
|
93 |
+
- Google's Intro to LLMs: https://developers.google.com/machine-learning/resources/intro-llms#llm_considerations
|
94 |
+
- LLMs 101(coming from a tech background): https://vinija.ai/models/LLM/
|
95 |
+
- LLM Fundamentals / LLM Scientist / LLM Engineer courses(Free): https://github.com/mlabonne/llm-course
|
96 |
+
|
97 |
+
#### Various Phrases & Terms to know
|
98 |
+
- **LLM** - Large Language Model - A type of neural network that can generate human-like text.
|
99 |
+
- **API** - Application Programming Interface - A set of rules and protocols that allows one software application to communicate with another.
|
100 |
+
* Think of it like a post address for a piece of software. You can send messages to and from it.
|
101 |
+
- **API Key** - A unique identifier that is used to authenticate a user, developer, or calling program to an API.
|
102 |
+
* Like the key to a post office box. You need it to access the contents.
|
103 |
+
- **GUI** - Graphical User Interface - the thing facilitating your interact with this application.
|
104 |
+
- **DB** - Database
|
105 |
+
- **Prompt Engineering** - The process of designing prompts that are used to guide the output of a language model. Is a meme but also very much not.
|
106 |
+
- **Quantization** - The process of converting a continuous range of values into a finite range of discrete values.
|
107 |
+
* https://github.com/ggerganov/llama.cpp/blob/cddae4884c853b1a7ab420458236d666e2e34423/examples/quantize/README.md#L27
|
108 |
+
- **GGUF Files** - GGUF is a binary format that is designed for fast loading and saving of models, and for ease of reading. Models are traditionally developed using PyTorch or another framework, and then converted to GGUF for use in GGML. https://github.com/ggerganov/ggml/blob/master/docs/gguf.md
|
109 |
+
- **Inference Engine** - A software system that is designed to execute a model that has been trained by a machine learning algorithm. Llama.cpp and Kobold.cpp are examples of inference engines.
|
110 |
+
- **Abliteration** - https://huggingface.co/blog/mlabonne/abliteration
|
111 |
+
""")
|
112 |
+
with gr.Row():
|
113 |
+
gr.Markdown("""### Ok cool, but how do I get started? I don't have an API key or a local server running...
|
114 |
+
#### Great, glad you asked! Getting Started:
|
115 |
+
- **Getting an API key for a commercial services provider:
|
116 |
+
- **OpenAI:**
|
117 |
+
* https://platform.openai.com/docs/quickstart
|
118 |
+
- **Anthropic:**
|
119 |
+
* https://docs.anthropic.com/en/api/getting-started
|
120 |
+
- **Cohere:**
|
121 |
+
* https://docs.cohere.com/
|
122 |
+
* They offer 1k free requests a month(up to 1million tokens total I think?), so you can try it out without paying.
|
123 |
+
- **Groq:**
|
124 |
+
* https://console.groq.com/keys
|
125 |
+
* Offer an account with free credits to try out their service. No idea how much you get.
|
126 |
+
- **DeepSeek:**
|
127 |
+
* https://platform.deepseek.com/ (Chinese-hosted/is in english)
|
128 |
+
- **OpenRouter:**
|
129 |
+
* https://openrouter.ai/
|
130 |
+
- **Mistral:**
|
131 |
+
* https://console.mistral.ai/
|
132 |
+
- **Choosing a Model to download**
|
133 |
+
- You'll first need to select a model you want to use with the server.
|
134 |
+
- Keep in mind that the model you select will determine the quality of the output you get, and that models run fastest when offloaded fully to your GPU.
|
135 |
+
* So this means that you can run a large model (Command-R) on CPU+System RAM, but you're gonna see a massive performance hit. Not saying its unusable, but it's not ideal.
|
136 |
+
* With that in mind, I would recommend an abliterated version of Meta's Llama3.1 model for most tasks. (Abliterated since it won't refuse requests)
|
137 |
+
* I say this because of the general quality of the model + it's context size.
|
138 |
+
* You can find the model here: https://huggingface.co/mlabonne/Meta-Llama-3.1-8B-Instruct-abliterated-GGUF
|
139 |
+
* And the Q8 quant(total size 8.6GB): https://huggingface.co/mlabonne/Meta-Llama-3.1-8B-Instruct-abliterated-GGUF/resolve/main/meta-llama-3.1-8b-instruct-abliterated.Q8_0.gguf?download=true
|
140 |
+
- **Local Inference Server:**
|
141 |
+
- **Llamafile-Server (wrapper for llama.cpp):**
|
142 |
+
* Run this script with the `--local_llm` argument next time, and you'll be walked through setting up a local instance of llamafile-server.
|
143 |
+
- **Llama.cpp Inference Engine:**
|
144 |
+
* Download the latest release for your platform here: https://github.com/ggerganov/llama.cpp/releases
|
145 |
+
* Windows: `llama-<release_number>-bin-win-cuda-cu<11.7.1 or 12.2.0 - version depends on installed cuda>-x64.zip`
|
146 |
+
* Run it: `llama-server.exe --model <path_to_model> -ctx 8192 -ngl 999`
|
147 |
+
- `-ctx 8192` sets the context size to 8192 tokens, `-ngl 999` sets the number of layers to offload to the GPU to 999. (essentially ensuring we only use our GPU and not CPU for processing)
|
148 |
+
* Macos: `llama-<release_number>-bin-macos-arm64.zip - for Apple Silicon / `llama-<release_number>-bin-macos-x64.zip` - for Intel Macs
|
149 |
+
* Run it: `llama-server --model <path_to_model> -ctx 8192 -ngl 999`
|
150 |
+
- `-ctx 8192` sets the context size to 8192 tokens, `-ngl 999` sets the number of layers to offload to the GPU to 999. (essentially ensuring we only use our GPU and not CPU for processing)
|
151 |
+
* Linux: You can probably figure it out.
|
152 |
+
- **Kobold.cpp Server:**
|
153 |
+
1. Download from here: https://github.com/LostRuins/koboldcpp/releases/latest
|
154 |
+
2. `Double click KoboldCPP.exe and select model OR run "KoboldCPP.exe --help" in CMD prompt to get command line arguments for more control.`
|
155 |
+
3. `Generally you don't have to change much besides the Presets and GPU Layers. Run with CuBLAS or CLBlast for GPU acceleration.`
|
156 |
+
4. `Select your GGUF or GGML model you downloaded earlier, and connect to the displayed URL once it finishes loading.`
|
157 |
+
- **Linux**
|
158 |
+
1. `On Linux, we provide a koboldcpp-linux-x64 PyInstaller prebuilt binary on the releases page for modern systems. Simply download and run the binary.`
|
159 |
+
* Alternatively, you can also install koboldcpp to the current directory by running the following terminal command: `curl -fLo koboldcpp https://github.com/LostRuins/koboldcpp/releases/latest/download/koboldcpp-linux-x64 && chmod +x koboldcpp`
|
160 |
+
2. When you can't use the precompiled binary directly, we provide an automated build script which uses conda to obtain all dependencies, and generates (from source) a ready-to-use a pyinstaller binary for linux users. Simply execute the build script with `./koboldcpp.sh dist` and run the generated binary.
|
161 |
+
""")
|
App_Function_Libraries/Gradio_UI/Keywords.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Keywords.py
|
2 |
+
# Purpose: This file contains the functions to create the Keywords tab in the Gradio UI.
|
3 |
+
#
|
4 |
+
# The Keywords tab allows the user to add, delete, view, and export keywords from the database.
|
5 |
+
#
|
6 |
+
# Imports:
|
7 |
+
|
8 |
+
#
|
9 |
+
# External Imports
|
10 |
+
import gradio as gr
|
11 |
+
#
|
12 |
+
# Internal Imports
|
13 |
+
from App_Function_Libraries.DB_Manager import add_keyword, delete_keyword, keywords_browser_interface, export_keywords_to_csv
|
14 |
+
#
|
15 |
+
#
|
16 |
+
######################################################################################################################
|
17 |
+
#
|
18 |
+
# Functions:
|
19 |
+
|
20 |
+
|
21 |
+
def create_export_keywords_tab():
|
22 |
+
with gr.Tab("Export Keywords"):
|
23 |
+
with gr.Row():
|
24 |
+
with gr.Column():
|
25 |
+
export_keywords_button = gr.Button("Export Keywords")
|
26 |
+
with gr.Column():
|
27 |
+
export_keywords_output = gr.File(label="Download Exported Keywords")
|
28 |
+
export_keywords_status = gr.Textbox(label="Export Status")
|
29 |
+
|
30 |
+
export_keywords_button.click(
|
31 |
+
fn=export_keywords_to_csv,
|
32 |
+
outputs=[export_keywords_status, export_keywords_output]
|
33 |
+
)
|
34 |
+
|
35 |
+
def create_view_keywords_tab():
|
36 |
+
with gr.TabItem("View Keywords"):
|
37 |
+
gr.Markdown("# Browse Keywords")
|
38 |
+
with gr.Column():
|
39 |
+
browse_output = gr.Markdown()
|
40 |
+
browse_button = gr.Button("View Existing Keywords")
|
41 |
+
browse_button.click(fn=keywords_browser_interface, outputs=browse_output)
|
42 |
+
|
43 |
+
|
44 |
+
def create_add_keyword_tab():
|
45 |
+
with gr.TabItem("Add Keywords"):
|
46 |
+
with gr.Row():
|
47 |
+
with gr.Column():
|
48 |
+
gr.Markdown("# Add Keywords to the Database")
|
49 |
+
add_input = gr.Textbox(label="Add Keywords (comma-separated)", placeholder="Enter keywords here...")
|
50 |
+
add_button = gr.Button("Add Keywords")
|
51 |
+
with gr.Row():
|
52 |
+
add_output = gr.Textbox(label="Result")
|
53 |
+
add_button.click(fn=add_keyword, inputs=add_input, outputs=add_output)
|
54 |
+
|
55 |
+
|
56 |
+
def create_delete_keyword_tab():
|
57 |
+
with gr.Tab("Delete Keywords"):
|
58 |
+
with gr.Row():
|
59 |
+
with gr.Column():
|
60 |
+
gr.Markdown("# Delete Keywords from the Database")
|
61 |
+
delete_input = gr.Textbox(label="Delete Keyword", placeholder="Enter keyword to delete here...")
|
62 |
+
delete_button = gr.Button("Delete Keyword")
|
63 |
+
with gr.Row():
|
64 |
+
delete_output = gr.Textbox(label="Result")
|
65 |
+
delete_button.click(fn=delete_keyword, inputs=delete_input, outputs=delete_output)
|
App_Function_Libraries/Gradio_UI/Llamafile_tab.py
ADDED
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Llamafile_tab.py
|
2 |
+
# Description: Functions relating to the Llamafile tab
|
3 |
+
#
|
4 |
+
# Imports
|
5 |
+
import os
|
6 |
+
import glob
|
7 |
+
#
|
8 |
+
# External Imports
|
9 |
+
import gradio as gr
|
10 |
+
#
|
11 |
+
# Local Imports
|
12 |
+
from App_Function_Libraries.Llamafile import start_llamafile
|
13 |
+
#
|
14 |
+
#######################################################################################################################
|
15 |
+
#
|
16 |
+
# Functions:
|
17 |
+
|
18 |
+
|
19 |
+
def create_chat_with_llamafile_tab():
|
20 |
+
def get_model_files(directory):
|
21 |
+
pattern = os.path.join(directory, "*.{gguf,llamafile}")
|
22 |
+
return [os.path.basename(f) for f in glob.glob(pattern)]
|
23 |
+
|
24 |
+
def update_dropdowns():
|
25 |
+
current_dir_models = get_model_files(".")
|
26 |
+
parent_dir_models = get_model_files("..")
|
27 |
+
return (
|
28 |
+
{"choices": current_dir_models, "value": None},
|
29 |
+
{"choices": parent_dir_models, "value": None}
|
30 |
+
)
|
31 |
+
|
32 |
+
with gr.TabItem("Local LLM with Llamafile"):
|
33 |
+
gr.Markdown("# Settings for Llamafile")
|
34 |
+
with gr.Row():
|
35 |
+
with gr.Column():
|
36 |
+
am_noob = gr.Checkbox(label="Check this to enable sane defaults", value=False, visible=True)
|
37 |
+
# FIXME - these get deleted at some point?
|
38 |
+
advanced_mode_toggle = gr.Checkbox(label="Advanced Mode - Enable to show all settings", value=False)
|
39 |
+
|
40 |
+
|
41 |
+
with gr.Column():
|
42 |
+
# FIXME - make this actually work
|
43 |
+
model_checked = gr.Checkbox(label="Enable Setting Local LLM Model Path", value=False, visible=True)
|
44 |
+
current_dir_dropdown = gr.Dropdown(
|
45 |
+
label="Select Model from Current Directory (.)",
|
46 |
+
choices=[], # Start with an empty list
|
47 |
+
visible=True
|
48 |
+
)
|
49 |
+
parent_dir_dropdown = gr.Dropdown(
|
50 |
+
label="Select Model from Parent Directory (..)",
|
51 |
+
choices=[], # Start with an empty list
|
52 |
+
visible=True
|
53 |
+
)
|
54 |
+
refresh_button = gr.Button("Refresh Model Lists")
|
55 |
+
model_value = gr.Textbox(label="Selected Model File", value="", visible=True)
|
56 |
+
with gr.Row():
|
57 |
+
with gr.Column():
|
58 |
+
ngl_checked = gr.Checkbox(label="Enable Setting GPU Layers", value=False, visible=True)
|
59 |
+
ngl_value = gr.Number(label="Number of GPU Layers", value=None, precision=0, visible=True)
|
60 |
+
advanced_inputs = create_llamafile_advanced_inputs()
|
61 |
+
with gr.Column():
|
62 |
+
start_button = gr.Button("Start Llamafile")
|
63 |
+
stop_button = gr.Button("Stop Llamafile (doesn't work)")
|
64 |
+
output_display = gr.Markdown()
|
65 |
+
|
66 |
+
|
67 |
+
def update_model_value(current_dir_model, parent_dir_model):
|
68 |
+
if current_dir_model:
|
69 |
+
return current_dir_model
|
70 |
+
elif parent_dir_model:
|
71 |
+
return os.path.join("..", parent_dir_model)
|
72 |
+
else:
|
73 |
+
return ""
|
74 |
+
|
75 |
+
current_dir_dropdown.change(
|
76 |
+
fn=update_model_value,
|
77 |
+
inputs=[current_dir_dropdown, parent_dir_dropdown],
|
78 |
+
outputs=model_value
|
79 |
+
)
|
80 |
+
parent_dir_dropdown.change(
|
81 |
+
fn=update_model_value,
|
82 |
+
inputs=[current_dir_dropdown, parent_dir_dropdown],
|
83 |
+
outputs=model_value
|
84 |
+
)
|
85 |
+
|
86 |
+
refresh_button.click(
|
87 |
+
fn=update_dropdowns,
|
88 |
+
inputs=[],
|
89 |
+
outputs=[current_dir_dropdown, parent_dir_dropdown]
|
90 |
+
)
|
91 |
+
|
92 |
+
start_button.click(
|
93 |
+
fn=start_llamafile,
|
94 |
+
inputs=[am_noob, model_checked, model_value, ngl_checked, ngl_value] + advanced_inputs,
|
95 |
+
outputs=output_display
|
96 |
+
)
|
97 |
+
|
98 |
+
|
99 |
+
def create_llamafile_advanced_inputs():
|
100 |
+
verbose_checked = gr.Checkbox(label="Enable Verbose Output", value=False, visible=False)
|
101 |
+
threads_checked = gr.Checkbox(label="Set CPU Threads", value=False, visible=False)
|
102 |
+
threads_value = gr.Number(label="Number of CPU Threads", value=None, precision=0, visible=False)
|
103 |
+
http_threads_checked = gr.Checkbox(label="Set HTTP Server Threads", value=False, visible=False)
|
104 |
+
http_threads_value = gr.Number(label="Number of HTTP Server Threads", value=None, precision=0, visible=False)
|
105 |
+
hf_repo_checked = gr.Checkbox(label="Use Huggingface Repo Model", value=False, visible=False)
|
106 |
+
hf_repo_value = gr.Textbox(label="Huggingface Repo Name", value="", visible=False)
|
107 |
+
hf_file_checked = gr.Checkbox(label="Set Huggingface Model File", value=False, visible=False)
|
108 |
+
hf_file_value = gr.Textbox(label="Huggingface Model File", value="", visible=False)
|
109 |
+
ctx_size_checked = gr.Checkbox(label="Set Prompt Context Size", value=False, visible=False)
|
110 |
+
ctx_size_value = gr.Number(label="Prompt Context Size", value=8124, precision=0, visible=False)
|
111 |
+
host_checked = gr.Checkbox(label="Set IP to Listen On", value=False, visible=False)
|
112 |
+
host_value = gr.Textbox(label="Host IP Address", value="", visible=False)
|
113 |
+
port_checked = gr.Checkbox(label="Set Server Port", value=False, visible=False)
|
114 |
+
port_value = gr.Number(label="Port Number", value=None, precision=0, visible=False)
|
115 |
+
|
116 |
+
return [verbose_checked, threads_checked, threads_value, http_threads_checked, http_threads_value,
|
117 |
+
hf_repo_checked, hf_repo_value, hf_file_checked, hf_file_value, ctx_size_checked, ctx_size_value,
|
118 |
+
host_checked, host_value, port_checked, port_value]
|
119 |
+
|
120 |
+
#
|
121 |
+
# End of Llamafile_tab.py
|
122 |
+
#########################################################################################################################
|
App_Function_Libraries/Gradio_UI/Media_edit.py
ADDED
@@ -0,0 +1,273 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Media_edit.py
|
2 |
+
# Functions for Gradio Media_Edit UI
|
3 |
+
|
4 |
+
# Imports
|
5 |
+
import logging
|
6 |
+
import uuid
|
7 |
+
|
8 |
+
# External Imports
|
9 |
+
import gradio as gr
|
10 |
+
#
|
11 |
+
# Local Imports
|
12 |
+
from App_Function_Libraries.DB_Manager import add_prompt, update_media_content, db, add_or_update_prompt, \
|
13 |
+
load_prompt_details
|
14 |
+
from App_Function_Libraries.Gradio_UI.Gradio_Shared import update_dropdown, update_prompt_dropdown
|
15 |
+
from App_Function_Libraries.SQLite_DB import fetch_item_details
|
16 |
+
|
17 |
+
|
18 |
+
def create_media_edit_tab():
|
19 |
+
with gr.TabItem("Edit Existing Items"):
|
20 |
+
gr.Markdown("# Search and Edit Media Items")
|
21 |
+
|
22 |
+
with gr.Row():
|
23 |
+
search_query_input = gr.Textbox(label="Search Query", placeholder="Enter your search query here...")
|
24 |
+
search_type_input = gr.Radio(choices=["Title", "URL", "Keyword", "Content"], value="Title", label="Search By")
|
25 |
+
search_button = gr.Button("Search")
|
26 |
+
|
27 |
+
with gr.Row():
|
28 |
+
items_output = gr.Dropdown(label="Select Item", choices=[], interactive=True)
|
29 |
+
item_mapping = gr.State({})
|
30 |
+
|
31 |
+
content_input = gr.Textbox(label="Edit Content", lines=10)
|
32 |
+
prompt_input = gr.Textbox(label="Edit Prompt", lines=3)
|
33 |
+
summary_input = gr.Textbox(label="Edit Summary", lines=5)
|
34 |
+
|
35 |
+
update_button = gr.Button("Update Media Content")
|
36 |
+
status_message = gr.Textbox(label="Status", interactive=False)
|
37 |
+
|
38 |
+
search_button.click(
|
39 |
+
fn=update_dropdown,
|
40 |
+
inputs=[search_query_input, search_type_input],
|
41 |
+
outputs=[items_output, item_mapping]
|
42 |
+
)
|
43 |
+
|
44 |
+
def load_selected_media_content(selected_item, item_mapping):
|
45 |
+
if selected_item and item_mapping and selected_item in item_mapping:
|
46 |
+
media_id = item_mapping[selected_item]
|
47 |
+
# FIXME - fetch_item_details is not handled by DB_Manager!
|
48 |
+
content, prompt, summary = fetch_item_details(media_id)
|
49 |
+
return content, prompt, summary
|
50 |
+
return "No item selected or invalid selection", "", ""
|
51 |
+
|
52 |
+
items_output.change(
|
53 |
+
fn=load_selected_media_content,
|
54 |
+
inputs=[items_output, item_mapping],
|
55 |
+
outputs=[content_input, prompt_input, summary_input]
|
56 |
+
)
|
57 |
+
|
58 |
+
update_button.click(
|
59 |
+
fn=update_media_content,
|
60 |
+
inputs=[items_output, item_mapping, content_input, prompt_input, summary_input],
|
61 |
+
outputs=status_message
|
62 |
+
)
|
63 |
+
|
64 |
+
|
65 |
+
def create_media_edit_and_clone_tab():
|
66 |
+
with gr.TabItem("Clone and Edit Existing Items"):
|
67 |
+
gr.Markdown("# Search, Edit, and Clone Existing Items")
|
68 |
+
|
69 |
+
with gr.Row():
|
70 |
+
with gr.Column():
|
71 |
+
search_query_input = gr.Textbox(label="Search Query", placeholder="Enter your search query here...")
|
72 |
+
search_type_input = gr.Radio(choices=["Title", "URL", "Keyword", "Content"], value="Title",
|
73 |
+
label="Search By")
|
74 |
+
with gr.Column():
|
75 |
+
search_button = gr.Button("Search")
|
76 |
+
clone_button = gr.Button("Clone Item")
|
77 |
+
save_clone_button = gr.Button("Save Cloned Item", visible=False)
|
78 |
+
with gr.Row():
|
79 |
+
items_output = gr.Dropdown(label="Select Item", choices=[], interactive=True)
|
80 |
+
item_mapping = gr.State({})
|
81 |
+
|
82 |
+
content_input = gr.Textbox(label="Edit Content", lines=10)
|
83 |
+
prompt_input = gr.Textbox(label="Edit Prompt", lines=3)
|
84 |
+
summary_input = gr.Textbox(label="Edit Summary", lines=5)
|
85 |
+
new_title_input = gr.Textbox(label="New Title (for cloning)", visible=False)
|
86 |
+
status_message = gr.Textbox(label="Status", interactive=False)
|
87 |
+
|
88 |
+
search_button.click(
|
89 |
+
fn=update_dropdown,
|
90 |
+
inputs=[search_query_input, search_type_input],
|
91 |
+
outputs=[items_output, item_mapping]
|
92 |
+
)
|
93 |
+
|
94 |
+
def load_selected_media_content(selected_item, item_mapping):
|
95 |
+
if selected_item and item_mapping and selected_item in item_mapping:
|
96 |
+
media_id = item_mapping[selected_item]
|
97 |
+
content, prompt, summary = fetch_item_details(media_id)
|
98 |
+
return content, prompt, summary, gr.update(visible=True), gr.update(visible=False)
|
99 |
+
return "No item selected or invalid selection", "", "", gr.update(visible=False), gr.update(visible=False)
|
100 |
+
|
101 |
+
items_output.change(
|
102 |
+
fn=load_selected_media_content,
|
103 |
+
inputs=[items_output, item_mapping],
|
104 |
+
outputs=[content_input, prompt_input, summary_input, clone_button, save_clone_button]
|
105 |
+
)
|
106 |
+
|
107 |
+
def prepare_for_cloning(selected_item):
|
108 |
+
return gr.update(value=f"Copy of {selected_item}", visible=True), gr.update(visible=True)
|
109 |
+
|
110 |
+
clone_button.click(
|
111 |
+
fn=prepare_for_cloning,
|
112 |
+
inputs=[items_output],
|
113 |
+
outputs=[new_title_input, save_clone_button]
|
114 |
+
)
|
115 |
+
|
116 |
+
def save_cloned_item(selected_item, item_mapping, content, prompt, summary, new_title):
|
117 |
+
if selected_item and item_mapping and selected_item in item_mapping:
|
118 |
+
original_media_id = item_mapping[selected_item]
|
119 |
+
try:
|
120 |
+
with db.get_connection() as conn:
|
121 |
+
cursor = conn.cursor()
|
122 |
+
|
123 |
+
# Fetch the original item's details
|
124 |
+
cursor.execute("SELECT type, url FROM Media WHERE id = ?", (original_media_id,))
|
125 |
+
original_type, original_url = cursor.fetchone()
|
126 |
+
|
127 |
+
# Generate a new unique URL
|
128 |
+
new_url = f"{original_url}_clone_{uuid.uuid4().hex[:8]}"
|
129 |
+
|
130 |
+
# Insert new item into Media table
|
131 |
+
cursor.execute("""
|
132 |
+
INSERT INTO Media (title, content, url, type)
|
133 |
+
VALUES (?, ?, ?, ?)
|
134 |
+
""", (new_title, content, new_url, original_type))
|
135 |
+
|
136 |
+
new_media_id = cursor.lastrowid
|
137 |
+
|
138 |
+
# Insert new item into MediaModifications table
|
139 |
+
cursor.execute("""
|
140 |
+
INSERT INTO MediaModifications (media_id, prompt, summary, modification_date)
|
141 |
+
VALUES (?, ?, ?, CURRENT_TIMESTAMP)
|
142 |
+
""", (new_media_id, prompt, summary))
|
143 |
+
|
144 |
+
# Copy keywords from the original item
|
145 |
+
cursor.execute("""
|
146 |
+
INSERT INTO MediaKeywords (media_id, keyword_id)
|
147 |
+
SELECT ?, keyword_id
|
148 |
+
FROM MediaKeywords
|
149 |
+
WHERE media_id = ?
|
150 |
+
""", (new_media_id, original_media_id))
|
151 |
+
|
152 |
+
# Update full-text search index
|
153 |
+
cursor.execute("""
|
154 |
+
INSERT INTO media_fts (rowid, title, content)
|
155 |
+
VALUES (?, ?, ?)
|
156 |
+
""", (new_media_id, new_title, content))
|
157 |
+
|
158 |
+
conn.commit()
|
159 |
+
|
160 |
+
return f"Cloned item saved successfully with ID: {new_media_id}", gr.update(
|
161 |
+
visible=False), gr.update(visible=False)
|
162 |
+
except Exception as e:
|
163 |
+
logging.error(f"Error saving cloned item: {e}")
|
164 |
+
return f"Error saving cloned item: {str(e)}", gr.update(visible=True), gr.update(visible=True)
|
165 |
+
else:
|
166 |
+
return "No item selected or invalid selection", gr.update(visible=True), gr.update(visible=True)
|
167 |
+
|
168 |
+
save_clone_button.click(
|
169 |
+
fn=save_cloned_item,
|
170 |
+
inputs=[items_output, item_mapping, content_input, prompt_input, summary_input, new_title_input],
|
171 |
+
outputs=[status_message, new_title_input, save_clone_button]
|
172 |
+
)
|
173 |
+
|
174 |
+
|
175 |
+
def create_prompt_edit_tab():
|
176 |
+
with gr.TabItem("Edit Prompts"):
|
177 |
+
with gr.Row():
|
178 |
+
with gr.Column():
|
179 |
+
prompt_dropdown = gr.Dropdown(
|
180 |
+
label="Select Prompt",
|
181 |
+
choices=[],
|
182 |
+
interactive=True
|
183 |
+
)
|
184 |
+
prompt_list_button = gr.Button("List Prompts")
|
185 |
+
|
186 |
+
with gr.Column():
|
187 |
+
title_input = gr.Textbox(label="Title", placeholder="Enter the prompt title")
|
188 |
+
description_input = gr.Textbox(label="Description", placeholder="Enter the prompt description", lines=3)
|
189 |
+
system_prompt_input = gr.Textbox(label="System Prompt", placeholder="Enter the system prompt", lines=3)
|
190 |
+
user_prompt_input = gr.Textbox(label="User Prompt", placeholder="Enter the user prompt", lines=3)
|
191 |
+
add_prompt_button = gr.Button("Add/Update Prompt")
|
192 |
+
add_prompt_output = gr.HTML()
|
193 |
+
|
194 |
+
# Event handlers
|
195 |
+
prompt_list_button.click(
|
196 |
+
fn=update_prompt_dropdown,
|
197 |
+
outputs=prompt_dropdown
|
198 |
+
)
|
199 |
+
|
200 |
+
add_prompt_button.click(
|
201 |
+
fn=add_or_update_prompt,
|
202 |
+
inputs=[title_input, description_input, system_prompt_input, user_prompt_input],
|
203 |
+
outputs=add_prompt_output
|
204 |
+
)
|
205 |
+
|
206 |
+
# Load prompt details when selected
|
207 |
+
prompt_dropdown.change(
|
208 |
+
fn=load_prompt_details,
|
209 |
+
inputs=[prompt_dropdown],
|
210 |
+
outputs=[title_input, description_input, system_prompt_input, user_prompt_input]
|
211 |
+
)
|
212 |
+
|
213 |
+
|
214 |
+
def create_prompt_clone_tab():
|
215 |
+
with gr.TabItem("Clone and Edit Prompts"):
|
216 |
+
with gr.Row():
|
217 |
+
with gr.Column():
|
218 |
+
gr.Markdown("# Clone and Edit Prompts")
|
219 |
+
prompt_dropdown = gr.Dropdown(
|
220 |
+
label="Select Prompt",
|
221 |
+
choices=[],
|
222 |
+
interactive=True
|
223 |
+
)
|
224 |
+
prompt_list_button = gr.Button("List Prompts")
|
225 |
+
|
226 |
+
with gr.Column():
|
227 |
+
title_input = gr.Textbox(label="Title", placeholder="Enter the prompt title")
|
228 |
+
description_input = gr.Textbox(label="Description", placeholder="Enter the prompt description", lines=3)
|
229 |
+
system_prompt_input = gr.Textbox(label="System Prompt", placeholder="Enter the system prompt", lines=3)
|
230 |
+
user_prompt_input = gr.Textbox(label="User Prompt", placeholder="Enter the user prompt", lines=3)
|
231 |
+
clone_prompt_button = gr.Button("Clone Selected Prompt")
|
232 |
+
save_cloned_prompt_button = gr.Button("Save Cloned Prompt", visible=False)
|
233 |
+
add_prompt_output = gr.HTML()
|
234 |
+
|
235 |
+
# Event handlers
|
236 |
+
prompt_list_button.click(
|
237 |
+
fn=update_prompt_dropdown,
|
238 |
+
outputs=prompt_dropdown
|
239 |
+
)
|
240 |
+
|
241 |
+
# Load prompt details when selected
|
242 |
+
prompt_dropdown.change(
|
243 |
+
fn=load_prompt_details,
|
244 |
+
inputs=[prompt_dropdown],
|
245 |
+
outputs=[title_input, description_input, system_prompt_input, user_prompt_input]
|
246 |
+
)
|
247 |
+
|
248 |
+
def prepare_for_cloning(selected_prompt):
|
249 |
+
if selected_prompt:
|
250 |
+
return gr.update(value=f"Copy of {selected_prompt}"), gr.update(visible=True)
|
251 |
+
return gr.update(), gr.update(visible=False)
|
252 |
+
|
253 |
+
clone_prompt_button.click(
|
254 |
+
fn=prepare_for_cloning,
|
255 |
+
inputs=[prompt_dropdown],
|
256 |
+
outputs=[title_input, save_cloned_prompt_button]
|
257 |
+
)
|
258 |
+
|
259 |
+
def save_cloned_prompt(title, description, system_prompt, user_prompt):
|
260 |
+
try:
|
261 |
+
result = add_prompt(title, description, system_prompt, user_prompt)
|
262 |
+
if result == "Prompt added successfully.":
|
263 |
+
return result, gr.update(choices=update_prompt_dropdown())
|
264 |
+
else:
|
265 |
+
return result, gr.update()
|
266 |
+
except Exception as e:
|
267 |
+
return f"Error saving cloned prompt: {str(e)}", gr.update()
|
268 |
+
|
269 |
+
save_cloned_prompt_button.click(
|
270 |
+
fn=save_cloned_prompt,
|
271 |
+
inputs=[title_input, description_input, system_prompt_input, user_prompt_input],
|
272 |
+
outputs=[add_prompt_output, prompt_dropdown]
|
273 |
+
)
|
App_Function_Libraries/Gradio_UI/PDF_ingestion_tab.py
ADDED
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# PDF_ingestion_tab.py
|
2 |
+
# Gradio UI for ingesting PDFs into the database
|
3 |
+
import os
|
4 |
+
import shutil
|
5 |
+
import tempfile
|
6 |
+
|
7 |
+
# Imports
|
8 |
+
#
|
9 |
+
# External Imports
|
10 |
+
import gradio as gr
|
11 |
+
#
|
12 |
+
# Local Imports
|
13 |
+
from App_Function_Libraries.DB_Manager import load_preset_prompts
|
14 |
+
from App_Function_Libraries.Gradio_UI.Chat_ui import update_user_prompt
|
15 |
+
from App_Function_Libraries.PDF_Ingestion_Lib import extract_metadata_from_pdf, extract_text_and_format_from_pdf, \
|
16 |
+
process_and_cleanup_pdf
|
17 |
+
#
|
18 |
+
#
|
19 |
+
########################################################################################################################
|
20 |
+
#
|
21 |
+
# Functions:
|
22 |
+
|
23 |
+
def create_pdf_ingestion_tab():
|
24 |
+
with gr.TabItem("PDF Ingestion"):
|
25 |
+
# TODO - Add functionality to extract metadata from pdf as part of conversion process in marker
|
26 |
+
gr.Markdown("# Ingest PDF Files and Extract Metadata")
|
27 |
+
with gr.Row():
|
28 |
+
with gr.Column():
|
29 |
+
pdf_file_input = gr.File(label="Uploaded PDF File", file_types=[".pdf"], visible=False)
|
30 |
+
pdf_upload_button = gr.UploadButton("Click to Upload PDF", file_types=[".pdf"])
|
31 |
+
pdf_title_input = gr.Textbox(label="Title (Optional)")
|
32 |
+
pdf_author_input = gr.Textbox(label="Author (Optional)")
|
33 |
+
pdf_keywords_input = gr.Textbox(label="Keywords (Optional, comma-separated)")
|
34 |
+
with gr.Row():
|
35 |
+
custom_prompt_checkbox = gr.Checkbox(label="Use a Custom Prompt",
|
36 |
+
value=False,
|
37 |
+
visible=True)
|
38 |
+
preset_prompt_checkbox = gr.Checkbox(label="Use a pre-set Prompt",
|
39 |
+
value=False,
|
40 |
+
visible=True)
|
41 |
+
with gr.Row():
|
42 |
+
preset_prompt = gr.Dropdown(label="Select Preset Prompt",
|
43 |
+
choices=load_preset_prompts(),
|
44 |
+
visible=False)
|
45 |
+
with gr.Row():
|
46 |
+
custom_prompt_input = gr.Textbox(label="Custom Prompt",
|
47 |
+
placeholder="Enter custom prompt here",
|
48 |
+
lines=3,
|
49 |
+
visible=False)
|
50 |
+
with gr.Row():
|
51 |
+
system_prompt_input = gr.Textbox(label="System Prompt",
|
52 |
+
value="""
|
53 |
+
<s>You are a bulleted notes specialist.
|
54 |
+
[INST]```When creating comprehensive bulleted notes, you should follow these guidelines: Use multiple headings based on the referenced topics, not categories like quotes or terms. Headings should be surrounded by bold formatting and not be listed as bullet points themselves. Leave no space between headings and their corresponding list items underneath. Important terms within the content should be emphasized by setting them in bold font. Any text that ends with a colon should also be bolded. Before submitting your response, review the instructions, and make any corrections necessary to adhered to the specified format. Do not reference these instructions within the notes.``` \nBased on the content between backticks create comprehensive bulleted notes.[/INST]
|
55 |
+
**Bulleted Note Creation Guidelines**
|
56 |
+
|
57 |
+
**Headings**:
|
58 |
+
- Based on referenced topics, not categories like quotes or terms
|
59 |
+
- Surrounded by **bold** formatting
|
60 |
+
- Not listed as bullet points
|
61 |
+
- No space between headings and list items underneath
|
62 |
+
|
63 |
+
**Emphasis**:
|
64 |
+
- **Important terms** set in bold font
|
65 |
+
- **Text ending in a colon**: also bolded
|
66 |
+
|
67 |
+
**Review**:
|
68 |
+
- Ensure adherence to specified format
|
69 |
+
- Do not reference these instructions in your response.</s>[INST] {{ .Prompt }} [/INST]""",
|
70 |
+
lines=3,
|
71 |
+
visible=False)
|
72 |
+
|
73 |
+
custom_prompt_checkbox.change(
|
74 |
+
fn=lambda x: (gr.update(visible=x), gr.update(visible=x)),
|
75 |
+
inputs=[custom_prompt_checkbox],
|
76 |
+
outputs=[custom_prompt_input, system_prompt_input]
|
77 |
+
)
|
78 |
+
preset_prompt_checkbox.change(
|
79 |
+
fn=lambda x: gr.update(visible=x),
|
80 |
+
inputs=[preset_prompt_checkbox],
|
81 |
+
outputs=[preset_prompt]
|
82 |
+
)
|
83 |
+
|
84 |
+
def update_prompts(preset_name):
|
85 |
+
prompts = update_user_prompt(preset_name)
|
86 |
+
return (
|
87 |
+
gr.update(value=prompts["user_prompt"], visible=True),
|
88 |
+
gr.update(value=prompts["system_prompt"], visible=True)
|
89 |
+
)
|
90 |
+
|
91 |
+
preset_prompt.change(
|
92 |
+
update_prompts,
|
93 |
+
inputs=preset_prompt,
|
94 |
+
outputs=[custom_prompt_input, system_prompt_input]
|
95 |
+
)
|
96 |
+
|
97 |
+
pdf_ingest_button = gr.Button("Ingest PDF")
|
98 |
+
|
99 |
+
pdf_upload_button.upload(fn=lambda file: file, inputs=pdf_upload_button, outputs=pdf_file_input)
|
100 |
+
with gr.Column():
|
101 |
+
pdf_result_output = gr.Textbox(label="Result")
|
102 |
+
|
103 |
+
pdf_ingest_button.click(
|
104 |
+
fn=process_and_cleanup_pdf,
|
105 |
+
inputs=[pdf_file_input, pdf_title_input, pdf_author_input, pdf_keywords_input],
|
106 |
+
outputs=pdf_result_output
|
107 |
+
)
|
108 |
+
|
109 |
+
|
110 |
+
def test_pdf_ingestion(pdf_file):
|
111 |
+
if pdf_file is None:
|
112 |
+
return "No file uploaded", ""
|
113 |
+
|
114 |
+
try:
|
115 |
+
# Create a temporary directory
|
116 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
117 |
+
# Create a path for the temporary PDF file
|
118 |
+
temp_path = os.path.join(temp_dir, "temp.pdf")
|
119 |
+
|
120 |
+
# Copy the contents of the uploaded file to the temporary file
|
121 |
+
shutil.copy(pdf_file.name, temp_path)
|
122 |
+
|
123 |
+
# Extract text and convert to Markdown
|
124 |
+
markdown_text = extract_text_and_format_from_pdf(temp_path)
|
125 |
+
|
126 |
+
# Extract metadata from PDF
|
127 |
+
metadata = extract_metadata_from_pdf(temp_path)
|
128 |
+
|
129 |
+
# Use metadata for title and author if not provided
|
130 |
+
title = metadata.get('title', os.path.splitext(os.path.basename(pdf_file.name))[0])
|
131 |
+
author = metadata.get('author', 'Unknown')
|
132 |
+
|
133 |
+
result = f"PDF '{title}' by {author} processed successfully."
|
134 |
+
return result, markdown_text
|
135 |
+
except Exception as e:
|
136 |
+
return f"Error ingesting PDF: {str(e)}", ""
|
137 |
+
|
138 |
+
def create_pdf_ingestion_test_tab():
|
139 |
+
with gr.TabItem("Test PDF Ingestion"):
|
140 |
+
with gr.Row():
|
141 |
+
with gr.Column():
|
142 |
+
pdf_file_input = gr.File(label="Upload PDF for testing")
|
143 |
+
test_button = gr.Button("Test PDF Ingestion")
|
144 |
+
with gr.Column():
|
145 |
+
test_output = gr.Textbox(label="Test Result")
|
146 |
+
pdf_content_output = gr.Textbox(label="PDF Content", lines=200)
|
147 |
+
test_button.click(
|
148 |
+
fn=test_pdf_ingestion,
|
149 |
+
inputs=[pdf_file_input],
|
150 |
+
outputs=[test_output, pdf_content_output]
|
151 |
+
)
|
152 |
+
|
App_Function_Libraries/Gradio_UI/Podcast_tab.py
ADDED
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Podcast_tab.py
|
2 |
+
# Description: Gradio UI for ingesting podcasts into the database
|
3 |
+
#
|
4 |
+
# Imports
|
5 |
+
#
|
6 |
+
#
|
7 |
+
# External Imports
|
8 |
+
import gradio as gr
|
9 |
+
#
|
10 |
+
# Local Imports
|
11 |
+
from App_Function_Libraries.Audio_Files import process_podcast
|
12 |
+
from App_Function_Libraries.DB_Manager import load_preset_prompts
|
13 |
+
from App_Function_Libraries.Gradio_UI.Gradio_Shared import whisper_models, update_user_prompt
|
14 |
+
|
15 |
+
|
16 |
+
#
|
17 |
+
########################################################################################################################
|
18 |
+
#
|
19 |
+
# Functions:
|
20 |
+
|
21 |
+
|
22 |
+
def create_podcast_tab():
|
23 |
+
with gr.TabItem("Podcast"):
|
24 |
+
gr.Markdown("# Podcast Transcription and Ingestion")
|
25 |
+
with gr.Row():
|
26 |
+
with gr.Column():
|
27 |
+
podcast_url_input = gr.Textbox(label="Podcast URL", placeholder="Enter the podcast URL here")
|
28 |
+
podcast_title_input = gr.Textbox(label="Podcast Title", placeholder="Will be auto-detected if possible")
|
29 |
+
podcast_author_input = gr.Textbox(label="Podcast Author", placeholder="Will be auto-detected if possible")
|
30 |
+
|
31 |
+
podcast_keywords_input = gr.Textbox(
|
32 |
+
label="Keywords",
|
33 |
+
placeholder="Enter keywords here (comma-separated, include series name if applicable)",
|
34 |
+
value="podcast,audio",
|
35 |
+
elem_id="podcast-keywords-input"
|
36 |
+
)
|
37 |
+
|
38 |
+
with gr.Row():
|
39 |
+
podcast_custom_prompt_checkbox = gr.Checkbox(label="Use a Custom Prompt",
|
40 |
+
value=False,
|
41 |
+
visible=True)
|
42 |
+
preset_prompt_checkbox = gr.Checkbox(label="Use a pre-set Prompt",
|
43 |
+
value=False,
|
44 |
+
visible=True)
|
45 |
+
with gr.Row():
|
46 |
+
preset_prompt = gr.Dropdown(label="Select Preset Prompt",
|
47 |
+
choices=load_preset_prompts(),
|
48 |
+
visible=False)
|
49 |
+
with gr.Row():
|
50 |
+
podcast_custom_prompt_input = gr.Textbox(label="Custom Prompt",
|
51 |
+
placeholder="Enter custom prompt here",
|
52 |
+
lines=3,
|
53 |
+
visible=False)
|
54 |
+
with gr.Row():
|
55 |
+
system_prompt_input = gr.Textbox(label="System Prompt",
|
56 |
+
value="""<s>You are a bulleted notes specialist. [INST]```When creating comprehensive bulleted notes, you should follow these guidelines: Use multiple headings based on the referenced topics, not categories like quotes or terms. Headings should be surrounded by bold formatting and not be listed as bullet points themselves. Leave no space between headings and their corresponding list items underneath. Important terms within the content should be emphasized by setting them in bold font. Any text that ends with a colon should also be bolded. Before submitting your response, review the instructions, and make any corrections necessary to adhered to the specified format. Do not reference these instructions within the notes.``` \nBased on the content between backticks create comprehensive bulleted notes.[/INST]
|
57 |
+
**Bulleted Note Creation Guidelines**
|
58 |
+
|
59 |
+
**Headings**:
|
60 |
+
- Based on referenced topics, not categories like quotes or terms
|
61 |
+
- Surrounded by **bold** formatting
|
62 |
+
- Not listed as bullet points
|
63 |
+
- No space between headings and list items underneath
|
64 |
+
|
65 |
+
**Emphasis**:
|
66 |
+
- **Important terms** set in bold font
|
67 |
+
- **Text ending in a colon**: also bolded
|
68 |
+
|
69 |
+
**Review**:
|
70 |
+
- Ensure adherence to specified format
|
71 |
+
- Do not reference these instructions in your response.</s>[INST] {{ .Prompt }} [/INST]
|
72 |
+
""",
|
73 |
+
lines=3,
|
74 |
+
visible=False)
|
75 |
+
|
76 |
+
podcast_custom_prompt_checkbox.change(
|
77 |
+
fn=lambda x: (gr.update(visible=x), gr.update(visible=x)),
|
78 |
+
inputs=[podcast_custom_prompt_checkbox],
|
79 |
+
outputs=[podcast_custom_prompt_input, system_prompt_input]
|
80 |
+
)
|
81 |
+
preset_prompt_checkbox.change(
|
82 |
+
fn=lambda x: gr.update(visible=x),
|
83 |
+
inputs=[preset_prompt_checkbox],
|
84 |
+
outputs=[preset_prompt]
|
85 |
+
)
|
86 |
+
|
87 |
+
def update_prompts(preset_name):
|
88 |
+
prompts = update_user_prompt(preset_name)
|
89 |
+
return (
|
90 |
+
gr.update(value=prompts["user_prompt"], visible=True),
|
91 |
+
gr.update(value=prompts["system_prompt"], visible=True)
|
92 |
+
)
|
93 |
+
|
94 |
+
preset_prompt.change(
|
95 |
+
update_prompts,
|
96 |
+
inputs=preset_prompt,
|
97 |
+
outputs=[podcast_custom_prompt_input, system_prompt_input]
|
98 |
+
)
|
99 |
+
|
100 |
+
podcast_api_name_input = gr.Dropdown(
|
101 |
+
choices=[None, "Local-LLM", "OpenAI", "Anthropic", "Cohere", "Groq", "DeepSeek", "Mistral", "OpenRouter", "Llama.cpp",
|
102 |
+
"Kobold", "Ooba", "Tabbyapi", "VLLM","ollama", "HuggingFace"],
|
103 |
+
value=None,
|
104 |
+
label="API Name for Summarization (Optional)"
|
105 |
+
)
|
106 |
+
podcast_api_key_input = gr.Textbox(label="API Key (if required)", type="password")
|
107 |
+
podcast_whisper_model_input = gr.Dropdown(choices=whisper_models, value="medium", label="Whisper Model")
|
108 |
+
|
109 |
+
keep_original_input = gr.Checkbox(label="Keep original audio file", value=False)
|
110 |
+
enable_diarization_input = gr.Checkbox(label="Enable speaker diarization", value=False)
|
111 |
+
|
112 |
+
use_cookies_input = gr.Checkbox(label="Use cookies for yt-dlp", value=False)
|
113 |
+
cookies_input = gr.Textbox(
|
114 |
+
label="yt-dlp Cookies",
|
115 |
+
placeholder="Paste your cookies here (JSON format)",
|
116 |
+
lines=3,
|
117 |
+
visible=False
|
118 |
+
)
|
119 |
+
|
120 |
+
use_cookies_input.change(
|
121 |
+
fn=lambda x: gr.update(visible=x),
|
122 |
+
inputs=[use_cookies_input],
|
123 |
+
outputs=[cookies_input]
|
124 |
+
)
|
125 |
+
|
126 |
+
chunking_options_checkbox = gr.Checkbox(label="Show Chunking Options", value=False)
|
127 |
+
with gr.Row(visible=False) as chunking_options_box:
|
128 |
+
gr.Markdown("### Chunking Options")
|
129 |
+
with gr.Column():
|
130 |
+
chunk_method = gr.Dropdown(choices=['words', 'sentences', 'paragraphs', 'tokens'], label="Chunking Method")
|
131 |
+
max_chunk_size = gr.Slider(minimum=100, maximum=1000, value=300, step=50, label="Max Chunk Size")
|
132 |
+
chunk_overlap = gr.Slider(minimum=0, maximum=100, value=0, step=10, label="Chunk Overlap")
|
133 |
+
use_adaptive_chunking = gr.Checkbox(label="Use Adaptive Chunking")
|
134 |
+
use_multi_level_chunking = gr.Checkbox(label="Use Multi-level Chunking")
|
135 |
+
chunk_language = gr.Dropdown(choices=['english', 'french', 'german', 'spanish'], label="Chunking Language")
|
136 |
+
|
137 |
+
chunking_options_checkbox.change(
|
138 |
+
fn=lambda x: gr.update(visible=x),
|
139 |
+
inputs=[chunking_options_checkbox],
|
140 |
+
outputs=[chunking_options_box]
|
141 |
+
)
|
142 |
+
|
143 |
+
podcast_process_button = gr.Button("Process Podcast")
|
144 |
+
|
145 |
+
with gr.Column():
|
146 |
+
podcast_progress_output = gr.Textbox(label="Progress")
|
147 |
+
podcast_error_output = gr.Textbox(label="Error Messages")
|
148 |
+
podcast_transcription_output = gr.Textbox(label="Transcription")
|
149 |
+
podcast_summary_output = gr.Textbox(label="Summary")
|
150 |
+
download_transcription = gr.File(label="Download Transcription as JSON")
|
151 |
+
download_summary = gr.File(label="Download Summary as Text")
|
152 |
+
|
153 |
+
podcast_process_button.click(
|
154 |
+
fn=process_podcast,
|
155 |
+
inputs=[podcast_url_input, podcast_title_input, podcast_author_input,
|
156 |
+
podcast_keywords_input, podcast_custom_prompt_input, podcast_api_name_input,
|
157 |
+
podcast_api_key_input, podcast_whisper_model_input, keep_original_input,
|
158 |
+
enable_diarization_input, use_cookies_input, cookies_input,
|
159 |
+
chunk_method, max_chunk_size, chunk_overlap, use_adaptive_chunking,
|
160 |
+
use_multi_level_chunking, chunk_language],
|
161 |
+
outputs=[podcast_progress_output, podcast_transcription_output, podcast_summary_output,
|
162 |
+
podcast_title_input, podcast_author_input, podcast_keywords_input, podcast_error_output,
|
163 |
+
download_transcription, download_summary]
|
164 |
+
)
|
App_Function_Libraries/Gradio_UI/Re_summarize_tab.py
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Re_summarize_tab.py
|
2 |
+
# Gradio UI for Re-summarizing items in the database
|
3 |
+
#
|
4 |
+
# Imports
|
5 |
+
import json
|
6 |
+
import logging
|
7 |
+
#
|
8 |
+
# External Imports
|
9 |
+
import gradio as gr
|
10 |
+
#
|
11 |
+
# Local Imports
|
12 |
+
from App_Function_Libraries.Chunk_Lib import improved_chunking_process
|
13 |
+
from App_Function_Libraries.DB_Manager import update_media_content, load_preset_prompts
|
14 |
+
from App_Function_Libraries.Gradio_UI.Chat_ui import update_user_prompt
|
15 |
+
from App_Function_Libraries.Gradio_UI.Gradio_Shared import fetch_item_details, fetch_items_by_keyword, \
|
16 |
+
fetch_items_by_content, fetch_items_by_title_or_url
|
17 |
+
from App_Function_Libraries.Summarization_General_Lib import summarize_chunk
|
18 |
+
from App_Function_Libraries.Utils import load_comprehensive_config
|
19 |
+
#
|
20 |
+
#
|
21 |
+
######################################################################################################################
|
22 |
+
#
|
23 |
+
# Functions:
|
24 |
+
|
25 |
+
def create_resummary_tab():
|
26 |
+
with gr.TabItem("Re-Summarize"):
|
27 |
+
gr.Markdown("# Re-Summarize Existing Content")
|
28 |
+
with gr.Row():
|
29 |
+
with gr.Column():
|
30 |
+
search_query_input = gr.Textbox(label="Search Query", placeholder="Enter your search query here...")
|
31 |
+
search_type_input = gr.Radio(choices=["Title", "URL", "Keyword", "Content"], value="Title", label="Search By")
|
32 |
+
search_button = gr.Button("Search")
|
33 |
+
|
34 |
+
items_output = gr.Dropdown(label="Select Item", choices=[], interactive=True)
|
35 |
+
item_mapping = gr.State({})
|
36 |
+
|
37 |
+
with gr.Row():
|
38 |
+
api_name_input = gr.Dropdown(
|
39 |
+
choices=["Local-LLM", "OpenAI", "Anthropic", "Cohere", "Groq", "DeepSeek", "Mistral", "OpenRouter",
|
40 |
+
"Llama.cpp", "Kobold", "Ooba", "Tabbyapi", "VLLM","ollama", "HuggingFace"],
|
41 |
+
value="Local-LLM", label="API Name")
|
42 |
+
api_key_input = gr.Textbox(label="API Key", placeholder="Enter your API key here", type="password")
|
43 |
+
|
44 |
+
chunking_options_checkbox = gr.Checkbox(label="Use Chunking", value=False)
|
45 |
+
with gr.Row(visible=False) as chunking_options_box:
|
46 |
+
chunk_method = gr.Dropdown(choices=['words', 'sentences', 'paragraphs', 'tokens', 'chapters'],
|
47 |
+
label="Chunking Method", value='words')
|
48 |
+
max_chunk_size = gr.Slider(minimum=100, maximum=1000, value=300, step=50, label="Max Chunk Size")
|
49 |
+
chunk_overlap = gr.Slider(minimum=0, maximum=100, value=0, step=10, label="Chunk Overlap")
|
50 |
+
|
51 |
+
with gr.Row():
|
52 |
+
custom_prompt_checkbox = gr.Checkbox(label="Use a Custom Prompt",
|
53 |
+
value=False,
|
54 |
+
visible=True)
|
55 |
+
preset_prompt_checkbox = gr.Checkbox(label="Use a pre-set Prompt",
|
56 |
+
value=False,
|
57 |
+
visible=True)
|
58 |
+
with gr.Row():
|
59 |
+
preset_prompt = gr.Dropdown(label="Select Preset Prompt",
|
60 |
+
choices=load_preset_prompts(),
|
61 |
+
visible=False)
|
62 |
+
with gr.Row():
|
63 |
+
custom_prompt_input = gr.Textbox(label="Custom Prompt",
|
64 |
+
placeholder="Enter custom prompt here",
|
65 |
+
lines=3,
|
66 |
+
visible=False)
|
67 |
+
with gr.Row():
|
68 |
+
system_prompt_input = gr.Textbox(label="System Prompt",
|
69 |
+
value="""<s>You are a bulleted notes specialist. [INST]```When creating comprehensive bulleted notes, you should follow these guidelines: Use multiple headings based on the referenced topics, not categories like quotes or terms. Headings should be surrounded by bold formatting and not be listed as bullet points themselves. Leave no space between headings and their corresponding list items underneath. Important terms within the content should be emphasized by setting them in bold font. Any text that ends with a colon should also be bolded. Before submitting your response, review the instructions, and make any corrections necessary to adhered to the specified format. Do not reference these instructions within the notes.``` \nBased on the content between backticks create comprehensive bulleted notes.[/INST]
|
70 |
+
**Bulleted Note Creation Guidelines**
|
71 |
+
|
72 |
+
**Headings**:
|
73 |
+
- Based on referenced topics, not categories like quotes or terms
|
74 |
+
- Surrounded by **bold** formatting
|
75 |
+
- Not listed as bullet points
|
76 |
+
- No space between headings and list items underneath
|
77 |
+
|
78 |
+
**Emphasis**:
|
79 |
+
- **Important terms** set in bold font
|
80 |
+
- **Text ending in a colon**: also bolded
|
81 |
+
|
82 |
+
**Review**:
|
83 |
+
- Ensure adherence to specified format
|
84 |
+
- Do not reference these instructions in your response.</s>[INST] {{ .Prompt }} [/INST]
|
85 |
+
""",
|
86 |
+
lines=3,
|
87 |
+
visible=False)
|
88 |
+
|
89 |
+
def update_prompts(preset_name):
|
90 |
+
prompts = update_user_prompt(preset_name)
|
91 |
+
return (
|
92 |
+
gr.update(value=prompts["user_prompt"], visible=True),
|
93 |
+
gr.update(value=prompts["system_prompt"], visible=True)
|
94 |
+
)
|
95 |
+
|
96 |
+
preset_prompt.change(
|
97 |
+
update_prompts,
|
98 |
+
inputs=preset_prompt,
|
99 |
+
outputs=[custom_prompt_input, system_prompt_input]
|
100 |
+
)
|
101 |
+
|
102 |
+
resummarize_button = gr.Button("Re-Summarize")
|
103 |
+
with gr.Column():
|
104 |
+
result_output = gr.Textbox(label="Result")
|
105 |
+
|
106 |
+
custom_prompt_checkbox.change(
|
107 |
+
fn=lambda x: (gr.update(visible=x), gr.update(visible=x)),
|
108 |
+
inputs=[custom_prompt_checkbox],
|
109 |
+
outputs=[custom_prompt_input, system_prompt_input]
|
110 |
+
)
|
111 |
+
preset_prompt_checkbox.change(
|
112 |
+
fn=lambda x: gr.update(visible=x),
|
113 |
+
inputs=[preset_prompt_checkbox],
|
114 |
+
outputs=[preset_prompt]
|
115 |
+
)
|
116 |
+
|
117 |
+
# Connect the UI elements
|
118 |
+
search_button.click(
|
119 |
+
fn=update_resummarize_dropdown,
|
120 |
+
inputs=[search_query_input, search_type_input],
|
121 |
+
outputs=[items_output, item_mapping]
|
122 |
+
)
|
123 |
+
|
124 |
+
chunking_options_checkbox.change(
|
125 |
+
fn=lambda x: gr.update(visible=x),
|
126 |
+
inputs=[chunking_options_checkbox],
|
127 |
+
outputs=[chunking_options_box]
|
128 |
+
)
|
129 |
+
|
130 |
+
custom_prompt_checkbox.change(
|
131 |
+
fn=lambda x: (gr.update(visible=x), gr.update(visible=x)),
|
132 |
+
inputs=[custom_prompt_checkbox],
|
133 |
+
outputs=[custom_prompt_input, system_prompt_input]
|
134 |
+
)
|
135 |
+
|
136 |
+
resummarize_button.click(
|
137 |
+
fn=resummarize_content_wrapper,
|
138 |
+
inputs=[items_output, item_mapping, api_name_input, api_key_input, chunking_options_checkbox, chunk_method,
|
139 |
+
max_chunk_size, chunk_overlap, custom_prompt_checkbox, custom_prompt_input],
|
140 |
+
outputs=result_output
|
141 |
+
)
|
142 |
+
|
143 |
+
return search_query_input, search_type_input, search_button, items_output, item_mapping, api_name_input, api_key_input, chunking_options_checkbox, chunking_options_box, chunk_method, max_chunk_size, chunk_overlap, custom_prompt_checkbox, custom_prompt_input, resummarize_button, result_output
|
144 |
+
|
145 |
+
|
146 |
+
def update_resummarize_dropdown(search_query, search_type):
|
147 |
+
if search_type in ['Title', 'URL']:
|
148 |
+
results = fetch_items_by_title_or_url(search_query, search_type)
|
149 |
+
elif search_type == 'Keyword':
|
150 |
+
results = fetch_items_by_keyword(search_query)
|
151 |
+
else: # Content
|
152 |
+
results = fetch_items_by_content(search_query)
|
153 |
+
|
154 |
+
item_options = [f"{item[1]} ({item[2]})" for item in results]
|
155 |
+
item_mapping = {f"{item[1]} ({item[2]})": item[0] for item in results}
|
156 |
+
logging.debug(f"item_options: {item_options}")
|
157 |
+
logging.debug(f"item_mapping: {item_mapping}")
|
158 |
+
return gr.update(choices=item_options), item_mapping
|
159 |
+
|
160 |
+
|
161 |
+
def resummarize_content_wrapper(selected_item, item_mapping, api_name, api_key=None, chunking_options_checkbox=None, chunk_method=None,
|
162 |
+
max_chunk_size=None, chunk_overlap=None, custom_prompt_checkbox=None, custom_prompt=None):
|
163 |
+
logging.debug(f"resummarize_content_wrapper called with item_mapping type: {type(item_mapping)}")
|
164 |
+
logging.debug(f"selected_item: {selected_item}")
|
165 |
+
|
166 |
+
if not selected_item or not api_name:
|
167 |
+
return "Please select an item and provide API details."
|
168 |
+
|
169 |
+
# Handle potential string representation of item_mapping
|
170 |
+
if isinstance(item_mapping, str):
|
171 |
+
try:
|
172 |
+
item_mapping = json.loads(item_mapping)
|
173 |
+
except json.JSONDecodeError:
|
174 |
+
return f"Error: item_mapping is a string but not valid JSON. Value: {item_mapping[:100]}..."
|
175 |
+
|
176 |
+
if not isinstance(item_mapping, dict):
|
177 |
+
return f"Error: item_mapping is not a dictionary or valid JSON string. Type: {type(item_mapping)}"
|
178 |
+
|
179 |
+
media_id = item_mapping.get(selected_item)
|
180 |
+
if not media_id:
|
181 |
+
return f"Invalid selection. Selected item: {selected_item}, Available items: {list(item_mapping.keys())[:5]}..."
|
182 |
+
|
183 |
+
content, old_prompt, old_summary = fetch_item_details(media_id)
|
184 |
+
|
185 |
+
if not content:
|
186 |
+
return "No content available for re-summarization."
|
187 |
+
|
188 |
+
# Prepare chunking options
|
189 |
+
chunk_options = {
|
190 |
+
'method': chunk_method,
|
191 |
+
'max_size': int(max_chunk_size) if max_chunk_size is not None else None,
|
192 |
+
'overlap': int(chunk_overlap) if chunk_overlap is not None else None,
|
193 |
+
'language': 'english',
|
194 |
+
'adaptive': True,
|
195 |
+
'multi_level': False,
|
196 |
+
} if chunking_options_checkbox else None
|
197 |
+
|
198 |
+
# Prepare summarization prompt
|
199 |
+
summarization_prompt = custom_prompt if custom_prompt_checkbox and custom_prompt else None
|
200 |
+
|
201 |
+
logging.debug(f"Calling resummarize_content with media_id: {media_id}")
|
202 |
+
# Call the resummarize_content function
|
203 |
+
result = resummarize_content(selected_item, item_mapping, content, api_name, api_key, chunk_options, summarization_prompt)
|
204 |
+
|
205 |
+
return result
|
206 |
+
|
207 |
+
|
208 |
+
# FIXME - should be moved...
|
209 |
+
def resummarize_content(selected_item, item_mapping, content, api_name, api_key=None, chunk_options=None, summarization_prompt=None):
|
210 |
+
logging.debug(f"resummarize_content called with selected_item: {selected_item}")
|
211 |
+
# Load configuration
|
212 |
+
config = load_comprehensive_config()
|
213 |
+
|
214 |
+
# Chunking logic
|
215 |
+
if chunk_options:
|
216 |
+
chunks = improved_chunking_process(content, chunk_options)
|
217 |
+
else:
|
218 |
+
chunks = [{'text': content, 'metadata': {}}]
|
219 |
+
|
220 |
+
# Use default prompt if not provided
|
221 |
+
if not summarization_prompt:
|
222 |
+
summarization_prompt = config.get('Prompts', 'default_summary_prompt', fallback="""<s>You are a bulleted notes specialist. [INST]```When creating comprehensive bulleted notes, you should follow these guidelines: Use multiple headings based on the referenced topics, not categories like quotes or terms. Headings should be surrounded by bold formatting and not be listed as bullet points themselves. Leave no space between headings and their corresponding list items underneath. Important terms within the content should be emphasized by setting them in bold font. Any text that ends with a colon should also be bolded. Before submitting your response, review the instructions, and make any corrections necessary to adhered to the specified format. Do not reference these instructions within the notes.``` \nBased on the content between backticks create comprehensive bulleted notes.[/INST]
|
223 |
+
**Bulleted Note Creation Guidelines**
|
224 |
+
|
225 |
+
**Headings**:
|
226 |
+
- Based on referenced topics, not categories like quotes or terms
|
227 |
+
- Surrounded by **bold** formatting
|
228 |
+
- Not listed as bullet points
|
229 |
+
- No space between headings and list items underneath
|
230 |
+
|
231 |
+
**Emphasis**:
|
232 |
+
- **Important terms** set in bold font
|
233 |
+
- **Text ending in a colon**: also bolded
|
234 |
+
|
235 |
+
**Review**:
|
236 |
+
- Ensure adherence to specified format
|
237 |
+
- Do not reference these instructions in your response.</s>[INST] {{ .Prompt }} [/INST]""")
|
238 |
+
|
239 |
+
# Summarization logic
|
240 |
+
summaries = []
|
241 |
+
for chunk in chunks:
|
242 |
+
chunk_text = chunk['text']
|
243 |
+
try:
|
244 |
+
chunk_summary = summarize_chunk(api_name, chunk_text, summarization_prompt, api_key)
|
245 |
+
if chunk_summary:
|
246 |
+
summaries.append(chunk_summary)
|
247 |
+
else:
|
248 |
+
logging.warning(f"Summarization failed for chunk: {chunk_text[:100]}...")
|
249 |
+
except Exception as e:
|
250 |
+
logging.error(f"Error during summarization: {str(e)}")
|
251 |
+
return f"Error during summarization: {str(e)}"
|
252 |
+
|
253 |
+
if not summaries:
|
254 |
+
return "Summarization failed for all chunks."
|
255 |
+
|
256 |
+
new_summary = " ".join(summaries)
|
257 |
+
|
258 |
+
# Update the database with the new summary
|
259 |
+
|
260 |
+
try:
|
261 |
+
update_result = update_media_content(selected_item, item_mapping, content, summarization_prompt, new_summary)
|
262 |
+
if "successfully" in update_result.lower():
|
263 |
+
return f"Re-summarization complete. New summary: {new_summary}..."
|
264 |
+
else:
|
265 |
+
return f"Error during database update: {update_result}"
|
266 |
+
except Exception as e:
|
267 |
+
logging.error(f"Error updating database: {str(e)}")
|
268 |
+
return f"Error updating database: {str(e)}"
|
App_Function_Libraries/Gradio_UI/Search_Tab.py
ADDED
@@ -0,0 +1,487 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Search_Tab.py
|
2 |
+
# Description: This file contains the code for the search tab in the Gradio UI
|
3 |
+
#
|
4 |
+
# Imports
|
5 |
+
import html
|
6 |
+
import logging
|
7 |
+
import sqlite3
|
8 |
+
|
9 |
+
#
|
10 |
+
# External Imports
|
11 |
+
import gradio as gr
|
12 |
+
|
13 |
+
from App_Function_Libraries.DB_Manager import view_database, search_and_display_items
|
14 |
+
from App_Function_Libraries.Gradio_UI.Gradio_Shared import update_dropdown, update_detailed_view
|
15 |
+
from App_Function_Libraries.RAG_Libary_2 import rag_search
|
16 |
+
|
17 |
+
#
|
18 |
+
# Local Imports
|
19 |
+
#
|
20 |
+
#
|
21 |
+
###################################################################################################
|
22 |
+
#
|
23 |
+
# Functions:
|
24 |
+
|
25 |
+
logger = logging.getLogger()
|
26 |
+
|
27 |
+
|
28 |
+
|
29 |
+
|
30 |
+
# FIXME - SQL functions to be moved to DB_Manager
|
31 |
+
def search_prompts(query):
|
32 |
+
try:
|
33 |
+
conn = sqlite3.connect('prompts.db')
|
34 |
+
cursor = conn.cursor()
|
35 |
+
cursor.execute("SELECT name, details, system, user FROM Prompts WHERE name LIKE ? OR details LIKE ?",
|
36 |
+
(f"%{query}%", f"%{query}%"))
|
37 |
+
results = cursor.fetchall()
|
38 |
+
conn.close()
|
39 |
+
return results
|
40 |
+
except sqlite3.Error as e:
|
41 |
+
print(f"Error searching prompts: {e}")
|
42 |
+
return []
|
43 |
+
|
44 |
+
|
45 |
+
|
46 |
+
|
47 |
+
|
48 |
+
|
49 |
+
|
50 |
+
|
51 |
+
|
52 |
+
|
53 |
+
|
54 |
+
|
55 |
+
def create_rag_tab():
|
56 |
+
with gr.TabItem("RAG Search"):
|
57 |
+
gr.Markdown("# Retrieval-Augmented Generation (RAG) Search")
|
58 |
+
|
59 |
+
with gr.Row():
|
60 |
+
with gr.Column():
|
61 |
+
search_query = gr.Textbox(label="Enter your question", placeholder="What would you like to know?")
|
62 |
+
api_choice = gr.Dropdown(
|
63 |
+
choices=["Local-LLM", "OpenAI", "Anthropic", "Cohere", "Groq", "DeepSeek", "Mistral", "OpenRouter", "Llama.cpp", "Kobold", "Ooba", "Tabbyapi", "VLLM", "ollama", "HuggingFace"],
|
64 |
+
label="Select API for RAG",
|
65 |
+
value="OpenAI"
|
66 |
+
)
|
67 |
+
search_button = gr.Button("Search")
|
68 |
+
|
69 |
+
with gr.Column():
|
70 |
+
result_output = gr.Textbox(label="Answer", lines=10)
|
71 |
+
context_output = gr.Textbox(label="Context", lines=10, visible=False)
|
72 |
+
|
73 |
+
def perform_rag_search(query, api_choice):
|
74 |
+
result = rag_search(query, api_choice)
|
75 |
+
return result['answer'], result['context']
|
76 |
+
|
77 |
+
search_button.click(perform_rag_search, inputs=[search_query, api_choice], outputs=[result_output, context_output])
|
78 |
+
|
79 |
+
# FIXME - under construction
|
80 |
+
def create_embeddings_tab():
|
81 |
+
with gr.TabItem("Create Embeddings"):
|
82 |
+
gr.Markdown("# Create Embeddings for All Content")
|
83 |
+
|
84 |
+
with gr.Row():
|
85 |
+
with gr.Column():
|
86 |
+
embedding_api_choice = gr.Dropdown(
|
87 |
+
choices=["OpenAI", "Local", "HuggingFace"],
|
88 |
+
label="Select API for Embeddings",
|
89 |
+
value="OpenAI"
|
90 |
+
)
|
91 |
+
create_button = gr.Button("Create Embeddings")
|
92 |
+
|
93 |
+
with gr.Column():
|
94 |
+
status_output = gr.Textbox(label="Status", lines=10)
|
95 |
+
|
96 |
+
def create_embeddings(api_choice):
|
97 |
+
try:
|
98 |
+
# Assuming you have a function that handles the creation of embeddings
|
99 |
+
from App_Function_Libraries.ChromaDB_Library import create_all_embeddings
|
100 |
+
status = create_all_embeddings(api_choice)
|
101 |
+
return status
|
102 |
+
except Exception as e:
|
103 |
+
return f"Error: {str(e)}"
|
104 |
+
|
105 |
+
create_button.click(create_embeddings, inputs=[embedding_api_choice], outputs=status_output)
|
106 |
+
|
107 |
+
|
108 |
+
|
109 |
+
|
110 |
+
def create_search_tab():
|
111 |
+
with gr.TabItem("Search / Detailed View"):
|
112 |
+
with gr.Row():
|
113 |
+
with gr.Column():
|
114 |
+
gr.Markdown("# Search across all ingested items in the Database")
|
115 |
+
gr.Markdown(" by Title / URL / Keyword / or Content via SQLite Full-Text-Search")
|
116 |
+
search_query_input = gr.Textbox(label="Search Query", placeholder="Enter your search query here...")
|
117 |
+
search_type_input = gr.Radio(choices=["Title", "URL", "Keyword", "Content"], value="Title", label="Search By")
|
118 |
+
search_button = gr.Button("Search")
|
119 |
+
items_output = gr.Dropdown(label="Select Item", choices=[])
|
120 |
+
item_mapping = gr.State({})
|
121 |
+
prompt_summary_output = gr.HTML(label="Prompt & Summary", visible=True)
|
122 |
+
|
123 |
+
search_button.click(
|
124 |
+
fn=update_dropdown,
|
125 |
+
inputs=[search_query_input, search_type_input],
|
126 |
+
outputs=[items_output, item_mapping]
|
127 |
+
)
|
128 |
+
with gr.Column():
|
129 |
+
content_output = gr.Markdown(label="Content", visible=True)
|
130 |
+
items_output.change(
|
131 |
+
fn=update_detailed_view,
|
132 |
+
inputs=[items_output, item_mapping],
|
133 |
+
outputs=[prompt_summary_output, content_output]
|
134 |
+
)
|
135 |
+
|
136 |
+
|
137 |
+
def display_search_results(query):
|
138 |
+
if not query.strip():
|
139 |
+
return "Please enter a search query."
|
140 |
+
|
141 |
+
results = search_prompts(query)
|
142 |
+
|
143 |
+
# Debugging: Print the results to the console to see what is being returned
|
144 |
+
print(f"Processed search results for query '{query}': {results}")
|
145 |
+
|
146 |
+
if results:
|
147 |
+
result_md = "## Search Results:\n"
|
148 |
+
for result in results:
|
149 |
+
# Debugging: Print each result to see its format
|
150 |
+
print(f"Result item: {result}")
|
151 |
+
|
152 |
+
if len(result) == 2:
|
153 |
+
name, details = result
|
154 |
+
result_md += f"**Title:** {name}\n\n**Description:** {details}\n\n---\n"
|
155 |
+
|
156 |
+
elif len(result) == 4:
|
157 |
+
name, details, system, user = result
|
158 |
+
result_md += f"**Title:** {name}\n\n"
|
159 |
+
result_md += f"**Description:** {details}\n\n"
|
160 |
+
result_md += f"**System Prompt:** {system}\n\n"
|
161 |
+
result_md += f"**User Prompt:** {user}\n\n"
|
162 |
+
result_md += "---\n"
|
163 |
+
else:
|
164 |
+
result_md += "Error: Unexpected result format.\n\n---\n"
|
165 |
+
return result_md
|
166 |
+
return "No results found."
|
167 |
+
|
168 |
+
|
169 |
+
def create_viewing_tab():
|
170 |
+
with gr.TabItem("View Database"):
|
171 |
+
gr.Markdown("# View Database Entries")
|
172 |
+
with gr.Row():
|
173 |
+
with gr.Column():
|
174 |
+
entries_per_page = gr.Dropdown(choices=[10, 20, 50, 100], label="Entries per Page", value=10)
|
175 |
+
page_number = gr.Number(value=1, label="Page Number", precision=0)
|
176 |
+
view_button = gr.Button("View Page")
|
177 |
+
next_page_button = gr.Button("Next Page")
|
178 |
+
previous_page_button = gr.Button("Previous Page")
|
179 |
+
with gr.Column():
|
180 |
+
results_display = gr.HTML()
|
181 |
+
pagination_info = gr.Textbox(label="Pagination Info", interactive=False)
|
182 |
+
|
183 |
+
def update_page(page, entries_per_page):
|
184 |
+
results, pagination, total_pages = view_database(page, entries_per_page)
|
185 |
+
next_disabled = page >= total_pages
|
186 |
+
prev_disabled = page <= 1
|
187 |
+
return results, pagination, page, gr.update(interactive=not next_disabled), gr.update(interactive=not prev_disabled)
|
188 |
+
|
189 |
+
def go_to_next_page(current_page, entries_per_page):
|
190 |
+
next_page = current_page + 1
|
191 |
+
return update_page(next_page, entries_per_page)
|
192 |
+
|
193 |
+
def go_to_previous_page(current_page, entries_per_page):
|
194 |
+
previous_page = max(1, current_page - 1)
|
195 |
+
return update_page(previous_page, entries_per_page)
|
196 |
+
|
197 |
+
view_button.click(
|
198 |
+
fn=update_page,
|
199 |
+
inputs=[page_number, entries_per_page],
|
200 |
+
outputs=[results_display, pagination_info, page_number, next_page_button, previous_page_button]
|
201 |
+
)
|
202 |
+
|
203 |
+
next_page_button.click(
|
204 |
+
fn=go_to_next_page,
|
205 |
+
inputs=[page_number, entries_per_page],
|
206 |
+
outputs=[results_display, pagination_info, page_number, next_page_button, previous_page_button]
|
207 |
+
)
|
208 |
+
|
209 |
+
previous_page_button.click(
|
210 |
+
fn=go_to_previous_page,
|
211 |
+
inputs=[page_number, entries_per_page],
|
212 |
+
outputs=[results_display, pagination_info, page_number, next_page_button, previous_page_button]
|
213 |
+
)
|
214 |
+
|
215 |
+
|
216 |
+
def create_search_summaries_tab():
|
217 |
+
with gr.TabItem("Search/View Title+Summary "):
|
218 |
+
gr.Markdown("# Search across all ingested items in the Database and review their summaries")
|
219 |
+
gr.Markdown("Search by Title / URL / Keyword / or Content via SQLite Full-Text-Search")
|
220 |
+
with gr.Row():
|
221 |
+
with gr.Column():
|
222 |
+
search_query_input = gr.Textbox(label="Search Query", placeholder="Enter your search query here...")
|
223 |
+
search_type_input = gr.Radio(choices=["Title", "URL", "Keyword", "Content"], value="Title",
|
224 |
+
label="Search By")
|
225 |
+
entries_per_page = gr.Dropdown(choices=[10, 20, 50, 100], label="Entries per Page", value=10)
|
226 |
+
page_number = gr.Number(value=1, label="Page Number", precision=0)
|
227 |
+
char_count_input = gr.Number(value=5000, label="Amount of characters to display from the main content",
|
228 |
+
precision=0)
|
229 |
+
with gr.Column():
|
230 |
+
search_button = gr.Button("Search")
|
231 |
+
next_page_button = gr.Button("Next Page")
|
232 |
+
previous_page_button = gr.Button("Previous Page")
|
233 |
+
pagination_info = gr.Textbox(label="Pagination Info", interactive=False)
|
234 |
+
search_results_output = gr.HTML()
|
235 |
+
|
236 |
+
|
237 |
+
def update_search_page(query, search_type, page, entries_per_page, char_count):
|
238 |
+
# Ensure char_count is a positive integer
|
239 |
+
char_count = max(1, int(char_count)) if char_count else 5000
|
240 |
+
results, pagination, total_pages = search_and_display_items(query, search_type, page, entries_per_page, char_count)
|
241 |
+
next_disabled = page >= total_pages
|
242 |
+
prev_disabled = page <= 1
|
243 |
+
return results, pagination, page, gr.update(interactive=not next_disabled), gr.update(
|
244 |
+
interactive=not prev_disabled)
|
245 |
+
|
246 |
+
def go_to_next_search_page(query, search_type, current_page, entries_per_page, char_count):
|
247 |
+
next_page = current_page + 1
|
248 |
+
return update_search_page(query, search_type, next_page, entries_per_page, char_count)
|
249 |
+
|
250 |
+
def go_to_previous_search_page(query, search_type, current_page, entries_per_page, char_count):
|
251 |
+
previous_page = max(1, current_page - 1)
|
252 |
+
return update_search_page(query, search_type, previous_page, entries_per_page, char_count)
|
253 |
+
|
254 |
+
search_button.click(
|
255 |
+
fn=update_search_page,
|
256 |
+
inputs=[search_query_input, search_type_input, page_number, entries_per_page, char_count_input],
|
257 |
+
outputs=[search_results_output, pagination_info, page_number, next_page_button, previous_page_button]
|
258 |
+
)
|
259 |
+
|
260 |
+
next_page_button.click(
|
261 |
+
fn=go_to_next_search_page,
|
262 |
+
inputs=[search_query_input, search_type_input, page_number, entries_per_page, char_count_input],
|
263 |
+
outputs=[search_results_output, pagination_info, page_number, next_page_button, previous_page_button]
|
264 |
+
)
|
265 |
+
|
266 |
+
previous_page_button.click(
|
267 |
+
fn=go_to_previous_search_page,
|
268 |
+
inputs=[search_query_input, search_type_input, page_number, entries_per_page, char_count_input],
|
269 |
+
outputs=[search_results_output, pagination_info, page_number, next_page_button, previous_page_button]
|
270 |
+
)
|
271 |
+
|
272 |
+
|
273 |
+
|
274 |
+
def create_prompt_view_tab():
|
275 |
+
with gr.TabItem("View Prompt Database"):
|
276 |
+
gr.Markdown("# View Prompt Database Entries")
|
277 |
+
with gr.Row():
|
278 |
+
with gr.Column():
|
279 |
+
entries_per_page = gr.Dropdown(choices=[10, 20, 50, 100], label="Entries per Page", value=10)
|
280 |
+
page_number = gr.Number(value=1, label="Page Number", precision=0)
|
281 |
+
view_button = gr.Button("View Page")
|
282 |
+
next_page_button = gr.Button("Next Page")
|
283 |
+
previous_page_button = gr.Button("Previous Page")
|
284 |
+
with gr.Column():
|
285 |
+
pagination_info = gr.Textbox(label="Pagination Info", interactive=False)
|
286 |
+
results_display = gr.HTML()
|
287 |
+
|
288 |
+
# FIXME - SQL functions to be moved to DB_Manager
|
289 |
+
def view_database(page, entries_per_page):
|
290 |
+
offset = (page - 1) * entries_per_page
|
291 |
+
try:
|
292 |
+
with sqlite3.connect('prompts.db') as conn:
|
293 |
+
cursor = conn.cursor()
|
294 |
+
cursor.execute('''
|
295 |
+
SELECT p.name, p.details, p.system, p.user, GROUP_CONCAT(k.keyword, ', ') as keywords
|
296 |
+
FROM Prompts p
|
297 |
+
LEFT JOIN PromptKeywords pk ON p.id = pk.prompt_id
|
298 |
+
LEFT JOIN Keywords k ON pk.keyword_id = k.id
|
299 |
+
GROUP BY p.id
|
300 |
+
ORDER BY p.name
|
301 |
+
LIMIT ? OFFSET ?
|
302 |
+
''', (entries_per_page, offset))
|
303 |
+
prompts = cursor.fetchall()
|
304 |
+
|
305 |
+
cursor.execute('SELECT COUNT(*) FROM Prompts')
|
306 |
+
total_prompts = cursor.fetchone()[0]
|
307 |
+
|
308 |
+
results = ""
|
309 |
+
for prompt in prompts:
|
310 |
+
# Escape HTML special characters and replace newlines with <br> tags
|
311 |
+
title = html.escape(prompt[0]).replace('\n', '<br>')
|
312 |
+
details = html.escape(prompt[1] or '').replace('\n', '<br>')
|
313 |
+
system_prompt = html.escape(prompt[2] or '')
|
314 |
+
user_prompt = html.escape(prompt[3] or '')
|
315 |
+
keywords = html.escape(prompt[4] or '').replace('\n', '<br>')
|
316 |
+
|
317 |
+
results += f"""
|
318 |
+
<div style="border: 1px solid #ddd; padding: 10px; margin-bottom: 20px;">
|
319 |
+
<div style="display: grid; grid-template-columns: 1fr 1fr; gap: 10px;">
|
320 |
+
<div><strong>Title:</strong> {title}</div>
|
321 |
+
<div><strong>Details:</strong> {details}</div>
|
322 |
+
</div>
|
323 |
+
<div style="margin-top: 10px;">
|
324 |
+
<strong>User Prompt:</strong>
|
325 |
+
<pre style="white-space: pre-wrap; word-wrap: break-word;">{user_prompt}</pre>
|
326 |
+
</div>
|
327 |
+
<div style="margin-top: 10px;">
|
328 |
+
<strong>System Prompt:</strong>
|
329 |
+
<pre style="white-space: pre-wrap; word-wrap: break-word;">{system_prompt}</pre>
|
330 |
+
</div>
|
331 |
+
<div style="margin-top: 10px;">
|
332 |
+
<strong>Keywords:</strong> {keywords}
|
333 |
+
</div>
|
334 |
+
</div>
|
335 |
+
"""
|
336 |
+
|
337 |
+
total_pages = (total_prompts + entries_per_page - 1) // entries_per_page
|
338 |
+
pagination = f"Page {page} of {total_pages} (Total prompts: {total_prompts})"
|
339 |
+
|
340 |
+
return results, pagination, total_pages
|
341 |
+
except sqlite3.Error as e:
|
342 |
+
return f"<p>Error fetching prompts: {e}</p>", "Error", 0
|
343 |
+
|
344 |
+
def update_page(page, entries_per_page):
|
345 |
+
results, pagination, total_pages = view_database(page, entries_per_page)
|
346 |
+
next_disabled = page >= total_pages
|
347 |
+
prev_disabled = page <= 1
|
348 |
+
return results, pagination, page, gr.update(interactive=not next_disabled), gr.update(
|
349 |
+
interactive=not prev_disabled)
|
350 |
+
|
351 |
+
def go_to_next_page(current_page, entries_per_page):
|
352 |
+
next_page = current_page + 1
|
353 |
+
return update_page(next_page, entries_per_page)
|
354 |
+
|
355 |
+
def go_to_previous_page(current_page, entries_per_page):
|
356 |
+
previous_page = max(1, current_page - 1)
|
357 |
+
return update_page(previous_page, entries_per_page)
|
358 |
+
|
359 |
+
view_button.click(
|
360 |
+
fn=update_page,
|
361 |
+
inputs=[page_number, entries_per_page],
|
362 |
+
outputs=[results_display, pagination_info, page_number, next_page_button, previous_page_button]
|
363 |
+
)
|
364 |
+
|
365 |
+
next_page_button.click(
|
366 |
+
fn=go_to_next_page,
|
367 |
+
inputs=[page_number, entries_per_page],
|
368 |
+
outputs=[results_display, pagination_info, page_number, next_page_button, previous_page_button]
|
369 |
+
)
|
370 |
+
|
371 |
+
previous_page_button.click(
|
372 |
+
fn=go_to_previous_page,
|
373 |
+
inputs=[page_number, entries_per_page],
|
374 |
+
outputs=[results_display, pagination_info, page_number, next_page_button, previous_page_button]
|
375 |
+
)
|
376 |
+
|
377 |
+
|
378 |
+
|
379 |
+
def create_prompt_search_tab():
|
380 |
+
with gr.TabItem("Search Prompts"):
|
381 |
+
gr.Markdown("# Search and View Prompt Details")
|
382 |
+
gr.Markdown("Currently has all of the https://github.com/danielmiessler/fabric prompts already available")
|
383 |
+
with gr.Row():
|
384 |
+
with gr.Column():
|
385 |
+
search_query_input = gr.Textbox(label="Search Prompts", placeholder="Enter your search query...")
|
386 |
+
entries_per_page = gr.Dropdown(choices=[10, 20, 50, 100], label="Entries per Page", value=10)
|
387 |
+
page_number = gr.Number(value=1, label="Page Number", precision=0)
|
388 |
+
with gr.Column():
|
389 |
+
search_button = gr.Button("Search Prompts")
|
390 |
+
next_page_button = gr.Button("Next Page")
|
391 |
+
previous_page_button = gr.Button("Previous Page")
|
392 |
+
pagination_info = gr.Textbox(label="Pagination Info", interactive=False)
|
393 |
+
search_results_output = gr.HTML()
|
394 |
+
|
395 |
+
def search_and_display_prompts(query, page, entries_per_page):
|
396 |
+
offset = (page - 1) * entries_per_page
|
397 |
+
try:
|
398 |
+
# FIXME - SQL functions to be moved to DB_Manager
|
399 |
+
with sqlite3.connect('prompts.db') as conn:
|
400 |
+
cursor = conn.cursor()
|
401 |
+
cursor.execute('''
|
402 |
+
SELECT p.name, p.details, p.system, p.user, GROUP_CONCAT(k.keyword, ', ') as keywords
|
403 |
+
FROM Prompts p
|
404 |
+
LEFT JOIN PromptKeywords pk ON p.id = pk.prompt_id
|
405 |
+
LEFT JOIN Keywords k ON pk.keyword_id = k.id
|
406 |
+
WHERE p.name LIKE ? OR p.details LIKE ? OR p.system LIKE ? OR p.user LIKE ? OR k.keyword LIKE ?
|
407 |
+
GROUP BY p.id
|
408 |
+
ORDER BY p.name
|
409 |
+
LIMIT ? OFFSET ?
|
410 |
+
''', (f'%{query}%', f'%{query}%', f'%{query}%', f'%{query}%', f'%{query}%', entries_per_page, offset))
|
411 |
+
prompts = cursor.fetchall()
|
412 |
+
|
413 |
+
cursor.execute('''
|
414 |
+
SELECT COUNT(DISTINCT p.id)
|
415 |
+
FROM Prompts p
|
416 |
+
LEFT JOIN PromptKeywords pk ON p.id = pk.prompt_id
|
417 |
+
LEFT JOIN Keywords k ON pk.keyword_id = k.id
|
418 |
+
WHERE p.name LIKE ? OR p.details LIKE ? OR p.system LIKE ? OR p.user LIKE ? OR k.keyword LIKE ?
|
419 |
+
''', (f'%{query}%', f'%{query}%', f'%{query}%', f'%{query}%', f'%{query}%'))
|
420 |
+
total_prompts = cursor.fetchone()[0]
|
421 |
+
|
422 |
+
results = ""
|
423 |
+
for prompt in prompts:
|
424 |
+
title = html.escape(prompt[0]).replace('\n', '<br>')
|
425 |
+
details = html.escape(prompt[1] or '').replace('\n', '<br>')
|
426 |
+
system_prompt = html.escape(prompt[2] or '')
|
427 |
+
user_prompt = html.escape(prompt[3] or '')
|
428 |
+
keywords = html.escape(prompt[4] or '').replace('\n', '<br>')
|
429 |
+
|
430 |
+
results += f"""
|
431 |
+
<div style="border: 1px solid #ddd; padding: 10px; margin-bottom: 20px;">
|
432 |
+
<div style="display: grid; grid-template-columns: 1fr 1fr; gap: 10px;">
|
433 |
+
<div><strong>Title:</strong> {title}</div>
|
434 |
+
<div><strong>Details:</strong> {details}</div>
|
435 |
+
</div>
|
436 |
+
<div style="margin-top: 10px;">
|
437 |
+
<strong>User Prompt:</strong>
|
438 |
+
<pre style="white-space: pre-wrap; word-wrap: break-word;">{user_prompt}</pre>
|
439 |
+
</div>
|
440 |
+
<div style="margin-top: 10px;">
|
441 |
+
<strong>System Prompt:</strong>
|
442 |
+
<pre style="white-space: pre-wrap; word-wrap: break-word;">{system_prompt}</pre>
|
443 |
+
</div>
|
444 |
+
<div style="margin-top: 10px;">
|
445 |
+
<strong>Keywords:</strong> {keywords}
|
446 |
+
</div>
|
447 |
+
</div>
|
448 |
+
"""
|
449 |
+
|
450 |
+
total_pages = (total_prompts + entries_per_page - 1) // entries_per_page
|
451 |
+
pagination = f"Page {page} of {total_pages} (Total prompts: {total_prompts})"
|
452 |
+
|
453 |
+
return results, pagination, total_pages
|
454 |
+
except sqlite3.Error as e:
|
455 |
+
return f"<p>Error searching prompts: {e}</p>", "Error", 0
|
456 |
+
|
457 |
+
def update_search_page(query, page, entries_per_page):
|
458 |
+
results, pagination, total_pages = search_and_display_prompts(query, page, entries_per_page)
|
459 |
+
next_disabled = page >= total_pages
|
460 |
+
prev_disabled = page <= 1
|
461 |
+
return results, pagination, page, gr.update(interactive=not next_disabled), gr.update(interactive=not prev_disabled)
|
462 |
+
|
463 |
+
def go_to_next_search_page(query, current_page, entries_per_page):
|
464 |
+
next_page = current_page + 1
|
465 |
+
return update_search_page(query, next_page, entries_per_page)
|
466 |
+
|
467 |
+
def go_to_previous_search_page(query, current_page, entries_per_page):
|
468 |
+
previous_page = max(1, current_page - 1)
|
469 |
+
return update_search_page(query, previous_page, entries_per_page)
|
470 |
+
|
471 |
+
search_button.click(
|
472 |
+
fn=update_search_page,
|
473 |
+
inputs=[search_query_input, page_number, entries_per_page],
|
474 |
+
outputs=[search_results_output, pagination_info, page_number, next_page_button, previous_page_button]
|
475 |
+
)
|
476 |
+
|
477 |
+
next_page_button.click(
|
478 |
+
fn=go_to_next_search_page,
|
479 |
+
inputs=[search_query_input, page_number, entries_per_page],
|
480 |
+
outputs=[search_results_output, pagination_info, page_number, next_page_button, previous_page_button]
|
481 |
+
)
|
482 |
+
|
483 |
+
previous_page_button.click(
|
484 |
+
fn=go_to_previous_search_page,
|
485 |
+
inputs=[search_query_input, page_number, entries_per_page],
|
486 |
+
outputs=[search_results_output, pagination_info, page_number, next_page_button, previous_page_button]
|
487 |
+
)
|
App_Function_Libraries/Gradio_UI/Transcript_comparison.py
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Transcript_comparison.py
|
2 |
+
# Description: Gradio UI tab for comparing transcripts
|
3 |
+
#
|
4 |
+
# Imports
|
5 |
+
import logging
|
6 |
+
|
7 |
+
#
|
8 |
+
# External Imports
|
9 |
+
import gradio as gr
|
10 |
+
|
11 |
+
from App_Function_Libraries.DB_Manager import get_transcripts
|
12 |
+
from App_Function_Libraries.Gradio_UI.Gradio_Shared import browse_items
|
13 |
+
from App_Function_Libraries.Utils import format_transcription
|
14 |
+
|
15 |
+
|
16 |
+
#
|
17 |
+
# Local Imports
|
18 |
+
|
19 |
+
def get_transcript_options(media_id):
|
20 |
+
transcripts = get_transcripts(media_id)
|
21 |
+
return [f"{t[0]}: {t[1]} ({t[3]})" for t in transcripts]
|
22 |
+
|
23 |
+
|
24 |
+
def update_transcript_options(media_id):
|
25 |
+
options = get_transcript_options(media_id)
|
26 |
+
return gr.update(choices=options), gr.update(choices=options)
|
27 |
+
|
28 |
+
def compare_transcripts(media_id, transcript1_id, transcript2_id):
|
29 |
+
try:
|
30 |
+
transcripts = get_transcripts(media_id)
|
31 |
+
transcript1 = next((t for t in transcripts if t[0] == int(transcript1_id)), None)
|
32 |
+
transcript2 = next((t for t in transcripts if t[0] == int(transcript2_id)), None)
|
33 |
+
|
34 |
+
if not transcript1 or not transcript2:
|
35 |
+
return "One or both selected transcripts not found."
|
36 |
+
|
37 |
+
comparison = f"Transcript 1 (Model: {transcript1[1]}, Created: {transcript1[3]}):\n\n"
|
38 |
+
comparison += format_transcription(transcript1[2])
|
39 |
+
comparison += f"\n\nTranscript 2 (Model: {transcript2[1]}, Created: {transcript2[3]}):\n\n"
|
40 |
+
comparison += format_transcription(transcript2[2])
|
41 |
+
|
42 |
+
return comparison
|
43 |
+
except Exception as e:
|
44 |
+
logging.error(f"Error in compare_transcripts: {str(e)}")
|
45 |
+
return f"Error comparing transcripts: {str(e)}"
|
46 |
+
|
47 |
+
|
48 |
+
def create_compare_transcripts_tab():
|
49 |
+
with gr.TabItem("Compare Transcripts"):
|
50 |
+
gr.Markdown("# Compare Transcripts")
|
51 |
+
|
52 |
+
with gr.Row():
|
53 |
+
search_query_input = gr.Textbox(label="Search Query", placeholder="Enter your search query here...")
|
54 |
+
search_type_input = gr.Radio(choices=["Title", "URL", "Keyword", "Content"], value="Title", label="Search By")
|
55 |
+
search_button = gr.Button("Search")
|
56 |
+
|
57 |
+
with gr.Row():
|
58 |
+
media_id_output = gr.Dropdown(label="Select Media Item", choices=[], interactive=True)
|
59 |
+
media_mapping = gr.State({})
|
60 |
+
|
61 |
+
media_id_input = gr.Number(label="Media ID", visible=False)
|
62 |
+
transcript1_dropdown = gr.Dropdown(label="Transcript 1")
|
63 |
+
transcript2_dropdown = gr.Dropdown(label="Transcript 2")
|
64 |
+
compare_button = gr.Button("Compare Transcripts")
|
65 |
+
comparison_output = gr.Textbox(label="Comparison Result", lines=20)
|
66 |
+
|
67 |
+
def update_media_dropdown(search_query, search_type):
|
68 |
+
results = browse_items(search_query, search_type)
|
69 |
+
item_options = [f"{item[1]} ({item[2]})" for item in results]
|
70 |
+
new_item_mapping = {f"{item[1]} ({item[2]})": item[0] for item in results}
|
71 |
+
return gr.update(choices=item_options), new_item_mapping
|
72 |
+
|
73 |
+
search_button.click(
|
74 |
+
fn=update_media_dropdown,
|
75 |
+
inputs=[search_query_input, search_type_input],
|
76 |
+
outputs=[media_id_output, media_mapping]
|
77 |
+
)
|
78 |
+
|
79 |
+
def load_selected_media_id(selected_media, media_mapping):
|
80 |
+
if selected_media and media_mapping and selected_media in media_mapping:
|
81 |
+
media_id = media_mapping[selected_media]
|
82 |
+
return media_id
|
83 |
+
return None
|
84 |
+
|
85 |
+
media_id_output.change(
|
86 |
+
fn=load_selected_media_id,
|
87 |
+
inputs=[media_id_output, media_mapping],
|
88 |
+
outputs=[media_id_input]
|
89 |
+
)
|
90 |
+
|
91 |
+
media_id_input.change(update_transcript_options, inputs=[media_id_input],
|
92 |
+
outputs=[transcript1_dropdown, transcript2_dropdown])
|
93 |
+
compare_button.click(compare_transcripts, inputs=[media_id_input, transcript1_dropdown, transcript2_dropdown],
|
94 |
+
outputs=[comparison_output])
|
App_Function_Libraries/Gradio_UI/Trash.py
ADDED
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Trash.py
|
2 |
+
# Gradio UI for deleting items from the database
|
3 |
+
import html
|
4 |
+
import sqlite3
|
5 |
+
|
6 |
+
# Imports
|
7 |
+
|
8 |
+
# External Imports
|
9 |
+
import gradio as gr
|
10 |
+
#
|
11 |
+
# Local Imports
|
12 |
+
from App_Function_Libraries.DB_Manager import delete_prompt, empty_trash, get_trashed_items, user_delete_item
|
13 |
+
|
14 |
+
|
15 |
+
def delete_item(media_id, force):
|
16 |
+
return user_delete_item(media_id, force)
|
17 |
+
|
18 |
+
def list_trash():
|
19 |
+
items = get_trashed_items()
|
20 |
+
return "\n".join(
|
21 |
+
[f"ID: {item['id']}, Title: {item['title']}, Trashed on: {item['trash_date']}" for item in items])
|
22 |
+
|
23 |
+
def empty_trash_ui(days):
|
24 |
+
deleted, remaining = empty_trash(days)
|
25 |
+
return f"Deleted {deleted} items. {remaining} items remain in trash."
|
26 |
+
|
27 |
+
def create_view_trash_tab():
|
28 |
+
with gr.TabItem("View Trash"):
|
29 |
+
view_button = gr.Button("View Trash")
|
30 |
+
trash_list = gr.Textbox(label="Trashed Items")
|
31 |
+
view_button.click(list_trash, inputs=[], outputs=trash_list)
|
32 |
+
|
33 |
+
|
34 |
+
|
35 |
+
|
36 |
+
def search_prompts_for_deletion(query):
|
37 |
+
try:
|
38 |
+
with sqlite3.connect('prompts.db') as conn:
|
39 |
+
cursor = conn.cursor()
|
40 |
+
cursor.execute('''
|
41 |
+
SELECT id, name, details
|
42 |
+
FROM Prompts
|
43 |
+
WHERE name LIKE ? OR details LIKE ?
|
44 |
+
LIMIT 10
|
45 |
+
''', (f'%{query}%', f'%{query}%'))
|
46 |
+
results = cursor.fetchall()
|
47 |
+
|
48 |
+
if not results:
|
49 |
+
return "No matching prompts found."
|
50 |
+
|
51 |
+
output = "<h3>Matching Prompts:</h3>"
|
52 |
+
for row in results:
|
53 |
+
output += f"<p><strong>ID:</strong> {row[0]} | <strong>Name:</strong> {html.escape(row[1])} | <strong>Details:</strong> {html.escape(row[2][:100])}...</p>"
|
54 |
+
return output
|
55 |
+
except sqlite3.Error as e:
|
56 |
+
return f"An error occurred while searching prompts: {e}"
|
57 |
+
|
58 |
+
|
59 |
+
def search_media_for_deletion(query):
|
60 |
+
try:
|
61 |
+
with sqlite3.connect('media.db') as conn:
|
62 |
+
cursor = conn.cursor()
|
63 |
+
cursor.execute('''
|
64 |
+
SELECT id, title, description
|
65 |
+
FROM media
|
66 |
+
WHERE title LIKE ? OR description LIKE ?
|
67 |
+
LIMIT 10
|
68 |
+
''', (f'%{query}%', f'%{query}%'))
|
69 |
+
results = cursor.fetchall()
|
70 |
+
|
71 |
+
if not results:
|
72 |
+
return "No matching media found."
|
73 |
+
|
74 |
+
output = "<h3>Matching Media:</h3>"
|
75 |
+
for row in results:
|
76 |
+
output += f"<p><strong>ID:</strong> {row[0]} | <strong>Title:</strong> {html.escape(row[1])} | <strong>Description:</strong> {html.escape(row[2][:100])}...</p>"
|
77 |
+
return output
|
78 |
+
except sqlite3.Error as e:
|
79 |
+
return f"An error occurred while searching media: {e}"
|
80 |
+
|
81 |
+
def create_delete_trash_tab():
|
82 |
+
with gr.TabItem("Delete DB Item"):
|
83 |
+
gr.Markdown("# Search and Delete Items from Databases")
|
84 |
+
|
85 |
+
with gr.Row():
|
86 |
+
with gr.Column():
|
87 |
+
gr.Markdown("## Search and Delete Prompts")
|
88 |
+
prompt_search_input = gr.Textbox(label="Search Prompts")
|
89 |
+
prompt_search_button = gr.Button("Search Prompts")
|
90 |
+
prompt_search_results = gr.HTML()
|
91 |
+
prompt_id_input = gr.Number(label="Prompt ID")
|
92 |
+
prompt_delete_button = gr.Button("Delete Prompt")
|
93 |
+
prompt_delete_output = gr.Textbox(label="Delete Result")
|
94 |
+
|
95 |
+
with gr.Column():
|
96 |
+
gr.Markdown("## Search and Delete Media")
|
97 |
+
media_search_input = gr.Textbox(label="Search Media")
|
98 |
+
media_search_button = gr.Button("Search Media")
|
99 |
+
media_search_results = gr.HTML()
|
100 |
+
media_id_input = gr.Number(label="Media ID")
|
101 |
+
media_force_checkbox = gr.Checkbox(label="Force Delete")
|
102 |
+
media_delete_button = gr.Button("Delete Media")
|
103 |
+
media_delete_output = gr.Textbox(label="Delete Result")
|
104 |
+
|
105 |
+
prompt_search_button.click(
|
106 |
+
search_prompts_for_deletion,
|
107 |
+
inputs=[prompt_search_input],
|
108 |
+
outputs=prompt_search_results
|
109 |
+
)
|
110 |
+
|
111 |
+
prompt_delete_button.click(
|
112 |
+
delete_prompt,
|
113 |
+
inputs=[prompt_id_input],
|
114 |
+
outputs=prompt_delete_output
|
115 |
+
)
|
116 |
+
|
117 |
+
media_search_button.click(
|
118 |
+
search_media_for_deletion,
|
119 |
+
inputs=[media_search_input],
|
120 |
+
outputs=media_search_results
|
121 |
+
)
|
122 |
+
|
123 |
+
media_delete_button.click(
|
124 |
+
delete_item,
|
125 |
+
inputs=[media_id_input, media_force_checkbox],
|
126 |
+
outputs=media_delete_output
|
127 |
+
)
|
128 |
+
|
129 |
+
def create_empty_trash_tab():
|
130 |
+
with gr.TabItem("Empty Trash"):
|
131 |
+
days_input = gr.Slider(minimum=15, maximum=90, step=5, label="Delete items older than (days)")
|
132 |
+
empty_button = gr.Button("Empty Trash")
|
133 |
+
empty_output = gr.Textbox(label="Result")
|
134 |
+
empty_button.click(empty_trash_ui, inputs=[days_input], outputs=empty_output)
|
App_Function_Libraries/Gradio_UI/Utilities.py
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import shutil
|
3 |
+
import tempfile
|
4 |
+
from pathlib import Path
|
5 |
+
|
6 |
+
import gradio as gr
|
7 |
+
import yt_dlp
|
8 |
+
|
9 |
+
from App_Function_Libraries.Utils import sanitize_filename, downloaded_files
|
10 |
+
|
11 |
+
|
12 |
+
def create_utilities_yt_video_tab():
|
13 |
+
with gr.Tab("YouTube Video Downloader"):
|
14 |
+
with gr.Row():
|
15 |
+
with gr.Column():
|
16 |
+
gr.Markdown(
|
17 |
+
"<h3>Youtube Video Downloader</h3><p>This Input takes a Youtube URL as input and creates a webm file for you to download. </br><em>If you want a full-featured one:</em> <strong><em>https://github.com/StefanLobbenmeier/youtube-dl-gui</strong></em> or <strong><em>https://github.com/yt-dlg/yt-dlg</em></strong></p>")
|
18 |
+
youtube_url_input = gr.Textbox(label="YouTube URL", placeholder="Enter YouTube video URL here")
|
19 |
+
download_button = gr.Button("Download Video")
|
20 |
+
with gr.Column():
|
21 |
+
output_file = gr.File(label="Download Video")
|
22 |
+
output_message = gr.Textbox(label="Status")
|
23 |
+
|
24 |
+
download_button.click(
|
25 |
+
fn=gradio_download_youtube_video,
|
26 |
+
inputs=youtube_url_input,
|
27 |
+
outputs=[output_file, output_message]
|
28 |
+
)
|
29 |
+
|
30 |
+
def create_utilities_yt_audio_tab():
|
31 |
+
with gr.Tab("YouTube Audio Downloader"):
|
32 |
+
with gr.Row():
|
33 |
+
with gr.Column():
|
34 |
+
gr.Markdown(
|
35 |
+
"<h3>Youtube Audio Downloader</h3><p>This Input takes a Youtube URL as input and creates an audio file for you to download.</p>"
|
36 |
+
+"\n<em>If you want a full-featured one:</em> <strong><em>https://github.com/StefanLobbenmeier/youtube-dl-gui</strong></em>\n or \n<strong><em>https://github.com/yt-dlg/yt-dlg</em></strong></p>")
|
37 |
+
youtube_url_input_audio = gr.Textbox(label="YouTube URL", placeholder="Enter YouTube video URL here")
|
38 |
+
download_button_audio = gr.Button("Download Audio")
|
39 |
+
with gr.Column():
|
40 |
+
output_file_audio = gr.File(label="Download Audio")
|
41 |
+
output_message_audio = gr.Textbox(label="Status")
|
42 |
+
|
43 |
+
from App_Function_Libraries.Audio_Files import download_youtube_audio
|
44 |
+
download_button_audio.click(
|
45 |
+
fn=download_youtube_audio,
|
46 |
+
inputs=youtube_url_input_audio,
|
47 |
+
outputs=[output_file_audio, output_message_audio]
|
48 |
+
)
|
49 |
+
|
50 |
+
def create_utilities_yt_timestamp_tab():
|
51 |
+
with gr.Tab("YouTube Timestamp URL Generator"):
|
52 |
+
gr.Markdown("## Generate YouTube URL with Timestamp")
|
53 |
+
with gr.Row():
|
54 |
+
with gr.Column():
|
55 |
+
url_input = gr.Textbox(label="YouTube URL")
|
56 |
+
hours_input = gr.Number(label="Hours", value=0, minimum=0, precision=0)
|
57 |
+
minutes_input = gr.Number(label="Minutes", value=0, minimum=0, maximum=59, precision=0)
|
58 |
+
seconds_input = gr.Number(label="Seconds", value=0, minimum=0, maximum=59, precision=0)
|
59 |
+
generate_button = gr.Button("Generate URL")
|
60 |
+
with gr.Column():
|
61 |
+
output_url = gr.Textbox(label="Timestamped URL")
|
62 |
+
|
63 |
+
from App_Function_Libraries.Video_DL_Ingestion_Lib import generate_timestamped_url
|
64 |
+
generate_button.click(
|
65 |
+
fn=generate_timestamped_url,
|
66 |
+
inputs=[url_input, hours_input, minutes_input, seconds_input],
|
67 |
+
outputs=output_url
|
68 |
+
)
|
69 |
+
|
70 |
+
|
71 |
+
def gradio_download_youtube_video(url):
|
72 |
+
try:
|
73 |
+
# Determine ffmpeg path based on the operating system.
|
74 |
+
ffmpeg_path = './Bin/ffmpeg.exe' if os.name == 'nt' else 'ffmpeg'
|
75 |
+
|
76 |
+
# Create a temporary directory
|
77 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
78 |
+
# Extract information about the video
|
79 |
+
with yt_dlp.YoutubeDL({'quiet': True}) as ydl:
|
80 |
+
info_dict = ydl.extract_info(url, download=False)
|
81 |
+
sanitized_title = sanitize_filename(info_dict['title'])
|
82 |
+
original_ext = info_dict['ext']
|
83 |
+
|
84 |
+
# Setup the temporary filename
|
85 |
+
temp_file_path = Path(temp_dir) / f"{sanitized_title}.{original_ext}"
|
86 |
+
|
87 |
+
# Initialize yt-dlp with generic options and the output template
|
88 |
+
ydl_opts = {
|
89 |
+
'format': 'bestvideo+bestaudio/best',
|
90 |
+
'ffmpeg_location': ffmpeg_path,
|
91 |
+
'outtmpl': str(temp_file_path),
|
92 |
+
'noplaylist': True,
|
93 |
+
'quiet': True
|
94 |
+
}
|
95 |
+
|
96 |
+
# Execute yt-dlp to download the video
|
97 |
+
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
|
98 |
+
ydl.download([url])
|
99 |
+
|
100 |
+
# Final check to ensure file exists
|
101 |
+
if not temp_file_path.exists():
|
102 |
+
raise FileNotFoundError(f"Expected file was not found: {temp_file_path}")
|
103 |
+
|
104 |
+
# Create a persistent directory for the download if it doesn't exist
|
105 |
+
persistent_dir = Path("downloads")
|
106 |
+
persistent_dir.mkdir(exist_ok=True)
|
107 |
+
|
108 |
+
# Move the file from the temporary directory to the persistent directory
|
109 |
+
persistent_file_path = persistent_dir / f"{sanitized_title}.{original_ext}"
|
110 |
+
shutil.move(str(temp_file_path), str(persistent_file_path))
|
111 |
+
|
112 |
+
# Add the file to the list of downloaded files
|
113 |
+
downloaded_files.append(str(persistent_file_path))
|
114 |
+
|
115 |
+
return str(persistent_file_path), f"Video downloaded successfully: {sanitized_title}.{original_ext}"
|
116 |
+
except Exception as e:
|
117 |
+
return None, f"Error downloading video: {str(e)}"
|
118 |
+
|
App_Function_Libraries/Gradio_UI/Video_transcription_tab.py
ADDED
@@ -0,0 +1,691 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Video_transcription_tab.py
|
2 |
+
# Description: This file contains the code for the video transcription tab in the Gradio UI.
|
3 |
+
#
|
4 |
+
# Imports
|
5 |
+
import json
|
6 |
+
import logging
|
7 |
+
import os
|
8 |
+
#
|
9 |
+
# External Imports
|
10 |
+
import gradio as gr
|
11 |
+
import yt_dlp
|
12 |
+
#
|
13 |
+
# Local Imports
|
14 |
+
from App_Function_Libraries.DB_Manager import load_preset_prompts, add_media_to_database
|
15 |
+
from App_Function_Libraries.Gradio_UI.Gradio_Shared import whisper_models, update_user_prompt
|
16 |
+
from App_Function_Libraries.Gradio_UI.Gradio_Shared import error_handler
|
17 |
+
from App_Function_Libraries.Summarization_General_Lib import perform_transcription, perform_summarization, \
|
18 |
+
save_transcription_and_summary
|
19 |
+
from App_Function_Libraries.Utils import convert_to_seconds, safe_read_file, format_transcription, \
|
20 |
+
create_download_directory, generate_unique_identifier, extract_text_from_segments
|
21 |
+
from App_Function_Libraries.Video_DL_Ingestion_Lib import parse_and_expand_urls, extract_metadata, download_video
|
22 |
+
#
|
23 |
+
################################################################################################################################################################
|
24 |
+
#
|
25 |
+
# Functions:
|
26 |
+
|
27 |
+
def create_video_transcription_tab():
|
28 |
+
with (gr.TabItem("Video Transcription + Summarization")):
|
29 |
+
gr.Markdown("# Transcribe & Summarize Videos from URLs")
|
30 |
+
with gr.Row():
|
31 |
+
gr.Markdown("""Follow this project at [tldw - GitHub](https://github.com/rmusser01/tldw)""")
|
32 |
+
with gr.Row():
|
33 |
+
gr.Markdown(
|
34 |
+
"""If you're wondering what all this is, please see the 'Introduction/Help' tab up above for more detailed information and how to obtain an API Key.""")
|
35 |
+
with gr.Row():
|
36 |
+
with gr.Column():
|
37 |
+
url_input = gr.Textbox(label="URL(s) (Mandatory)",
|
38 |
+
placeholder="Enter video URLs here, one per line. Supports YouTube, Vimeo, other video sites and Youtube playlists.",
|
39 |
+
lines=5)
|
40 |
+
video_file_input = gr.File(label="Upload Video File (Optional)", file_types=["video/*"])
|
41 |
+
diarize_input = gr.Checkbox(label="Enable Speaker Diarization", value=False)
|
42 |
+
whisper_model_input = gr.Dropdown(choices=whisper_models, value="medium", label="Whisper Model")
|
43 |
+
|
44 |
+
with gr.Row():
|
45 |
+
custom_prompt_checkbox = gr.Checkbox(label="Use a Custom Prompt",
|
46 |
+
value=False,
|
47 |
+
visible=True)
|
48 |
+
preset_prompt_checkbox = gr.Checkbox(label="Use a pre-set Prompt",
|
49 |
+
value=False,
|
50 |
+
visible=True)
|
51 |
+
with gr.Row():
|
52 |
+
preset_prompt = gr.Dropdown(label="Select Preset Prompt",
|
53 |
+
choices=load_preset_prompts(),
|
54 |
+
visible=False)
|
55 |
+
with gr.Row():
|
56 |
+
custom_prompt_input = gr.Textbox(label="Custom Prompt",
|
57 |
+
placeholder="Enter custom prompt here",
|
58 |
+
lines=3,
|
59 |
+
visible=False)
|
60 |
+
with gr.Row():
|
61 |
+
system_prompt_input = gr.Textbox(label="System Prompt",
|
62 |
+
value="""<s>You are a bulleted notes specialist. [INST]```When creating comprehensive bulleted notes, you should follow these guidelines: Use multiple headings based on the referenced topics, not categories like quotes or terms. Headings should be surrounded by bold formatting and not be listed as bullet points themselves. Leave no space between headings and their corresponding list items underneath. Important terms within the content should be emphasized by setting them in bold font. Any text that ends with a colon should also be bolded. Before submitting your response, review the instructions, and make any corrections necessary to adhered to the specified format. Do not reference these instructions within the notes.``` \nBased on the content between backticks create comprehensive bulleted notes.[/INST]
|
63 |
+
**Bulleted Note Creation Guidelines**
|
64 |
+
|
65 |
+
**Headings**:
|
66 |
+
- Based on referenced topics, not categories like quotes or terms
|
67 |
+
- Surrounded by **bold** formatting
|
68 |
+
- Not listed as bullet points
|
69 |
+
- No space between headings and list items underneath
|
70 |
+
|
71 |
+
**Emphasis**:
|
72 |
+
- **Important terms** set in bold font
|
73 |
+
- **Text ending in a colon**: also bolded
|
74 |
+
|
75 |
+
**Review**:
|
76 |
+
- Ensure adherence to specified format
|
77 |
+
- Do not reference these instructions in your response.</s>[INST] {{ .Prompt }} [/INST]
|
78 |
+
""",
|
79 |
+
lines=3,
|
80 |
+
visible=False,
|
81 |
+
interactive=True)
|
82 |
+
custom_prompt_checkbox.change(
|
83 |
+
fn=lambda x: (gr.update(visible=x), gr.update(visible=x)),
|
84 |
+
inputs=[custom_prompt_checkbox],
|
85 |
+
outputs=[custom_prompt_input, system_prompt_input]
|
86 |
+
)
|
87 |
+
preset_prompt_checkbox.change(
|
88 |
+
fn=lambda x: gr.update(visible=x),
|
89 |
+
inputs=[preset_prompt_checkbox],
|
90 |
+
outputs=[preset_prompt]
|
91 |
+
)
|
92 |
+
|
93 |
+
def update_prompts(preset_name):
|
94 |
+
prompts = update_user_prompt(preset_name)
|
95 |
+
return (
|
96 |
+
gr.update(value=prompts["user_prompt"], visible=True),
|
97 |
+
gr.update(value=prompts["system_prompt"], visible=True)
|
98 |
+
)
|
99 |
+
|
100 |
+
preset_prompt.change(
|
101 |
+
update_prompts,
|
102 |
+
inputs=preset_prompt,
|
103 |
+
outputs=[custom_prompt_input, system_prompt_input]
|
104 |
+
)
|
105 |
+
|
106 |
+
api_name_input = gr.Dropdown(
|
107 |
+
choices=[None, "Local-LLM", "OpenAI", "Anthropic", "Cohere", "Groq", "DeepSeek", "Mistral",
|
108 |
+
"OpenRouter",
|
109 |
+
"Llama.cpp", "Kobold", "Ooba", "Tabbyapi", "VLLM", "ollama", "HuggingFace"],
|
110 |
+
value=None, label="API Name (Mandatory)")
|
111 |
+
api_key_input = gr.Textbox(label="API Key (Mandatory)", placeholder="Enter your API key here",
|
112 |
+
type="password")
|
113 |
+
keywords_input = gr.Textbox(label="Keywords", placeholder="Enter keywords here (comma-separated)",
|
114 |
+
value="default,no_keyword_set")
|
115 |
+
batch_size_input = gr.Slider(minimum=1, maximum=10, value=1, step=1,
|
116 |
+
label="Batch Size (Number of videos to process simultaneously)")
|
117 |
+
timestamp_option = gr.Radio(choices=["Include Timestamps", "Exclude Timestamps"],
|
118 |
+
value="Include Timestamps", label="Timestamp Option")
|
119 |
+
keep_original_video = gr.Checkbox(label="Keep Original Video", value=False)
|
120 |
+
# First, create a checkbox to toggle the chunking options
|
121 |
+
chunking_options_checkbox = gr.Checkbox(label="Show Chunking Options", value=False)
|
122 |
+
summarize_recursively = gr.Checkbox(label="Enable Recursive Summarization", value=False)
|
123 |
+
use_cookies_input = gr.Checkbox(label="Use cookies for authenticated download", value=False)
|
124 |
+
use_time_input = gr.Checkbox(label="Use Start and End Time", value=False)
|
125 |
+
|
126 |
+
with gr.Row(visible=False) as time_input_box:
|
127 |
+
gr.Markdown("### Start and End time")
|
128 |
+
with gr.Column():
|
129 |
+
start_time_input = gr.Textbox(label="Start Time (Optional)",
|
130 |
+
placeholder="e.g., 1:30 or 90 (in seconds)")
|
131 |
+
end_time_input = gr.Textbox(label="End Time (Optional)",
|
132 |
+
placeholder="e.g., 5:45 or 345 (in seconds)")
|
133 |
+
|
134 |
+
use_time_input.change(
|
135 |
+
fn=lambda x: gr.update(visible=x),
|
136 |
+
inputs=[use_time_input],
|
137 |
+
outputs=[time_input_box]
|
138 |
+
)
|
139 |
+
|
140 |
+
cookies_input = gr.Textbox(
|
141 |
+
label="User Session Cookies",
|
142 |
+
placeholder="Paste your cookies here (JSON format)",
|
143 |
+
lines=3,
|
144 |
+
visible=False
|
145 |
+
)
|
146 |
+
|
147 |
+
use_cookies_input.change(
|
148 |
+
fn=lambda x: gr.update(visible=x),
|
149 |
+
inputs=[use_cookies_input],
|
150 |
+
outputs=[cookies_input]
|
151 |
+
)
|
152 |
+
# Then, create a Box to group the chunking options
|
153 |
+
with gr.Row(visible=False) as chunking_options_box:
|
154 |
+
gr.Markdown("### Chunking Options")
|
155 |
+
with gr.Column():
|
156 |
+
chunk_method = gr.Dropdown(choices=['words', 'sentences', 'paragraphs', 'tokens'],
|
157 |
+
label="Chunking Method")
|
158 |
+
max_chunk_size = gr.Slider(minimum=100, maximum=1000, value=300, step=50,
|
159 |
+
label="Max Chunk Size")
|
160 |
+
chunk_overlap = gr.Slider(minimum=0, maximum=100, value=0, step=10, label="Chunk Overlap")
|
161 |
+
use_adaptive_chunking = gr.Checkbox(
|
162 |
+
label="Use Adaptive Chunking (Adjust chunking based on text complexity)")
|
163 |
+
use_multi_level_chunking = gr.Checkbox(label="Use Multi-level Chunking")
|
164 |
+
chunk_language = gr.Dropdown(choices=['english', 'french', 'german', 'spanish'],
|
165 |
+
label="Chunking Language")
|
166 |
+
|
167 |
+
# Add JavaScript to toggle the visibility of the chunking options box
|
168 |
+
chunking_options_checkbox.change(
|
169 |
+
fn=lambda x: gr.update(visible=x),
|
170 |
+
inputs=[chunking_options_checkbox],
|
171 |
+
outputs=[chunking_options_box]
|
172 |
+
)
|
173 |
+
process_button = gr.Button("Process Videos")
|
174 |
+
|
175 |
+
with gr.Column():
|
176 |
+
progress_output = gr.Textbox(label="Progress")
|
177 |
+
error_output = gr.Textbox(label="Errors", visible=False)
|
178 |
+
results_output = gr.HTML(label="Results")
|
179 |
+
download_transcription = gr.File(label="Download All Transcriptions as JSON")
|
180 |
+
download_summary = gr.File(label="Download All Summaries as Text")
|
181 |
+
|
182 |
+
@error_handler
|
183 |
+
def process_videos_with_error_handling(inputs, start_time, end_time, diarize, whisper_model,
|
184 |
+
custom_prompt_checkbox, custom_prompt, chunking_options_checkbox,
|
185 |
+
chunk_method, max_chunk_size, chunk_overlap, use_adaptive_chunking,
|
186 |
+
use_multi_level_chunking, chunk_language, api_name,
|
187 |
+
api_key, keywords, use_cookies, cookies, batch_size,
|
188 |
+
timestamp_option, keep_original_video, summarize_recursively,
|
189 |
+
progress: gr.Progress = gr.Progress()) -> tuple:
|
190 |
+
try:
|
191 |
+
logging.info("Entering process_videos_with_error_handling")
|
192 |
+
logging.info(f"Received inputs: {inputs}")
|
193 |
+
|
194 |
+
if not inputs:
|
195 |
+
raise ValueError("No inputs provided")
|
196 |
+
|
197 |
+
logging.debug("Input(s) is(are) valid")
|
198 |
+
|
199 |
+
# Ensure batch_size is an integer
|
200 |
+
try:
|
201 |
+
batch_size = int(batch_size)
|
202 |
+
except (ValueError, TypeError):
|
203 |
+
batch_size = 1 # Default to processing one video at a time if invalid
|
204 |
+
|
205 |
+
# Separate URLs and local files
|
206 |
+
urls = [input for input in inputs if
|
207 |
+
isinstance(input, str) and input.startswith(('http://', 'https://'))]
|
208 |
+
local_files = [input for input in inputs if
|
209 |
+
isinstance(input, str) and not input.startswith(('http://', 'https://'))]
|
210 |
+
|
211 |
+
# Parse and expand URLs if there are any
|
212 |
+
expanded_urls = parse_and_expand_urls(urls) if urls else []
|
213 |
+
|
214 |
+
valid_local_files = []
|
215 |
+
invalid_local_files = []
|
216 |
+
|
217 |
+
for file_path in local_files:
|
218 |
+
if os.path.exists(file_path):
|
219 |
+
valid_local_files.append(file_path)
|
220 |
+
else:
|
221 |
+
invalid_local_files.append(file_path)
|
222 |
+
error_message = f"Local file not found: {file_path}"
|
223 |
+
logging.error(error_message)
|
224 |
+
|
225 |
+
if invalid_local_files:
|
226 |
+
logging.warning(f"Found {len(invalid_local_files)} invalid local file paths")
|
227 |
+
# FIXME - Add more complete error handling for invalid local files
|
228 |
+
|
229 |
+
all_inputs = expanded_urls + valid_local_files
|
230 |
+
logging.info(f"Total valid inputs to process: {len(all_inputs)} "
|
231 |
+
f"({len(expanded_urls)} URLs, {len(valid_local_files)} local files)")
|
232 |
+
|
233 |
+
all_inputs = expanded_urls + local_files
|
234 |
+
logging.info(f"Total inputs to process: {len(all_inputs)}")
|
235 |
+
results = []
|
236 |
+
errors = []
|
237 |
+
results_html = ""
|
238 |
+
all_transcriptions = {}
|
239 |
+
all_summaries = ""
|
240 |
+
|
241 |
+
for i in range(0, len(all_inputs), batch_size):
|
242 |
+
batch = all_inputs[i:i + batch_size]
|
243 |
+
batch_results = []
|
244 |
+
|
245 |
+
for input_item in batch:
|
246 |
+
try:
|
247 |
+
start_seconds = convert_to_seconds(start_time)
|
248 |
+
end_seconds = convert_to_seconds(end_time) if end_time else None
|
249 |
+
|
250 |
+
logging.info(f"Attempting to extract metadata for {input_item}")
|
251 |
+
|
252 |
+
if input_item.startswith(('http://', 'https://')):
|
253 |
+
logging.info(f"Attempting to extract metadata for URL: {input_item}")
|
254 |
+
video_metadata = extract_metadata(input_item, use_cookies, cookies)
|
255 |
+
if not video_metadata:
|
256 |
+
raise ValueError(f"Failed to extract metadata for {input_item}")
|
257 |
+
else:
|
258 |
+
logging.info(f"Processing local file: {input_item}")
|
259 |
+
video_metadata = {"title": os.path.basename(input_item), "url": input_item}
|
260 |
+
|
261 |
+
chunk_options = {
|
262 |
+
'method': chunk_method,
|
263 |
+
'max_size': max_chunk_size,
|
264 |
+
'overlap': chunk_overlap,
|
265 |
+
'adaptive': use_adaptive_chunking,
|
266 |
+
'multi_level': use_multi_level_chunking,
|
267 |
+
'language': chunk_language
|
268 |
+
} if chunking_options_checkbox else None
|
269 |
+
|
270 |
+
if custom_prompt_checkbox:
|
271 |
+
custom_prompt = custom_prompt
|
272 |
+
else:
|
273 |
+
custom_prompt = ("""
|
274 |
+
<s>You are a bulleted notes specialist. [INST]```When creating comprehensive bulleted notes, you should follow these guidelines: Use multiple headings based on the referenced topics, not categories like quotes or terms. Headings should be surrounded by bold formatting and not be listed as bullet points themselves. Leave no space between headings and their corresponding list items underneath. Important terms within the content should be emphasized by setting them in bold font. Any text that ends with a colon should also be bolded. Before submitting your response, review the instructions, and make any corrections necessary to adhered to the specified format. Do not reference these instructions within the notes.``` \nBased on the content between backticks create comprehensive bulleted notes.[/INST]
|
275 |
+
**Bulleted Note Creation Guidelines**
|
276 |
+
|
277 |
+
**Headings**:
|
278 |
+
- Based on referenced topics, not categories like quotes or terms
|
279 |
+
- Surrounded by **bold** formatting
|
280 |
+
- Not listed as bullet points
|
281 |
+
- No space between headings and list items underneath
|
282 |
+
|
283 |
+
**Emphasis**:
|
284 |
+
- **Important terms** set in bold font
|
285 |
+
- **Text ending in a colon**: also bolded
|
286 |
+
|
287 |
+
**Review**:
|
288 |
+
- Ensure adherence to specified format
|
289 |
+
- Do not reference these instructions in your response.</s>[INST] {{ .Prompt }} [/INST]
|
290 |
+
""")
|
291 |
+
|
292 |
+
logging.debug("Gradio_Related.py: process_url_with_metadata being called")
|
293 |
+
result = process_url_with_metadata(
|
294 |
+
input_item, 2, whisper_model,
|
295 |
+
custom_prompt,
|
296 |
+
start_seconds, api_name, api_key,
|
297 |
+
False, False, False, False, 0.01, None, keywords, None, diarize,
|
298 |
+
end_time=end_seconds,
|
299 |
+
include_timestamps=(timestamp_option == "Include Timestamps"),
|
300 |
+
metadata=video_metadata,
|
301 |
+
use_chunking=chunking_options_checkbox,
|
302 |
+
chunk_options=chunk_options,
|
303 |
+
keep_original_video=keep_original_video,
|
304 |
+
current_whisper_model=whisper_model,
|
305 |
+
)
|
306 |
+
|
307 |
+
if result[0] is None:
|
308 |
+
error_message = "Processing failed without specific error"
|
309 |
+
batch_results.append(
|
310 |
+
(input_item, error_message, "Error", video_metadata, None, None))
|
311 |
+
errors.append(f"Error processing {input_item}: {error_message}")
|
312 |
+
else:
|
313 |
+
url, transcription, summary, json_file, summary_file, result_metadata = result
|
314 |
+
if transcription is None:
|
315 |
+
error_message = f"Processing failed for {input_item}: Transcription is None"
|
316 |
+
batch_results.append(
|
317 |
+
(input_item, error_message, "Error", result_metadata, None, None))
|
318 |
+
errors.append(error_message)
|
319 |
+
else:
|
320 |
+
batch_results.append(
|
321 |
+
(input_item, transcription, "Success", result_metadata, json_file,
|
322 |
+
summary_file))
|
323 |
+
|
324 |
+
|
325 |
+
except Exception as e:
|
326 |
+
error_message = f"Error processing {input_item}: {str(e)}"
|
327 |
+
logging.error(error_message, exc_info=True)
|
328 |
+
batch_results.append((input_item, error_message, "Error", {}, None, None))
|
329 |
+
errors.append(error_message)
|
330 |
+
|
331 |
+
results.extend(batch_results)
|
332 |
+
logging.debug(f"Processed {len(batch_results)} videos in batch")
|
333 |
+
if isinstance(progress, gr.Progress):
|
334 |
+
progress((i + len(batch)) / len(all_inputs),
|
335 |
+
f"Processed {i + len(batch)}/{len(all_inputs)} videos")
|
336 |
+
|
337 |
+
# Generate HTML for results
|
338 |
+
logging.debug(f"Generating HTML for {len(results)} results")
|
339 |
+
for url, transcription, status, metadata, json_file, summary_file in results:
|
340 |
+
if status == "Success":
|
341 |
+
title = metadata.get('title', 'Unknown Title')
|
342 |
+
|
343 |
+
# Check if transcription is a string (which it should be now)
|
344 |
+
if isinstance(transcription, str):
|
345 |
+
# Split the transcription into metadata and actual transcription
|
346 |
+
parts = transcription.split('\n\n', 1)
|
347 |
+
if len(parts) == 2:
|
348 |
+
metadata_text, transcription_text = parts
|
349 |
+
else:
|
350 |
+
metadata_text = "Metadata not found"
|
351 |
+
transcription_text = transcription
|
352 |
+
else:
|
353 |
+
metadata_text = "Metadata format error"
|
354 |
+
transcription_text = "Transcription format error"
|
355 |
+
|
356 |
+
summary = safe_read_file(summary_file) if summary_file else "No summary available"
|
357 |
+
|
358 |
+
# FIXME - Add to other functions that generate HTML
|
359 |
+
# Format the transcription
|
360 |
+
formatted_transcription = format_transcription(transcription_text)
|
361 |
+
# Format the summary
|
362 |
+
formatted_summary = format_transcription(summary)
|
363 |
+
|
364 |
+
results_html += f"""
|
365 |
+
<div class="result-box">
|
366 |
+
<gradio-accordion>
|
367 |
+
<gradio-accordion-item label="{title}">
|
368 |
+
<p><strong>URL:</strong> <a href="{url}" target="_blank">{url}</a></p>
|
369 |
+
<h4>Metadata:</h4>
|
370 |
+
<pre>{metadata_text}</pre>
|
371 |
+
<h4>Transcription:</h4>
|
372 |
+
<div class="transcription" style="white-space: pre-wrap; word-wrap: break-word;">
|
373 |
+
{formatted_transcription}
|
374 |
+
</div>
|
375 |
+
<h4>Summary:</h4>
|
376 |
+
<div class="summary">{formatted_summary}</div>
|
377 |
+
</gradio-accordion-item>
|
378 |
+
</gradio-accordion>
|
379 |
+
</div>
|
380 |
+
"""
|
381 |
+
logging.debug(f"Transcription for {url}: {transcription[:200]}...")
|
382 |
+
all_transcriptions[url] = transcription
|
383 |
+
all_summaries += f"Title: {title}\nURL: {url}\n\n{metadata_text}\n\nTranscription:\n{transcription_text}\n\nSummary:\n{summary}\n\n---\n\n"
|
384 |
+
else:
|
385 |
+
results_html += f"""
|
386 |
+
<div class="result-box error">
|
387 |
+
<h3>Error processing {url}</h3>
|
388 |
+
<p>{transcription}</p>
|
389 |
+
</div>
|
390 |
+
"""
|
391 |
+
|
392 |
+
# Save all transcriptions and summaries to files
|
393 |
+
logging.debug("Saving all transcriptions and summaries to files")
|
394 |
+
with open('all_transcriptions.json', 'w', encoding='utf-8') as f:
|
395 |
+
json.dump(all_transcriptions, f, indent=2, ensure_ascii=False)
|
396 |
+
|
397 |
+
with open('all_summaries.txt', 'w', encoding='utf-8') as f:
|
398 |
+
f.write(all_summaries)
|
399 |
+
|
400 |
+
error_summary = "\n".join(errors) if errors else "No errors occurred."
|
401 |
+
|
402 |
+
total_inputs = len(all_inputs)
|
403 |
+
return (
|
404 |
+
f"Processed {total_inputs} videos. {len(errors)} errors occurred.",
|
405 |
+
error_summary,
|
406 |
+
results_html,
|
407 |
+
'all_transcriptions.json',
|
408 |
+
'all_summaries.txt'
|
409 |
+
)
|
410 |
+
except Exception as e:
|
411 |
+
logging.error(f"Unexpected error in process_videos_with_error_handling: {str(e)}", exc_info=True)
|
412 |
+
return (
|
413 |
+
f"An unexpected error occurred: {str(e)}",
|
414 |
+
str(e),
|
415 |
+
"<div class='result-box error'><h3>Unexpected Error</h3><p>" + str(e) + "</p></div>",
|
416 |
+
None,
|
417 |
+
None
|
418 |
+
)
|
419 |
+
|
420 |
+
def process_videos_wrapper(url_input, video_file, start_time, end_time, diarize, whisper_model,
|
421 |
+
custom_prompt_checkbox, custom_prompt, chunking_options_checkbox,
|
422 |
+
chunk_method, max_chunk_size, chunk_overlap, use_adaptive_chunking,
|
423 |
+
use_multi_level_chunking, chunk_language, summarize_recursively, api_name,
|
424 |
+
api_key, keywords, use_cookies, cookies, batch_size,
|
425 |
+
timestamp_option, keep_original_video):
|
426 |
+
global result
|
427 |
+
try:
|
428 |
+
logging.info("process_videos_wrapper(): process_videos_wrapper called")
|
429 |
+
|
430 |
+
# Define file paths
|
431 |
+
transcriptions_file = os.path.join('all_transcriptions.json')
|
432 |
+
summaries_file = os.path.join('all_summaries.txt')
|
433 |
+
|
434 |
+
# Delete existing files if they exist
|
435 |
+
for file_path in [transcriptions_file, summaries_file]:
|
436 |
+
try:
|
437 |
+
if os.path.exists(file_path):
|
438 |
+
os.remove(file_path)
|
439 |
+
logging.info(f"Deleted existing file: {file_path}")
|
440 |
+
except Exception as e:
|
441 |
+
logging.warning(f"Failed to delete file {file_path}: {str(e)}")
|
442 |
+
|
443 |
+
# Handle both URL input and file upload
|
444 |
+
inputs = []
|
445 |
+
if url_input:
|
446 |
+
inputs.extend([url.strip() for url in url_input.split('\n') if url.strip()])
|
447 |
+
if video_file is not None:
|
448 |
+
# Assuming video_file is a file object with a 'name' attribute
|
449 |
+
inputs.append(video_file.name)
|
450 |
+
|
451 |
+
if not inputs:
|
452 |
+
raise ValueError("No input provided. Please enter URLs or upload a video file.")
|
453 |
+
try:
|
454 |
+
result = process_videos_with_error_handling(
|
455 |
+
inputs, start_time, end_time, diarize, whisper_model,
|
456 |
+
custom_prompt_checkbox, custom_prompt, chunking_options_checkbox,
|
457 |
+
chunk_method, max_chunk_size, chunk_overlap, use_adaptive_chunking,
|
458 |
+
use_multi_level_chunking, chunk_language, api_name,
|
459 |
+
api_key, keywords, use_cookies, cookies, batch_size,
|
460 |
+
timestamp_option, keep_original_video, summarize_recursively
|
461 |
+
)
|
462 |
+
except Exception as e:
|
463 |
+
logging.error(
|
464 |
+
f"process_videos_wrapper(): Error in process_videos_with_error_handling: {str(e)}",
|
465 |
+
exc_info=True)
|
466 |
+
|
467 |
+
logging.info("process_videos_wrapper(): process_videos_with_error_handling completed")
|
468 |
+
|
469 |
+
# Ensure that result is a tuple with 5 elements
|
470 |
+
if not isinstance(result, tuple) or len(result) != 5:
|
471 |
+
raise ValueError(
|
472 |
+
f"process_videos_wrapper(): Expected 5 outputs, but got {len(result) if isinstance(result, tuple) else 1}")
|
473 |
+
|
474 |
+
return result
|
475 |
+
except Exception as e:
|
476 |
+
logging.error(f"process_videos_wrapper(): Error in process_videos_wrapper: {str(e)}", exc_info=True)
|
477 |
+
# Return a tuple with 5 elements in case of any error
|
478 |
+
return (
|
479 |
+
# progress_output
|
480 |
+
f"process_videos_wrapper(): An error occurred: {str(e)}",
|
481 |
+
# error_output
|
482 |
+
str(e),
|
483 |
+
# results_output
|
484 |
+
f"<div class='error'>Error: {str(e)}</div>",
|
485 |
+
# download_transcription
|
486 |
+
None,
|
487 |
+
# download_summary
|
488 |
+
None
|
489 |
+
)
|
490 |
+
|
491 |
+
# FIXME - remove dead args for process_url_with_metadata
|
492 |
+
@error_handler
|
493 |
+
def process_url_with_metadata(input_item, num_speakers, whisper_model, custom_prompt, offset, api_name,
|
494 |
+
api_key,
|
495 |
+
vad_filter, download_video_flag, download_audio, rolling_summarization,
|
496 |
+
detail_level, question_box, keywords, local_file_path, diarize, end_time=None,
|
497 |
+
include_timestamps=True, metadata=None, use_chunking=False,
|
498 |
+
chunk_options=None, keep_original_video=False, current_whisper_model="Blank"):
|
499 |
+
|
500 |
+
try:
|
501 |
+
logging.info(f"Starting process_url_metadata for URL: {input_item}")
|
502 |
+
# Create download path
|
503 |
+
download_path = create_download_directory("Video_Downloads")
|
504 |
+
logging.info(f"Download path created at: {download_path}")
|
505 |
+
|
506 |
+
# Initialize info_dict
|
507 |
+
info_dict = {}
|
508 |
+
|
509 |
+
# Handle URL or local file
|
510 |
+
if os.path.isfile(input_item):
|
511 |
+
video_file_path = input_item
|
512 |
+
unique_id = generate_unique_identifier(input_item)
|
513 |
+
# Extract basic info from local file
|
514 |
+
info_dict = {
|
515 |
+
'webpage_url': unique_id,
|
516 |
+
'title': os.path.basename(input_item),
|
517 |
+
'description': "Local file",
|
518 |
+
'channel_url': None,
|
519 |
+
'duration': None,
|
520 |
+
'channel': None,
|
521 |
+
'uploader': None,
|
522 |
+
'upload_date': None
|
523 |
+
}
|
524 |
+
else:
|
525 |
+
# Extract video information
|
526 |
+
with yt_dlp.YoutubeDL({'quiet': True}) as ydl:
|
527 |
+
try:
|
528 |
+
full_info = ydl.extract_info(input_item, download=False)
|
529 |
+
|
530 |
+
# Create a safe subset of info to log
|
531 |
+
safe_info = {
|
532 |
+
'title': full_info.get('title', 'No title'),
|
533 |
+
'duration': full_info.get('duration', 'Unknown duration'),
|
534 |
+
'upload_date': full_info.get('upload_date', 'Unknown upload date'),
|
535 |
+
'uploader': full_info.get('uploader', 'Unknown uploader'),
|
536 |
+
'view_count': full_info.get('view_count', 'Unknown view count')
|
537 |
+
}
|
538 |
+
|
539 |
+
logging.debug(f"Full info extracted for {input_item}: {safe_info}")
|
540 |
+
except Exception as e:
|
541 |
+
logging.error(f"Error extracting video info: {str(e)}")
|
542 |
+
return None, None, None, None, None, None
|
543 |
+
|
544 |
+
# Filter the required metadata
|
545 |
+
if full_info:
|
546 |
+
info_dict = {
|
547 |
+
'webpage_url': full_info.get('webpage_url', input_item),
|
548 |
+
'title': full_info.get('title'),
|
549 |
+
'description': full_info.get('description'),
|
550 |
+
'channel_url': full_info.get('channel_url'),
|
551 |
+
'duration': full_info.get('duration'),
|
552 |
+
'channel': full_info.get('channel'),
|
553 |
+
'uploader': full_info.get('uploader'),
|
554 |
+
'upload_date': full_info.get('upload_date')
|
555 |
+
}
|
556 |
+
logging.debug(f"Filtered info_dict: {info_dict}")
|
557 |
+
else:
|
558 |
+
logging.error("Failed to extract video information")
|
559 |
+
return None, None, None, None, None, None
|
560 |
+
|
561 |
+
# Download video/audio
|
562 |
+
logging.info("Downloading video/audio...")
|
563 |
+
video_file_path = download_video(input_item, download_path, full_info, download_video_flag,
|
564 |
+
current_whisper_model="Blank")
|
565 |
+
if not video_file_path:
|
566 |
+
logging.error(f"Failed to download video/audio from {input_item}")
|
567 |
+
return None, None, None, None, None, None
|
568 |
+
|
569 |
+
logging.info(f"Processing file: {video_file_path}")
|
570 |
+
|
571 |
+
# Perform transcription
|
572 |
+
logging.info("Starting transcription...")
|
573 |
+
audio_file_path, segments = perform_transcription(video_file_path, offset, whisper_model,
|
574 |
+
vad_filter, diarize)
|
575 |
+
|
576 |
+
if audio_file_path is None or segments is None:
|
577 |
+
logging.error("Transcription failed or segments not available.")
|
578 |
+
return None, None, None, None, None, None
|
579 |
+
|
580 |
+
logging.info(f"Transcription completed. Number of segments: {len(segments)}")
|
581 |
+
|
582 |
+
# Add metadata to segments
|
583 |
+
segments_with_metadata = {
|
584 |
+
"metadata": info_dict,
|
585 |
+
"segments": segments
|
586 |
+
}
|
587 |
+
|
588 |
+
# Save segments with metadata to JSON file
|
589 |
+
segments_json_path = os.path.splitext(audio_file_path)[0] + ".segments.json"
|
590 |
+
with open(segments_json_path, 'w') as f:
|
591 |
+
json.dump(segments_with_metadata, f, indent=2)
|
592 |
+
|
593 |
+
# FIXME - why isnt this working?
|
594 |
+
# Delete the .wav file after successful transcription
|
595 |
+
files_to_delete = [audio_file_path]
|
596 |
+
for file_path in files_to_delete:
|
597 |
+
if file_path and os.path.exists(file_path):
|
598 |
+
try:
|
599 |
+
os.remove(file_path)
|
600 |
+
logging.info(f"Successfully deleted file: {file_path}")
|
601 |
+
except Exception as e:
|
602 |
+
logging.warning(f"Failed to delete file {file_path}: {str(e)}")
|
603 |
+
|
604 |
+
# Delete the mp4 file after successful transcription if not keeping original audio
|
605 |
+
# Modify the file deletion logic to respect keep_original_video
|
606 |
+
if not keep_original_video:
|
607 |
+
files_to_delete = [audio_file_path, video_file_path]
|
608 |
+
for file_path in files_to_delete:
|
609 |
+
if file_path and os.path.exists(file_path):
|
610 |
+
try:
|
611 |
+
os.remove(file_path)
|
612 |
+
logging.info(f"Successfully deleted file: {file_path}")
|
613 |
+
except Exception as e:
|
614 |
+
logging.warning(f"Failed to delete file {file_path}: {str(e)}")
|
615 |
+
else:
|
616 |
+
logging.info(f"Keeping original video file: {video_file_path}")
|
617 |
+
logging.info(f"Keeping original audio file: {audio_file_path}")
|
618 |
+
|
619 |
+
# Process segments based on the timestamp option
|
620 |
+
if not include_timestamps:
|
621 |
+
segments = [{'Text': segment['Text']} for segment in segments]
|
622 |
+
|
623 |
+
logging.info(f"Segments processed for timestamp inclusion: {segments}")
|
624 |
+
|
625 |
+
# Extract text from segments
|
626 |
+
transcription_text = extract_text_from_segments(segments)
|
627 |
+
|
628 |
+
if transcription_text.startswith("Error:"):
|
629 |
+
logging.error(f"Failed to extract transcription: {transcription_text}")
|
630 |
+
return None, None, None, None, None, None
|
631 |
+
|
632 |
+
# Use transcription_text instead of segments for further processing
|
633 |
+
full_text_with_metadata = f"{json.dumps(info_dict, indent=2)}\n\n{transcription_text}"
|
634 |
+
|
635 |
+
logging.debug(f"Full text with metadata extracted: {full_text_with_metadata[:100]}...")
|
636 |
+
|
637 |
+
# Perform summarization if API is provided
|
638 |
+
summary_text = None
|
639 |
+
if api_name:
|
640 |
+
# API key resolution handled at base of function if none provided
|
641 |
+
api_key = api_key if api_key else None
|
642 |
+
logging.info(f"Starting summarization with {api_name}...")
|
643 |
+
summary_text = perform_summarization(api_name, full_text_with_metadata, custom_prompt, api_key)
|
644 |
+
if summary_text is None:
|
645 |
+
logging.error("Summarization failed.")
|
646 |
+
return None, None, None, None, None, None
|
647 |
+
logging.debug(f"Summarization completed: {summary_text[:100]}...")
|
648 |
+
|
649 |
+
# Save transcription and summary
|
650 |
+
logging.info("Saving transcription and summary...")
|
651 |
+
download_path = create_download_directory("Audio_Processing")
|
652 |
+
json_file_path, summary_file_path = save_transcription_and_summary(full_text_with_metadata,
|
653 |
+
summary_text,
|
654 |
+
download_path, info_dict)
|
655 |
+
logging.info(f"Transcription saved to: {json_file_path}")
|
656 |
+
logging.info(f"Summary saved to: {summary_file_path}")
|
657 |
+
|
658 |
+
# Prepare keywords for database
|
659 |
+
if isinstance(keywords, str):
|
660 |
+
keywords_list = [kw.strip() for kw in keywords.split(',') if kw.strip()]
|
661 |
+
elif isinstance(keywords, (list, tuple)):
|
662 |
+
keywords_list = keywords
|
663 |
+
else:
|
664 |
+
keywords_list = []
|
665 |
+
logging.info(f"Keywords prepared: {keywords_list}")
|
666 |
+
|
667 |
+
# Add to database
|
668 |
+
logging.info("Adding to database...")
|
669 |
+
add_media_to_database(info_dict['webpage_url'], info_dict, full_text_with_metadata, summary_text,
|
670 |
+
keywords_list, custom_prompt, whisper_model)
|
671 |
+
logging.info(f"Media added to database: {info_dict['webpage_url']}")
|
672 |
+
|
673 |
+
return info_dict[
|
674 |
+
'webpage_url'], full_text_with_metadata, summary_text, json_file_path, summary_file_path, info_dict
|
675 |
+
|
676 |
+
except Exception as e:
|
677 |
+
logging.error(f"Error in process_url_with_metadata: {str(e)}", exc_info=True)
|
678 |
+
return None, None, None, None, None, None
|
679 |
+
|
680 |
+
process_button.click(
|
681 |
+
fn=process_videos_wrapper,
|
682 |
+
inputs=[
|
683 |
+
url_input, video_file_input, start_time_input, end_time_input, diarize_input, whisper_model_input,
|
684 |
+
custom_prompt_checkbox, custom_prompt_input, chunking_options_checkbox,
|
685 |
+
chunk_method, max_chunk_size, chunk_overlap, use_adaptive_chunking,
|
686 |
+
use_multi_level_chunking, chunk_language, summarize_recursively, api_name_input, api_key_input,
|
687 |
+
keywords_input, use_cookies_input, cookies_input, batch_size_input,
|
688 |
+
timestamp_option, keep_original_video
|
689 |
+
],
|
690 |
+
outputs=[progress_output, error_output, results_output, download_transcription, download_summary]
|
691 |
+
)
|
App_Function_Libraries/Gradio_UI/Website_scraping_tab.py
ADDED
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Website_scraping_tab.py
|
2 |
+
# Gradio UI for scraping websites
|
3 |
+
|
4 |
+
# Imports
|
5 |
+
#
|
6 |
+
# External Imports
|
7 |
+
import gradio as gr
|
8 |
+
|
9 |
+
from App_Function_Libraries.Article_Summarization_Lib import scrape_and_summarize_multiple
|
10 |
+
from App_Function_Libraries.DB_Manager import load_preset_prompts
|
11 |
+
from App_Function_Libraries.Gradio_UI.Chat_ui import update_user_prompt
|
12 |
+
|
13 |
+
|
14 |
+
#
|
15 |
+
# Local Imports
|
16 |
+
#
|
17 |
+
#
|
18 |
+
########################################################################################################################
|
19 |
+
#
|
20 |
+
# Functions:
|
21 |
+
|
22 |
+
|
23 |
+
def create_website_scraping_tab():
|
24 |
+
with gr.TabItem("Website Scraping"):
|
25 |
+
gr.Markdown("# Scrape Websites & Summarize Articles using a Headless Chrome Browser!")
|
26 |
+
with gr.Row():
|
27 |
+
with gr.Column():
|
28 |
+
url_input = gr.Textbox(label="Article URLs", placeholder="Enter article URLs here, one per line", lines=5)
|
29 |
+
custom_article_title_input = gr.Textbox(label="Custom Article Titles (Optional, one per line)",
|
30 |
+
placeholder="Enter custom titles for the articles, one per line",
|
31 |
+
lines=5)
|
32 |
+
with gr.Row():
|
33 |
+
custom_prompt_checkbox = gr.Checkbox(label="Use a Custom Prompt",
|
34 |
+
value=False,
|
35 |
+
visible=True)
|
36 |
+
preset_prompt_checkbox = gr.Checkbox(label="Use a pre-set Prompt",
|
37 |
+
value=False,
|
38 |
+
visible=True)
|
39 |
+
with gr.Row():
|
40 |
+
preset_prompt = gr.Dropdown(label="Select Preset Prompt",
|
41 |
+
choices=load_preset_prompts(),
|
42 |
+
visible=False)
|
43 |
+
with gr.Row():
|
44 |
+
website_custom_prompt_input = gr.Textbox(label="Custom Prompt",
|
45 |
+
placeholder="Enter custom prompt here",
|
46 |
+
lines=3,
|
47 |
+
visible=False)
|
48 |
+
with gr.Row():
|
49 |
+
system_prompt_input = gr.Textbox(label="System Prompt",
|
50 |
+
value="""<s>You are a bulleted notes specialist. [INST]```When creating comprehensive bulleted notes, you should follow these guidelines: Use multiple headings based on the referenced topics, not categories like quotes or terms. Headings should be surrounded by bold formatting and not be listed as bullet points themselves. Leave no space between headings and their corresponding list items underneath. Important terms within the content should be emphasized by setting them in bold font. Any text that ends with a colon should also be bolded. Before submitting your response, review the instructions, and make any corrections necessary to adhered to the specified format. Do not reference these instructions within the notes.``` \nBased on the content between backticks create comprehensive bulleted notes.[/INST]
|
51 |
+
**Bulleted Note Creation Guidelines**
|
52 |
+
|
53 |
+
**Headings**:
|
54 |
+
- Based on referenced topics, not categories like quotes or terms
|
55 |
+
- Surrounded by **bold** formatting
|
56 |
+
- Not listed as bullet points
|
57 |
+
- No space between headings and list items underneath
|
58 |
+
|
59 |
+
**Emphasis**:
|
60 |
+
- **Important terms** set in bold font
|
61 |
+
- **Text ending in a colon**: also bolded
|
62 |
+
|
63 |
+
**Review**:
|
64 |
+
- Ensure adherence to specified format
|
65 |
+
- Do not reference these instructions in your response.</s>[INST] {{ .Prompt }} [/INST]
|
66 |
+
""",
|
67 |
+
lines=3,
|
68 |
+
visible=False)
|
69 |
+
|
70 |
+
custom_prompt_checkbox.change(
|
71 |
+
fn=lambda x: (gr.update(visible=x), gr.update(visible=x)),
|
72 |
+
inputs=[custom_prompt_checkbox],
|
73 |
+
outputs=[website_custom_prompt_input, system_prompt_input]
|
74 |
+
)
|
75 |
+
preset_prompt_checkbox.change(
|
76 |
+
fn=lambda x: gr.update(visible=x),
|
77 |
+
inputs=[preset_prompt_checkbox],
|
78 |
+
outputs=[preset_prompt]
|
79 |
+
)
|
80 |
+
|
81 |
+
def update_prompts(preset_name):
|
82 |
+
prompts = update_user_prompt(preset_name)
|
83 |
+
return (
|
84 |
+
gr.update(value=prompts["user_prompt"], visible=True),
|
85 |
+
gr.update(value=prompts["system_prompt"], visible=True)
|
86 |
+
)
|
87 |
+
|
88 |
+
preset_prompt.change(
|
89 |
+
update_prompts,
|
90 |
+
inputs=preset_prompt,
|
91 |
+
outputs=[website_custom_prompt_input, system_prompt_input]
|
92 |
+
)
|
93 |
+
|
94 |
+
api_name_input = gr.Dropdown(
|
95 |
+
choices=[None, "Local-LLM", "OpenAI", "Anthropic", "Cohere", "Groq", "DeepSeek", "Mistral", "OpenRouter",
|
96 |
+
"Llama.cpp", "Kobold", "Ooba", "Tabbyapi", "VLLM","ollama", "HuggingFace"], value=None, label="API Name (Mandatory for Summarization)")
|
97 |
+
api_key_input = gr.Textbox(label="API Key (Mandatory if API Name is specified)",
|
98 |
+
placeholder="Enter your API key here; Ignore if using Local API or Built-in API", type="password")
|
99 |
+
keywords_input = gr.Textbox(label="Keywords", placeholder="Enter keywords here (comma-separated)",
|
100 |
+
value="default,no_keyword_set", visible=True)
|
101 |
+
|
102 |
+
scrape_button = gr.Button("Scrape and Summarize")
|
103 |
+
with gr.Column():
|
104 |
+
result_output = gr.Textbox(label="Result", lines=20)
|
105 |
+
|
106 |
+
scrape_button.click(
|
107 |
+
fn=scrape_and_summarize_multiple,
|
108 |
+
inputs=[url_input, website_custom_prompt_input, api_name_input, api_key_input, keywords_input,
|
109 |
+
custom_article_title_input, system_prompt_input],
|
110 |
+
outputs=result_output
|
111 |
+
)
|
112 |
+
|
113 |
+
|
App_Function_Libraries/Gradio_UI/Writing.py
ADDED
@@ -0,0 +1,700 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Writing.py
|
2 |
+
# Description: This file contains the functions that are used for writing in the Gradio UI.
|
3 |
+
#
|
4 |
+
# Imports
|
5 |
+
import base64
|
6 |
+
from datetime import datetime as datetime
|
7 |
+
import logging
|
8 |
+
import json
|
9 |
+
import os
|
10 |
+
#
|
11 |
+
# External Imports
|
12 |
+
import gradio as gr
|
13 |
+
from PIL import Image
|
14 |
+
import textstat
|
15 |
+
#
|
16 |
+
# Local Imports
|
17 |
+
from App_Function_Libraries.Summarization_General_Lib import perform_summarization
|
18 |
+
from App_Function_Libraries.Chat import chat
|
19 |
+
#
|
20 |
+
########################################################################################################################
|
21 |
+
#
|
22 |
+
# Functions:
|
23 |
+
|
24 |
+
def adjust_tone(text, concise, casual, api_name, api_key):
|
25 |
+
tones = [
|
26 |
+
{"tone": "concise", "weight": concise},
|
27 |
+
{"tone": "casual", "weight": casual},
|
28 |
+
{"tone": "professional", "weight": 1 - casual},
|
29 |
+
{"tone": "expanded", "weight": 1 - concise}
|
30 |
+
]
|
31 |
+
tones = sorted(tones, key=lambda x: x['weight'], reverse=True)[:2]
|
32 |
+
|
33 |
+
tone_prompt = " and ".join([f"{t['tone']} (weight: {t['weight']:.2f})" for t in tones])
|
34 |
+
|
35 |
+
prompt = f"Rewrite the following text to match these tones: {tone_prompt}. Text: {text}"
|
36 |
+
# Performing tone adjustment request...
|
37 |
+
adjusted_text = perform_summarization(api_name, text, prompt, api_key)
|
38 |
+
|
39 |
+
return adjusted_text
|
40 |
+
|
41 |
+
|
42 |
+
def grammar_style_check(input_text, custom_prompt, api_name, api_key, system_prompt):
|
43 |
+
default_prompt = "Please analyze the following text for grammar and style. Offer suggestions for improvement and point out any misused words or incorrect spellings:\n\n"
|
44 |
+
full_prompt = custom_prompt if custom_prompt else default_prompt
|
45 |
+
full_text = full_prompt + input_text
|
46 |
+
|
47 |
+
return perform_summarization(api_name, full_text, custom_prompt, api_key, system_prompt)
|
48 |
+
|
49 |
+
|
50 |
+
def create_grammar_style_check_tab():
|
51 |
+
with gr.TabItem("Grammar and Style Check"):
|
52 |
+
with gr.Row():
|
53 |
+
with gr.Column():
|
54 |
+
gr.Markdown("# Grammar and Style Check")
|
55 |
+
gr.Markdown("This utility checks the grammar and style of the provided text by feeding it to an LLM and returning suggestions for improvement.")
|
56 |
+
input_text = gr.Textbox(label="Input Text", lines=10)
|
57 |
+
custom_prompt_checkbox = gr.Checkbox(label="Use Custom Prompt", value=False, visible=True)
|
58 |
+
system_prompt_input = gr.Textbox(label="System Prompt", placeholder="Please analyze the provided text for grammar and style. Offer any suggestions or points to improve you can identify. Additionally please point out any misuses of any words or incorrect spellings.", lines=5, visible=False)
|
59 |
+
custom_prompt_input = gr.Textbox(label="user Prompt",
|
60 |
+
value="""<s>You are a bulleted notes specialist. [INST]```When creating comprehensive bulleted notes, you should follow these guidelines: Use multiple headings based on the referenced topics, not categories like quotes or terms. Headings should be surrounded by bold formatting and not be listed as bullet points themselves. Leave no space between headings and their corresponding list items underneath. Important terms within the content should be emphasized by setting them in bold font. Any text that ends with a colon should also be bolded. Before submitting your response, review the instructions, and make any corrections necessary to adhered to the specified format. Do not reference these instructions within the notes.``` \nBased on the content between backticks create comprehensive bulleted notes.[/INST]
|
61 |
+
**Bulleted Note Creation Guidelines**
|
62 |
+
|
63 |
+
**Headings**:
|
64 |
+
- Based on referenced topics, not categories like quotes or terms
|
65 |
+
- Surrounded by **bold** formatting
|
66 |
+
- Not listed as bullet points
|
67 |
+
- No space between headings and list items underneath
|
68 |
+
|
69 |
+
**Emphasis**:
|
70 |
+
- **Important terms** set in bold font
|
71 |
+
- **Text ending in a colon**: also bolded
|
72 |
+
|
73 |
+
**Review**:
|
74 |
+
- Ensure adherence to specified format
|
75 |
+
- Do not reference these instructions in your response.</s>[INST] {{ .Prompt }} [/INST]
|
76 |
+
""",
|
77 |
+
lines=3,
|
78 |
+
visible=False)
|
79 |
+
custom_prompt_checkbox.change(
|
80 |
+
fn=lambda x: (gr.update(visible=x), gr.update(visible=x)),
|
81 |
+
inputs=[custom_prompt_checkbox],
|
82 |
+
outputs=[custom_prompt_input, system_prompt_input]
|
83 |
+
)
|
84 |
+
api_name_input = gr.Dropdown(
|
85 |
+
choices=[None, "Local-LLM", "OpenAI", "Anthropic", "Cohere", "Groq", "DeepSeek", "Mistral", "OpenRouter",
|
86 |
+
"Llama.cpp", "Kobold", "Ooba", "Tabbyapi", "VLLM","ollama", "HuggingFace"],
|
87 |
+
value=None,
|
88 |
+
label="API for Grammar Check"
|
89 |
+
)
|
90 |
+
api_key_input = gr.Textbox(label="API Key (if not set in config.txt)", placeholder="Enter your API key here",
|
91 |
+
type="password")
|
92 |
+
check_grammar_button = gr.Button("Check Grammar and Style")
|
93 |
+
|
94 |
+
with gr.Column():
|
95 |
+
gr.Markdown("# Resulting Suggestions")
|
96 |
+
gr.Markdown("(Keep in mind the API used can affect the quality of the suggestions)")
|
97 |
+
|
98 |
+
output_text = gr.Textbox(label="Grammar and Style Suggestions", lines=15)
|
99 |
+
|
100 |
+
check_grammar_button.click(
|
101 |
+
fn=grammar_style_check,
|
102 |
+
inputs=[input_text, custom_prompt_input, api_name_input, api_key_input, system_prompt_input],
|
103 |
+
outputs=output_text
|
104 |
+
)
|
105 |
+
|
106 |
+
|
107 |
+
def create_tone_adjustment_tab():
|
108 |
+
with gr.TabItem("Tone Analyzer & Editor"):
|
109 |
+
with gr.Row():
|
110 |
+
with gr.Column():
|
111 |
+
input_text = gr.Textbox(label="Input Text", lines=10)
|
112 |
+
concise_slider = gr.Slider(minimum=0, maximum=1, value=0.5, label="Concise vs Expanded")
|
113 |
+
casual_slider = gr.Slider(minimum=0, maximum=1, value=0.5, label="Casual vs Professional")
|
114 |
+
api_name_input = gr.Dropdown(
|
115 |
+
choices=[None, "Local-LLM", "OpenAI", "Anthropic", "Cohere", "Groq", "DeepSeek", "Mistral", "OpenRouter",
|
116 |
+
"Llama.cpp", "Kobold", "Ooba", "Tabbyapi", "VLLM","ollama", "HuggingFace"],
|
117 |
+
value=None,
|
118 |
+
label="API for Grammar Check"
|
119 |
+
)
|
120 |
+
api_key_input = gr.Textbox(label="API Key (if not set in config.txt)", placeholder="Enter your API key here",
|
121 |
+
type="password")
|
122 |
+
adjust_btn = gr.Button("Adjust Tone")
|
123 |
+
|
124 |
+
with gr.Column():
|
125 |
+
output_text = gr.Textbox(label="Adjusted Text", lines=15)
|
126 |
+
|
127 |
+
adjust_btn.click(
|
128 |
+
adjust_tone,
|
129 |
+
inputs=[input_text, concise_slider, casual_slider],
|
130 |
+
outputs=output_text
|
131 |
+
)
|
132 |
+
|
133 |
+
|
134 |
+
persona_prompts = {
|
135 |
+
"Hemingway": "As Ernest Hemingway, known for concise and straightforward prose, provide feedback on the following text:",
|
136 |
+
"Shakespeare": "Channel William Shakespeare's poetic style and provide feedback on the following text:",
|
137 |
+
"Jane Austen": "Embodying Jane Austen's wit and social commentary, critique the following text:",
|
138 |
+
"Stephen King": "With Stephen King's flair for suspense and horror, analyze the following text:",
|
139 |
+
"J.K. Rowling": "As J.K. Rowling, creator of the magical world of Harry Potter, review the following text:"
|
140 |
+
}
|
141 |
+
|
142 |
+
def generate_writing_feedback(text, persona, aspect, api_name, api_key):
|
143 |
+
if isinstance(persona, dict): # If it's a character card
|
144 |
+
base_prompt = f"You are {persona['name']}. {persona['personality']}\n\nScenario: {persona['scenario']}\n\nRespond to the following message in character:"
|
145 |
+
else: # If it's a regular persona
|
146 |
+
base_prompt = persona_prompts.get(persona, f"As {persona}, provide feedback on the following text:")
|
147 |
+
|
148 |
+
if aspect != "Overall":
|
149 |
+
prompt = f"{base_prompt}\n\nFocus specifically on the {aspect.lower()} in the following text:\n\n{text}"
|
150 |
+
else:
|
151 |
+
prompt = f"{base_prompt}\n\n{text}"
|
152 |
+
|
153 |
+
return perform_summarization(api_name, text, prompt, api_key, system_message="You are a helpful AI assistant. You will respond to the user as if you were the persona declared in the user prompt.")
|
154 |
+
|
155 |
+
def generate_writing_prompt(persona, api_name, api_key):
|
156 |
+
prompt = f"Generate a writing prompt in the style of {persona}. The prompt should inspire a short story or scene that reflects {persona}'s typical themes and writing style."
|
157 |
+
#FIXME
|
158 |
+
return perform_summarization(api_name, prompt, "", api_key, system_message="You are a helpful AI assistant. You will respond to the user as if you were the persona declared in the user prompt." )
|
159 |
+
|
160 |
+
def calculate_readability(text):
|
161 |
+
ease = textstat.flesch_reading_ease(text)
|
162 |
+
grade = textstat.flesch_kincaid_grade(text)
|
163 |
+
return f"Readability: Flesch Reading Ease: {ease:.2f}, Flesch-Kincaid Grade Level: {grade:.2f}"
|
164 |
+
|
165 |
+
|
166 |
+
def generate_feedback_history_html(history):
|
167 |
+
html = "<h3>Recent Feedback History</h3>"
|
168 |
+
for entry in reversed(history):
|
169 |
+
html += f"<details><summary>{entry['persona']} Feedback</summary>"
|
170 |
+
html += f"<p><strong>Original Text:</strong> {entry['text'][:100]}...</p>"
|
171 |
+
|
172 |
+
feedback = entry.get('feedback')
|
173 |
+
if feedback:
|
174 |
+
html += f"<p><strong>Feedback:</strong> {feedback[:200]}...</p>"
|
175 |
+
else:
|
176 |
+
html += "<p><strong>Feedback:</strong> No feedback provided.</p>"
|
177 |
+
|
178 |
+
html += "</details>"
|
179 |
+
return html
|
180 |
+
|
181 |
+
|
182 |
+
# FIXME
|
183 |
+
def create_document_feedback_tab():
|
184 |
+
with gr.TabItem("Writing Feedback"):
|
185 |
+
with gr.Row():
|
186 |
+
with gr.Column(scale=2):
|
187 |
+
input_text = gr.Textbox(label="Your Writing", lines=10)
|
188 |
+
persona_dropdown = gr.Dropdown(
|
189 |
+
label="Select Persona",
|
190 |
+
choices=[
|
191 |
+
"Agatha Christie",
|
192 |
+
"Arthur Conan Doyle",
|
193 |
+
"Charles Bukowski",
|
194 |
+
"Charles Dickens",
|
195 |
+
"Chinua Achebe",
|
196 |
+
"Cormac McCarthy",
|
197 |
+
"David Foster Wallace",
|
198 |
+
"Edgar Allan Poe",
|
199 |
+
"F. Scott Fitzgerald",
|
200 |
+
"Flannery O'Connor",
|
201 |
+
"Franz Kafka",
|
202 |
+
"Fyodor Dostoevsky",
|
203 |
+
"Gabriel Garcia Marquez",
|
204 |
+
"George R.R. Martin",
|
205 |
+
"George Orwell",
|
206 |
+
"Haruki Murakami",
|
207 |
+
"Hemingway",
|
208 |
+
"Herman Melville",
|
209 |
+
"Isabel Allende",
|
210 |
+
"James Joyce",
|
211 |
+
"Jane Austen",
|
212 |
+
"J.K. Rowling",
|
213 |
+
"J.R.R. Tolkien",
|
214 |
+
"Jorge Luis Borges",
|
215 |
+
"Kurt Vonnegut",
|
216 |
+
"Leo Tolstoy",
|
217 |
+
"Margaret Atwood",
|
218 |
+
"Mark Twain",
|
219 |
+
"Mary Shelley",
|
220 |
+
"Milan Kundera",
|
221 |
+
"Naguib Mahfouz",
|
222 |
+
"Neil Gaiman",
|
223 |
+
"Octavia Butler",
|
224 |
+
"Philip K Dick",
|
225 |
+
"Ray Bradbury",
|
226 |
+
"Salman Rushdie",
|
227 |
+
"Shakespeare",
|
228 |
+
"Stephen King",
|
229 |
+
"Toni Morrison",
|
230 |
+
"T.S. Eliot",
|
231 |
+
"Ursula K. Le Guin",
|
232 |
+
"Virginia Woolf",
|
233 |
+
"Virginia Woolf",
|
234 |
+
"Zadie Smith"],
|
235 |
+
value="Hemingway"
|
236 |
+
)
|
237 |
+
custom_persona_name = gr.Textbox(label="Custom Persona Name")
|
238 |
+
custom_persona_description = gr.Textbox(label="Custom Persona Description", lines=3)
|
239 |
+
add_custom_persona_button = gr.Button("Add Custom Persona")
|
240 |
+
aspect_dropdown = gr.Dropdown(
|
241 |
+
label="Focus Feedback On",
|
242 |
+
choices=["Overall", "Grammar", "Word choice", "Structure of delivery", "Character Development", "Character Dialogue", "Descriptive Language", "Plot Structure"],
|
243 |
+
value="Overall"
|
244 |
+
)
|
245 |
+
api_name_input = gr.Dropdown(
|
246 |
+
choices=[None, "Local-LLM", "OpenAI", "Anthropic", "Cohere", "Groq", "DeepSeek", "Mistral", "OpenRouter",
|
247 |
+
"Llama.cpp", "Kobold", "Ooba", "Tabbyapi", "VLLM", "ollama", "HuggingFace"],
|
248 |
+
value=None,
|
249 |
+
label="API for Feedback"
|
250 |
+
)
|
251 |
+
api_key_input = gr.Textbox(label="API Key (if not set in config.txt)", type="password")
|
252 |
+
get_feedback_button = gr.Button("Get Feedback")
|
253 |
+
generate_prompt_button = gr.Button("Generate Writing Prompt")
|
254 |
+
|
255 |
+
with gr.Column(scale=2):
|
256 |
+
feedback_output = gr.Textbox(label="Feedback", lines=15)
|
257 |
+
readability_output = gr.Textbox(label="Readability Metrics")
|
258 |
+
feedback_history_display = gr.HTML(label="Feedback History")
|
259 |
+
|
260 |
+
with gr.Row():
|
261 |
+
compare_personas = gr.CheckboxGroup(
|
262 |
+
choices=[
|
263 |
+
"Agatha Christie",
|
264 |
+
"Arthur Conan Doyle",
|
265 |
+
"Charles Bukowski",
|
266 |
+
"Charles Dickens",
|
267 |
+
"Chinua Achebe",
|
268 |
+
"Cormac McCarthy",
|
269 |
+
"David Foster Wallace",
|
270 |
+
"Edgar Allan Poe",
|
271 |
+
"F. Scott Fitzgerald",
|
272 |
+
"Flannery O'Connor",
|
273 |
+
"Franz Kafka",
|
274 |
+
"Fyodor Dostoevsky",
|
275 |
+
"Gabriel Garcia Marquez",
|
276 |
+
"George R.R. Martin",
|
277 |
+
"George Orwell",
|
278 |
+
"Haruki Murakami",
|
279 |
+
"Hemingway",
|
280 |
+
"Herman Melville",
|
281 |
+
"Isabel Allende",
|
282 |
+
"James Joyce",
|
283 |
+
"Jane Austen",
|
284 |
+
"J.K. Rowling",
|
285 |
+
"J.R.R. Tolkien",
|
286 |
+
"Jorge Luis Borges",
|
287 |
+
"Kurt Vonnegut",
|
288 |
+
"Leo Tolstoy",
|
289 |
+
"Margaret Atwood",
|
290 |
+
"Mark Twain",
|
291 |
+
"Mary Shelley",
|
292 |
+
"Milan Kundera",
|
293 |
+
"Naguib Mahfouz",
|
294 |
+
"Neil Gaiman",
|
295 |
+
"Octavia Butler",
|
296 |
+
"Philip K Dick",
|
297 |
+
"Ray Bradbury",
|
298 |
+
"Salman Rushdie",
|
299 |
+
"Shakespeare",
|
300 |
+
"Stephen King",
|
301 |
+
"Toni Morrison",
|
302 |
+
"T.S. Eliot",
|
303 |
+
"Ursula K. Le Guin",
|
304 |
+
"Virginia Woolf",
|
305 |
+
"Virginia Woolf",
|
306 |
+
"Zadie Smith"],
|
307 |
+
label="Compare Multiple Persona's Feedback at Once"
|
308 |
+
)
|
309 |
+
with gr.Row():
|
310 |
+
compare_button = gr.Button("Compare Feedback")
|
311 |
+
|
312 |
+
feedback_history = gr.State([])
|
313 |
+
|
314 |
+
def add_custom_persona(name, description):
|
315 |
+
updated_choices = persona_dropdown.choices + [name]
|
316 |
+
persona_prompts[name] = f"As {name}, {description}, provide feedback on the following text:"
|
317 |
+
return gr.update(choices=updated_choices)
|
318 |
+
|
319 |
+
def update_feedback_history(current_text, persona, feedback):
|
320 |
+
# Ensure feedback_history.value is initialized and is a list
|
321 |
+
if feedback_history.value is None:
|
322 |
+
feedback_history.value = []
|
323 |
+
|
324 |
+
history = feedback_history.value
|
325 |
+
|
326 |
+
# Append the new entry to the history
|
327 |
+
history.append({"text": current_text, "persona": persona, "feedback": feedback})
|
328 |
+
|
329 |
+
# Keep only the last 5 entries in the history
|
330 |
+
feedback_history.value = history[-10:]
|
331 |
+
|
332 |
+
# Generate and return the updated HTML
|
333 |
+
return generate_feedback_history_html(feedback_history.value)
|
334 |
+
|
335 |
+
def compare_feedback(text, selected_personas, api_name, api_key):
|
336 |
+
results = []
|
337 |
+
for persona in selected_personas:
|
338 |
+
feedback = generate_writing_feedback(text, persona, "Overall", api_name, api_key)
|
339 |
+
results.append(f"### {persona}'s Feedback:\n{feedback}\n\n")
|
340 |
+
return "\n".join(results)
|
341 |
+
|
342 |
+
add_custom_persona_button.click(
|
343 |
+
fn=add_custom_persona,
|
344 |
+
inputs=[custom_persona_name, custom_persona_description],
|
345 |
+
outputs=persona_dropdown
|
346 |
+
)
|
347 |
+
|
348 |
+
get_feedback_button.click(
|
349 |
+
fn=lambda text, persona, aspect, api_name, api_key: (
|
350 |
+
generate_writing_feedback(text, persona, aspect, api_name, api_key),
|
351 |
+
calculate_readability(text),
|
352 |
+
update_feedback_history(text, persona, generate_writing_feedback(text, persona, aspect, api_name, api_key))
|
353 |
+
),
|
354 |
+
inputs=[input_text, persona_dropdown, aspect_dropdown, api_name_input, api_key_input],
|
355 |
+
outputs=[feedback_output, readability_output, feedback_history_display]
|
356 |
+
)
|
357 |
+
|
358 |
+
compare_button.click(
|
359 |
+
fn=compare_feedback,
|
360 |
+
inputs=[input_text, compare_personas, api_name_input, api_key_input],
|
361 |
+
outputs=feedback_output
|
362 |
+
)
|
363 |
+
|
364 |
+
generate_prompt_button.click(
|
365 |
+
fn=generate_writing_prompt,
|
366 |
+
inputs=[persona_dropdown, api_name_input, api_key_input],
|
367 |
+
outputs=input_text
|
368 |
+
)
|
369 |
+
|
370 |
+
return input_text, feedback_output, readability_output, feedback_history_display
|
371 |
+
|
372 |
+
|
373 |
+
def create_creative_writing_tab():
|
374 |
+
with gr.TabItem("Creative Writing Assistant"):
|
375 |
+
gr.Markdown("# Utility to be added...")
|
376 |
+
|
377 |
+
|
378 |
+
#FIXME - change to use chat function
|
379 |
+
def chat_with_character(user_message, history, char_data, api_name_input, api_key):
|
380 |
+
if char_data is None:
|
381 |
+
return history, "Please import a character card first."
|
382 |
+
|
383 |
+
bot_message = generate_writing_feedback(user_message, char_data['name'], "Overall", api_name_input,
|
384 |
+
api_key)
|
385 |
+
history.append((user_message, bot_message))
|
386 |
+
return history, ""
|
387 |
+
|
388 |
+
def import_character_card(file):
|
389 |
+
if file is None:
|
390 |
+
logging.warning("No file provided for character card import")
|
391 |
+
return None
|
392 |
+
try:
|
393 |
+
if file.name.lower().endswith(('.png', '.webp')):
|
394 |
+
logging.info(f"Attempting to import character card from image: {file.name}")
|
395 |
+
json_data = extract_json_from_image(file)
|
396 |
+
if json_data:
|
397 |
+
logging.info("JSON data extracted from image, attempting to parse")
|
398 |
+
return import_character_card_json(json_data)
|
399 |
+
else:
|
400 |
+
logging.warning("No JSON data found in the image")
|
401 |
+
else:
|
402 |
+
logging.info(f"Attempting to import character card from JSON file: {file.name}")
|
403 |
+
content = file.read().decode('utf-8')
|
404 |
+
return import_character_card_json(content)
|
405 |
+
except Exception as e:
|
406 |
+
logging.error(f"Error importing character card: {e}")
|
407 |
+
return None
|
408 |
+
|
409 |
+
|
410 |
+
def import_character_card_json(json_content):
|
411 |
+
try:
|
412 |
+
# Remove any leading/trailing whitespace
|
413 |
+
json_content = json_content.strip()
|
414 |
+
|
415 |
+
# Log the first 100 characters of the content
|
416 |
+
logging.debug(f"JSON content (first 100 chars): {json_content[:100]}...")
|
417 |
+
|
418 |
+
card_data = json.loads(json_content)
|
419 |
+
logging.debug(f"Parsed JSON data keys: {list(card_data.keys())}")
|
420 |
+
if 'spec' in card_data and card_data['spec'] == 'chara_card_v2':
|
421 |
+
logging.info("Detected V2 character card")
|
422 |
+
return card_data['data']
|
423 |
+
else:
|
424 |
+
logging.info("Assuming V1 character card")
|
425 |
+
return card_data
|
426 |
+
except json.JSONDecodeError as e:
|
427 |
+
logging.error(f"JSON decode error: {e}")
|
428 |
+
logging.error(f"Problematic JSON content: {json_content[:500]}...")
|
429 |
+
except Exception as e:
|
430 |
+
logging.error(f"Unexpected error parsing JSON: {e}")
|
431 |
+
return None
|
432 |
+
|
433 |
+
|
434 |
+
def extract_json_from_image(image_file):
|
435 |
+
logging.debug(f"Attempting to extract JSON from image: {image_file.name}")
|
436 |
+
try:
|
437 |
+
with Image.open(image_file) as img:
|
438 |
+
logging.debug("Image opened successfully")
|
439 |
+
metadata = img.info
|
440 |
+
if 'chara' in metadata:
|
441 |
+
logging.debug("Found 'chara' in image metadata")
|
442 |
+
chara_content = metadata['chara']
|
443 |
+
logging.debug(f"Content of 'chara' metadata (first 100 chars): {chara_content[:100]}...")
|
444 |
+
try:
|
445 |
+
decoded_content = base64.b64decode(chara_content).decode('utf-8')
|
446 |
+
logging.debug(f"Decoded content (first 100 chars): {decoded_content[:100]}...")
|
447 |
+
return decoded_content
|
448 |
+
except Exception as e:
|
449 |
+
logging.error(f"Error decoding base64 content: {e}")
|
450 |
+
|
451 |
+
logging.debug("'chara' not found in metadata, checking for base64 encoded data")
|
452 |
+
raw_data = img.tobytes()
|
453 |
+
possible_json = raw_data.split(b'{', 1)[-1].rsplit(b'}', 1)[0]
|
454 |
+
if possible_json:
|
455 |
+
try:
|
456 |
+
decoded = base64.b64decode(possible_json).decode('utf-8')
|
457 |
+
if decoded.startswith('{') and decoded.endswith('}'):
|
458 |
+
logging.debug("Found and decoded base64 JSON data")
|
459 |
+
return '{' + decoded + '}'
|
460 |
+
except Exception as e:
|
461 |
+
logging.error(f"Error decoding base64 data: {e}")
|
462 |
+
|
463 |
+
logging.warning("No JSON data found in the image")
|
464 |
+
except Exception as e:
|
465 |
+
logging.error(f"Error extracting JSON from image: {e}")
|
466 |
+
return None
|
467 |
+
|
468 |
+
def load_chat_history(file):
|
469 |
+
try:
|
470 |
+
content = file.read().decode('utf-8')
|
471 |
+
chat_data = json.loads(content)
|
472 |
+
return chat_data['history'], chat_data['character']
|
473 |
+
except Exception as e:
|
474 |
+
logging.error(f"Error loading chat history: {e}")
|
475 |
+
return None, None
|
476 |
+
|
477 |
+
def create_character_card_interaction_tab():
|
478 |
+
with gr.TabItem("Chat with a Character Card"):
|
479 |
+
gr.Markdown("# Chat with a Character Card")
|
480 |
+
with gr.Row():
|
481 |
+
with gr.Column(scale=1):
|
482 |
+
character_card_upload = gr.File(label="Upload Character Card")
|
483 |
+
import_card_button = gr.Button("Import Character Card")
|
484 |
+
load_characters_button = gr.Button("Load Existing Characters")
|
485 |
+
from App_Function_Libraries.Chat_related_functions import get_character_names
|
486 |
+
character_dropdown = gr.Dropdown(label="Select Character", choices=get_character_names())
|
487 |
+
api_name_input = gr.Dropdown(
|
488 |
+
choices=[None, "Local-LLM", "OpenAI", "Anthropic", "Cohere", "Groq", "DeepSeek", "Mistral",
|
489 |
+
"OpenRouter", "Llama.cpp", "Kobold", "Ooba", "Tabbyapi", "VLLM", "ollama", "HuggingFace"],
|
490 |
+
value=None,
|
491 |
+
label="API for Interaction"
|
492 |
+
)
|
493 |
+
api_key_input = gr.Textbox(label="API Key (if not set in config.txt)",
|
494 |
+
placeholder="Enter your API key here", type="password")
|
495 |
+
temperature_slider = gr.Slider(minimum=0.0, maximum=2.0, value=0.7, step=0.05, label="Temperature")
|
496 |
+
import_chat_button = gr.Button("Import Chat History")
|
497 |
+
chat_file_upload = gr.File(label="Upload Chat History JSON", visible=False)
|
498 |
+
|
499 |
+
|
500 |
+
with gr.Column(scale=2):
|
501 |
+
chat_history = gr.Chatbot(label="Conversation")
|
502 |
+
user_input = gr.Textbox(label="Your message")
|
503 |
+
send_message_button = gr.Button("Send Message")
|
504 |
+
regenerate_button = gr.Button("Regenerate Last Message")
|
505 |
+
save_chat_button = gr.Button("Save This Chat")
|
506 |
+
save_status = gr.Textbox(label="Save Status", interactive=False)
|
507 |
+
|
508 |
+
character_data = gr.State(None)
|
509 |
+
|
510 |
+
def import_chat_history(file, current_history, char_data):
|
511 |
+
loaded_history, char_name = load_chat_history(file)
|
512 |
+
if loaded_history is None:
|
513 |
+
return current_history, char_data, "Failed to load chat history."
|
514 |
+
|
515 |
+
# Check if the loaded chat is for the current character
|
516 |
+
if char_data and char_data.get('name') != char_name:
|
517 |
+
return current_history, char_data, f"Warning: Loaded chat is for character '{char_name}', but current character is '{char_data.get('name')}'. Chat not imported."
|
518 |
+
|
519 |
+
# If no character is selected, try to load the character from the chat
|
520 |
+
if not char_data:
|
521 |
+
new_char_data = load_character(char_name)[0]
|
522 |
+
if new_char_data:
|
523 |
+
char_data = new_char_data
|
524 |
+
else:
|
525 |
+
return current_history, char_data, f"Warning: Character '{char_name}' not found. Please select the character manually."
|
526 |
+
|
527 |
+
return loaded_history, char_data, f"Chat history for '{char_name}' imported successfully."
|
528 |
+
|
529 |
+
def import_character(file):
|
530 |
+
card_data = import_character_card(file)
|
531 |
+
if card_data:
|
532 |
+
from App_Function_Libraries.Chat_related_functions import save_character
|
533 |
+
save_character(card_data)
|
534 |
+
return card_data, gr.update(choices=get_character_names())
|
535 |
+
else:
|
536 |
+
return None, gr.update()
|
537 |
+
|
538 |
+
def load_character(name):
|
539 |
+
from App_Function_Libraries.Chat_related_functions import load_characters
|
540 |
+
characters = load_characters()
|
541 |
+
char_data = characters.get(name)
|
542 |
+
if char_data:
|
543 |
+
first_message = char_data.get('first_mes', "Hello! I'm ready to chat.")
|
544 |
+
return char_data, [(None, first_message)] if first_message else []
|
545 |
+
return None, []
|
546 |
+
|
547 |
+
def character_chat_wrapper(message, history, char_data, api_endpoint, api_key, temperature):
|
548 |
+
logging.debug("Entered character_chat_wrapper")
|
549 |
+
if char_data is None:
|
550 |
+
return "Please select a character first.", history
|
551 |
+
|
552 |
+
# Prepare the character's background information
|
553 |
+
char_background = f"""
|
554 |
+
Name: {char_data.get('name', 'Unknown')}
|
555 |
+
Description: {char_data.get('description', 'N/A')}
|
556 |
+
Personality: {char_data.get('personality', 'N/A')}
|
557 |
+
Scenario: {char_data.get('scenario', 'N/A')}
|
558 |
+
"""
|
559 |
+
|
560 |
+
# Prepare the system prompt for character impersonation
|
561 |
+
system_message = f"""You are roleplaying as the character described below. Respond to the user's messages in character, maintaining the personality and background provided. Do not break character or refer to yourself as an AI.
|
562 |
+
|
563 |
+
{char_background}
|
564 |
+
|
565 |
+
Additional instructions: {char_data.get('post_history_instructions', '')}
|
566 |
+
"""
|
567 |
+
|
568 |
+
# Prepare media_content and selected_parts
|
569 |
+
media_content = {
|
570 |
+
'id': char_data.get('name'),
|
571 |
+
'title': char_data.get('name', 'Unknown Character'),
|
572 |
+
'content': char_background,
|
573 |
+
'description': char_data.get('description', ''),
|
574 |
+
'personality': char_data.get('personality', ''),
|
575 |
+
'scenario': char_data.get('scenario', '')
|
576 |
+
}
|
577 |
+
selected_parts = ['description', 'personality', 'scenario']
|
578 |
+
|
579 |
+
prompt = char_data.get('post_history_instructions', '')
|
580 |
+
|
581 |
+
# Prepare the input for the chat function
|
582 |
+
if not history:
|
583 |
+
full_message = f"{prompt}\n\n{message}" if prompt else message
|
584 |
+
else:
|
585 |
+
full_message = message
|
586 |
+
|
587 |
+
# Call the chat function
|
588 |
+
bot_message = chat(
|
589 |
+
message,
|
590 |
+
history,
|
591 |
+
media_content,
|
592 |
+
selected_parts,
|
593 |
+
api_endpoint,
|
594 |
+
api_key,
|
595 |
+
prompt,
|
596 |
+
temperature,
|
597 |
+
system_message
|
598 |
+
)
|
599 |
+
|
600 |
+
# Update history
|
601 |
+
history.append((message, bot_message))
|
602 |
+
return history
|
603 |
+
|
604 |
+
def save_chat_history(history, character_name):
|
605 |
+
# Create the Saved_Chats folder if it doesn't exist
|
606 |
+
save_directory = "Saved_Chats"
|
607 |
+
os.makedirs(save_directory, exist_ok=True)
|
608 |
+
|
609 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
610 |
+
filename = f"chat_history_{character_name}_{timestamp}.json"
|
611 |
+
filepath = os.path.join(save_directory, filename)
|
612 |
+
|
613 |
+
chat_data = {
|
614 |
+
"character": character_name,
|
615 |
+
"timestamp": timestamp,
|
616 |
+
"history": history
|
617 |
+
}
|
618 |
+
|
619 |
+
try:
|
620 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
621 |
+
json.dump(chat_data, f, ensure_ascii=False, indent=2)
|
622 |
+
return filepath
|
623 |
+
except Exception as e:
|
624 |
+
return f"Error saving chat: {str(e)}"
|
625 |
+
|
626 |
+
def save_current_chat(history, char_data):
|
627 |
+
if not char_data or not history:
|
628 |
+
return "No chat to save or character not selected."
|
629 |
+
|
630 |
+
character_name = char_data.get('name', 'Unknown')
|
631 |
+
result = save_chat_history(history, character_name)
|
632 |
+
if result.startswith("Error"):
|
633 |
+
return result
|
634 |
+
return f"Chat saved successfully as {result}"
|
635 |
+
|
636 |
+
def regenerate_last_message(history, char_data, api_name, api_key, temperature):
|
637 |
+
if not history:
|
638 |
+
return history
|
639 |
+
|
640 |
+
last_user_message = history[-1][0]
|
641 |
+
new_history = history[:-1]
|
642 |
+
|
643 |
+
return character_chat_wrapper(last_user_message, new_history, char_data, api_name, api_key, temperature)
|
644 |
+
|
645 |
+
import_chat_button.click(
|
646 |
+
fn=lambda: gr.update(visible=True),
|
647 |
+
outputs=chat_file_upload
|
648 |
+
)
|
649 |
+
|
650 |
+
chat_file_upload.change(
|
651 |
+
fn=import_chat_history,
|
652 |
+
inputs=[chat_file_upload, chat_history, character_data],
|
653 |
+
outputs=[chat_history, character_data, save_status]
|
654 |
+
)
|
655 |
+
|
656 |
+
import_card_button.click(
|
657 |
+
fn=import_character,
|
658 |
+
inputs=[character_card_upload],
|
659 |
+
outputs=[character_data, character_dropdown]
|
660 |
+
)
|
661 |
+
|
662 |
+
load_characters_button.click(
|
663 |
+
fn=lambda: gr.update(choices=get_character_names()),
|
664 |
+
outputs=character_dropdown
|
665 |
+
)
|
666 |
+
|
667 |
+
character_dropdown.change(
|
668 |
+
fn=load_character,
|
669 |
+
inputs=[character_dropdown],
|
670 |
+
outputs=[character_data, chat_history]
|
671 |
+
)
|
672 |
+
|
673 |
+
send_message_button.click(
|
674 |
+
fn=character_chat_wrapper,
|
675 |
+
inputs=[user_input, chat_history, character_data, api_name_input, api_key_input, temperature_slider],
|
676 |
+
outputs=[chat_history]
|
677 |
+
).then(lambda: "", outputs=user_input)
|
678 |
+
|
679 |
+
regenerate_button.click(
|
680 |
+
fn=regenerate_last_message,
|
681 |
+
inputs=[chat_history, character_data, api_name_input, api_key_input, temperature_slider],
|
682 |
+
outputs=[chat_history]
|
683 |
+
)
|
684 |
+
|
685 |
+
save_chat_button.click(
|
686 |
+
fn=save_current_chat,
|
687 |
+
inputs=[chat_history, character_data],
|
688 |
+
outputs=[save_status]
|
689 |
+
)
|
690 |
+
|
691 |
+
return character_data, chat_history, user_input
|
692 |
+
|
693 |
+
|
694 |
+
def create_mikupad_tab():
|
695 |
+
with gr.TabItem("Mikupad"):
|
696 |
+
gr.Markdown("I Wish. Gradio won't embed it successfully...")
|
697 |
+
|
698 |
+
#
|
699 |
+
# End of Writing.py
|
700 |
+
########################################################################################################################
|
App_Function_Libraries/Gradio_UI/__init__.py
ADDED
File without changes
|
App_Function_Libraries/Gradio_UI/__pycache__/Audio_ingestion_tab.cpython-312.pyc
ADDED
Binary file (9.42 kB). View file
|
|
App_Function_Libraries/Gradio_UI/__pycache__/Chat_ui.cpython-312.pyc
ADDED
Binary file (49.3 kB). View file
|
|
App_Function_Libraries/Gradio_UI/__pycache__/Explain_summarize_tab.cpython-312.pyc
ADDED
Binary file (10.5 kB). View file
|
|
App_Function_Libraries/Gradio_UI/__pycache__/Export_Functionality.cpython-312.pyc
ADDED
Binary file (18.4 kB). View file
|
|
App_Function_Libraries/Gradio_UI/__pycache__/Gradio_Shared.cpython-312.pyc
ADDED
Binary file (13.7 kB). View file
|
|
App_Function_Libraries/Gradio_UI/__pycache__/Import_Functionality.cpython-312.pyc
ADDED
Binary file (27.5 kB). View file
|
|
App_Function_Libraries/Gradio_UI/__pycache__/Introduction_tab.cpython-312.pyc
ADDED
Binary file (15 kB). View file
|
|
App_Function_Libraries/Gradio_UI/__pycache__/Keywords.cpython-312.pyc
ADDED
Binary file (4.58 kB). View file
|
|
App_Function_Libraries/Gradio_UI/__pycache__/Llamafile_tab.cpython-312.pyc
ADDED
Binary file (6.52 kB). View file
|
|
App_Function_Libraries/Gradio_UI/__pycache__/Media_edit.cpython-312.pyc
ADDED
Binary file (14.3 kB). View file
|
|
App_Function_Libraries/Gradio_UI/__pycache__/PDF_ingestion_tab.cpython-312.pyc
ADDED
Binary file (8.79 kB). View file
|
|
App_Function_Libraries/Gradio_UI/__pycache__/Podcast_tab.cpython-312.pyc
ADDED
Binary file (9.53 kB). View file
|
|
App_Function_Libraries/Gradio_UI/__pycache__/Re_summarize_tab.cpython-312.pyc
ADDED
Binary file (14.5 kB). View file
|
|
App_Function_Libraries/Gradio_UI/__pycache__/Search_Tab.cpython-312.pyc
ADDED
Binary file (26.2 kB). View file
|
|
App_Function_Libraries/Gradio_UI/__pycache__/Trash.cpython-312.pyc
ADDED
Binary file (7.73 kB). View file
|
|
App_Function_Libraries/Gradio_UI/__pycache__/Utilities.cpython-312.pyc
ADDED
Binary file (7.55 kB). View file
|
|
App_Function_Libraries/Gradio_UI/__pycache__/Video_transcription_tab.cpython-312.pyc
ADDED
Binary file (35.1 kB). View file
|
|
App_Function_Libraries/Gradio_UI/__pycache__/Website_scraping_tab.cpython-312.pyc
ADDED
Binary file (6.53 kB). View file
|
|
App_Function_Libraries/Gradio_UI/__pycache__/Writing.cpython-312.pyc
ADDED
Binary file (33.7 kB). View file
|
|