oceansweep commited on
Commit
1b24d60
·
verified ·
1 Parent(s): 34c34d3

Upload 2 files

Browse files
App_Function_Libraries/Summarization/Local_Summarization_Lib.py CHANGED
@@ -35,6 +35,25 @@ from App_Function_Libraries.Utils.Utils import load_and_log_configs, extract_tex
35
  logger = logging.getLogger()
36
 
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  # FIXME - temp is not used
39
  def summarize_with_local_llm(input_data, custom_prompt_arg, temp, system_message=None):
40
  try:
@@ -108,7 +127,7 @@ def summarize_with_local_llm(input_data, custom_prompt_arg, temp, system_message
108
  return "Local LLM: Error occurred while processing summary"
109
 
110
 
111
- def summarize_with_llama(input_data, custom_prompt, api_url="http://127.0.0.1:8080/completion", api_key=None, temp=None, system_message=None):
112
  try:
113
  logging.debug("Llama.cpp: Loading and validating configurations")
114
  loaded_config_data = load_and_log_configs()
@@ -138,12 +157,12 @@ def summarize_with_llama(input_data, custom_prompt, api_url="http://127.0.0.1:80
138
  logging.debug("Llama.cpp: Using provided string data for summarization")
139
  data = input_data
140
 
141
- logging.debug(f"Llama.cpp: Loaded data: {data}")
142
- logging.debug(f"Llama.cpp: Type of data: {type(data)}")
143
 
144
  if isinstance(data, dict) and 'summary' in data:
145
  # If the loaded data is a dictionary and already contains a summary, return it
146
- logging.debug("Llama.cpp: Summary already exists in the loaded data")
147
  return data['summary']
148
 
149
  # If the loaded data is a list of segment dictionaries or a string, proceed with summarization
@@ -153,7 +172,7 @@ def summarize_with_llama(input_data, custom_prompt, api_url="http://127.0.0.1:80
153
  elif isinstance(data, str):
154
  text = data
155
  else:
156
- raise ValueError("Llama.cpp: Invalid input data format")
157
 
158
  headers = {
159
  'accept': 'application/json',
@@ -162,13 +181,17 @@ def summarize_with_llama(input_data, custom_prompt, api_url="http://127.0.0.1:80
162
  if len(api_key) > 5:
163
  headers['Authorization'] = f'Bearer {api_key}'
164
 
165
- llama_prompt = f"{custom_prompt} \n\n\n\n{text}"
166
  if system_message is None:
167
  system_message = "You are a helpful AI assistant."
168
- logging.debug("llama: Prompt being sent is {llama_prompt}")
169
  if system_message is None:
170
  system_message = "You are a helpful AI assistant."
171
 
 
 
 
 
 
172
  data = {
173
  "messages": [
174
  {"role": "system", "content": system_message},
@@ -201,7 +224,7 @@ def summarize_with_llama(input_data, custom_prompt, api_url="http://127.0.0.1:80
201
 
202
 
203
  # https://lite.koboldai.net/koboldcpp_api#/api%2Fv1/post_api_v1_generate
204
- def summarize_with_kobold(input_data, api_key, custom_prompt_input, kobold_api_ip="http://127.0.0.1:5001/api/v1/generate", temp=None, system_message=None):
205
  logging.debug("Kobold: Summarization process starting...")
206
  try:
207
  logging.debug("Kobold: Loading and validating configurations")
@@ -253,9 +276,12 @@ def summarize_with_kobold(input_data, api_key, custom_prompt_input, kobold_api_i
253
  'accept': 'application/json',
254
  'content-type': 'application/json',
255
  }
 
 
 
 
256
 
257
- kobold_prompt = f"{custom_prompt_input}\n\n\n\n{text}"
258
- logging.debug("kobold: Prompt being sent is {kobold_prompt}")
259
 
260
  # FIXME
261
  # Values literally c/p from the api docs....
@@ -269,12 +295,12 @@ def summarize_with_kobold(input_data, api_key, custom_prompt_input, kobold_api_i
269
  #"rep_penalty": 1.0,
270
  }
271
 
272
- logging.debug("kobold: Submitting request to API endpoint")
273
- print("kobold: Submitting request to API endpoint")
274
  kobold_api_ip = loaded_config_data['local_api_ip']['kobold']
275
  try:
276
  response = requests.post(kobold_api_ip, headers=headers, json=data)
277
- logging.debug("kobold: API Response Status Code: %d", response.status_code)
278
 
279
  if response.status_code == 200:
280
  try:
@@ -303,7 +329,7 @@ def summarize_with_kobold(input_data, api_key, custom_prompt_input, kobold_api_i
303
 
304
 
305
  # https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API
306
- def summarize_with_oobabooga(input_data, api_key, custom_prompt, api_url="http://127.0.0.1:5000/v1/chat/completions", temp=None, system_message=None):
307
  logging.debug("Oobabooga: Summarization process starting...")
308
  try:
309
  logging.debug("Oobabooga: Loading and validating configurations")
@@ -356,9 +382,13 @@ def summarize_with_oobabooga(input_data, api_key, custom_prompt, api_url="http:/
356
  'content-type': 'application/json',
357
  }
358
 
359
- # prompt_text = "I like to eat cake and bake cakes. I am a baker. I work in a French bakery baking cakes. It
360
- # is a fun job. I have been baking cakes for ten years. I also bake lots of other baked goods, but cakes are
361
- # my favorite." prompt_text += f"\n\n{text}" # Uncomment this line if you want to include the text variable
 
 
 
 
362
  ooba_prompt = f"{text}" + f"\n\n\n\n{custom_prompt}"
363
  logging.debug("ooba: Prompt being sent is {ooba_prompt}")
364
 
@@ -392,8 +422,7 @@ def summarize_with_oobabooga(input_data, api_key, custom_prompt, api_url="http:/
392
  return f"ooba: Error occurred while processing summary with oobabooga: {str(e)}"
393
 
394
 
395
-
396
- def summarize_with_tabbyapi(input_data, custom_prompt_input, api_key=None, api_IP="http://127.0.0.1:5000/v1/chat/completions", temp=None, system_message=None):
397
  logging.debug("TabbyAPI: Summarization process starting...")
398
  try:
399
  logging.debug("TabbyAPI: Loading and validating configurations")
@@ -448,6 +477,11 @@ def summarize_with_tabbyapi(input_data, custom_prompt_input, api_key=None, api_I
448
  if system_message is None:
449
  system_message = "You are a helpful AI assistant."
450
 
 
 
 
 
 
451
  headers = {
452
  'Authorization': f'Bearer {api_key}',
453
  'Content-Type': 'application/json'
@@ -501,10 +535,10 @@ def summarize_with_vllm(
501
  input_data: Union[str, dict, list],
502
  custom_prompt_input: str,
503
  api_key: str = None,
504
- vllm_api_url: str = "http://127.0.0.1:8000/v1/chat/completions",
505
  model: str = None,
506
  system_prompt: str = None,
507
- temp: float = 0.7
 
508
  ) -> str:
509
  logging.debug("vLLM: Summarization process starting...")
510
  try:
@@ -556,6 +590,11 @@ def summarize_with_vllm(
556
  if system_prompt is None:
557
  system_prompt = "You are a helpful AI assistant."
558
 
 
 
 
 
 
559
  model = model or loaded_config_data['models']['vllm']
560
  if system_prompt is None:
561
  system_prompt = "You are a helpful AI assistant."
@@ -602,7 +641,7 @@ def summarize_with_vllm(
602
 
603
 
604
  # FIXME - update to be a summarize request
605
- def summarize_with_ollama(input_data, custom_prompt, api_url="http://127.0.0.1:11434/api/generate", api_key=None, temp=None, system_message=None, model=None):
606
  try:
607
  logging.debug("ollama: Loading and validating configurations")
608
  loaded_config_data = load_and_log_configs()
@@ -651,6 +690,11 @@ def summarize_with_ollama(input_data, custom_prompt, api_url="http://127.0.0.1:1
651
  else:
652
  raise ValueError("Ollama: Invalid input data format")
653
 
 
 
 
 
 
654
  headers = {
655
  'accept': 'application/json',
656
  'content-type': 'application/json',
@@ -761,6 +805,11 @@ def summarize_with_custom_openai(api_key, input_data, custom_prompt_arg, temp=No
761
  logging.debug(f"Custom OpenAI API: Extracted text (first 500 chars): {text[:500]}...")
762
  logging.debug(f"v: Custom prompt: {custom_prompt_arg}")
763
 
 
 
 
 
 
764
  openai_model = loaded_config_data['models']['openai'] or "gpt-4o"
765
  logging.debug(f"Custom OpenAI API: Using model: {openai_model}")
766
 
 
35
  logger = logging.getLogger()
36
 
37
 
38
+ summarizer_prompt = """
39
+ <s>You are a bulleted notes specialist. [INST]```When creating comprehensive bulleted notes, you should follow these guidelines: Use multiple headings based on the referenced topics, not categories like quotes or terms. Headings should be surrounded by bold formatting and not be listed as bullet points themselves. Leave no space between headings and their corresponding list items underneath. Important terms within the content should be emphasized by setting them in bold font. Any text that ends with a colon should also be bolded. Before submitting your response, review the instructions, and make any corrections necessary to adhered to the specified format. Do not reference these instructions within the notes.``` \nBased on the content between backticks create comprehensive bulleted notes.[/INST]
40
+ **Bulleted Note Creation Guidelines**
41
+
42
+ **Headings**:
43
+ - Based on referenced topics, not categories like quotes or terms
44
+ - Surrounded by **bold** formatting
45
+ - Not listed as bullet points
46
+ - No space between headings and list items underneath
47
+
48
+ **Emphasis**:
49
+ - **Important terms** set in bold font
50
+ - **Text ending in a colon**: also bolded
51
+
52
+ **Review**:
53
+ - Ensure adherence to specified format
54
+ - Do not reference these instructions in your response.</s>[INST] {{ .Prompt }} [/INST]
55
+ """
56
+
57
  # FIXME - temp is not used
58
  def summarize_with_local_llm(input_data, custom_prompt_arg, temp, system_message=None):
59
  try:
 
127
  return "Local LLM: Error occurred while processing summary"
128
 
129
 
130
+ def summarize_with_llama(input_data, custom_prompt, api_key=None, temp=None, system_message=None, api_url="http://127.0.0.1:8080/completion",):
131
  try:
132
  logging.debug("Llama.cpp: Loading and validating configurations")
133
  loaded_config_data = load_and_log_configs()
 
157
  logging.debug("Llama.cpp: Using provided string data for summarization")
158
  data = input_data
159
 
160
+ logging.debug(f"Llama Summarize: Loaded data: {data}")
161
+ logging.debug(f"Llama Summarize: Type of data: {type(data)}")
162
 
163
  if isinstance(data, dict) and 'summary' in data:
164
  # If the loaded data is a dictionary and already contains a summary, return it
165
+ logging.debug("Llama Summarize: Summary already exists in the loaded data")
166
  return data['summary']
167
 
168
  # If the loaded data is a list of segment dictionaries or a string, proceed with summarization
 
172
  elif isinstance(data, str):
173
  text = data
174
  else:
175
+ raise ValueError("Llama Summarize: Invalid input data format")
176
 
177
  headers = {
178
  'accept': 'application/json',
 
181
  if len(api_key) > 5:
182
  headers['Authorization'] = f'Bearer {api_key}'
183
 
 
184
  if system_message is None:
185
  system_message = "You are a helpful AI assistant."
186
+ logging.debug(f":Llama Summarize: System Prompt being sent is {system_message}")
187
  if system_message is None:
188
  system_message = "You are a helpful AI assistant."
189
 
190
+ if custom_prompt is None:
191
+ llama_prompt = f"{summarizer_prompt}\n\n\n\n{text}"
192
+ else:
193
+ llama_prompt = f"{custom_prompt}\n\n\n\n{text}"
194
+
195
  data = {
196
  "messages": [
197
  {"role": "system", "content": system_message},
 
224
 
225
 
226
  # https://lite.koboldai.net/koboldcpp_api#/api%2Fv1/post_api_v1_generate
227
+ def summarize_with_kobold(input_data, api_key, custom_prompt_input, system_message=None, temp=None, kobold_api_ip="http://127.0.0.1:5001/api/v1/generate"):
228
  logging.debug("Kobold: Summarization process starting...")
229
  try:
230
  logging.debug("Kobold: Loading and validating configurations")
 
276
  'accept': 'application/json',
277
  'content-type': 'application/json',
278
  }
279
+ if custom_prompt_input is None:
280
+ kobold_prompt = f"{summarizer_prompt}\n\n\n\n{text}"
281
+ else:
282
+ kobold_prompt = f"{custom_prompt_input}\n\n\n\n{text}"
283
 
284
+ logging.debug("Kobold summarization: Prompt being sent is {kobold_prompt}")
 
285
 
286
  # FIXME
287
  # Values literally c/p from the api docs....
 
295
  #"rep_penalty": 1.0,
296
  }
297
 
298
+ logging.debug("Kobold Summarization: Submitting request to API endpoint")
299
+ print("Kobold Summarization: Submitting request to API endpoint")
300
  kobold_api_ip = loaded_config_data['local_api_ip']['kobold']
301
  try:
302
  response = requests.post(kobold_api_ip, headers=headers, json=data)
303
+ logging.debug("Kobold Summarization: API Response Status Code: %d", response.status_code)
304
 
305
  if response.status_code == 200:
306
  try:
 
329
 
330
 
331
  # https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API
332
+ def summarize_with_oobabooga(input_data, api_key, custom_prompt, system_message=None, temp=None, api_url="http://127.0.0.1:5000/v1/chat/completions"):
333
  logging.debug("Oobabooga: Summarization process starting...")
334
  try:
335
  logging.debug("Oobabooga: Loading and validating configurations")
 
382
  'content-type': 'application/json',
383
  }
384
 
385
+ if custom_prompt is None:
386
+ custom_prompt = f"{summarizer_prompt}\n\n\n\n{text}"
387
+ else:
388
+ custom_prompt = f"{custom_prompt}\n\n\n\n{text}"
389
+
390
+ logging.debug("Ooba Summarize: Prompt being sent is {kobold_prompt}")
391
+
392
  ooba_prompt = f"{text}" + f"\n\n\n\n{custom_prompt}"
393
  logging.debug("ooba: Prompt being sent is {ooba_prompt}")
394
 
 
422
  return f"ooba: Error occurred while processing summary with oobabooga: {str(e)}"
423
 
424
 
425
+ def summarize_with_tabbyapi(input_data, custom_prompt_input, system_message=None, api_key=None, temp=None, api_IP="http://127.0.0.1:5000/v1/chat/completions"):
 
426
  logging.debug("TabbyAPI: Summarization process starting...")
427
  try:
428
  logging.debug("TabbyAPI: Loading and validating configurations")
 
477
  if system_message is None:
478
  system_message = "You are a helpful AI assistant."
479
 
480
+ if custom_prompt_input is None:
481
+ custom_prompt_input = f"{summarizer_prompt}\n\n\n\n{text}"
482
+ else:
483
+ custom_prompt_input = f"{custom_prompt_input}\n\n\n\n{text}"
484
+
485
  headers = {
486
  'Authorization': f'Bearer {api_key}',
487
  'Content-Type': 'application/json'
 
535
  input_data: Union[str, dict, list],
536
  custom_prompt_input: str,
537
  api_key: str = None,
 
538
  model: str = None,
539
  system_prompt: str = None,
540
+ temp: float = 0.7,
541
+ vllm_api_url: str = "http://127.0.0.1:8000/v1/chat/completions"
542
  ) -> str:
543
  logging.debug("vLLM: Summarization process starting...")
544
  try:
 
590
  if system_prompt is None:
591
  system_prompt = "You are a helpful AI assistant."
592
 
593
+ if custom_prompt_input is None:
594
+ custom_prompt_input = f"{summarizer_prompt}\n\n\n\n{text}"
595
+ else:
596
+ custom_prompt_input = f"{custom_prompt_input}\n\n\n\n{text}"
597
+
598
  model = model or loaded_config_data['models']['vllm']
599
  if system_prompt is None:
600
  system_prompt = "You are a helpful AI assistant."
 
641
 
642
 
643
  # FIXME - update to be a summarize request
644
+ def summarize_with_ollama(input_data, custom_prompt, api_key=None, temp=None, system_message=None, model=None, api_url="http://127.0.0.1:11434/api/generate",):
645
  try:
646
  logging.debug("ollama: Loading and validating configurations")
647
  loaded_config_data = load_and_log_configs()
 
690
  else:
691
  raise ValueError("Ollama: Invalid input data format")
692
 
693
+ if custom_prompt is None:
694
+ custom_prompt = f"{summarizer_prompt}\n\n\n\n{text}"
695
+ else:
696
+ custom_prompt = f"{custom_prompt}\n\n\n\n{text}"
697
+
698
  headers = {
699
  'accept': 'application/json',
700
  'content-type': 'application/json',
 
805
  logging.debug(f"Custom OpenAI API: Extracted text (first 500 chars): {text[:500]}...")
806
  logging.debug(f"v: Custom prompt: {custom_prompt_arg}")
807
 
808
+ if input_data is None:
809
+ input_data = f"{summarizer_prompt}\n\n\n\n{text}"
810
+ else:
811
+ input_data = f"{input_data}\n\n\n\n{text}"
812
+
813
  openai_model = loaded_config_data['models']['openai'] or "gpt-4o"
814
  logging.debug(f"Custom OpenAI API: Using model: {openai_model}")
815
 
App_Function_Libraries/Summarization/Summarization_General_Lib.py CHANGED
@@ -30,7 +30,8 @@ from App_Function_Libraries.Chunk_Lib import semantic_chunking, rolling_summariz
30
  improved_chunking_process
31
  from App_Function_Libraries.Audio.Diarization_Lib import combine_transcription_and_diarization
32
  from App_Function_Libraries.Summarization.Local_Summarization_Lib import summarize_with_llama, summarize_with_kobold, \
33
- summarize_with_oobabooga, summarize_with_tabbyapi, summarize_with_vllm, summarize_with_local_llm
 
34
  from App_Function_Libraries.DB.DB_Manager import add_media_to_database
35
  # Import Local
36
  from App_Function_Libraries.Utils.Utils import load_and_log_configs, load_comprehensive_config, sanitize_filename, \
@@ -1108,9 +1109,15 @@ def process_video_urls(url_list, num_speakers, whisper_model, custom_prompt_inpu
1108
 
1109
 
1110
  def perform_transcription(video_path, offset, whisper_model, vad_filter, diarize=False):
 
 
1111
  global segments_json_path
1112
  audio_file_path = convert_to_wav(video_path, offset)
 
 
 
1113
  segments_json_path = audio_file_path.replace('.wav', '.segments.json')
 
1114
 
1115
  if diarize:
1116
  diarized_json_path = audio_file_path.replace('.wav', '.diarized.json')
@@ -1521,16 +1528,22 @@ def process_url(
1521
  summary = summarize_with_deepseek(api_key, chunk, custom_prompt_input, temp, system_message)
1522
  elif api_name == "OpenRouter":
1523
  summary = summarize_with_openrouter(api_key, chunk, custom_prompt_input, temp, system_message)
 
1524
  elif api_name == "Llama.cpp":
1525
- summary = summarize_with_llama(chunk, custom_prompt_input, temp, system_message)
1526
  elif api_name == "Kobold":
1527
- summary = summarize_with_kobold(chunk, custom_prompt_input, temp, system_message)
1528
  elif api_name == "Ooba":
1529
- summary = summarize_with_oobabooga(chunk, custom_prompt_input, temp, system_message)
1530
  elif api_name == "Tabbyapi":
1531
- summary = summarize_with_tabbyapi(chunk, custom_prompt_input, temp, system_message)
1532
  elif api_name == "VLLM":
1533
- summary = summarize_with_vllm(chunk, custom_prompt_input, temp, system_message)
 
 
 
 
 
1534
  summarized_chunk_transcriptions.append(summary)
1535
 
1536
  # Combine chunked transcriptions into a single file
 
30
  improved_chunking_process
31
  from App_Function_Libraries.Audio.Diarization_Lib import combine_transcription_and_diarization
32
  from App_Function_Libraries.Summarization.Local_Summarization_Lib import summarize_with_llama, summarize_with_kobold, \
33
+ summarize_with_oobabooga, summarize_with_tabbyapi, summarize_with_vllm, summarize_with_local_llm, \
34
+ summarize_with_ollama, summarize_with_custom_openai
35
  from App_Function_Libraries.DB.DB_Manager import add_media_to_database
36
  # Import Local
37
  from App_Function_Libraries.Utils.Utils import load_and_log_configs, load_comprehensive_config, sanitize_filename, \
 
1109
 
1110
 
1111
  def perform_transcription(video_path, offset, whisper_model, vad_filter, diarize=False):
1112
+ temp_files = []
1113
+ logging.info(f"Processing media: {video_path}")
1114
  global segments_json_path
1115
  audio_file_path = convert_to_wav(video_path, offset)
1116
+ logging.debug(f"Converted audio file: {audio_file_path}")
1117
+ temp_files.append(audio_file_path)
1118
+ logging.debug("Replacing audio file with segments.json file")
1119
  segments_json_path = audio_file_path.replace('.wav', '.segments.json')
1120
+ temp_files.append(segments_json_path)
1121
 
1122
  if diarize:
1123
  diarized_json_path = audio_file_path.replace('.wav', '.diarized.json')
 
1528
  summary = summarize_with_deepseek(api_key, chunk, custom_prompt_input, temp, system_message)
1529
  elif api_name == "OpenRouter":
1530
  summary = summarize_with_openrouter(api_key, chunk, custom_prompt_input, temp, system_message)
1531
+ # Local LLM APIs
1532
  elif api_name == "Llama.cpp":
1533
+ summary = summarize_with_llama(chunk, custom_prompt_input, api_key, temp, system_message)
1534
  elif api_name == "Kobold":
1535
+ summary = summarize_with_kobold(chunk, None, custom_prompt_input, system_message, temp)
1536
  elif api_name == "Ooba":
1537
+ summary = summarize_with_oobabooga(chunk, None, custom_prompt_input, system_message, temp)
1538
  elif api_name == "Tabbyapi":
1539
+ summary = summarize_with_tabbyapi(chunk, custom_prompt_input, system_message, None, temp)
1540
  elif api_name == "VLLM":
1541
+ summary = summarize_with_vllm(chunk, custom_prompt_input, None, None, system_message)
1542
+ elif api_name == "Ollama":
1543
+ summary = summarize_with_ollama(chunk, custom_prompt_input, api_key, temp, system_message, None)
1544
+ elif api_name == "custom_openai_api":
1545
+ summary = summarize_with_custom_openai(chunk, custom_prompt_input, api_key, temp=None, system_message=None)
1546
+
1547
  summarized_chunk_transcriptions.append(summary)
1548
 
1549
  # Combine chunked transcriptions into a single file