oceansweep commited on
Commit
805099a
·
verified ·
1 Parent(s): df6eb12

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +332 -362
app.py CHANGED
@@ -1,14 +1,22 @@
1
  #!/usr/bin/env python3
2
- import gradio as gr
3
- import argparse, configparser, datetime, json, logging, os, platform, requests, shutil, subprocess, sys, time, unicodedata
 
 
 
 
 
 
 
 
 
 
4
  import zipfile
5
- from datetime import datetime
6
- import contextlib
7
- import ffmpeg
8
  import torch
9
  import yt_dlp
10
 
11
-
12
  #######
13
  # Function Sections
14
  #
@@ -34,7 +42,6 @@ import yt_dlp
34
  #
35
 
36
 
37
-
38
  ####
39
  #
40
  # TL/DW: Too Long Didn't Watch
@@ -52,8 +59,10 @@ import yt_dlp
52
  # Download Audio+Video from URL -> Transcribe audio from Video:**
53
  # python summarize.py -v https://www.youtube.com/watch?v=4nd1CDZP21s`
54
  #
55
- # Download Audio only from URL -> Transcribe audio -> Summarize using (`anthropic`/`cohere`/`openai`/`llama` (llama.cpp)/`ooba` (oobabooga/text-gen-webui)/`kobold` (kobold.cpp)/`tabby` (Tabbyapi)) API:**
56
- # python summarize.py -v https://www.youtube.com/watch?v=4nd1CDZP21s -api <your choice of API>` - Make sure to put your API key into `config.txt` under the appropriate API variable
 
 
57
  #
58
  # Download Audio+Video from a list of videos in a text file (can be file paths or URLs) and have them all summarized:**
59
  # python summarize.py ./local/file_on_your/system --api_name <API_name>`
@@ -89,7 +98,6 @@ logging.debug(f"Loaded openAI Face API Key: {openai_api_key}")
89
  huggingface_api_key = config.get('API', 'huggingface_api_key', fallback=None)
90
  logging.debug(f"Loaded HuggingFace Face API Key: {huggingface_api_key}")
91
 
92
-
93
  # Models
94
  anthropic_model = config.get('API', 'anthropic_model', fallback='claude-3-sonnet-20240229')
95
  cohere_model = config.get('API', 'cohere_model', fallback='command-r-plus')
@@ -119,9 +127,9 @@ processing_choice = config.get('Processing', 'processing_choice', fallback='cpu'
119
  #######################
120
 
121
  # Dirty hack - sue me.
122
- os.environ['KMP_DUPLICATE_LIB_OK']='True'
123
 
124
- whisper_models = ["small", "medium", "small.en","medium.en"]
125
  source_languages = {
126
  "en": "English",
127
  "zh": "Chinese",
@@ -133,10 +141,8 @@ source_languages = {
133
  }
134
  source_language_list = [key[0] for key in source_languages.items()]
135
 
136
-
137
-
138
-
139
- print(r"""_____ _ ________ _ _
140
  |_ _|| | / /| _ \| | | | _
141
  | | | | / / | | | || | | |(_)
142
  | | | | / / | | | || |/\| |
@@ -167,6 +173,8 @@ print(r"""_____ _ ________ _ _
167
 
168
  # Perform Platform Check
169
  userOS = ""
 
 
170
  def platform_check():
171
  global userOS
172
  if platform.system() == "Linux":
@@ -180,7 +188,6 @@ def platform_check():
180
  exit()
181
 
182
 
183
-
184
  # Check for NVIDIA GPU and CUDA availability
185
  def cuda_check():
186
  global processing_choice
@@ -197,7 +204,6 @@ def cuda_check():
197
  processing_choice = "cpu" # Set processing_choice to cpu if nvidia-smi command fails
198
 
199
 
200
-
201
  # Ask user if they would like to use either their GPU or their CPU for transcription
202
  def decide_cpugpu():
203
  global processing_choice
@@ -214,7 +220,6 @@ def decide_cpugpu():
214
  print("Invalid choice. Please select either GPU or CPU.")
215
 
216
 
217
-
218
  # check for existence of ffmpeg
219
  def check_ffmpeg():
220
  if shutil.which("ffmpeg") or (os.path.exists("Bin") and os.path.isfile(".\\Bin\\ffmpeg.exe")):
@@ -222,20 +227,21 @@ def check_ffmpeg():
222
  pass
223
  else:
224
  logging.debug("ffmpeg not installed on the local system/in local PATH")
225
- print("ffmpeg is not installed.\n\n You can either install it manually, or through your package manager of choice.\n Windows users, builds are here: https://www.gyan.dev/ffmpeg/builds/")
 
226
  if userOS == "Windows":
227
  download_ffmpeg()
228
  elif userOS == "Linux":
229
- print("You should install ffmpeg using your platform's appropriate package manager, 'apt install ffmpeg','dnf install ffmpeg' or 'pacman', etc.")
 
230
  else:
231
  logging.debug("running an unsupported OS")
232
- print("You're running an unspported/Un-tested OS")
233
  exit_script = input("Let's exit the script, unless you're feeling lucky? (y/n)")
234
  if exit_script == "y" or "yes" or "1":
235
  exit()
236
 
237
 
238
-
239
  # Download ffmpeg
240
  def download_ffmpeg():
241
  user_choice = input("Do you want to download ffmpeg? (y)Yes/(n)No: ")
@@ -243,33 +249,33 @@ def download_ffmpeg():
243
  print("Downloading ffmpeg")
244
  url = "https://www.gyan.dev/ffmpeg/builds/ffmpeg-release-essentials.zip"
245
  response = requests.get(url)
246
-
247
  if response.status_code == 200:
248
  print("Saving ffmpeg zip file")
249
  logging.debug("Saving ffmpeg zip file")
250
  zip_path = "ffmpeg-release-essentials.zip"
251
  with open(zip_path, 'wb') as file:
252
  file.write(response.content)
253
-
254
  logging.debug("Extracting the 'ffmpeg.exe' file from the zip")
255
  print("Extracting ffmpeg.exe from zip file to '/Bin' folder")
256
  with zipfile.ZipFile(zip_path, 'r') as zip_ref:
257
  ffmpeg_path = "ffmpeg-7.0-essentials_build/bin/ffmpeg.exe"
258
-
259
  logging.debug("checking if the './Bin' folder exists, creating if not")
260
  bin_folder = "Bin"
261
  if not os.path.exists(bin_folder):
262
  logging.debug("Creating a folder for './Bin', it didn't previously exist")
263
  os.makedirs(bin_folder)
264
-
265
  logging.debug("Extracting 'ffmpeg.exe' to the './Bin' folder")
266
  zip_ref.extract(ffmpeg_path, path=bin_folder)
267
-
268
  logging.debug("Moving 'ffmpeg.exe' to the './Bin' folder")
269
  src_path = os.path.join(bin_folder, ffmpeg_path)
270
  dst_path = os.path.join(bin_folder, "ffmpeg.exe")
271
  shutil.move(src_path, dst_path)
272
-
273
  logging.debug("Removing ffmpeg zip file")
274
  print("Deleting zip file (we've already extracted ffmpeg.exe, no worries)")
275
  os.remove(zip_path)
@@ -283,16 +289,12 @@ def download_ffmpeg():
283
  logging.debug("User chose to not download ffmpeg")
284
  print("ffmpeg will not be downloaded.")
285
 
286
- #
 
287
  #
288
  ####################################################################################################################################
289
 
290
 
291
-
292
-
293
-
294
-
295
-
296
  ####################################################################################################################################
297
  # Processing Paths and local file handling
298
  #
@@ -304,13 +306,13 @@ def read_paths_from_file(file_path):
304
  with open(file_path, 'r') as file:
305
  for line in file:
306
  line = line.strip()
307
- if line and not os.path.exists(os.path.join('results', normalize_title(line.split('/')[-1].split('.')[0]) + '.json')):
 
308
  logging.debug("line successfully imported from file and added to list to be transcribed")
309
  paths.append(line)
310
  return paths
311
 
312
 
313
-
314
  def process_path(path):
315
  """ Decides whether the path is a URL or a local file and processes accordingly. """
316
  if path.startswith('http'):
@@ -324,7 +326,6 @@ def process_path(path):
324
  return None
325
 
326
 
327
-
328
  # FIXME
329
  def process_local_file(file_path):
330
  logging.info(f"Processing local file: {file_path}")
@@ -334,31 +335,32 @@ def process_local_file(file_path):
334
  download_path = create_download_directory(title)
335
  logging.debug(f"Converting '{title}' to an audio file (wav).")
336
  audio_file = convert_to_wav(file_path) # Assumes input files are videos needing audio extraction
337
- logging.debug(f"'{title}' succesfully converted to an audio file (wav).")
338
  return download_path, info_dict, audio_file
339
- #
340
- #
341
- ####################################################################################################################################
342
-
343
-
344
 
345
 
 
 
 
346
 
347
 
348
  ####################################################################################################################################
349
  # Video Download/Handling
350
  #
351
 
352
- def process_url(input_path, num_speakers=2, whisper_model="small.en", offset=0, api_name=None, api_key=None, vad_filter=False, download_video_flag=False,custom_prompt=None, demo_mode=True):
 
353
  if demo_mode:
354
  api_name = "huggingface"
355
  api_key = os.environ.get(HF_TOKEN)
356
  print("HUGGINGFACE API KEY CHECK #3: " + api_key)
357
  vad_filter = False
358
  download_video_flag = False
359
-
360
  try:
361
- results = main(input_path, api_name=api_name, api_key=api_key, num_speakers=num_speakers, whisper_model=whisper_model, offset=offset, vad_filter=vad_filter, download_video_flag=download_video_flag)
 
 
362
 
363
  if results:
364
  transcription_result = results[0]
@@ -382,7 +384,6 @@ def process_url(input_path, num_speakers=2, whisper_model="small.en", offset=0,
382
  return None, error_message, None, None
383
 
384
 
385
-
386
  def create_download_directory(title):
387
  base_dir = "Results"
388
  # Remove characters that are illegal in Windows filenames and normalize
@@ -397,15 +398,15 @@ def create_download_directory(title):
397
  return session_path
398
 
399
 
400
-
401
  def normalize_title(title):
402
  # Normalize the string to 'NFKD' form and encode to 'ascii' ignoring non-ascii characters
403
  title = unicodedata.normalize('NFKD', title).encode('ascii', 'ignore').decode('ascii')
404
- title = title.replace('/', '_').replace('\\', '_').replace(':', '_').replace('"', '').replace('*', '').replace('?', '').replace('<', '').replace('>', '').replace('|', '')
 
 
405
  return title
406
 
407
 
408
-
409
  def get_youtube(video_url):
410
  ydl_opts = {
411
  'format': 'bestaudio[ext=m4a]',
@@ -420,7 +421,6 @@ def get_youtube(video_url):
420
  return info_dict
421
 
422
 
423
-
424
  def get_playlist_videos(playlist_url):
425
  ydl_opts = {
426
  'extract_flat': True,
@@ -440,18 +440,16 @@ def get_playlist_videos(playlist_url):
440
  return [], None
441
 
442
 
443
-
444
  def save_to_file(video_urls, filename):
445
  with open(filename, 'w') as file:
446
  file.write('\n'.join(video_urls))
447
  print(f"Video URLs saved to {filename}")
448
 
449
 
450
-
451
  def download_video(video_url, download_path, info_dict, download_video_flag):
452
  logging.debug("About to normalize downloaded video title")
453
  title = normalize_title(info_dict['title'])
454
-
455
  if download_video_flag == False:
456
  file_path = os.path.join(download_path, f"{title}.m4a")
457
  ydl_opts = {
@@ -474,12 +472,12 @@ def download_video(video_url, download_path, info_dict, download_video_flag):
474
  'format': 'bestaudio[ext=m4a]',
475
  'outtmpl': audio_file_path,
476
  }
477
-
478
  with yt_dlp.YoutubeDL(ydl_opts_video) as ydl:
479
  logging.debug("yt_dlp: About to download video with youtube-dl")
480
  ydl.download([video_url])
481
  logging.debug("yt_dlp: Video successfully downloaded with youtube-dl")
482
-
483
  with yt_dlp.YoutubeDL(ydl_opts_audio) as ydl:
484
  logging.debug("yt_dlp: About to download audio with youtube-dl")
485
  ydl.download([video_url])
@@ -508,17 +506,14 @@ def download_video(video_url, download_path, info_dict, download_video_flag):
508
  '-c:a', 'copy',
509
  output_file_path
510
  ]
511
- subprocess.run(ffmpeg_command, check=True)
512
  else:
513
  logging.error("You shouldn't be here...")
514
  exit()
515
  os.remove(video_file_path)
516
  os.remove(audio_file_path)
517
-
518
- return output_file_path
519
-
520
-
521
 
 
522
 
523
 
524
  #
@@ -526,10 +521,6 @@ def download_video(video_url, download_path, info_dict, download_video_flag):
526
  ####################################################################################################################################
527
 
528
 
529
-
530
-
531
-
532
-
533
  ####################################################################################################################################
534
  # Audio Transcription
535
  #
@@ -553,12 +544,12 @@ def convert_to_wav(video_file_path, offset=0):
553
  ffmpeg_cmd = 'ffmpeg' # Assume 'ffmpeg' is in PATH for non-Windows systems
554
 
555
  command = [
556
- ffmpeg_cmd, # Assuming the working directory is correctly set where .\Bin exists
557
- "-ss", "00:00:00", # Start at the beginning of the video
558
  "-i", video_file_path,
559
- "-ar", "16000", # Audio sample rate
560
- "-ac", "1", # Number of audio channels
561
- "-c:a", "pcm_s16le", # Audio codec
562
  out_path
563
  ]
564
  try:
@@ -590,7 +581,6 @@ def convert_to_wav(video_file_path, offset=0):
590
  return out_path
591
 
592
 
593
-
594
  # Transcribe .wav into .segments.json
595
  def speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='small.en', vad_filter=False):
596
  logging.info('Loading faster_whisper model: %s', whisper_model)
@@ -609,7 +599,7 @@ def speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='sm
609
  with open(out_file) as f:
610
  segments = json.load(f)
611
  return segments
612
-
613
  logging.info('Starting transcription...')
614
  options = dict(language=selected_source_lang, beam_size=5, best_of=5, vad_filter=vad_filter)
615
  transcribe_options = dict(task="transcribe", **options)
@@ -631,139 +621,133 @@ def speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='sm
631
  logging.error("Error transcribing audio: %s", str(e))
632
  raise RuntimeError("Error transcribing audio")
633
  return segments
 
 
634
  #
635
  #
636
  ####################################################################################################################################
637
 
638
 
639
-
640
-
641
-
642
-
643
  ####################################################################################################################################
644
  # Diarization
645
  #
646
  # TODO: https://huggingface.co/pyannote/speaker-diarization-3.1
647
  # embedding_model = "pyannote/embedding", embedding_size=512
648
  # embedding_model = "speechbrain/spkrec-ecapa-voxceleb", embedding_size=192
649
- def speaker_diarize(video_file_path, segments, embedding_model = "pyannote/embedding", embedding_size=512, num_speakers=0):
650
- """
651
- 1. Generating speaker embeddings for each segments.
652
- 2. Applying agglomerative clustering on the embeddings to identify the speaker for each segment.
653
- """
654
- try:
655
- from pyannote.audio import Audio
656
- from pyannote.core import Segment
657
- from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
658
- import numpy as np
659
- import pandas as pd
660
- from sklearn.cluster import AgglomerativeClustering
661
- from sklearn.metrics import silhouette_score
662
- import tqdm
663
- import wave
664
-
665
- embedding_model = PretrainedSpeakerEmbedding( embedding_model, device=torch.device("cuda" if torch.cuda.is_available() else "cpu"))
666
-
667
-
668
- _,file_ending = os.path.splitext(f'{video_file_path}')
669
- audio_file = video_file_path.replace(file_ending, ".wav")
670
- out_file = video_file_path.replace(file_ending, ".diarize.json")
671
-
672
- logging.debug("getting duration of audio file")
673
- with contextlib.closing(wave.open(audio_file,'r')) as f:
674
- frames = f.getnframes()
675
- rate = f.getframerate()
676
- duration = frames / float(rate)
677
- logging.debug("duration of audio file obtained")
678
- print(f"duration of audio file: {duration}")
679
-
680
- def segment_embedding(segment):
681
- logging.debug("Creating embedding")
682
- audio = Audio()
683
- start = segment["start"]
684
- end = segment["end"]
685
-
686
- # Enforcing a minimum segment length
687
- if end-start < 0.3:
688
- padding = 0.3-(end-start)
689
- start -= padding/2
690
- end += padding/2
691
- print('Padded segment because it was too short:',segment)
692
-
693
- # Whisper overshoots the end timestamp in the last segment
694
- end = min(duration, end)
695
- # clip audio and embed
696
- clip = Segment(start, end)
697
- waveform, sample_rate = audio.crop(audio_file, clip)
698
- return embedding_model(waveform[None])
699
-
700
- embeddings = np.zeros(shape=(len(segments), embedding_size))
701
- for i, segment in enumerate(tqdm.tqdm(segments)):
702
- embeddings[i] = segment_embedding(segment)
703
- embeddings = np.nan_to_num(embeddings)
704
- print(f'Embedding shape: {embeddings.shape}')
705
-
706
- if num_speakers == 0:
707
- # Find the best number of speakers
708
- score_num_speakers = {}
709
-
710
- for num_speakers in range(2, 10+1):
711
- clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
712
- score = silhouette_score(embeddings, clustering.labels_, metric='euclidean')
713
- score_num_speakers[num_speakers] = score
714
- best_num_speaker = max(score_num_speakers, key=lambda x:score_num_speakers[x])
715
- print(f"The best number of speakers: {best_num_speaker} with {score_num_speakers[best_num_speaker]} score")
716
- else:
717
- best_num_speaker = num_speakers
718
-
719
- # Assign speaker label
720
- clustering = AgglomerativeClustering(best_num_speaker).fit(embeddings)
721
- labels = clustering.labels_
722
- for i in range(len(segments)):
723
- segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
724
-
725
- with open(out_file,'w') as f:
726
- f.write(json.dumps(segments, indent=2))
727
-
728
- # Make CSV output
729
- def convert_time(secs):
730
- return datetime.timedelta(seconds=round(secs))
731
-
732
- objects = {
733
- 'Start' : [],
734
- 'End': [],
735
- 'Speaker': [],
736
- 'Text': []
737
- }
738
- text = ''
739
- for (i, segment) in enumerate(segments):
740
- if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
741
- objects['Start'].append(str(convert_time(segment["start"])))
742
- objects['Speaker'].append(segment["speaker"])
743
- if i != 0:
744
- objects['End'].append(str(convert_time(segments[i - 1]["end"])))
745
- objects['Text'].append(text)
746
- text = ''
747
- text += segment["text"] + ' '
748
- objects['End'].append(str(convert_time(segments[i - 1]["end"])))
749
- objects['Text'].append(text)
750
-
751
- save_path = video_file_path.replace(file_ending, ".csv")
752
- df_results = pd.DataFrame(objects)
753
- df_results.to_csv(save_path)
754
- return df_results, save_path
755
-
756
- except Exception as e:
757
- raise RuntimeError("Error Running inference with local model", e)
758
  #
759
  #
760
  ####################################################################################################################################
761
 
762
 
763
-
764
-
765
-
766
-
767
  ####################################################################################################################################
768
  #Summarizers
769
  #
@@ -776,13 +760,12 @@ def extract_text_from_segments(segments):
776
  return text
777
 
778
 
779
-
780
- def summarize_with_openai(api_key, file_path, model):
781
  try:
782
  logging.debug("openai: Loading json data for summarization")
783
  with open(file_path, 'r') as file:
784
  segments = json.load(file)
785
-
786
  logging.debug("openai: Extracting text from the segments")
787
  text = extract_text_from_segments(segments)
788
 
@@ -790,9 +773,14 @@ def summarize_with_openai(api_key, file_path, model):
790
  'Authorization': f'Bearer {api_key}',
791
  'Content-Type': 'application/json'
792
  }
793
-
 
 
 
 
 
794
  logging.debug("openai: Preparing data + prompt for submittal")
795
- openai_prompt = f"{text} \n\n\n\n{prompt_text}"
796
  data = {
797
  "model": model,
798
  "messages": [
@@ -810,7 +798,7 @@ def summarize_with_openai(api_key, file_path, model):
810
  }
811
  logging.debug("openai: Posting request")
812
  response = requests.post('https://api.openai.com/v1/chat/completions', headers=headers, json=data)
813
-
814
  if response.status_code == 200:
815
  summary = response.json()['choices'][0]['message']['content'].strip()
816
  logging.debug("openai: Summarization successful")
@@ -826,13 +814,12 @@ def summarize_with_openai(api_key, file_path, model):
826
  return None
827
 
828
 
829
-
830
- def summarize_with_claude(api_key, file_path, model):
831
  try:
832
  logging.debug("anthropic: Loading JSON data")
833
  with open(file_path, 'r') as file:
834
  segments = json.load(file)
835
-
836
  logging.debug("anthropic: Extracting text from the segments file")
837
  text = extract_text_from_segments(segments)
838
 
@@ -841,16 +828,17 @@ def summarize_with_claude(api_key, file_path, model):
841
  'anthropic-version': '2023-06-01',
842
  'Content-Type': 'application/json'
843
  }
844
-
845
- logging.debug("anthropic: Prepping data + prompt for submittal")
 
846
  user_message = {
847
  "role": "user",
848
- "content": f"{text} \n\n\n\n{prompt_text}"
849
  }
850
 
851
  data = {
852
  "model": model,
853
- "max_tokens": 4096, # max _possible_ tokens to return
854
  "messages": [user_message],
855
  "stop_sequences": ["\n\nHuman:"],
856
  "temperature": 0.7,
@@ -862,17 +850,17 @@ def summarize_with_claude(api_key, file_path, model):
862
  "stream": False,
863
  "system": "You are a professional summarizer."
864
  }
865
-
866
  logging.debug("anthropic: Posting request to API")
867
  response = requests.post('https://api.anthropic.com/v1/messages', headers=headers, json=data)
868
-
869
  # Check if the status code indicates success
870
  if response.status_code == 200:
871
  logging.debug("anthropic: Post submittal successful")
872
  response_data = response.json()
873
  try:
874
  summary = response_data['content'][0]['text'].strip()
875
- logging.debug("anthropic: Summarization succesful")
876
  print("Summary processed successfully.")
877
  return summary
878
  except (IndexError, KeyError) as e:
@@ -894,9 +882,8 @@ def summarize_with_claude(api_key, file_path, model):
894
  return None
895
 
896
 
897
-
898
  # Summarize with Cohere
899
- def summarize_with_cohere(api_key, file_path, model):
900
  try:
901
  logging.basicConfig(level=logging.DEBUG)
902
  logging.debug("cohere: Loading JSON data")
@@ -912,7 +899,9 @@ def summarize_with_cohere(api_key, file_path, model):
912
  'Authorization': f'Bearer {api_key}'
913
  }
914
 
915
- cohere_prompt = f"{text} \n\n\n\n{prompt_text}"
 
 
916
  data = {
917
  "chat_history": [
918
  {"role": "USER", "message": cohere_prompt}
@@ -938,7 +927,7 @@ def summarize_with_cohere(api_key, file_path, model):
938
  logging.error("Expected data not found in API response.")
939
  return "Expected data not found in API response."
940
  else:
941
- logging.error(f"cohere: API request failed with status code {response.status_code}: {resposne.text}")
942
  print(f"Failed to process summary, status code {response.status_code}: {response.text}")
943
  return f"cohere: API request failed: {response.text}"
944
 
@@ -947,9 +936,8 @@ def summarize_with_cohere(api_key, file_path, model):
947
  return f"cohere: Error occurred while processing summary with Cohere: {str(e)}"
948
 
949
 
950
-
951
  # https://console.groq.com/docs/quickstart
952
- def summarize_with_groq(api_key, file_path, model):
953
  try:
954
  logging.debug("groq: Loading JSON data")
955
  with open(file_path, 'r') as file:
@@ -963,7 +951,9 @@ def summarize_with_groq(api_key, file_path, model):
963
  'Content-Type': 'application/json'
964
  }
965
 
966
- groq_prompt = f"{text} \n\n\n\n{prompt_text}"
 
 
967
  data = {
968
  "messages": [
969
  {
@@ -1003,7 +993,7 @@ def summarize_with_groq(api_key, file_path, model):
1003
  #
1004
  # Local Summarization
1005
 
1006
- def summarize_with_llama(api_url, file_path, token):
1007
  try:
1008
  logging.debug("llama: Loading JSON data")
1009
  with open(file_path, 'r') as file:
@@ -1016,17 +1006,17 @@ def summarize_with_llama(api_url, file_path, token):
1016
  'accept': 'application/json',
1017
  'content-type': 'application/json',
1018
  }
1019
- if len(token)>5:
1020
  headers['Authorization'] = f'Bearer {token}'
1021
 
 
 
1022
 
1023
- llama_prompt = f"{text} \n\n\n\n{prompt_text}"
1024
- logging.debug(f"llama: Complete prompt is: {llama_prompt}")
1025
  data = {
1026
  "prompt": llama_prompt
1027
  }
1028
 
1029
- #logging.debug(f"llama: Submitting request to API endpoint {llama_prompt}")
1030
  print("llama: Submitting request to API endpoint")
1031
  response = requests.post(api_url, headers=headers, json=data)
1032
  response_data = response.json()
@@ -1048,9 +1038,8 @@ def summarize_with_llama(api_url, file_path, token):
1048
  return f"llama: Error occurred while processing summary with llama: {str(e)}"
1049
 
1050
 
1051
-
1052
  # https://lite.koboldai.net/koboldcpp_api#/api%2Fv1/post_api_v1_generate
1053
- def summarize_with_kobold(api_url, file_path):
1054
  try:
1055
  logging.debug("kobold: Loading JSON data")
1056
  with open(file_path, 'r') as file:
@@ -1063,9 +1052,11 @@ def summarize_with_kobold(api_url, file_path):
1063
  'accept': 'application/json',
1064
  'content-type': 'application/json',
1065
  }
 
 
 
 
1066
  # FIXME
1067
- kobold_prompt = f"{text} \n\n\n\n{prompt_text}"
1068
- logging.debug(kobold_prompt)
1069
  # Values literally c/p from the api docs....
1070
  data = {
1071
  "max_context_length": 8096,
@@ -1097,9 +1088,8 @@ def summarize_with_kobold(api_url, file_path):
1097
  return f"kobold: Error occurred while processing summary with kobold: {str(e)}"
1098
 
1099
 
1100
-
1101
  # https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API
1102
- def summarize_with_oobabooga(api_url, file_path):
1103
  try:
1104
  logging.debug("ooba: Loading JSON data")
1105
  with open(file_path, 'r') as file:
@@ -1114,14 +1104,15 @@ def summarize_with_oobabooga(api_url, file_path):
1114
  'content-type': 'application/json',
1115
  }
1116
 
1117
- #prompt_text = "I like to eat cake and bake cakes. I am a baker. I work in a french bakery baking cakes. It is a fun job. I have been baking cakes for ten years. I also bake lots of other baked goods, but cakes are my favorite."
1118
- #prompt_text += f"\n\n{text}" # Uncomment this line if you want to include the text variable
1119
- ooba_prompt = f"{text}\n\n\n\n{prompt_text}"
 
1120
 
1121
- data = {
1122
  "mode": "chat",
1123
  "character": "Example",
1124
- "messages": [{"role": "user", "content": prompt_text}]
1125
  }
1126
 
1127
  logging.debug("ooba: Submitting request to API endpoint")
@@ -1144,7 +1135,6 @@ def summarize_with_oobabooga(api_url, file_path):
1144
  return f"ooba: Error occurred while processing summary with oobabooga: {str(e)}"
1145
 
1146
 
1147
-
1148
  def save_summary_to_file(summary, file_path):
1149
  summary_file_path = file_path.replace('.segments.json', '_summary.txt')
1150
  logging.debug("Opening summary file for writing, *segments.json with *_summary.txt")
@@ -1152,15 +1142,12 @@ def save_summary_to_file(summary, file_path):
1152
  file.write(summary)
1153
  logging.info(f"Summary saved to file: {summary_file_path}")
1154
 
 
1155
  #
1156
  #
1157
  ####################################################################################################################################
1158
 
1159
 
1160
-
1161
-
1162
-
1163
-
1164
  ####################################################################################################################################
1165
  # Gradio UI
1166
  #
@@ -1194,8 +1181,8 @@ def summarize_with_huggingface(api_key, file_path):
1194
  response_data = response.json()
1195
  wait_time = response_data.get('estimated_time', 10)
1196
  return None, f"Model is loading, retrying in {int(wait_time)} seconds..."
1197
- # Sleep before retrying....
1198
- time.sleep(wait_time)
1199
 
1200
  if api_key == "":
1201
  api_key = os.environ.get(HF_TOKEN)
@@ -1204,19 +1191,16 @@ def summarize_with_huggingface(api_key, file_path):
1204
  logging.debug("huggingface: Loading json data for summarization")
1205
  with open(file_path, 'r') as file:
1206
  segments = json.load(file)
1207
-
1208
  logging.debug("huggingface: Extracting text from the segments")
1209
  text = ' '.join([segment['text'] for segment in segments])
1210
 
1211
  api_key = os.environ.get(HF_TOKEN)
1212
  logging.debug("HUGGINGFACE API KEY CHECK #2: " + api_key)
1213
 
1214
-
1215
-
1216
-
1217
  logging.debug("huggingface: Submitting request...")
1218
  response = requests.post(API_URL, headers=headers, json=data)
1219
-
1220
  if response.status_code == 200:
1221
  summary = response.json()[0]['summary_text']
1222
  logging.debug("huggingface: Summarization successful")
@@ -1230,13 +1214,10 @@ def summarize_with_huggingface(api_key, file_path):
1230
  print(f"Error occurred while processing summary with huggingface: {str(e)}")
1231
  return None
1232
 
1233
-
1234
-
1235
  def same_auth(username, password):
1236
  return username == password
1237
 
1238
 
1239
-
1240
  def format_transcription(transcription_result):
1241
  if transcription_result:
1242
  json_data = transcription_result['transcription']
@@ -1245,79 +1226,83 @@ def format_transcription(transcription_result):
1245
  return ""
1246
 
1247
 
1248
-
1249
- def process_text(api_key,text_file):
1250
- summary,message = summarize_with_huggingface(api_key,text_file)
1251
  if summary:
1252
  # Show summary on success
1253
- return "Summary:",summary
1254
  else:
1255
  # Inform user about load/wait time
1256
- return "Notice:",message
1257
 
1258
 
1259
  def launch_ui(demo_mode=False):
1260
- def process_transcription(json_data):
1261
- if json_data:
1262
- return json.dumps(json_data, indent=2)
1263
- #return "\n".join([item["text"] for item in json_data])
1264
- else:
1265
- return ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1266
 
1267
  inputs = [
1268
- gr.components.Textbox(label="URL of video to be Transcribed/Summarized"),
1269
- gr.components.Number(value=2, label="Number of Speakers (for Diarization)"),
1270
- gr.components.Dropdown(choices=whisper_models, value="small.en", label="Whisper Model (Can ignore this)"),
1271
- gr.components.Textbox(label="Custom Prompt", value="Please provide a detailed, bulleted list of the points made throughout the transcribed video and any supporting arguments made for said points", lines=3),
1272
- gr.components.Number(value=0, label="Offset time to start transcribing from\n\n (helpful if you only want part of a video/lecture)")
 
 
 
 
 
 
1273
  ]
1274
 
1275
- if not demo_mode:
1276
- inputs.extend([
1277
- gr.components.Dropdown(choices=["huggingface", "openai", "anthropic", "cohere", "groq", "llama", "kobold", "ooba"], value="huggingface", label="API Name - What LLM service will summarize your transcription"),
1278
- gr.components.Textbox(label="API Key - Have to provide one, unless you're fine waiting on HuggingFace..."),
1279
- # gr.components.Checkbox(value=False, label="Download Video"),
1280
- # gr.components.Checkbox(value=False, label="VAD Filter")
1281
- ])
1282
 
1283
  iface = gr.Interface(
1284
- # fn=lambda url, num_speakers, whisper_model, offset, api_name, api_key: process_url(url, num_speakers, whisper_model, offset, api_name=api_name, api_key=api_key, demo_mode=demo_mode),
1285
- # fn=lambda url, num_speakers, whisper_model, offset, api_name, custom_prompt, api_key: process_url(url, num_speakers, whisper_model, offset, api_name, api_key, demo_mode=demo_mode),
1286
- fn=lambda *args: process_url(*args, demo_mode=demo_mode),
1287
-
1288
  inputs=inputs,
1289
- outputs=[
1290
- gr.components.Textbox(label="Transcription", value=lambda: "", max_lines=10),
1291
- gr.components.Textbox(label="Summary or Status Message"),
1292
- gr.components.File(label="Download Transcription as JSON"),
1293
- gr.components.File(label="Download Summary as text", visible=lambda summary_file_path: summary_file_path is not None)
1294
- ],
1295
  title="Video Transcription and Summarization",
1296
- description="Submit a video URL for transcription and summarization.",
1297
- allow_flagging="never",
1298
- #https://huggingface.co/spaces/bethecloud/storj_theme
1299
- theme="bethecloud/storj_theme"
1300
- # FIXME - Figure out how to enable dark mode...
1301
- # other themes: https://huggingface.co/spaces/gradio/theme-gallery
1302
  )
1303
 
1304
- iface.launch(share=True)
 
1305
 
1306
  #
1307
  #
1308
  #####################################################################################################################################
1309
 
1310
 
1311
-
1312
-
1313
-
1314
-
1315
-
1316
  ####################################################################################################################################
1317
  # Main()
1318
  #
1319
 
1320
- def main(input_path, api_name=None, api_key=None, num_speakers=2, whisper_model="small.en", offset=0, vad_filter=False, download_video_flag=False, demo_mode=False):
 
1321
  if input_path is None and args.user_interface:
1322
  return []
1323
  start_time = time.monotonic()
@@ -1330,7 +1315,8 @@ def main(input_path, api_name=None, api_key=None, num_speakers=2, whisper_model=
1330
  paths = [input_path]
1331
  elif (info_dict := get_youtube(input_path)) and 'entries' in info_dict:
1332
  logging.debug("MAIN: YouTube playlist detected")
1333
- print("\n\nSorry, but playlists aren't currently supported. You can run the following command to generate a text file that you can then pass into this script though! (It may not work... playlist support seems spotty)" + """\n\n\tpython Get_Playlist_URLs.py <Youtube Playlist URL>\n\n\tThen,\n\n\tpython diarizer.py <playlist text file name>\n\n""")
 
1334
  return
1335
  else:
1336
  paths = [input_path]
@@ -1350,7 +1336,7 @@ def main(input_path, api_name=None, api_key=None, num_speakers=2, whisper_model=
1350
  logging.debug("MAIN: Video downloaded successfully")
1351
  logging.debug("MAIN: Converting video file to WAV...")
1352
  audio_file = convert_to_wav(video_path, offset)
1353
- logging.debug("MAIN: Audio file converted succesfully")
1354
  else:
1355
  if os.path.exists(path):
1356
  logging.debug("MAIN: Local file path detected")
@@ -1370,85 +1356,69 @@ def main(input_path, api_name=None, api_key=None, num_speakers=2, whisper_model=
1370
  results.append(transcription_result)
1371
  logging.info(f"Transcription complete: {audio_file}")
1372
 
1373
- if path.startswith('http'):
1374
- # Delete the downloaded video file
1375
- os.remove(video_path)
1376
- logging.info(f"Deleted downloaded video file: {video_path}")
1377
-
1378
  # Perform summarization based on the specified API
1379
  if api_name and api_key:
1380
  logging.debug(f"MAIN: Summarization being performed by {api_name}")
1381
  json_file_path = audio_file.replace('.wav', '.segments.json')
1382
  if api_name.lower() == 'openai':
 
1383
  try:
1384
- logging.debug(f"MAIN: trying to summarize with openAI")
1385
- api_key = openai_api_key
1386
- logging.debug(f"OpenAI: OpenAI API Key: {api_key}")
1387
- summary = summarize_with_openai(api_key, json_file_path, openai_model)
1388
  except requests.exceptions.ConnectionError:
1389
- r.status_code = "Connection: "
1390
- elif api_name.lower() == 'anthropic':
 
1391
  try:
1392
- logging.debug("MAIN: Trying to summarize with anthropic")
1393
- api_key = anthropic_api_key
1394
- logging.debug(f"Anthropic: Anthropic API Key: {api_key}")
1395
- summary = summarize_with_claude(api_key, json_file_path, anthropic_model)
1396
  except requests.exceptions.ConnectionError:
1397
- r.status_code = "Connection: "
1398
- elif api_name.lower() == 'cohere':
 
1399
  try:
1400
- logging.debug("Main: Trying to summarize with cohere")
1401
- api_key = cohere_api_key
1402
- logging.debug(f"Cohere: Cohere API Key: {api_key}")
1403
- summary = summarize_with_cohere(api_key, json_file_path, cohere_model)
1404
  except requests.exceptions.ConnectionError:
1405
- r.status_code = "Connection: "
1406
- elif api_name.lower() == 'groq':
 
1407
  try:
1408
- logging.debug("Main: Trying to summarize with Groq")
1409
- api_key = groq_api_key
1410
- logging.debug(f"Groq: Groq API Key: {api_key}")
1411
- summary = summarize_with_groq(api_key, json_file_path, groq_model)
1412
  except requests.exceptions.ConnectionError:
1413
- r.status_code = "Connection: "
1414
- elif api_name.lower() == 'llama':
 
 
1415
  try:
1416
- logging.debug("Main: Trying to summarize with Llama.cpp")
1417
- token = llama_api_key
1418
- logging.debug(f"Llama.cpp: Llama.cpp API Key: {api_key}")
1419
- llama_ip = llama_api_IP
1420
- logging.debug(f"Llama.cpp: Llama.cpp API IP:Port : {llama_ip}")
1421
- summary = summarize_with_llama(llama_ip, json_file_path, token)
1422
  except requests.exceptions.ConnectionError:
1423
- r.status_code = "Connection: "
1424
- elif api_name.lower() == 'kobold':
 
 
1425
  try:
1426
- logging.debug("Main: Trying to summarize with kobold.cpp")
1427
- token = kobold_api_key
1428
- logging.debug(f"kobold.cpp: Kobold.cpp API Key: {api_key}")
1429
- kobold_ip = kobold_api_IP
1430
- logging.debug(f"kobold.cpp: Kobold.cpp API IP:Port : {kobold_api_IP}")
1431
- summary = summarize_with_kobold(kobold_ip, json_file_path)
1432
  except requests.exceptions.ConnectionError:
1433
- r.status_code = "Connection: "
1434
- elif api_name.lower() == 'ooba':
 
 
1435
  try:
1436
- logging.debug("Main: Trying to summarize with oobabooga")
1437
- token = ooba_api_key
1438
- logging.debug(f"oobabooga: ooba API Key: {api_key}")
1439
- ooba_ip = ooba_api_IP
1440
- logging.debug(f"oobabooga: ooba API IP:Port : {ooba_ip}")
1441
- summary = summarize_with_oobabooga(ooba_ip, json_file_path)
1442
  except requests.exceptions.ConnectionError:
1443
- r.status_code = "Connection: "
1444
- if api_name.lower() == 'huggingface':
 
1445
  try:
1446
- logging.debug("MAIN: Trying to summarize with huggingface")
1447
- api_key = huggingface_api_key
1448
- logging.debug(f"huggingface: huggingface API Key: {api_key}")
1449
- summarize_with_huggingface(api_key, json_file_path)
1450
  except requests.exceptions.ConnectionError:
1451
- r.status_code = "Connection: "
1452
 
1453
  else:
1454
  logging.warning(f"Unsupported API: {api_name}")
@@ -1471,28 +1441,28 @@ def main(input_path, api_name=None, api_key=None, num_speakers=2, whisper_model=
1471
  return results
1472
 
1473
 
1474
-
1475
  if __name__ == "__main__":
1476
  parser = argparse.ArgumentParser(description='Transcribe and summarize videos.')
1477
  parser.add_argument('input_path', type=str, help='Path or URL of the video', nargs='?')
1478
- parser.add_argument('-v','--video', action='store_true', help='Download the video instead of just the audio')
1479
  parser.add_argument('-api', '--api_name', type=str, help='API name for summarization (optional)')
1480
  parser.add_argument('-ns', '--num_speakers', type=int, default=2, help='Number of speakers (default: 2)')
1481
- parser.add_argument('-wm', '--whisper_model', type=str, default='small.en', help='Whisper model (default: small.en)')
 
1482
  parser.add_argument('-off', '--offset', type=int, default=0, help='Offset in seconds (default: 0)')
1483
  parser.add_argument('-vad', '--vad_filter', action='store_true', help='Enable VAD filter')
1484
- parser.add_argument('-log', '--log_level', type=str, default='INFO', choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], help='Log level (default: INFO)')
 
1485
  parser.add_argument('-ui', '--user_interface', action='store_true', help='Launch the Gradio user interface')
1486
  parser.add_argument('-demo', '--demo_mode', action='store_true', help='Enable demo mode')
1487
  #parser.add_argument('--log_file', action=str, help='Where to save logfile (non-default)')
1488
  args = parser.parse_args()
1489
-
1490
  print(f"Is CUDA available: {torch.cuda.is_available()}")
1491
  # True
1492
  print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
1493
  # Tesla T4
1494
 
1495
-
1496
  # Since this is running in HF....
1497
  args.user_interface = True
1498
  if args.user_interface:
@@ -1507,12 +1477,12 @@ if __name__ == "__main__":
1507
  logging.info('Starting the transcription and summarization process.')
1508
  logging.info(f'Input path: {args.input_path}')
1509
  logging.info(f'API Name: {args.api_name}')
1510
- logging.debug(f'API Key: {args.api_key}') # ehhhhh
1511
  logging.info(f'Number of speakers: {args.num_speakers}')
1512
  logging.info(f'Whisper model: {args.whisper_model}')
1513
  logging.info(f'Offset: {args.offset}')
1514
  logging.info(f'VAD filter: {args.vad_filter}')
1515
- logging.info(f'Log Level: {args.log_level}') #lol
1516
 
1517
  if args.api_name and args.api_key:
1518
  logging.info(f'API: {args.api_name}')
@@ -1529,13 +1499,13 @@ if __name__ == "__main__":
1529
 
1530
  # Hey, we're in HuggingFace
1531
  launch_ui(demo_mode=args.demo_mode)
1532
-
1533
  try:
1534
- results = main(input_path, api_name=api_name, api_key=api_key, num_speakers=num_speakers, whisper_model="small.en", offset=offset, vad_filter=vad_filter, download_video_flag=download_video_flag)
1535
- results = main(args.input_path, api_name=args.api_name, api_key=args.api_key, num_speakers=args.num_speakers, whisper_model="small.en", offset=args.offset, vad_filter=args.vad_filter, download_video_flag=args.video)
 
1536
  logging.info('Transcription process completed.')
1537
  except Exception as e:
1538
  logging.error('An error occurred during the transcription process.')
1539
  logging.error(str(e))
1540
  sys.exit(1)
1541
-
 
1
  #!/usr/bin/env python3
2
+ import argparse
3
+ import configparser
4
+ import json
5
+ import logging
6
+ import os
7
+ import platform
8
+ import requests
9
+ import shutil
10
+ import subprocess
11
+ import sys
12
+ import time
13
+ import unicodedata
14
  import zipfile
15
+
16
+ import gradio as gr
 
17
  import torch
18
  import yt_dlp
19
 
 
20
  #######
21
  # Function Sections
22
  #
 
42
  #
43
 
44
 
 
45
  ####
46
  #
47
  # TL/DW: Too Long Didn't Watch
 
59
  # Download Audio+Video from URL -> Transcribe audio from Video:**
60
  # python summarize.py -v https://www.youtube.com/watch?v=4nd1CDZP21s`
61
  #
62
+ # Download Audio only from URL -> Transcribe audio -> Summarize using (`anthropic`/`cohere`/`openai`/`llama` (
63
+ # llama.cpp)/`ooba` (oobabooga/text-gen-webui)/`kobold` (kobold.cpp)/`tabby` (Tabbyapi)) API:** python summarize.py
64
+ # -v https://www.youtube.com/watch?v=4nd1CDZP21s -api <your choice of API>` - Make sure to put your API key into
65
+ # `config.txt` under the appropriate API variable
66
  #
67
  # Download Audio+Video from a list of videos in a text file (can be file paths or URLs) and have them all summarized:**
68
  # python summarize.py ./local/file_on_your/system --api_name <API_name>`
 
98
  huggingface_api_key = config.get('API', 'huggingface_api_key', fallback=None)
99
  logging.debug(f"Loaded HuggingFace Face API Key: {huggingface_api_key}")
100
 
 
101
  # Models
102
  anthropic_model = config.get('API', 'anthropic_model', fallback='claude-3-sonnet-20240229')
103
  cohere_model = config.get('API', 'cohere_model', fallback='command-r-plus')
 
127
  #######################
128
 
129
  # Dirty hack - sue me.
130
+ os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
131
 
132
+ whisper_models = ["small", "medium", "small.en", "medium.en"]
133
  source_languages = {
134
  "en": "English",
135
  "zh": "Chinese",
 
141
  }
142
  source_language_list = [key[0] for key in source_languages.items()]
143
 
144
+ print(r"""
145
+ _____ _ ________ _ _
 
 
146
  |_ _|| | / /| _ \| | | | _
147
  | | | | / / | | | || | | |(_)
148
  | | | | / / | | | || |/\| |
 
173
 
174
  # Perform Platform Check
175
  userOS = ""
176
+
177
+
178
  def platform_check():
179
  global userOS
180
  if platform.system() == "Linux":
 
188
  exit()
189
 
190
 
 
191
  # Check for NVIDIA GPU and CUDA availability
192
  def cuda_check():
193
  global processing_choice
 
204
  processing_choice = "cpu" # Set processing_choice to cpu if nvidia-smi command fails
205
 
206
 
 
207
  # Ask user if they would like to use either their GPU or their CPU for transcription
208
  def decide_cpugpu():
209
  global processing_choice
 
220
  print("Invalid choice. Please select either GPU or CPU.")
221
 
222
 
 
223
  # check for existence of ffmpeg
224
  def check_ffmpeg():
225
  if shutil.which("ffmpeg") or (os.path.exists("Bin") and os.path.isfile(".\\Bin\\ffmpeg.exe")):
 
227
  pass
228
  else:
229
  logging.debug("ffmpeg not installed on the local system/in local PATH")
230
+ print(
231
+ "ffmpeg is not installed.\n\n You can either install it manually, or through your package manager of choice.\n Windows users, builds are here: https://www.gyan.dev/ffmpeg/builds/")
232
  if userOS == "Windows":
233
  download_ffmpeg()
234
  elif userOS == "Linux":
235
+ print(
236
+ "You should install ffmpeg using your platform's appropriate package manager, 'apt install ffmpeg','dnf install ffmpeg' or 'pacman', etc.")
237
  else:
238
  logging.debug("running an unsupported OS")
239
+ print("You're running an unsupported/Un-tested OS")
240
  exit_script = input("Let's exit the script, unless you're feeling lucky? (y/n)")
241
  if exit_script == "y" or "yes" or "1":
242
  exit()
243
 
244
 
 
245
  # Download ffmpeg
246
  def download_ffmpeg():
247
  user_choice = input("Do you want to download ffmpeg? (y)Yes/(n)No: ")
 
249
  print("Downloading ffmpeg")
250
  url = "https://www.gyan.dev/ffmpeg/builds/ffmpeg-release-essentials.zip"
251
  response = requests.get(url)
252
+
253
  if response.status_code == 200:
254
  print("Saving ffmpeg zip file")
255
  logging.debug("Saving ffmpeg zip file")
256
  zip_path = "ffmpeg-release-essentials.zip"
257
  with open(zip_path, 'wb') as file:
258
  file.write(response.content)
259
+
260
  logging.debug("Extracting the 'ffmpeg.exe' file from the zip")
261
  print("Extracting ffmpeg.exe from zip file to '/Bin' folder")
262
  with zipfile.ZipFile(zip_path, 'r') as zip_ref:
263
  ffmpeg_path = "ffmpeg-7.0-essentials_build/bin/ffmpeg.exe"
264
+
265
  logging.debug("checking if the './Bin' folder exists, creating if not")
266
  bin_folder = "Bin"
267
  if not os.path.exists(bin_folder):
268
  logging.debug("Creating a folder for './Bin', it didn't previously exist")
269
  os.makedirs(bin_folder)
270
+
271
  logging.debug("Extracting 'ffmpeg.exe' to the './Bin' folder")
272
  zip_ref.extract(ffmpeg_path, path=bin_folder)
273
+
274
  logging.debug("Moving 'ffmpeg.exe' to the './Bin' folder")
275
  src_path = os.path.join(bin_folder, ffmpeg_path)
276
  dst_path = os.path.join(bin_folder, "ffmpeg.exe")
277
  shutil.move(src_path, dst_path)
278
+
279
  logging.debug("Removing ffmpeg zip file")
280
  print("Deleting zip file (we've already extracted ffmpeg.exe, no worries)")
281
  os.remove(zip_path)
 
289
  logging.debug("User chose to not download ffmpeg")
290
  print("ffmpeg will not be downloaded.")
291
 
292
+
293
+ #
294
  #
295
  ####################################################################################################################################
296
 
297
 
 
 
 
 
 
298
  ####################################################################################################################################
299
  # Processing Paths and local file handling
300
  #
 
306
  with open(file_path, 'r') as file:
307
  for line in file:
308
  line = line.strip()
309
+ if line and not os.path.exists(
310
+ os.path.join('results', normalize_title(line.split('/')[-1].split('.')[0]) + '.json')):
311
  logging.debug("line successfully imported from file and added to list to be transcribed")
312
  paths.append(line)
313
  return paths
314
 
315
 
 
316
  def process_path(path):
317
  """ Decides whether the path is a URL or a local file and processes accordingly. """
318
  if path.startswith('http'):
 
326
  return None
327
 
328
 
 
329
  # FIXME
330
  def process_local_file(file_path):
331
  logging.info(f"Processing local file: {file_path}")
 
335
  download_path = create_download_directory(title)
336
  logging.debug(f"Converting '{title}' to an audio file (wav).")
337
  audio_file = convert_to_wav(file_path) # Assumes input files are videos needing audio extraction
338
+ logging.debug(f"'{title}' successfully converted to an audio file (wav).")
339
  return download_path, info_dict, audio_file
 
 
 
 
 
340
 
341
 
342
+ #
343
+ #
344
+ ####################################################################################################################################
345
 
346
 
347
  ####################################################################################################################################
348
  # Video Download/Handling
349
  #
350
 
351
+ def process_url(input_path, num_speakers=2, whisper_model="small.en", custom_prompt=None, offset=0, api_name=None,
352
+ api_key=None, vad_filter=False, download_video_flag=False, demo_mode=False):
353
  if demo_mode:
354
  api_name = "huggingface"
355
  api_key = os.environ.get(HF_TOKEN)
356
  print("HUGGINGFACE API KEY CHECK #3: " + api_key)
357
  vad_filter = False
358
  download_video_flag = False
359
+
360
  try:
361
+ results = main(input_path, api_name=api_name, api_key=api_key, num_speakers=num_speakers,
362
+ whisper_model=whisper_model, offset=offset, vad_filter=vad_filter,
363
+ download_video_flag=download_video_flag)
364
 
365
  if results:
366
  transcription_result = results[0]
 
384
  return None, error_message, None, None
385
 
386
 
 
387
  def create_download_directory(title):
388
  base_dir = "Results"
389
  # Remove characters that are illegal in Windows filenames and normalize
 
398
  return session_path
399
 
400
 
 
401
  def normalize_title(title):
402
  # Normalize the string to 'NFKD' form and encode to 'ascii' ignoring non-ascii characters
403
  title = unicodedata.normalize('NFKD', title).encode('ascii', 'ignore').decode('ascii')
404
+ title = title.replace('/', '_').replace('\\', '_').replace(':', '_').replace('"', '').replace('*', '').replace('?',
405
+ '').replace(
406
+ '<', '').replace('>', '').replace('|', '')
407
  return title
408
 
409
 
 
410
  def get_youtube(video_url):
411
  ydl_opts = {
412
  'format': 'bestaudio[ext=m4a]',
 
421
  return info_dict
422
 
423
 
 
424
  def get_playlist_videos(playlist_url):
425
  ydl_opts = {
426
  'extract_flat': True,
 
440
  return [], None
441
 
442
 
 
443
  def save_to_file(video_urls, filename):
444
  with open(filename, 'w') as file:
445
  file.write('\n'.join(video_urls))
446
  print(f"Video URLs saved to {filename}")
447
 
448
 
 
449
  def download_video(video_url, download_path, info_dict, download_video_flag):
450
  logging.debug("About to normalize downloaded video title")
451
  title = normalize_title(info_dict['title'])
452
+
453
  if download_video_flag == False:
454
  file_path = os.path.join(download_path, f"{title}.m4a")
455
  ydl_opts = {
 
472
  'format': 'bestaudio[ext=m4a]',
473
  'outtmpl': audio_file_path,
474
  }
475
+
476
  with yt_dlp.YoutubeDL(ydl_opts_video) as ydl:
477
  logging.debug("yt_dlp: About to download video with youtube-dl")
478
  ydl.download([video_url])
479
  logging.debug("yt_dlp: Video successfully downloaded with youtube-dl")
480
+
481
  with yt_dlp.YoutubeDL(ydl_opts_audio) as ydl:
482
  logging.debug("yt_dlp: About to download audio with youtube-dl")
483
  ydl.download([video_url])
 
506
  '-c:a', 'copy',
507
  output_file_path
508
  ]
509
+ subprocess.run(ffmpeg_command, check=True)
510
  else:
511
  logging.error("You shouldn't be here...")
512
  exit()
513
  os.remove(video_file_path)
514
  os.remove(audio_file_path)
 
 
 
 
515
 
516
+ return output_file_path
517
 
518
 
519
  #
 
521
  ####################################################################################################################################
522
 
523
 
 
 
 
 
524
  ####################################################################################################################################
525
  # Audio Transcription
526
  #
 
544
  ffmpeg_cmd = 'ffmpeg' # Assume 'ffmpeg' is in PATH for non-Windows systems
545
 
546
  command = [
547
+ ffmpeg_cmd, # Assuming the working directory is correctly set where .\Bin exists
548
+ "-ss", "00:00:00", # Start at the beginning of the video
549
  "-i", video_file_path,
550
+ "-ar", "16000", # Audio sample rate
551
+ "-ac", "1", # Number of audio channels
552
+ "-c:a", "pcm_s16le", # Audio codec
553
  out_path
554
  ]
555
  try:
 
581
  return out_path
582
 
583
 
 
584
  # Transcribe .wav into .segments.json
585
  def speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='small.en', vad_filter=False):
586
  logging.info('Loading faster_whisper model: %s', whisper_model)
 
599
  with open(out_file) as f:
600
  segments = json.load(f)
601
  return segments
602
+
603
  logging.info('Starting transcription...')
604
  options = dict(language=selected_source_lang, beam_size=5, best_of=5, vad_filter=vad_filter)
605
  transcribe_options = dict(task="transcribe", **options)
 
621
  logging.error("Error transcribing audio: %s", str(e))
622
  raise RuntimeError("Error transcribing audio")
623
  return segments
624
+
625
+
626
  #
627
  #
628
  ####################################################################################################################################
629
 
630
 
 
 
 
 
631
  ####################################################################################################################################
632
  # Diarization
633
  #
634
  # TODO: https://huggingface.co/pyannote/speaker-diarization-3.1
635
  # embedding_model = "pyannote/embedding", embedding_size=512
636
  # embedding_model = "speechbrain/spkrec-ecapa-voxceleb", embedding_size=192
637
+ # def speaker_diarize(video_file_path, segments, embedding_model = "pyannote/embedding", embedding_size=512, num_speakers=0):
638
+ # """
639
+ # 1. Generating speaker embeddings for each segments.
640
+ # 2. Applying agglomerative clustering on the embeddings to identify the speaker for each segment.
641
+ # """
642
+ # try:
643
+ # from pyannote.audio import Audio
644
+ # from pyannote.core import Segment
645
+ # from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
646
+ # import numpy as np
647
+ # import pandas as pd
648
+ # from sklearn.cluster import AgglomerativeClustering
649
+ # from sklearn.metrics import silhouette_score
650
+ # import tqdm
651
+ # import wave
652
+ #
653
+ # embedding_model = PretrainedSpeakerEmbedding( embedding_model, device=torch.device("cuda" if torch.cuda.is_available() else "cpu"))
654
+ #
655
+ #
656
+ # _,file_ending = os.path.splitext(f'{video_file_path}')
657
+ # audio_file = video_file_path.replace(file_ending, ".wav")
658
+ # out_file = video_file_path.replace(file_ending, ".diarize.json")
659
+ #
660
+ # logging.debug("getting duration of audio file")
661
+ # with contextlib.closing(wave.open(audio_file,'r')) as f:
662
+ # frames = f.getnframes()
663
+ # rate = f.getframerate()
664
+ # duration = frames / float(rate)
665
+ # logging.debug("duration of audio file obtained")
666
+ # print(f"duration of audio file: {duration}")
667
+ #
668
+ # def segment_embedding(segment):
669
+ # logging.debug("Creating embedding")
670
+ # audio = Audio()
671
+ # start = segment["start"]
672
+ # end = segment["end"]
673
+ #
674
+ # # Enforcing a minimum segment length
675
+ # if end-start < 0.3:
676
+ # padding = 0.3-(end-start)
677
+ # start -= padding/2
678
+ # end += padding/2
679
+ # print('Padded segment because it was too short:',segment)
680
+ #
681
+ # # Whisper overshoots the end timestamp in the last segment
682
+ # end = min(duration, end)
683
+ # # clip audio and embed
684
+ # clip = Segment(start, end)
685
+ # waveform, sample_rate = audio.crop(audio_file, clip)
686
+ # return embedding_model(waveform[None])
687
+ #
688
+ # embeddings = np.zeros(shape=(len(segments), embedding_size))
689
+ # for i, segment in enumerate(tqdm.tqdm(segments)):
690
+ # embeddings[i] = segment_embedding(segment)
691
+ # embeddings = np.nan_to_num(embeddings)
692
+ # print(f'Embedding shape: {embeddings.shape}')
693
+ #
694
+ # if num_speakers == 0:
695
+ # # Find the best number of speakers
696
+ # score_num_speakers = {}
697
+ #
698
+ # for num_speakers in range(2, 10+1):
699
+ # clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
700
+ # score = silhouette_score(embeddings, clustering.labels_, metric='euclidean')
701
+ # score_num_speakers[num_speakers] = score
702
+ # best_num_speaker = max(score_num_speakers, key=lambda x:score_num_speakers[x])
703
+ # print(f"The best number of speakers: {best_num_speaker} with {score_num_speakers[best_num_speaker]} score")
704
+ # else:
705
+ # best_num_speaker = num_speakers
706
+ #
707
+ # # Assign speaker label
708
+ # clustering = AgglomerativeClustering(best_num_speaker).fit(embeddings)
709
+ # labels = clustering.labels_
710
+ # for i in range(len(segments)):
711
+ # segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
712
+ #
713
+ # with open(out_file,'w') as f:
714
+ # f.write(json.dumps(segments, indent=2))
715
+ #
716
+ # # Make CSV output
717
+ # def convert_time(secs):
718
+ # return datetime.timedelta(seconds=round(secs))
719
+ #
720
+ # objects = {
721
+ # 'Start' : [],
722
+ # 'End': [],
723
+ # 'Speaker': [],
724
+ # 'Text': []
725
+ # }
726
+ # text = ''
727
+ # for (i, segment) in enumerate(segments):
728
+ # if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
729
+ # objects['Start'].append(str(convert_time(segment["start"])))
730
+ # objects['Speaker'].append(segment["speaker"])
731
+ # if i != 0:
732
+ # objects['End'].append(str(convert_time(segments[i - 1]["end"])))
733
+ # objects['Text'].append(text)
734
+ # text = ''
735
+ # text += segment["text"] + ' '
736
+ # objects['End'].append(str(convert_time(segments[i - 1]["end"])))
737
+ # objects['Text'].append(text)
738
+ #
739
+ # save_path = video_file_path.replace(file_ending, ".csv")
740
+ # df_results = pd.DataFrame(objects)
741
+ # df_results.to_csv(save_path)
742
+ # return df_results, save_path
743
+ #
744
+ # except Exception as e:
745
+ # raise RuntimeError("Error Running inference with local model", e)
746
  #
747
  #
748
  ####################################################################################################################################
749
 
750
 
 
 
 
 
751
  ####################################################################################################################################
752
  #Summarizers
753
  #
 
760
  return text
761
 
762
 
763
+ def summarize_with_openai(api_key, file_path, model, custom_prompt):
 
764
  try:
765
  logging.debug("openai: Loading json data for summarization")
766
  with open(file_path, 'r') as file:
767
  segments = json.load(file)
768
+
769
  logging.debug("openai: Extracting text from the segments")
770
  text = extract_text_from_segments(segments)
771
 
 
773
  'Authorization': f'Bearer {api_key}',
774
  'Content-Type': 'application/json'
775
  }
776
+ # headers = {
777
+ # 'Authorization': f'Bearer {api_key}',
778
+ # 'Content-Type': 'application/json'
779
+ # }
780
+
781
+ logging.debug(f"openai: API Key is: {api_key}")
782
  logging.debug("openai: Preparing data + prompt for submittal")
783
+ openai_prompt = f"{text} \n\n\n\n{custom_prompt}"
784
  data = {
785
  "model": model,
786
  "messages": [
 
798
  }
799
  logging.debug("openai: Posting request")
800
  response = requests.post('https://api.openai.com/v1/chat/completions', headers=headers, json=data)
801
+
802
  if response.status_code == 200:
803
  summary = response.json()['choices'][0]['message']['content'].strip()
804
  logging.debug("openai: Summarization successful")
 
814
  return None
815
 
816
 
817
+ def summarize_with_claude(api_key, file_path, model, custom_prompt):
 
818
  try:
819
  logging.debug("anthropic: Loading JSON data")
820
  with open(file_path, 'r') as file:
821
  segments = json.load(file)
822
+
823
  logging.debug("anthropic: Extracting text from the segments file")
824
  text = extract_text_from_segments(segments)
825
 
 
828
  'anthropic-version': '2023-06-01',
829
  'Content-Type': 'application/json'
830
  }
831
+
832
+ anthropic_prompt = custom_prompt
833
+ logging.debug("anthropic: Prompt is {anthropic_prompt}")
834
  user_message = {
835
  "role": "user",
836
+ "content": f"{text} \n\n\n\n{anthropic_prompt}"
837
  }
838
 
839
  data = {
840
  "model": model,
841
+ "max_tokens": 4096, # max _possible_ tokens to return
842
  "messages": [user_message],
843
  "stop_sequences": ["\n\nHuman:"],
844
  "temperature": 0.7,
 
850
  "stream": False,
851
  "system": "You are a professional summarizer."
852
  }
853
+
854
  logging.debug("anthropic: Posting request to API")
855
  response = requests.post('https://api.anthropic.com/v1/messages', headers=headers, json=data)
856
+
857
  # Check if the status code indicates success
858
  if response.status_code == 200:
859
  logging.debug("anthropic: Post submittal successful")
860
  response_data = response.json()
861
  try:
862
  summary = response_data['content'][0]['text'].strip()
863
+ logging.debug("anthropic: Summarization successful")
864
  print("Summary processed successfully.")
865
  return summary
866
  except (IndexError, KeyError) as e:
 
882
  return None
883
 
884
 
 
885
  # Summarize with Cohere
886
+ def summarize_with_cohere(api_key, file_path, model, custom_prompt):
887
  try:
888
  logging.basicConfig(level=logging.DEBUG)
889
  logging.debug("cohere: Loading JSON data")
 
899
  'Authorization': f'Bearer {api_key}'
900
  }
901
 
902
+ cohere_prompt = f"{text} \n\n\n\n{custom_prompt}"
903
+ logging.debug("cohere: Prompt being sent is {cohere_prompt}")
904
+
905
  data = {
906
  "chat_history": [
907
  {"role": "USER", "message": cohere_prompt}
 
927
  logging.error("Expected data not found in API response.")
928
  return "Expected data not found in API response."
929
  else:
930
+ logging.error(f"cohere: API request failed with status code {response.status_code}: {response.text}")
931
  print(f"Failed to process summary, status code {response.status_code}: {response.text}")
932
  return f"cohere: API request failed: {response.text}"
933
 
 
936
  return f"cohere: Error occurred while processing summary with Cohere: {str(e)}"
937
 
938
 
 
939
  # https://console.groq.com/docs/quickstart
940
+ def summarize_with_groq(api_key, file_path, model, custom_prompt):
941
  try:
942
  logging.debug("groq: Loading JSON data")
943
  with open(file_path, 'r') as file:
 
951
  'Content-Type': 'application/json'
952
  }
953
 
954
+ groq_prompt = f"{text} \n\n\n\n{custom_prompt}"
955
+ logging.debug("groq: Prompt being sent is {groq_prompt}")
956
+
957
  data = {
958
  "messages": [
959
  {
 
993
  #
994
  # Local Summarization
995
 
996
+ def summarize_with_llama(api_url, file_path, token, custom_prompt):
997
  try:
998
  logging.debug("llama: Loading JSON data")
999
  with open(file_path, 'r') as file:
 
1006
  'accept': 'application/json',
1007
  'content-type': 'application/json',
1008
  }
1009
+ if len(token) > 5:
1010
  headers['Authorization'] = f'Bearer {token}'
1011
 
1012
+ llama_prompt = f"{text} \n\n\n\n{custom_prompt}"
1013
+ logging.debug("llama: Prompt being sent is {llama_prompt}")
1014
 
 
 
1015
  data = {
1016
  "prompt": llama_prompt
1017
  }
1018
 
1019
+ logging.debug("llama: Submitting request to API endpoint")
1020
  print("llama: Submitting request to API endpoint")
1021
  response = requests.post(api_url, headers=headers, json=data)
1022
  response_data = response.json()
 
1038
  return f"llama: Error occurred while processing summary with llama: {str(e)}"
1039
 
1040
 
 
1041
  # https://lite.koboldai.net/koboldcpp_api#/api%2Fv1/post_api_v1_generate
1042
+ def summarize_with_kobold(api_url, file_path, custom_prompt):
1043
  try:
1044
  logging.debug("kobold: Loading JSON data")
1045
  with open(file_path, 'r') as file:
 
1052
  'accept': 'application/json',
1053
  'content-type': 'application/json',
1054
  }
1055
+
1056
+ kobold_prompt = f"{text} \n\n\n\n{custom_prompt}"
1057
+ logging.debug("kobold: Prompt being sent is {kobold_prompt}")
1058
+
1059
  # FIXME
 
 
1060
  # Values literally c/p from the api docs....
1061
  data = {
1062
  "max_context_length": 8096,
 
1088
  return f"kobold: Error occurred while processing summary with kobold: {str(e)}"
1089
 
1090
 
 
1091
  # https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API
1092
+ def summarize_with_oobabooga(api_url, file_path, custom_prompt):
1093
  try:
1094
  logging.debug("ooba: Loading JSON data")
1095
  with open(file_path, 'r') as file:
 
1104
  'content-type': 'application/json',
1105
  }
1106
 
1107
+ # prompt_text = "I like to eat cake and bake cakes. I am a baker. I work in a French bakery baking cakes. It is a fun job. I have been baking cakes for ten years. I also bake lots of other baked goods, but cakes are my favorite."
1108
+ # prompt_text += f"\n\n{text}" # Uncomment this line if you want to include the text variable
1109
+ ooba_prompt = "{text}\n\n\n\n{custom_prompt}"
1110
+ logging.debug("ooba: Prompt being sent is {ooba_prompt}")
1111
 
1112
+ data = {
1113
  "mode": "chat",
1114
  "character": "Example",
1115
+ "messages": [{"role": "user", "content": ooba_prompt}]
1116
  }
1117
 
1118
  logging.debug("ooba: Submitting request to API endpoint")
 
1135
  return f"ooba: Error occurred while processing summary with oobabooga: {str(e)}"
1136
 
1137
 
 
1138
  def save_summary_to_file(summary, file_path):
1139
  summary_file_path = file_path.replace('.segments.json', '_summary.txt')
1140
  logging.debug("Opening summary file for writing, *segments.json with *_summary.txt")
 
1142
  file.write(summary)
1143
  logging.info(f"Summary saved to file: {summary_file_path}")
1144
 
1145
+
1146
  #
1147
  #
1148
  ####################################################################################################################################
1149
 
1150
 
 
 
 
 
1151
  ####################################################################################################################################
1152
  # Gradio UI
1153
  #
 
1181
  response_data = response.json()
1182
  wait_time = response_data.get('estimated_time', 10)
1183
  return None, f"Model is loading, retrying in {int(wait_time)} seconds..."
1184
+ # Sleep before retrying....
1185
+ time.sleep(wait_time)
1186
 
1187
  if api_key == "":
1188
  api_key = os.environ.get(HF_TOKEN)
 
1191
  logging.debug("huggingface: Loading json data for summarization")
1192
  with open(file_path, 'r') as file:
1193
  segments = json.load(file)
1194
+
1195
  logging.debug("huggingface: Extracting text from the segments")
1196
  text = ' '.join([segment['text'] for segment in segments])
1197
 
1198
  api_key = os.environ.get(HF_TOKEN)
1199
  logging.debug("HUGGINGFACE API KEY CHECK #2: " + api_key)
1200
 
 
 
 
1201
  logging.debug("huggingface: Submitting request...")
1202
  response = requests.post(API_URL, headers=headers, json=data)
1203
+
1204
  if response.status_code == 200:
1205
  summary = response.json()[0]['summary_text']
1206
  logging.debug("huggingface: Summarization successful")
 
1214
  print(f"Error occurred while processing summary with huggingface: {str(e)}")
1215
  return None
1216
 
 
 
1217
  def same_auth(username, password):
1218
  return username == password
1219
 
1220
 
 
1221
  def format_transcription(transcription_result):
1222
  if transcription_result:
1223
  json_data = transcription_result['transcription']
 
1226
  return ""
1227
 
1228
 
1229
+ def process_text(api_key, text_file):
1230
+ summary, message = summarize_with_huggingface(api_key, text_file)
 
1231
  if summary:
1232
  # Show summary on success
1233
+ return "Summary:", summary
1234
  else:
1235
  # Inform user about load/wait time
1236
+ return "Notice:", message
1237
 
1238
 
1239
  def launch_ui(demo_mode=False):
1240
+ def process_url(url, num_speakers, whisper_model, custom_prompt, offset, api_name, api_key, vad_filter,
1241
+ download_video):
1242
+ try:
1243
+ # Assuming 'main' is the function that handles the processing logic.
1244
+ # Adjust parameters as needed based on your actual 'main' function implementation.
1245
+ results = main(url, api_name=api_name, api_key=api_key, num_speakers=num_speakers,
1246
+ whisper_model=whisper_model, offset=offset, vad_filter=vad_filter,
1247
+ download_video_flag=download_video, custom_prompt=custom_prompt)
1248
+
1249
+ if results:
1250
+ transcription_result = results[0]
1251
+ json_data = transcription_result['transcription']
1252
+ summary_file_path = transcription_result.get('summary', "Summary not available.")
1253
+ json_file_path = transcription_result['audio_file'].replace('.wav', '.segments.json')
1254
+ video_file_path = transcription_result.get('video_path', None)
1255
+ return json_data, summary_file_path, json_file_path, summary_file_path, video_file_path
1256
+ else:
1257
+ return "No results found.", "No summary available.", None, None, None
1258
+ except Exception as e:
1259
+ return str(e), "Error processing the request.", None, None, None
1260
 
1261
  inputs = [
1262
+ gr.components.Textbox(label="URL", placeholder="Enter the video URL here"),
1263
+ gr.components.Number(value=2, label="Number of Speakers"),
1264
+ gr.components.Dropdown(choices=whisper_models, value="small.en", label="Whisper Model"),
1265
+ gr.components.Textbox(label="Custom Prompt", placeholder="Enter a custom prompt here", lines=3),
1266
+ gr.components.Number(value=0, label="Offset"),
1267
+ gr.components.Dropdown(
1268
+ choices=["huggingface", "openai", "anthropic", "cohere", "groq", "llama", "kobold", "ooba"],
1269
+ label="API Name"),
1270
+ gr.components.Textbox(label="API Key", placeholder="Enter your API key here"),
1271
+ gr.components.Checkbox(label="VAD Filter", value=False),
1272
+ gr.components.Checkbox(label="Download Video", value=False)
1273
  ]
1274
 
1275
+ outputs = [
1276
+ gr.components.Textbox(label="Transcription"),
1277
+ gr.components.Textbox(label="Summary or Status Message"),
1278
+ gr.components.File(label="Download Transcription as JSON", visible=lambda x: x is not None),
1279
+ gr.components.File(label="Download Summary as Text", visible=lambda x: x is not None),
1280
+ gr.components.File(label="Download Video", visible=lambda x: x is not None)
1281
+ ]
1282
 
1283
  iface = gr.Interface(
1284
+ fn=process_url,
 
 
 
1285
  inputs=inputs,
1286
+ outputs=outputs,
 
 
 
 
 
1287
  title="Video Transcription and Summarization",
1288
+ description="Submit a video URL for transcription and summarization. Ensure you input all necessary information including API keys.",
1289
+ theme="bethecloud/storj_theme" # Adjust theme as necessary
 
 
 
 
1290
  )
1291
 
1292
+ iface.launch(share=False)
1293
+
1294
 
1295
  #
1296
  #
1297
  #####################################################################################################################################
1298
 
1299
 
 
 
 
 
 
1300
  ####################################################################################################################################
1301
  # Main()
1302
  #
1303
 
1304
+ def main(input_path, api_name=None, api_key=None, num_speakers=2, whisper_model="small.en", offset=0, vad_filter=False,
1305
+ download_video_flag=False, demo_mode=False, custom_prompt=None):
1306
  if input_path is None and args.user_interface:
1307
  return []
1308
  start_time = time.monotonic()
 
1315
  paths = [input_path]
1316
  elif (info_dict := get_youtube(input_path)) and 'entries' in info_dict:
1317
  logging.debug("MAIN: YouTube playlist detected")
1318
+ print(
1319
+ "\n\nSorry, but playlists aren't currently supported. You can run the following command to generate a text file that you can then pass into this script though! (It may not work... playlist support seems spotty)" + """\n\n\tpython Get_Playlist_URLs.py <Youtube Playlist URL>\n\n\tThen,\n\n\tpython diarizer.py <playlist text file name>\n\n""")
1320
  return
1321
  else:
1322
  paths = [input_path]
 
1336
  logging.debug("MAIN: Video downloaded successfully")
1337
  logging.debug("MAIN: Converting video file to WAV...")
1338
  audio_file = convert_to_wav(video_path, offset)
1339
+ logging.debug("MAIN: Audio file converted successfully")
1340
  else:
1341
  if os.path.exists(path):
1342
  logging.debug("MAIN: Local file path detected")
 
1356
  results.append(transcription_result)
1357
  logging.info(f"Transcription complete: {audio_file}")
1358
 
 
 
 
 
 
1359
  # Perform summarization based on the specified API
1360
  if api_name and api_key:
1361
  logging.debug(f"MAIN: Summarization being performed by {api_name}")
1362
  json_file_path = audio_file.replace('.wav', '.segments.json')
1363
  if api_name.lower() == 'openai':
1364
+ api_key = openai_api_key
1365
  try:
1366
+ logging.debug(f"MAIN: trying to summarize with openAI")
1367
+ summary = summarize_with_openai(api_key, json_file_path, openai_model, custom_prompt)
 
 
1368
  except requests.exceptions.ConnectionError:
1369
+ requests.status_code = "Connection: "
1370
+ elif api_name.lower() == "anthropic":
1371
+ api_key = anthropic_api_key
1372
  try:
1373
+ logging.debug(f"MAIN: Trying to summarize with anthropic")
1374
+ summary = summarize_with_claude(api_key, json_file_path, anthropic_model, custom_prompt)
 
 
1375
  except requests.exceptions.ConnectionError:
1376
+ requests.status_code = "Connection: "
1377
+ elif api_name.lower() == "cohere":
1378
+ api_key = cohere_api_key
1379
  try:
1380
+ logging.debug(f"MAIN: Trying to summarize with cohere")
1381
+ summary = summarize_with_cohere(api_key, json_file_path, cohere_model, custom_prompt)
 
 
1382
  except requests.exceptions.ConnectionError:
1383
+ requests.status_code = "Connection: "
1384
+ elif api_name.lower() == "groq":
1385
+ api_key = groq_api_key
1386
  try:
1387
+ logging.debug(f"MAIN: Trying to summarize with Groq")
1388
+ summary = summarize_with_groq(api_key, json_file_path, groq_model, custom_prompt)
 
 
1389
  except requests.exceptions.ConnectionError:
1390
+ requests.status_code = "Connection: "
1391
+ elif api_name.lower() == "llama":
1392
+ token = llama_api_key
1393
+ llama_ip = llama_api_IP
1394
  try:
1395
+ logging.debug(f"MAIN: Trying to summarize with Llama.cpp")
1396
+ summary = summarize_with_llama(llama_ip, json_file_path, token, custom_prompt)
 
 
 
 
1397
  except requests.exceptions.ConnectionError:
1398
+ requests.status_code = "Connection: "
1399
+ elif api_name.lower() == "kobold":
1400
+ token = kobold_api_key
1401
+ kobold_ip = kobold_api_IP
1402
  try:
1403
+ logging.debug(f"MAIN: Trying to summarize with kobold.cpp")
1404
+ summary = summarize_with_kobold(kobold_ip, json_file_path, custom_prompt)
 
 
 
 
1405
  except requests.exceptions.ConnectionError:
1406
+ requests.status_code = "Connection: "
1407
+ elif api_name.lower() == "ooba":
1408
+ token = ooba_api_key
1409
+ ooba_ip = ooba_api_IP
1410
  try:
1411
+ logging.debug(f"MAIN: Trying to summarize with oobabooga")
1412
+ summary = summarize_with_oobabooga(ooba_ip, json_file_path, custom_prompt)
 
 
 
 
1413
  except requests.exceptions.ConnectionError:
1414
+ requests.status_code = "Connection: "
1415
+ elif api_name.lower() == "huggingface":
1416
+ api_key = huggingface_api_key
1417
  try:
1418
+ logging.debug(f"MAIN: Trying to summarize with huggingface")
1419
+ summarize_with_huggingface(api_key, json_file_path, custom_prompt)
 
 
1420
  except requests.exceptions.ConnectionError:
1421
+ requests.status_code = "Connection: "
1422
 
1423
  else:
1424
  logging.warning(f"Unsupported API: {api_name}")
 
1441
  return results
1442
 
1443
 
 
1444
  if __name__ == "__main__":
1445
  parser = argparse.ArgumentParser(description='Transcribe and summarize videos.')
1446
  parser.add_argument('input_path', type=str, help='Path or URL of the video', nargs='?')
1447
+ parser.add_argument('-v', '--video', action='store_true', help='Download the video instead of just the audio')
1448
  parser.add_argument('-api', '--api_name', type=str, help='API name for summarization (optional)')
1449
  parser.add_argument('-ns', '--num_speakers', type=int, default=2, help='Number of speakers (default: 2)')
1450
+ parser.add_argument('-wm', '--whisper_model', type=str, default='small.en',
1451
+ help='Whisper model (default: small.en)')
1452
  parser.add_argument('-off', '--offset', type=int, default=0, help='Offset in seconds (default: 0)')
1453
  parser.add_argument('-vad', '--vad_filter', action='store_true', help='Enable VAD filter')
1454
+ parser.add_argument('-log', '--log_level', type=str, default='INFO',
1455
+ choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], help='Log level (default: INFO)')
1456
  parser.add_argument('-ui', '--user_interface', action='store_true', help='Launch the Gradio user interface')
1457
  parser.add_argument('-demo', '--demo_mode', action='store_true', help='Enable demo mode')
1458
  #parser.add_argument('--log_file', action=str, help='Where to save logfile (non-default)')
1459
  args = parser.parse_args()
1460
+
1461
  print(f"Is CUDA available: {torch.cuda.is_available()}")
1462
  # True
1463
  print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
1464
  # Tesla T4
1465
 
 
1466
  # Since this is running in HF....
1467
  args.user_interface = True
1468
  if args.user_interface:
 
1477
  logging.info('Starting the transcription and summarization process.')
1478
  logging.info(f'Input path: {args.input_path}')
1479
  logging.info(f'API Name: {args.api_name}')
1480
+ logging.debug(f'API Key: {args.api_key}') # ehhhhh
1481
  logging.info(f'Number of speakers: {args.num_speakers}')
1482
  logging.info(f'Whisper model: {args.whisper_model}')
1483
  logging.info(f'Offset: {args.offset}')
1484
  logging.info(f'VAD filter: {args.vad_filter}')
1485
+ logging.info(f'Log Level: {args.log_level}') #lol
1486
 
1487
  if args.api_name and args.api_key:
1488
  logging.info(f'API: {args.api_name}')
 
1499
 
1500
  # Hey, we're in HuggingFace
1501
  launch_ui(demo_mode=args.demo_mode)
1502
+
1503
  try:
1504
+ results = main(args.input_path, api_name=args.api_name, api_key=args.api_key,
1505
+ num_speakers=args.num_speakers, whisper_model=args.whisper_model, offset=args.offset,
1506
+ vad_filter=args.vad_filter, download_video_flag=args.video)
1507
  logging.info('Transcription process completed.')
1508
  except Exception as e:
1509
  logging.error('An error occurred during the transcription process.')
1510
  logging.error(str(e))
1511
  sys.exit(1)