Eason Lu commited on
Commit
f1ae450
1 Parent(s): 1db8078

add youtube task creation

Browse files

Former-commit-id: da4a1419b1c085934eba891dbf1079c2da05888b

.gitignore CHANGED
@@ -10,4 +10,7 @@ test.py
10
  test.srt
11
  test.txt
12
  log_*.csv
13
- log.csv
 
 
 
 
10
  test.srt
11
  test.txt
12
  log_*.csv
13
+ log.csv
14
+ .chroma
15
+ *.ini
16
+ local_dump/
configs/local_launch.yaml ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # launch config for local environment
2
+ model: "gpt-4"
3
+ local_dump: ./local_dump
4
+ output_type: srt
5
+ environ: local
entries/__init_lib_path.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+
4
+ def add_path(custom_path):
5
+ if custom_path not in sys.path: sys.path.insert(0, custom_path)
6
+
7
+ this_dir = os.path.dirname(__file__)
8
+
9
+ lib_path = os.path.join(this_dir, '..')
10
+ add_path(lib_path)
entries/run.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import __init_lib_path
2
+ import logging
3
+ from yaml import Loader, Dumper, load, dump
4
+ from src.task import Task
5
+ import openai
6
+ import argparse
7
+ import os
8
+ from pathlib import Path
9
+ from datetime import datetime
10
+ import shutil
11
+
12
+ def parse_args():
13
+ parser = argparse.ArgumentParser()
14
+ parser.add_argument("--link", help="youtube video link here", default=None, type=str, required=False)
15
+ parser.add_argument("--video_file", help="local video path here", default=None, type=str, required=False)
16
+ parser.add_argument("--audio_file", help="local audio path here", default=None, type=str, required=False)
17
+ parser.add_argument("--srt_file", help="srt file input path here", default=None, type=str, required=False)
18
+ parser.add_argument("--continue", help="task_id that need to continue", default=None, type=str, required=False) # need implement
19
+ parser.add_argument("--launch_cfg", help="launch config path", default='./configs/local_launch.yaml', type=str, required=False)
20
+ args = parser.parse_args()
21
+
22
+ return args
23
+
24
+ if __name__ == "__main__":
25
+ args = parse_args()
26
+ launch_cfg = load(open(args.launch_cfg), Loader=Loader)
27
+
28
+ # initialize dir
29
+ local_dir = Path(launch_cfg['local_dump'])
30
+
31
+ # initialize task queue
32
+ if not local_dir.exists():
33
+ local_dir.mkdir(parents=False, exist_ok=False)
34
+ f = open(local_dir.joinpath("task_queue.yaml"), "w")
35
+ f.write("Task Queue: []\n")
36
+ f.close()
37
+
38
+ # get task id
39
+ tasks_queue = load(open(local_dir.joinpath("task_queue.yaml")), Loader = Loader)
40
+ task_list = tasks_queue['Task Queue']
41
+ task_id = len(task_list)
42
+
43
+ # create locak dir for the task
44
+ task_dir = local_dir.joinpath(f"task_{task_id}")
45
+ task_dir.mkdir(parents=False, exist_ok=False)
46
+ task_dir.joinpath("results").mkdir(parents=False, exist_ok=False)
47
+ task_dir.joinpath("logs").mkdir(parents=False, exist_ok=False)
48
+ f = open(task_dir.joinpath("task_info.yaml"), "w")
49
+ f.write(f"task_id: {task_id}")
50
+ f.close()
51
+
52
+ logging.basicConfig(level=logging.INFO, handlers=[
53
+ logging.FileHandler(
54
+ "{}/{}_{}.log".format(task_dir.joinpath("logs"), f"task_{task_id}", datetime.now().strftime("%m%d%Y_%H%M%S")),
55
+ 'w', encoding='utf-8')])
56
+
57
+ # task create
58
+ if args.link is not None:
59
+ try:
60
+ task = Task.fromYoutubeLink(args.link, task_id, launch_cfg)
61
+ except:
62
+ shutil.rmtree(task_dir)
63
+ raise RuntimeError("failed to create task from youtube link")
64
+
65
+ # add task to the status queue
66
+ task_list.append({"id": task_id, "status": "created", "resource_status:": "local"})
67
+ stream = open(local_dir.joinpath("task_queue.yaml"), "w")
68
+ dump(tasks_queue, stream)
69
+
70
+ task.run_pipeline()
71
+
72
+
73
+
74
+
75
+
76
+
77
+
78
+
79
+
80
+
entries/web_backend.py ADDED
File without changes
src/task.py CHANGED
@@ -9,6 +9,9 @@ from os import getenv
9
  from enum import Enum
10
  from pathlib import Path
11
  from enum import Enum, auto
 
 
 
12
 
13
 
14
  """
@@ -42,6 +45,7 @@ SRT_Script : SrtScript
42
  """
43
 
44
  class TaskStatus(Enum):
 
45
  INITIALIZING_ASR = (auto(), None)
46
  PRE_PROCESSING = (auto(), None)
47
  TRANSLATING = (auto(), 0.0)
@@ -50,22 +54,43 @@ class TaskStatus(Enum):
50
 
51
 
52
  class Task:
53
- def __init__(self, task_id, audio_path, model, output_type):
54
- # openai.api_key = getenv("OPENAI_API_KEY")
55
- self.audio_path = audio_path
56
- self.model = model
57
  self.gpu_status = 0
58
- self.output_type = output_type
59
  self.task_id = task_id
60
  self.progress = NotImplemented
61
  self.SRT_Script = None
62
- self.local_dump = Path()
63
 
64
  @staticmethod
65
- def fromYoutubeLink(youtube_url):
66
  # convert to audio
 
 
 
67
 
68
- return Task(...)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
  @staticmethod
71
  def fromAudioFile():
 
9
  from enum import Enum
10
  from pathlib import Path
11
  from enum import Enum, auto
12
+ import logging
13
+ import subprocess
14
+
15
 
16
 
17
  """
 
45
  """
46
 
47
  class TaskStatus(Enum):
48
+ CREATED = (auto(), None)
49
  INITIALIZING_ASR = (auto(), None)
50
  PRE_PROCESSING = (auto(), None)
51
  TRANSLATING = (auto(), 0.0)
 
54
 
55
 
56
  class Task:
57
+ def __init__(self, task_id, task_local_dir, launch_info):
58
+ openai.api_key = getenv("OPENAI_API_KEY")
59
+ self.task_local_dir = task_local_dir
60
+ self.model = launch_info["model"]
61
  self.gpu_status = 0
62
+ self.output_type = launch_info["output_type"]
63
  self.task_id = task_id
64
  self.progress = NotImplemented
65
  self.SRT_Script = None
66
+
67
 
68
  @staticmethod
69
+ def fromYoutubeLink(youtube_url, task_id, launch_info):
70
  # convert to audio
71
+ local_dump = Path(launch_info['local_dump'])# should get from launch config
72
+ yt = YouTube(youtube_url)
73
+ video = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first()
74
 
75
+ if video:
76
+ video.download(str(local_dump.joinpath(f"task_{task_id}")), filename=f"task_{task_id}.mp4")
77
+ logging.info(f'Video download completed to {local_dump.joinpath(f"task_{task_id}")}!')
78
+ else:
79
+ raise FileNotFoundError(f"Video stream not found for link {youtube_url}")
80
+
81
+ audio = yt.streams.filter(only_audio=True).first()
82
+ if audio:
83
+ audio.download(str(local_dump.joinpath(f"task_{task_id}")), filename=f"task_{task_id}.mp3")
84
+ logging.info(f'Audio download completed to {local_dump.joinpath(f"task_{task_id}")}!')
85
+ else:
86
+ logging.info("download audio failed, using ffmpeg to extract audio")
87
+ subprocess.run(['ffmpeg', '-i', local_dump.joinpath(f"task_{task_id}").joinpath(f"task_{task_id}.mp4"), '-f', 'mp3', '-ab', '192000', '-vn', local_dump.joinpath(f"task_{task_id}").joinpath(f"task_{task_id}.mp3")])
88
+ logging.info("audio extraction finished")
89
+
90
+ logging.info("Task Creation Complete.")
91
+ logging.info("Task Creation method: Youtube Link")
92
+
93
+ return Task(task_id, local_dump.joinpath(f"task_{task_id}"), launch_info)
94
 
95
  @staticmethod
96
  def fromAudioFile():