Ligeng-Zhu
commited on
Upload files with huggingface_hub
Browse filesUpload test.py
Upload panda70m_training_full.csv
Upload main.py
Upload panda70m_training_10m.csv
Upload panda70m_validation.csv
Upload panda70m_testing.csv
Upload panda70m_training_2m.csv
- .gitattributes +3 -0
- main.py +87 -0
- panda70m_testing.csv +0 -0
- panda70m_training_10m.csv +3 -0
- panda70m_training_2m.csv +3 -0
- panda70m_training_full.csv +3 -0
- panda70m_validation.csv +0 -0
- test.py +28 -0
.gitattributes
CHANGED
@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
panda70m_training_full.csv filter=lfs diff=lfs merge=lfs -text
|
37 |
+
panda70m_training_10m.csv filter=lfs diff=lfs merge=lfs -text
|
38 |
+
panda70m_training_2m.csv filter=lfs diff=lfs merge=lfs -text
|
main.py
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import sys, os, os.path as osp
|
3 |
+
import yt_dlp
|
4 |
+
import asyncio
|
5 |
+
|
6 |
+
import fire
|
7 |
+
import pandas as pd
|
8 |
+
from random import random
|
9 |
+
from concurrent.futures import ProcessPoolExecutor
|
10 |
+
|
11 |
+
|
12 |
+
def ytb_download(uid, url, json_info, output_dir="ytb_videos/"):
|
13 |
+
os.makedirs(output_dir, exist_ok=True)
|
14 |
+
# uid = url.split("?v=")[-1]
|
15 |
+
yt_opts = {
|
16 |
+
"format": "best", # Download the best quality available
|
17 |
+
"outtmpl": osp.join(output_dir, f"{uid}.%(ext)s"), # Set the output template
|
18 |
+
"postprocessors": [
|
19 |
+
{
|
20 |
+
"key": "FFmpegVideoConvertor",
|
21 |
+
"preferedformat": "mp4", # Convert video to mp4 format
|
22 |
+
}
|
23 |
+
],
|
24 |
+
}
|
25 |
+
|
26 |
+
video_path = osp.join(output_dir, f"{uid}.mp4")
|
27 |
+
meta_path = osp.join(output_dir, f"{uid}.json")
|
28 |
+
if osp.exists(video_path) and osp.exists(meta_path):
|
29 |
+
print(f"{uid} already labeled.")
|
30 |
+
return 0
|
31 |
+
|
32 |
+
try:
|
33 |
+
with yt_dlp.YoutubeDL(yt_opts) as ydl:
|
34 |
+
ydl.download([url])
|
35 |
+
with open(osp.join(output_dir, f"{uid}.json"), "w") as fp:
|
36 |
+
json.dump(json_info, fp, indent=2)
|
37 |
+
return 0
|
38 |
+
except:
|
39 |
+
return -1
|
40 |
+
|
41 |
+
|
42 |
+
async def main(csv_path, max_workers=256, shards=0, total=-1, limit=False):
|
43 |
+
PPE = ProcessPoolExecutor(max_workers=max_workers)
|
44 |
+
loop = asyncio.get_event_loop()
|
45 |
+
|
46 |
+
df = pd.read_csv(csv_path)
|
47 |
+
output_dir = csv_path.split(".")[0]
|
48 |
+
|
49 |
+
tasks = []
|
50 |
+
|
51 |
+
data_list = list(df.iterrows())
|
52 |
+
|
53 |
+
if total > 0:
|
54 |
+
chunk = len(data_list) // total
|
55 |
+
begin_idx = shards * chunk
|
56 |
+
end_idx = (shards + 1) * chunk
|
57 |
+
if shards == total - 1:
|
58 |
+
end_idx = len(data_list)
|
59 |
+
data_list = data_list[begin_idx:end_idx]
|
60 |
+
print(f"download total {len(data_list)} videos")
|
61 |
+
|
62 |
+
for idx, (index, row) in enumerate(data_list):
|
63 |
+
uid = row["videoID"]
|
64 |
+
url = row["url"]
|
65 |
+
|
66 |
+
json_info = {
|
67 |
+
"timestamp": eval(row["timestamp"]),
|
68 |
+
"caption": eval(row["caption"]),
|
69 |
+
"matching_score": eval(row["matching_score"]),
|
70 |
+
}
|
71 |
+
|
72 |
+
tasks.append(
|
73 |
+
loop.run_in_executor(PPE, ytb_download, uid, url, json_info, output_dir)
|
74 |
+
)
|
75 |
+
if idx >= 20 and limit:
|
76 |
+
break
|
77 |
+
res = await asyncio.gather(*tasks)
|
78 |
+
|
79 |
+
print(f"[{sum(res)} / {len(res)}]")
|
80 |
+
|
81 |
+
|
82 |
+
def entry(csv="panda70m_testing.csv", shards=0, total=-1, limit=False):
|
83 |
+
asyncio.run(main(csv, shards=shards, total=total, limit=limit))
|
84 |
+
|
85 |
+
|
86 |
+
if __name__ == "__main__":
|
87 |
+
fire.Fire(entry)
|
panda70m_testing.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
panda70m_training_10m.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c7b0eae210bff532b9753a987a4911407d2bfb008cfca2dc3c02957082406e26
|
3 |
+
size 1453874594
|
panda70m_training_2m.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b5ea91bfa797ad82a73d5a3ce354f3f1af80da7feb74837b87c470b2e739ae3d
|
3 |
+
size 329152920
|
panda70m_training_full.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1d32b0fb20505952004299d8be3dfa1b56436f75ce94dbd2c85923065d9238df
|
3 |
+
size 8409689605
|
panda70m_validation.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
test.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from random import random
|
2 |
+
import asyncio
|
3 |
+
|
4 |
+
|
5 |
+
# task coroutine
|
6 |
+
async def task(semaphore, number):
|
7 |
+
# acquire the semaphore
|
8 |
+
async with semaphore:
|
9 |
+
# generate a random value between 0 and 1
|
10 |
+
value = random() + 2
|
11 |
+
# block for a moment
|
12 |
+
await asyncio.sleep(value)
|
13 |
+
# report a message
|
14 |
+
print(f"Task {number} got {value}")
|
15 |
+
|
16 |
+
|
17 |
+
# main coroutine
|
18 |
+
async def main():
|
19 |
+
# create the shared semaphore
|
20 |
+
semaphore = asyncio.Semaphore(2)
|
21 |
+
# create and schedule tasks
|
22 |
+
tasks = [asyncio.create_task(task(semaphore, i)) for i in range(10)]
|
23 |
+
# wait for all tasks to complete
|
24 |
+
_ = await asyncio.wait(tasks)
|
25 |
+
|
26 |
+
|
27 |
+
# start the asyncio program
|
28 |
+
asyncio.run(main())
|