Spaces:
Runtime error
Runtime error
cocktailpeanut
commited on
Commit
•
9dfb729
1
Parent(s):
c32e4c7
update
Browse files- app.py +40 -39
- requirements.txt +4 -4
app.py
CHANGED
@@ -11,23 +11,10 @@ import torchvision.transforms as transforms
|
|
11 |
from decord import VideoReader
|
12 |
from PIL import Image, ImageDraw, ImageFont
|
13 |
from transformers import AutoModel, AutoTokenizer
|
|
|
|
|
|
|
14 |
|
15 |
-
import spaces
|
16 |
-
|
17 |
-
title_markdown = ("""
|
18 |
-
<div style="display: flex; justify-content: flex-start; align-items: center; text-align: center;">
|
19 |
-
<div style="margin-right: 20px; display: flex; align-items: center;">
|
20 |
-
<a href="https://github.com/ShareGPT4Omni/ShareGPT4Video" style="text-decoration: none; display: flex; align-items: center;">
|
21 |
-
<img src="https://raw.githubusercontent.com/ShareGPT4V/ShareGPT4V-Resources/master/images/share4video_tight.png" alt="ShareGPT4Video🚀" style="max-width: 120px; height: auto;">
|
22 |
-
</a>
|
23 |
-
</div>
|
24 |
-
<div>
|
25 |
-
<h1>ShareGPT4Video: Improving Video Understanding and Generation with Better Captions</h1>
|
26 |
-
<h5 style="margin: 0;">If you like our project, please give us a star ✨ on Github for the latest update.</h5>
|
27 |
-
<h5 style="margin: 0;"> <a href="https://sharegpt4video.github.io/">[Project Page]</a> <a href="https://github.com/ShareGPT4Omni/ShareGPT4Video">[Code]</a> <a href="https://arxiv.org/abs/2406.04325v1">[Paper]</a>
|
28 |
-
</div>
|
29 |
-
</div>
|
30 |
-
""")
|
31 |
|
32 |
block_css = """
|
33 |
#buttons button {
|
@@ -35,17 +22,14 @@ block_css = """
|
|
35 |
}
|
36 |
"""
|
37 |
|
38 |
-
|
39 |
-
### License
|
40 |
-
The service is a research preview intended for non-commercial use only, subject to the model [License](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) of LLaMA, [Terms of Use](https://openai.com/policies/terms-of-use) of the data generated by OpenAI, and [Privacy Practices](https://chrome.google.com/webstore/detail/sharegpt-share-your-chatg/daiacboceoaocpibfodeljbdfacokfjb) of ShareGPT. Please contact us if you find any potential violation.
|
41 |
-
""")
|
42 |
-
|
43 |
-
|
44 |
new_path = 'Lin-Chen/ShareCaptioner-Video'
|
45 |
tokenizer = AutoTokenizer.from_pretrained(new_path, trust_remote_code=True)
|
46 |
model = AutoModel.from_pretrained(
|
47 |
-
new_path, torch_dtype=torch.float16, trust_remote_code=True).cuda().eval()
|
48 |
-
|
|
|
|
|
49 |
model.tokenizer = tokenizer
|
50 |
|
51 |
|
@@ -120,7 +104,8 @@ def model_gen(model, text, images, need_bos=True, hd_num=25, max_new_token=2048,
|
|
120 |
text_embeds = model.encode_text(
|
121 |
subtext, add_special_tokens=need_bos)
|
122 |
embeds.append(text_embeds)
|
123 |
-
im_mask.append(torch.zeros(text_embeds.shape[:2]).cuda())
|
|
|
124 |
need_bos = False
|
125 |
if i < len(images):
|
126 |
try:
|
@@ -129,11 +114,13 @@ def model_gen(model, text, images, need_bos=True, hd_num=25, max_new_token=2048,
|
|
129 |
image = images[i].convert('RGB')
|
130 |
|
131 |
image = HD_transform(image, hd_num=hd_num)
|
132 |
-
image = model.vis_processor(image).unsqueeze(0).cuda()
|
|
|
133 |
image_embeds = model.encode_img(image)
|
134 |
print(image_embeds.shape)
|
135 |
embeds.append(image_embeds)
|
136 |
-
im_mask.append(torch.ones(image_embeds.shape[:2]).cuda())
|
|
|
137 |
pt1 = pts
|
138 |
embeds = torch.cat(embeds, dim=1)
|
139 |
im_mask = torch.cat(im_mask, dim=1)
|
@@ -232,14 +219,17 @@ def encode_resized_image(image_path, max_size=1024):
|
|
232 |
return base64.b64encode(buffer.getvalue()).decode('utf-8')
|
233 |
|
234 |
|
235 |
-
|
236 |
def generate_slidingcaptioning(video_path):
|
237 |
imgs = load_quota_video(video_path)
|
238 |
q = 'This is the first frame of a video, describe it in detail.'
|
239 |
query = f'[UNUSED_TOKEN_146]user\n{q}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
|
240 |
img = imgs[0]
|
241 |
-
|
242 |
-
|
|
|
|
|
|
|
243 |
print(response)
|
244 |
responses = [response]
|
245 |
images = [img]
|
@@ -253,7 +243,10 @@ def generate_slidingcaptioning(video_path):
|
|
253 |
new_img.paste(image1, (0, 0))
|
254 |
new_img.paste(image2, (0, height+50))
|
255 |
query = f'[UNUSED_TOKEN_146]user\n{prompt}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
|
256 |
-
|
|
|
|
|
|
|
257 |
response = model_gen(model, query, new_img, hd_num=9)
|
258 |
responses.append(response)
|
259 |
images.append(new_img)
|
@@ -263,29 +256,39 @@ def generate_slidingcaptioning(video_path):
|
|
263 |
idx+1, idx*2, txt)
|
264 |
query = f'[UNUSED_TOKEN_146]user\n{prompt}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
|
265 |
print(query)
|
266 |
-
|
|
|
|
|
|
|
267 |
summ = model_gen(model, query, None, hd_num=16)
|
268 |
print(summ)
|
269 |
return summ
|
270 |
|
271 |
|
272 |
-
|
273 |
def generate_fastcaptioning(video_path):
|
274 |
q = 'Here are a few key frames of a video, discribe this video in detail.'
|
275 |
query = f'[UNUSED_TOKEN_146]user\n{q}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
|
276 |
imgs = load_quota_video(video_path)
|
277 |
img = img_process(imgs)
|
278 |
-
|
|
|
|
|
|
|
|
|
279 |
response = model_gen(model, query, img, hd_num=16,
|
280 |
-
|
281 |
return response
|
282 |
|
283 |
|
284 |
-
|
285 |
def generate_promptrecaptioning(text):
|
286 |
q = f'Translate this brief generation prompt into a detailed caption: {text}'
|
287 |
query = f'[UNUSED_TOKEN_146]user\n{q}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
|
288 |
-
|
|
|
|
|
|
|
289 |
response = model_gen(model, query, None)
|
290 |
return response
|
291 |
|
@@ -298,7 +301,6 @@ def save_video_to_local(video_path):
|
|
298 |
|
299 |
|
300 |
with gr.Blocks(title='ShareCaptioner-Video', theme=gr.themes.Default(), css=block_css) as demo:
|
301 |
-
gr.Markdown(title_markdown)
|
302 |
state = gr.State()
|
303 |
state_ = gr.State()
|
304 |
first_run = gr.State()
|
@@ -333,7 +335,6 @@ with gr.Blocks(title='ShareCaptioner-Video', theme=gr.themes.Default(), css=bloc
|
|
333 |
textbox_out = gr.Textbox(
|
334 |
show_label=False, placeholder="Output", container=False
|
335 |
)
|
336 |
-
gr.Markdown(learn_more_markdown)
|
337 |
|
338 |
submit_btn_sc.click(generate_slidingcaptioning, [video], [textbox_out])
|
339 |
submit_btn_fc.click(generate_fastcaptioning, [video], [textbox_out])
|
|
|
11 |
from decord import VideoReader
|
12 |
from PIL import Image, ImageDraw, ImageFont
|
13 |
from transformers import AutoModel, AutoTokenizer
|
14 |
+
import devicetorch
|
15 |
+
|
16 |
+
#import spaces
|
17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
block_css = """
|
20 |
#buttons button {
|
|
|
22 |
}
|
23 |
"""
|
24 |
|
25 |
+
device = devicetorch.get(torch)
|
|
|
|
|
|
|
|
|
|
|
26 |
new_path = 'Lin-Chen/ShareCaptioner-Video'
|
27 |
tokenizer = AutoTokenizer.from_pretrained(new_path, trust_remote_code=True)
|
28 |
model = AutoModel.from_pretrained(
|
29 |
+
#new_path, torch_dtype=torch.float16, trust_remote_code=True).cuda().eval()
|
30 |
+
new_path, torch_dtype=torch.float16, trust_remote_code=True).to(device).eval()
|
31 |
+
#model.cuda()
|
32 |
+
model.to(device)
|
33 |
model.tokenizer = tokenizer
|
34 |
|
35 |
|
|
|
104 |
text_embeds = model.encode_text(
|
105 |
subtext, add_special_tokens=need_bos)
|
106 |
embeds.append(text_embeds)
|
107 |
+
#im_mask.append(torch.zeros(text_embeds.shape[:2]).cuda())
|
108 |
+
im_mask.append(torch.zeros(text_embeds.shape[:2]).to(device))
|
109 |
need_bos = False
|
110 |
if i < len(images):
|
111 |
try:
|
|
|
114 |
image = images[i].convert('RGB')
|
115 |
|
116 |
image = HD_transform(image, hd_num=hd_num)
|
117 |
+
#image = model.vis_processor(image).unsqueeze(0).cuda()
|
118 |
+
image = model.vis_processor(image).unsqueeze(0).to(device)
|
119 |
image_embeds = model.encode_img(image)
|
120 |
print(image_embeds.shape)
|
121 |
embeds.append(image_embeds)
|
122 |
+
#im_mask.append(torch.ones(image_embeds.shape[:2]).cuda())
|
123 |
+
im_mask.append(torch.ones(image_embeds.shape[:2]).to(device))
|
124 |
pt1 = pts
|
125 |
embeds = torch.cat(embeds, dim=1)
|
126 |
im_mask = torch.cat(im_mask, dim=1)
|
|
|
219 |
return base64.b64encode(buffer.getvalue()).decode('utf-8')
|
220 |
|
221 |
|
222 |
+
#@spaces.GPU(duration=60)
|
223 |
def generate_slidingcaptioning(video_path):
|
224 |
imgs = load_quota_video(video_path)
|
225 |
q = 'This is the first frame of a video, describe it in detail.'
|
226 |
query = f'[UNUSED_TOKEN_146]user\n{q}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
|
227 |
img = imgs[0]
|
228 |
+
if device == "cuda":
|
229 |
+
with torch.cuda.amp.autocast():
|
230 |
+
response = model_gen(model, query, img, hd_num=9)
|
231 |
+
else:
|
232 |
+
response = model_gen(model, query, img, hd_num=9)
|
233 |
print(response)
|
234 |
responses = [response]
|
235 |
images = [img]
|
|
|
243 |
new_img.paste(image1, (0, 0))
|
244 |
new_img.paste(image2, (0, height+50))
|
245 |
query = f'[UNUSED_TOKEN_146]user\n{prompt}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
|
246 |
+
if device == "cuda":
|
247 |
+
with torch.cuda.amp.autocast():
|
248 |
+
response = model_gen(model, query, new_img, hd_num=9)
|
249 |
+
else:
|
250 |
response = model_gen(model, query, new_img, hd_num=9)
|
251 |
responses.append(response)
|
252 |
images.append(new_img)
|
|
|
256 |
idx+1, idx*2, txt)
|
257 |
query = f'[UNUSED_TOKEN_146]user\n{prompt}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
|
258 |
print(query)
|
259 |
+
if device == "cuda":
|
260 |
+
with torch.cuda.amp.autocast():
|
261 |
+
summ = model_gen(model, query, None, hd_num=16)
|
262 |
+
else:
|
263 |
summ = model_gen(model, query, None, hd_num=16)
|
264 |
print(summ)
|
265 |
return summ
|
266 |
|
267 |
|
268 |
+
#@spaces.GPU(duration=60)
|
269 |
def generate_fastcaptioning(video_path):
|
270 |
q = 'Here are a few key frames of a video, discribe this video in detail.'
|
271 |
query = f'[UNUSED_TOKEN_146]user\n{q}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
|
272 |
imgs = load_quota_video(video_path)
|
273 |
img = img_process(imgs)
|
274 |
+
if device == "cuda":
|
275 |
+
with torch.cuda.amp.autocast():
|
276 |
+
response = model_gen(model, query, img, hd_num=16,
|
277 |
+
do_sample=False, beam=3)
|
278 |
+
else:
|
279 |
response = model_gen(model, query, img, hd_num=16,
|
280 |
+
do_sample=False, beam=3)
|
281 |
return response
|
282 |
|
283 |
|
284 |
+
#@spaces.GPU(duration=60)
|
285 |
def generate_promptrecaptioning(text):
|
286 |
q = f'Translate this brief generation prompt into a detailed caption: {text}'
|
287 |
query = f'[UNUSED_TOKEN_146]user\n{q}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n'
|
288 |
+
if device == "cuda":
|
289 |
+
with torch.cuda.amp.autocast():
|
290 |
+
response = model_gen(model, query, None)
|
291 |
+
else:
|
292 |
response = model_gen(model, query, None)
|
293 |
return response
|
294 |
|
|
|
301 |
|
302 |
|
303 |
with gr.Blocks(title='ShareCaptioner-Video', theme=gr.themes.Default(), css=block_css) as demo:
|
|
|
304 |
state = gr.State()
|
305 |
state_ = gr.State()
|
306 |
first_run = gr.State()
|
|
|
335 |
textbox_out = gr.Textbox(
|
336 |
show_label=False, placeholder="Output", container=False
|
337 |
)
|
|
|
338 |
|
339 |
submit_btn_sc.click(generate_slidingcaptioning, [video], [textbox_out])
|
340 |
submit_btn_fc.click(generate_fastcaptioning, [video], [textbox_out])
|
requirements.txt
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
-
torch==2.1.2
|
2 |
-
torchvision==0.16.2
|
3 |
transformers==4.37.2
|
4 |
tokenizers==0.15.1
|
5 |
sentencepiece==0.1.99
|
@@ -13,12 +13,12 @@ scikit-learn==1.2.2
|
|
13 |
gradio==4.16.0
|
14 |
gradio_client==0.8.1
|
15 |
openai
|
16 |
-
spaces
|
17 |
requests
|
18 |
httpx==0.24.0
|
19 |
uvicorn
|
20 |
fastapi
|
21 |
-
decord
|
22 |
einops==0.6.1
|
23 |
einops-exts==0.0.4
|
24 |
timm==0.6.13
|
|
|
1 |
+
#torch==2.1.2
|
2 |
+
#torchvision==0.16.2
|
3 |
transformers==4.37.2
|
4 |
tokenizers==0.15.1
|
5 |
sentencepiece==0.1.99
|
|
|
13 |
gradio==4.16.0
|
14 |
gradio_client==0.8.1
|
15 |
openai
|
16 |
+
#spaces
|
17 |
requests
|
18 |
httpx==0.24.0
|
19 |
uvicorn
|
20 |
fastapi
|
21 |
+
#decord
|
22 |
einops==0.6.1
|
23 |
einops-exts==0.0.4
|
24 |
timm==0.6.13
|