Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -1,7 +1,14 @@
|
|
1 |
-
import spaces
|
2 |
import gradio as gr
|
3 |
import subprocess
|
4 |
from deep_translator import GoogleTranslator
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
# Gerekli kütüphanelerin kurulumu
|
7 |
subprocess.run(
|
@@ -11,64 +18,42 @@ subprocess.run(
|
|
11 |
)
|
12 |
subprocess.run("pip install deep_translator", shell=True)
|
13 |
|
14 |
-
import torch
|
15 |
-
from llava.model.builder import load_pretrained_model
|
16 |
-
from llava.mm_utils import get_model_name_from_path, process_images, tokenizer_image_token
|
17 |
-
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, IGNORE_INDEX
|
18 |
-
from llava.conversation import conv_templates, SeparatorStyle
|
19 |
-
import copy
|
20 |
-
import warnings
|
21 |
-
from decord import VideoReader, cpu
|
22 |
-
import numpy as np
|
23 |
-
|
24 |
# Çevirmen nesnesi oluştur
|
25 |
translator = GoogleTranslator(source='tr', target='en')
|
26 |
translator_reverse = GoogleTranslator(source='en', target='tr')
|
27 |
|
28 |
title = "# 🙋🏻♂️🌟Tonic'in 🌋📹LLaVA-Video'suna Hoş Geldiniz!"
|
29 |
-
description1 = """**🌋📹LLaVA-Video-7B-Qwen2**,
|
30 |
-
Bu model, görsel girdi için **SO400M görüş omurgasını** ve dil işleme için Qwen2'yi kullanır, bu da onu görsel ve video tabanlı görevler de dahil olmak üzere çoklu modal akıl yürütmede oldukça verimli kılar.
|
31 |
-
🌋📹LLaVA-Video'nun [32B](https://huggingface.co/lmms-lab/LLaVA-NeXT-Video-32B-Qwen) ve [72B](https://huggingface.co/lmms-lab/LLaVA-Video-72B-Qwen2) daha büyük varyantları ve [sadece yeni sentetik veriler üzerinde eğitilmiş bir varyantı](https://huggingface.co/lmms-lab/LLaVA-Video-7B-Qwen2-Video-Only) bulunmaktadır.
|
32 |
-
Daha fazla detay için lütfen [Proje Sayfasını](https://github.com/LLaVA-VL/LLaVA-NeXT) ziyaret edin veya ilgili [araştırma makalesine](https://arxiv.org/abs/2410.02713) göz atın.
|
33 |
-
- **Mimari**: `LlavaQwenForCausalLM`
|
34 |
-
- **Dikkat Başlıkları**: 28
|
35 |
-
- **Gizli Katmanlar**: 28
|
36 |
-
- **Gizli Boyut**: 3584
|
37 |
"""
|
38 |
description2 = """
|
39 |
-
|
40 |
-
- **Desteklenen Maksimum Kare Sayısı**: 64
|
41 |
-
- **Desteklenen Diller**: İngilizce, Çince
|
42 |
-
- **Görüntü En-Boy Oranı**: `anyres_max_9`
|
43 |
-
- **Görüntü Çözünürlüğü**: Çeşitli ızgara çözünürlükleri
|
44 |
-
- **Maksimum Konum Gömmeleri**: 32,768
|
45 |
-
- **Kelime Dağarcığı Boyutu**: 152,064
|
46 |
-
- **Model Hassasiyeti**: bfloat16
|
47 |
-
- **Eğitim İçin Kullanılan Donanım**: 256 * Nvidia Tesla A100 GPU'ları
|
48 |
"""
|
49 |
|
50 |
join_us = """
|
51 |
## Bize Katılın:
|
52 |
-
|
53 |
"""
|
54 |
|
55 |
def load_video(video_path, max_frames_num, fps=1, force_sample=False):
|
56 |
if max_frames_num == 0:
|
57 |
return np.zeros((1, 336, 336, 3))
|
|
|
58 |
vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
|
59 |
total_frame_num = len(vr)
|
60 |
-
video_time = total_frame_num / vr.get_avg_fps()
|
61 |
fps = round(vr.get_avg_fps()/fps)
|
62 |
frame_idx = [i for i in range(0, len(vr), fps)]
|
63 |
-
frame_time = [i/
|
|
|
64 |
if len(frame_idx) > max_frames_num or force_sample:
|
65 |
sample_fps = max_frames_num
|
66 |
uniform_sampled_frames = np.linspace(0, total_frame_num - 1, sample_fps, dtype=int)
|
67 |
frame_idx = uniform_sampled_frames.tolist()
|
68 |
frame_time = [i/vr.get_avg_fps() for i in frame_idx]
|
|
|
69 |
frame_time = ",".join([f"{i:.2f}s" for i in frame_time])
|
70 |
spare_frames = vr.get_batch(frame_idx).asnumpy()
|
71 |
-
|
|
|
72 |
|
73 |
# Model yükleme
|
74 |
pretrained = "lmms-lab/LLaVA-Video-7B-Qwen2"
|
@@ -81,7 +66,6 @@ tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained
|
|
81 |
model.eval()
|
82 |
print("Model başarıyla yüklendi!")
|
83 |
|
84 |
-
@spaces.GPU
|
85 |
def process_video(video_path, question):
|
86 |
try:
|
87 |
max_frames_num = 64
|
@@ -101,8 +85,8 @@ def process_video(video_path, question):
|
|
101 |
conv.append_message(conv.roles[1], None)
|
102 |
prompt_question = conv.get_prompt()
|
103 |
|
104 |
-
input_ids = tokenizer_image_token(prompt_question, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").
|
105 |
-
|
106 |
with torch.no_grad():
|
107 |
output = model.generate(
|
108 |
input_ids,
|
@@ -150,4 +134,4 @@ with gr.Blocks() as demo:
|
|
150 |
)
|
151 |
|
152 |
if __name__ == "__main__":
|
153 |
-
demo.launch(show_error=True)
|
|
|
|
|
1 |
import gradio as gr
|
2 |
import subprocess
|
3 |
from deep_translator import GoogleTranslator
|
4 |
+
import torch
|
5 |
+
from llava.model.builder import load_pretrained_model
|
6 |
+
from llava.mm_utils import tokenizer_image_token
|
7 |
+
from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
|
8 |
+
from llava.conversation import conv_templates
|
9 |
+
from decord import VideoReader, cpu
|
10 |
+
import numpy as np
|
11 |
+
import copy
|
12 |
|
13 |
# Gerekli kütüphanelerin kurulumu
|
14 |
subprocess.run(
|
|
|
18 |
)
|
19 |
subprocess.run("pip install deep_translator", shell=True)
|
20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
# Çevirmen nesnesi oluştur
|
22 |
translator = GoogleTranslator(source='tr', target='en')
|
23 |
translator_reverse = GoogleTranslator(source='en', target='tr')
|
24 |
|
25 |
title = "# 🙋🏻♂️🌟Tonic'in 🌋📹LLaVA-Video'suna Hoş Geldiniz!"
|
26 |
+
description1 = """**🌋📹LLaVA-Video-7B-Qwen2**, ...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
"""
|
28 |
description2 = """
|
29 |
+
...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
"""
|
31 |
|
32 |
join_us = """
|
33 |
## Bize Katılın:
|
34 |
+
...
|
35 |
"""
|
36 |
|
37 |
def load_video(video_path, max_frames_num, fps=1, force_sample=False):
|
38 |
if max_frames_num == 0:
|
39 |
return np.zeros((1, 336, 336, 3))
|
40 |
+
|
41 |
vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
|
42 |
total_frame_num = len(vr)
|
|
|
43 |
fps = round(vr.get_avg_fps()/fps)
|
44 |
frame_idx = [i for i in range(0, len(vr), fps)]
|
45 |
+
frame_time = [i/vr.get_avg_fps() for i in frame_idx]
|
46 |
+
|
47 |
if len(frame_idx) > max_frames_num or force_sample:
|
48 |
sample_fps = max_frames_num
|
49 |
uniform_sampled_frames = np.linspace(0, total_frame_num - 1, sample_fps, dtype=int)
|
50 |
frame_idx = uniform_sampled_frames.tolist()
|
51 |
frame_time = [i/vr.get_avg_fps() for i in frame_idx]
|
52 |
+
|
53 |
frame_time = ",".join([f"{i:.2f}s" for i in frame_time])
|
54 |
spare_frames = vr.get_batch(frame_idx).asnumpy()
|
55 |
+
|
56 |
+
return spare_frames, frame_time, total_frame_num / vr.get_avg_fps()
|
57 |
|
58 |
# Model yükleme
|
59 |
pretrained = "lmms-lab/LLaVA-Video-7B-Qwen2"
|
|
|
66 |
model.eval()
|
67 |
print("Model başarıyla yüklendi!")
|
68 |
|
|
|
69 |
def process_video(video_path, question):
|
70 |
try:
|
71 |
max_frames_num = 64
|
|
|
85 |
conv.append_message(conv.roles[1], None)
|
86 |
prompt_question = conv.get_prompt()
|
87 |
|
88 |
+
input_ids = tokenizer_image_token(prompt_question, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").to(device)
|
89 |
+
|
90 |
with torch.no_grad():
|
91 |
output = model.generate(
|
92 |
input_ids,
|
|
|
134 |
)
|
135 |
|
136 |
if __name__ == "__main__":
|
137 |
+
demo.launch(show_error=True)
|