Spaces:
Sleeping
Sleeping
momergul
commited on
Commit
•
554adbb
1
Parent(s):
5f8e458
Update
Browse files- app.py +7 -6
- joint_inference.py +7 -0
- requirements.txt +1 -1
app.py
CHANGED
@@ -23,7 +23,7 @@ css="""
|
|
23 |
def initialize_game() -> List[List[str]]:
|
24 |
context_dicts = [generate_complete_game() for _ in range(2)]
|
25 |
|
26 |
-
roles = ["
|
27 |
speaker_images = []
|
28 |
listener_images = []
|
29 |
targets = []
|
@@ -40,7 +40,6 @@ def get_model_response(
|
|
40 |
model, adapter_name, processor, index_to_token, role: str,
|
41 |
image_paths: List[str], user_message: str = "", target_image: str = ""
|
42 |
) -> str:
|
43 |
-
model.model.set_adapter(adapter_name)
|
44 |
if role == "speaker":
|
45 |
img_dir = "tangram_pngs"
|
46 |
print("Starting processing")
|
@@ -50,7 +49,7 @@ def get_model_response(
|
|
50 |
image_paths = [image_paths]
|
51 |
print("Starting inference")
|
52 |
captions = get_speaker_response(model, images, input_tokens, attn_mask, image_attn_mask, label, image_paths,
|
53 |
-
processor, img_dir, index_to_token)
|
54 |
print("Done")
|
55 |
response = captions[0]
|
56 |
else: # listener
|
@@ -63,14 +62,15 @@ def get_model_response(
|
|
63 |
print("Starting inference")
|
64 |
response = get_listener_response(
|
65 |
model, images, l_input_tokens, l_attn_mask, l_image_attn_mask, index_to_token,
|
66 |
-
s_input_tokens, s_attn_mask, s_image_attn_mask, s_target_mask, s_target_label, image_paths
|
67 |
)
|
68 |
print("Done")
|
69 |
|
70 |
return response
|
71 |
|
72 |
@spaces.GPU(duration=20)
|
73 |
-
def get_speaker_response(model, images, input_tokens, attn_mask, image_attn_mask, label, image_paths, processor, img_dir, index_to_token):
|
|
|
74 |
model = model.cuda()
|
75 |
with torch.no_grad():
|
76 |
captions, _, _, _, _ = model.generate(
|
@@ -83,7 +83,8 @@ def get_speaker_response(model, images, input_tokens, attn_mask, image_attn_mask
|
|
83 |
|
84 |
@spaces.GPU(duration=20)
|
85 |
def get_listener_response(model, images, l_input_tokens, l_attn_mask, l_image_attn_mask, index_to_token,
|
86 |
-
s_input_tokens, s_attn_mask, s_image_attn_mask, s_target_mask, s_target_label, image_paths):
|
|
|
87 |
model = model.cuda()
|
88 |
with torch.no_grad():
|
89 |
_, _, joint_log_probs = model.comprehension_side([
|
|
|
23 |
def initialize_game() -> List[List[str]]:
|
24 |
context_dicts = [generate_complete_game() for _ in range(2)]
|
25 |
|
26 |
+
roles = ["speaker"] * 3 + ["listener"] * 3
|
27 |
speaker_images = []
|
28 |
listener_images = []
|
29 |
targets = []
|
|
|
40 |
model, adapter_name, processor, index_to_token, role: str,
|
41 |
image_paths: List[str], user_message: str = "", target_image: str = ""
|
42 |
) -> str:
|
|
|
43 |
if role == "speaker":
|
44 |
img_dir = "tangram_pngs"
|
45 |
print("Starting processing")
|
|
|
49 |
image_paths = [image_paths]
|
50 |
print("Starting inference")
|
51 |
captions = get_speaker_response(model, images, input_tokens, attn_mask, image_attn_mask, label, image_paths,
|
52 |
+
processor, img_dir, index_to_token, adapter_name)
|
53 |
print("Done")
|
54 |
response = captions[0]
|
55 |
else: # listener
|
|
|
62 |
print("Starting inference")
|
63 |
response = get_listener_response(
|
64 |
model, images, l_input_tokens, l_attn_mask, l_image_attn_mask, index_to_token,
|
65 |
+
s_input_tokens, s_attn_mask, s_image_attn_mask, s_target_mask, s_target_label, image_paths, adapter_name
|
66 |
)
|
67 |
print("Done")
|
68 |
|
69 |
return response
|
70 |
|
71 |
@spaces.GPU(duration=20)
|
72 |
+
def get_speaker_response(model, images, input_tokens, attn_mask, image_attn_mask, label, image_paths, processor, img_dir, index_to_token, adapter_name):
|
73 |
+
model.model.set_adapter(adapter_name)
|
74 |
model = model.cuda()
|
75 |
with torch.no_grad():
|
76 |
captions, _, _, _, _ = model.generate(
|
|
|
83 |
|
84 |
@spaces.GPU(duration=20)
|
85 |
def get_listener_response(model, images, l_input_tokens, l_attn_mask, l_image_attn_mask, index_to_token,
|
86 |
+
s_input_tokens, s_attn_mask, s_image_attn_mask, s_target_mask, s_target_label, image_paths, adapter_name):
|
87 |
+
model.model.set_adapter(adapter_name)
|
88 |
model = model.cuda()
|
89 |
with torch.no_grad():
|
90 |
_, _, joint_log_probs = model.comprehension_side([
|
joint_inference.py
CHANGED
@@ -346,6 +346,7 @@ class IdeficsJointInferenceModel(nn.Module):
|
|
346 |
speaker = self.get_speaker()
|
347 |
generation_config = GenerationConfig(
|
348 |
max_new_tokens=max_steps,
|
|
|
349 |
do_sample=True,
|
350 |
temperature=temperature,
|
351 |
top_k=top_k, top_p=top_p,
|
@@ -436,6 +437,12 @@ class IdeficsJointInferenceModel(nn.Module):
|
|
436 |
output_hidden_states=True,
|
437 |
return_dict_in_generate=True
|
438 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
439 |
outputs = speaker.generate(
|
440 |
input_ids=s_input_tokens,
|
441 |
attention_mask=s_attn_mask,
|
|
|
346 |
speaker = self.get_speaker()
|
347 |
generation_config = GenerationConfig(
|
348 |
max_new_tokens=max_steps,
|
349 |
+
min_new_tokens=1,
|
350 |
do_sample=True,
|
351 |
temperature=temperature,
|
352 |
top_k=top_k, top_p=top_p,
|
|
|
437 |
output_hidden_states=True,
|
438 |
return_dict_in_generate=True
|
439 |
)
|
440 |
+
|
441 |
+
print(torch.any(torch.isnan(s_input_tokens)))
|
442 |
+
print(torch.any(torch.isnan(s_attn_mask)))
|
443 |
+
print(torch.any(torch.isnan(images)))
|
444 |
+
print(torch.any(torch.isnan(s_image_attn_mask)))
|
445 |
+
|
446 |
outputs = speaker.generate(
|
447 |
input_ids=s_input_tokens,
|
448 |
attention_mask=s_attn_mask,
|
requirements.txt
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
torch==2.2.
|
2 |
datasets==2.18.0
|
3 |
transformers==4.40.0
|
4 |
sentencepiece==0.2.0
|
|
|
1 |
+
torch==2.2.0
|
2 |
datasets==2.18.0
|
3 |
transformers==4.40.0
|
4 |
sentencepiece==0.2.0
|