Clyine1 commited on
Commit
a8dc4dc
1 Parent(s): 3f662bc

Upload 2 files

Browse files
Files changed (2) hide show
  1. config.json +51 -48
  2. vqapair.py +65 -0
config.json CHANGED
@@ -1,48 +1,51 @@
1
- {
2
- "_name_or_path": "/content/drive/MyDrive/model/iter_39620_hf",
3
- "architectures": [
4
- "LlavaForConditionalGeneration"
5
- ],
6
- "ignore_index": -100,
7
- "image_token_index": 32011,
8
- "model_type": "llava",
9
- "pad_token_id": 32012,
10
- "projector_hidden_act": "gelu",
11
- "text_config": {
12
- "_name_or_path": "../iter_32_xtuner_llama_llm",
13
- "architectures": [
14
- "LlamaForCausalLM"
15
- ],
16
- "eos_token_id": 32000,
17
- "hidden_size": 3072,
18
- "intermediate_size": 8192,
19
- "max_position_embeddings": 4096,
20
- "model_type": "llama",
21
- "original_max_position_embeddings": 4096,
22
- "pad_token_id": 32000,
23
- "rms_norm_eps": 1e-05,
24
- "sliding_window": 2047,
25
- "torch_dtype": "float16",
26
- "vocab_size": 32064
27
- },
28
- "torch_dtype": "float16",
29
- "transformers_version": "4.41.2",
30
- "vision_config": {
31
- "_name_or_path": "./visual_encoder",
32
- "architectures": [
33
- "CLIPVisionModel"
34
- ],
35
- "dropout": 0.0,
36
- "hidden_size": 1024,
37
- "image_size": 336,
38
- "intermediate_size": 4096,
39
- "model_type": "clip_vision_model",
40
- "num_attention_heads": 16,
41
- "num_hidden_layers": 24,
42
- "patch_size": 14,
43
- "projection_dim": 768,
44
- "torch_dtype": "float32"
45
- },
46
- "vision_feature_layer": -2,
47
- "vision_feature_select_strategy": "default"
48
- }
 
 
 
 
1
+ {
2
+ "_name_or_path": "/content/drive/MyDrive/model/iter_39620_hf",
3
+ "architectures": [
4
+ "LlavaForConditionalGeneration"
5
+ ],
6
+ "auto_map": {
7
+ "AutoModelForCausalLM": "vqapair.VQApair"
8
+ },
9
+ "ignore_index": -100,
10
+ "image_token_index": 32011,
11
+ "model_type": "llava",
12
+ "pad_token_id": 32012,
13
+ "projector_hidden_act": "gelu",
14
+ "text_config": {
15
+ "_name_or_path": "../iter_32_xtuner_llama_llm",
16
+ "architectures": [
17
+ "LlamaForCausalLM"
18
+ ],
19
+ "eos_token_id": 32000,
20
+ "hidden_size": 3072,
21
+ "intermediate_size": 8192,
22
+ "max_position_embeddings": 4096,
23
+ "model_type": "llama",
24
+ "original_max_position_embeddings": 4096,
25
+ "pad_token_id": 32000,
26
+ "rms_norm_eps": 1e-05,
27
+ "sliding_window": 2047,
28
+ "torch_dtype": "float16",
29
+ "vocab_size": 32064
30
+ },
31
+ "torch_dtype": "float16",
32
+ "transformers_version": "4.41.2",
33
+ "vision_config": {
34
+ "_name_or_path": "./visual_encoder",
35
+ "architectures": [
36
+ "CLIPVisionModel"
37
+ ],
38
+ "dropout": 0.0,
39
+ "hidden_size": 1024,
40
+ "image_size": 336,
41
+ "intermediate_size": 4096,
42
+ "model_type": "clip_vision_model",
43
+ "num_attention_heads": 16,
44
+ "num_hidden_layers": 24,
45
+ "patch_size": 14,
46
+ "projection_dim": 768,
47
+ "torch_dtype": "float32"
48
+ },
49
+ "vision_feature_layer": -2,
50
+ "vision_feature_select_strategy": "default"
51
+ }
vqapair.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+ from transformers import LlavaForConditionalGeneration, LlavaConfig
4
+ import re
5
+
6
+ from PIL import Image
7
+ from random import randint
8
+
9
+ class VQApair(LlavaForConditionalGeneration):
10
+ config_class = LlavaConfig
11
+ def __init__(self, config, **kwargs):
12
+ super().__init__(config)
13
+ self.processor = kwargs.pop("proc")
14
+
15
+ def genChoice(self, question, base_prompt, img_obj):
16
+ base_prompt += "{}<|end|>\n<|user|> Suggest 1 correct answer<|end|><|assistant|> ".format(question)
17
+ inputs = self.processor(base_prompt, img_obj, return_tensors='pt').to(0)
18
+ output = self.generate(**inputs, eos_token_id=32007, max_new_tokens=500)
19
+ index = torch.where(output[0]==32001)[0][-1].item()
20
+ answer = self.processor.decode(output[0][index:], skip_special_tokens=True)
21
+ base_prompt += "{}<|end|>\n<|user|> Suggest 3 incorrect answers<|end|><|assistant|> ".format(answer)
22
+ inputs = self.processor(base_prompt, img_obj, return_tensors='pt').to(0)
23
+ output = self.generate(**inputs, eos_token_id=32007, max_new_tokens=500)
24
+ index = torch.where(output[0]==32001)[0][-1].item()
25
+ choices = self.processor.decode(output[0][index:], skip_special_tokens=True)
26
+ a = choices.split("\n")
27
+ a = [x[3:].strip() for x in a]
28
+ a = [x for x in a if x]
29
+ correct_answer = randint(0,len(a))
30
+ a.insert(correct_answer, answer)
31
+
32
+ a = ["{}) {}".format(i+1, a[i]) for i in range(len(a))]
33
+ ans = "Correct Answer: {}".format(a[correct_answer])
34
+ return {"Choices": a, "Answers": ans}
35
+
36
+ def generateQn(self, img_path, n):
37
+ #commands = ["Generate a simple question",""]
38
+ prompt ='''
39
+ <|user|>\n<image>\nDescribe this image in a passage<|end|><|assistant|>
40
+ '''
41
+ artifacts = []
42
+ img_obj = Image.open(img_path)
43
+
44
+
45
+ inputs = self.processor(prompt, img_obj, return_tensors='pt').to(0)
46
+ #Generate Desc
47
+ output = self.generate(**inputs, eos_token_id=32007, max_new_tokens=500)
48
+ index = torch.where(output[0]==32001)[0][-1].item()
49
+ desc = self.processor.decode(output[0][index:], skip_special_tokens=True)
50
+ #Update Prompt to generate question
51
+ prompt += "{}<|end|>\n<|user|> {}<|end|><|assistant|> ".format(desc,"Generate a simple question")
52
+ inputs = self.processor(prompt, img_obj, return_tensors='pt').to(0)
53
+ #Generate k questions
54
+ output = self.generate(**inputs, eos_token_id=32007, max_new_tokens=500, do_sample=False, num_beams=3,num_beam_groups=3,diversity_penalty=10.0, num_return_sequences=n)
55
+ for out in output:
56
+ entry = {}
57
+ index = torch.where(out==32001)[0][-1].item()
58
+ text = self.processor.decode(out[index:], skip_special_tokens=True)
59
+ entry.update({"desc":desc})
60
+ entry.update({"question":text})
61
+ entry.update(self.genChoice(text, prompt, img_obj))
62
+ artifacts.append(entry)
63
+
64
+ return artifacts
65
+