Visual Question Answering
Transformers
TensorBoard
Safetensors
internvl_chat
feature-extraction
custom_code
czczup commited on
Commit
5d9647a
1 Parent(s): a807c4a

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. config.json +4 -3
  2. modeling_internlm2.py +182 -0
  3. modeling_internvl_chat.py +2 -3
config.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
  "_commit_hash": null,
3
- "_name_or_path": "OpenGVLab/InternVL-Chat-V1-5",
4
  "architectures": [
5
  "InternVLChatModel"
6
  ],
@@ -11,6 +11,7 @@
11
  "downsample_ratio": 0.5,
12
  "dynamic_image_size": true,
13
  "force_image_size": 448,
 
14
  "llm_config": {
15
  "_name_or_path": "pretrained/internlm2-chat-20b/",
16
  "add_cross_attention": false,
@@ -100,7 +101,7 @@
100
  "use_cache": false,
101
  "vocab_size": 92553
102
  },
103
- "max_dynamic_patch": 12,
104
  "min_dynamic_patch": 1,
105
  "model_type": "internvl_chat",
106
  "pad2square": false,
@@ -113,7 +114,7 @@
113
  "use_llm_lora": 0,
114
  "use_thumbnail": true,
115
  "vision_config": {
116
- "_name_or_path": "OpenGVLab/InternViT-6B-448px-V1-5",
117
  "add_cross_attention": false,
118
  "architectures": [
119
  "InternVisionModel"
 
1
  {
2
  "_commit_hash": null,
3
+ "_name_or_path": "./work_dirs/internvl_chat_internlm2_20b_448_dynamic_chinese_pretrain3/checkpoint-1600_replace_llm",
4
  "architectures": [
5
  "InternVLChatModel"
6
  ],
 
11
  "downsample_ratio": 0.5,
12
  "dynamic_image_size": true,
13
  "force_image_size": 448,
14
+ "image_fold": null,
15
  "llm_config": {
16
  "_name_or_path": "pretrained/internlm2-chat-20b/",
17
  "add_cross_attention": false,
 
101
  "use_cache": false,
102
  "vocab_size": 92553
103
  },
104
+ "max_dynamic_patch": 6,
105
  "min_dynamic_patch": 1,
106
  "model_type": "internvl_chat",
107
  "pad2square": false,
 
114
  "use_llm_lora": 0,
115
  "use_thumbnail": true,
116
  "vision_config": {
117
+ "_name_or_path": "work_dirs/internvl_chat_internlm2_20b_448_dynamic_chinese_pretrain/checkpoint-5200-vit",
118
  "add_cross_attention": false,
119
  "architectures": [
120
  "InternVisionModel"
modeling_internlm2.py CHANGED
@@ -39,6 +39,20 @@ try:
39
  from transformers.generation.streamers import BaseStreamer
40
  except: # noqa # pylint: disable=bare-except
41
  BaseStreamer = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
  from .configuration_internlm2 import InternLM2Config
44
 
@@ -1272,6 +1286,174 @@ class InternLM2ForCausalLM(InternLM2PreTrainedModel):
1272
 
1273
  return consumer()
1274
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1275
 
1276
  # Copied from transformers.model.llama.modeling_llama.LlamaForSequenceClassification with Llama->InternLM2
1277
  @add_start_docstrings(
 
39
  from transformers.generation.streamers import BaseStreamer
40
  except: # noqa # pylint: disable=bare-except
41
  BaseStreamer = None
42
+ from typing import Any, List, Optional, Tuple, Union
43
+ import torch.distributed as dist
44
+ import torch.utils.checkpoint
45
+ from peft import LoraConfig, get_peft_model
46
+ from torch import nn
47
+ from torch.nn import CrossEntropyLoss
48
+ from transformers import GenerationConfig, LlamaForCausalLM, LlamaTokenizer
49
+ from transformers.generation.logits_process import LogitsProcessorList
50
+ from transformers.generation.stopping_criteria import StoppingCriteriaList
51
+ from transformers.generation.streamers import BaseStreamer
52
+ from transformers.modeling_outputs import CausalLMOutputWithPast
53
+ from transformers.modeling_utils import PreTrainedModel
54
+ from transformers.utils import ModelOutput, logging
55
+ from transformers.generation.utils import GreedySearchOutput, validate_stopping_criteria, GreedySearchDecoderOnlyOutput, GreedySearchEncoderDecoderOutput
56
 
57
  from .configuration_internlm2 import InternLM2Config
58
 
 
1286
 
1287
  return consumer()
1288
 
1289
+ def greedy_search(
1290
+ self,
1291
+ input_ids: torch.LongTensor,
1292
+ logits_processor: Optional[LogitsProcessorList] = None,
1293
+ stopping_criteria: Optional[StoppingCriteriaList] = None,
1294
+ max_length: Optional[int] = None,
1295
+ pad_token_id: Optional[int] = None,
1296
+ eos_token_id: Optional[Union[int, List[int]]] = None,
1297
+ output_attentions: Optional[bool] = None,
1298
+ output_hidden_states: Optional[bool] = None,
1299
+ output_scores: Optional[bool] = None,
1300
+ return_dict_in_generate: Optional[bool] = None,
1301
+ synced_gpus: bool = False,
1302
+ streamer: Optional["BaseStreamer"] = None,
1303
+ **model_kwargs,
1304
+ ) -> Union[GreedySearchOutput, torch.LongTensor]:
1305
+ # init values
1306
+ logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
1307
+ stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
1308
+ if max_length is not None:
1309
+ warnings.warn(
1310
+ "`max_length` is deprecated in this function, use"
1311
+ " `stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])` instead.",
1312
+ UserWarning,
1313
+ )
1314
+ stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
1315
+ pad_token_id = pad_token_id if pad_token_id is not None else self.generation_config.pad_token_id
1316
+ eos_token_id = eos_token_id if eos_token_id is not None else self.generation_config.eos_token_id
1317
+ if isinstance(eos_token_id, int):
1318
+ eos_token_id = [eos_token_id]
1319
+ eos_token_id_tensor = torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None
1320
+ output_scores = output_scores if output_scores is not None else self.generation_config.output_scores
1321
+ output_attentions = (
1322
+ output_attentions if output_attentions is not None else self.generation_config.output_attentions
1323
+ )
1324
+ output_hidden_states = (
1325
+ output_hidden_states if output_hidden_states is not None else self.generation_config.output_hidden_states
1326
+ )
1327
+ return_dict_in_generate = (
1328
+ return_dict_in_generate
1329
+ if return_dict_in_generate is not None
1330
+ else self.generation_config.return_dict_in_generate
1331
+ )
1332
+
1333
+ # init attention / hidden states / scores tuples
1334
+ scores = () if (return_dict_in_generate and output_scores) else None
1335
+ decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
1336
+ cross_attentions = () if (return_dict_in_generate and output_attentions) else None
1337
+ decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
1338
+
1339
+ # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
1340
+ if return_dict_in_generate and self.config.is_encoder_decoder:
1341
+ encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
1342
+ encoder_hidden_states = (
1343
+ model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
1344
+ )
1345
+
1346
+ # keep track of which sequences are already finished
1347
+ unfinished_sequences = torch.ones(input_ids.shape[0], dtype=torch.long, device=input_ids.device)
1348
+
1349
+ this_peer_finished = False # used by synced_gpus only
1350
+ while True:
1351
+ if synced_gpus:
1352
+ # Under synced_gpus the `forward` call must continue until all gpus complete their sequence.
1353
+ # The following logic allows an early break if all peers finished generating their sequence
1354
+ this_peer_finished_flag = torch.tensor(0.0 if this_peer_finished else 1.0).to(input_ids.device)
1355
+ # send 0.0 if we finished, 1.0 otherwise
1356
+ dist.all_reduce(this_peer_finished_flag, op=dist.ReduceOp.SUM)
1357
+ # did all peers finish? the reduced sum will be 0.0 then
1358
+ if this_peer_finished_flag.item() == 0.0:
1359
+ break
1360
+
1361
+ # prepare model inputs
1362
+ model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
1363
+
1364
+ # forward pass to get next token
1365
+ outputs = self(
1366
+ **model_inputs,
1367
+ return_dict=True,
1368
+ output_attentions=output_attentions,
1369
+ output_hidden_states=output_hidden_states,
1370
+ )
1371
+
1372
+ if synced_gpus and this_peer_finished:
1373
+ continue # don't waste resources running the code we don't need
1374
+
1375
+ next_token_logits = outputs.logits[:, -1, :]
1376
+
1377
+ # pre-process distribution
1378
+ next_tokens_scores = logits_processor(input_ids, next_token_logits)
1379
+
1380
+ # Store scores, attentions and hidden_states when required
1381
+ if return_dict_in_generate:
1382
+ if output_scores:
1383
+ scores += (next_tokens_scores,)
1384
+ if output_attentions:
1385
+ decoder_attentions += (
1386
+ (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
1387
+ )
1388
+ if self.config.is_encoder_decoder:
1389
+ cross_attentions += (outputs.cross_attentions,)
1390
+
1391
+ if output_hidden_states:
1392
+ decoder_hidden_states += (
1393
+ (outputs.decoder_hidden_states,)
1394
+ if self.config.is_encoder_decoder
1395
+ else (outputs.hidden_states,)
1396
+ )
1397
+
1398
+ # argmax
1399
+ next_tokens = torch.argmax(next_tokens_scores, dim=-1).to(device=input_ids.device)
1400
+ # finished sentences should have their next token be a padding token
1401
+ if eos_token_id is not None:
1402
+ if pad_token_id is None:
1403
+ raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.")
1404
+ next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)
1405
+
1406
+ # update generated ids, model inputs, and length for next step
1407
+ input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
1408
+ if streamer is not None:
1409
+ streamer.put(next_tokens.cpu())
1410
+ model_kwargs = self._update_model_kwargs_for_generation(
1411
+ outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
1412
+ )
1413
+
1414
+ # if eos_token was found in one sentence, set sentence to finished
1415
+ if eos_token_id_tensor is not None:
1416
+ unfinished_sequences = unfinished_sequences.mul(
1417
+ next_tokens.tile(eos_token_id_tensor.shape[0], 1).ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0)
1418
+ )
1419
+
1420
+ # stop when each sentence is finished
1421
+ if unfinished_sequences.max() == 0:
1422
+ this_peer_finished = True
1423
+
1424
+ # stop if we exceed the maximum length
1425
+ if stopping_criteria(input_ids, scores):
1426
+ this_peer_finished = True
1427
+
1428
+ if this_peer_finished and not synced_gpus:
1429
+ break
1430
+
1431
+ if streamer is not None:
1432
+ streamer.end()
1433
+
1434
+ if return_dict_in_generate:
1435
+ if self.config.is_encoder_decoder:
1436
+ return GreedySearchEncoderDecoderOutput(
1437
+ sequences=input_ids,
1438
+ scores=scores,
1439
+ encoder_attentions=encoder_attentions,
1440
+ encoder_hidden_states=encoder_hidden_states,
1441
+ decoder_attentions=decoder_attentions,
1442
+ cross_attentions=cross_attentions,
1443
+ decoder_hidden_states=decoder_hidden_states,
1444
+ past_key_values=model_kwargs.get("past_key_values"),
1445
+ )
1446
+ else:
1447
+ return GreedySearchDecoderOnlyOutput(
1448
+ sequences=input_ids,
1449
+ scores=scores,
1450
+ attentions=decoder_attentions,
1451
+ hidden_states=decoder_hidden_states,
1452
+ past_key_values=model_kwargs.get("past_key_values"),
1453
+ )
1454
+ else:
1455
+ return input_ids
1456
+
1457
 
1458
  # Copied from transformers.model.llama.modeling_llama.LlamaForSequenceClassification with Llama->InternLM2
1459
  @add_start_docstrings(
modeling_internvl_chat.py CHANGED
@@ -26,7 +26,7 @@ logger = logging.get_logger(__name__)
26
  class InternVLChatModel(PreTrainedModel):
27
  config_class = InternVLChatConfig
28
  main_input_name = 'pixel_values'
29
- _no_split_modules = ['InternVisionEncoderLayer', 'LlamaDecoderLayer', 'LlamaForCausalLM']
30
 
31
  def __init__(self, config: InternVLChatConfig, vision_model=None, language_model=None):
32
  super().__init__(config)
@@ -337,7 +337,6 @@ class InternVLChatModel(PreTrainedModel):
337
  vit_embeds = visual_features
338
  else:
339
  vit_embeds = self.extract_feature(pixel_values)
340
-
341
  input_embeds = self.language_model.get_input_embeddings()(input_ids)
342
  B, N, C = input_embeds.shape
343
  input_embeds = input_embeds.reshape(B * N, C)
@@ -345,7 +344,7 @@ class InternVLChatModel(PreTrainedModel):
345
  input_ids = input_ids.reshape(B * N)
346
  selected = (input_ids == self.img_context_token_id)
347
  assert selected.sum() != 0
348
- input_embeds[selected] = vit_embeds.reshape(-1, C)
349
 
350
  input_embeds = input_embeds.reshape(B, N, C)
351
  else:
 
26
  class InternVLChatModel(PreTrainedModel):
27
  config_class = InternVLChatConfig
28
  main_input_name = 'pixel_values'
29
+ _no_split_modules = ['InternVisionEncoderLayer', 'LlamaDecoderLayer']
30
 
31
  def __init__(self, config: InternVLChatConfig, vision_model=None, language_model=None):
32
  super().__init__(config)
 
337
  vit_embeds = visual_features
338
  else:
339
  vit_embeds = self.extract_feature(pixel_values)
 
340
  input_embeds = self.language_model.get_input_embeddings()(input_ids)
341
  B, N, C = input_embeds.shape
342
  input_embeds = input_embeds.reshape(B * N, C)
 
344
  input_ids = input_ids.reshape(B * N)
345
  selected = (input_ids == self.img_context_token_id)
346
  assert selected.sum() != 0
347
+ input_embeds[selected] = vit_embeds.reshape(-1, C).to(input_embeds.device)
348
 
349
  input_embeds = input_embeds.reshape(B, N, C)
350
  else: