Upload 5 files

Browse files

Files changed (5) hide show

README (1).md +82 -0
config.json +66 -0
generation_config.json +10 -0
preprocessor_config.json +14 -0
tokenizer.json +0 -0

README (1).md ADDED Viewed

	@@ -0,0 +1,82 @@

+---
+inference: false
+language:
+- en
+tags:
+- 'LLaMA '
+- MultiModal
+---
+*This is a Hugging Face friendly Model, the original can be found at https://huggingface.co/liuhaotian/llava-llama-2-7b-chat-lightning-lora-preview*
+<br>
+# LLaVA Model Card
+## Model details
+**Model type:**
+LLaVA is an open-source chatbot trained by fine-tuning LLaMA/Vicuna on GPT-generated multimodal instruction-following data.
+It is an auto-regressive language model, based on the transformer architecture.
+**Model date:**
+LLaVA-LLaMA-2-7B-Chat-LoRA-Preview was trained in July 2023.
+**Paper or resources for more information:**
+https://llava-vl.github.io/
+## License
+Llama 2 is licensed under the LLAMA 2 Community License,
+Copyright (c) Meta Platforms, Inc. All Rights Reserved.
+**Where to send questions or comments about the model:**
+https://github.com/haotian-liu/LLaVA/issues
+## Intended use
+**Primary intended uses:**
+The primary use of LLaVA is research on large multimodal models and chatbots.
+**Primary intended users:**
+The primary intended users of the model are researchers and hobbyists in computer vision, natural language processing, machine learning, and artificial intelligence.
+## Training dataset
+- 558K filtered image-text pairs from LAION/CC/SBU, captioned by BLIP.
+- 80K GPT-generated multimodal instruction-following data.
+## Evaluation dataset
+A preliminary evaluation of the model quality is conducted by creating a set of 90 visual reasoning questions from 30 unique images randomly sampled from COCO val 2014 and each is associated with three types of questions: conversational, detailed description, and complex reasoning. We utilize GPT-4 to judge the model outputs.
+We also evaluate our model on the ScienceQA dataset.  Our synergy with GPT-4 sets a new state-of-the-art on the dataset.
+See https://llava-vl.github.io/ for more details.
+## Usage
+usage is as follows
+```python
+from transformers import LlavaProcessor, LlavaForCausalLM
+from PIL import Image
+import requests
+import torch
+PATH_TO_CONVERTED_WEIGHTS = "shauray/Llava-Llama-2-7B-hf"
+model = LlavaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS,
+device_map="cuda",torch_dtype=torch.float16).to("cuda")
+processor = LlavaProcessor.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+url = "https://llava-vl.github.io/static/images/view.jpg"
+image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+prompt = "How can you best describe this image?"
+inputs = processor(text=prompt, images=image, return_tensors="pt").to("cuda",
+torch.float16)
+# Generate
+generate_ids = model.generate(**inputs,
+    do_sample=True,
+    max_length=1024,
+    temperature=0.1,
+    top_p=0.9,
+)
+out = processor.decode(generate_ids[0, inputs["input_ids"].shape[1]:], skip_special_tokens=True).strip()
+print(out)
+"""The photograph shows a wooden dock floating on the water, with mountains in the background. It is an idyllic scene that captures both
+nature and human-made structures at their finest moments of beauty or tranquility depending upon one's perspective as they gaze into it"""
+```

config.json ADDED Viewed

	@@ -0,0 +1,66 @@

+{
+  "_commit_hash": null,
+  "_name_or_path": "shauray/Llava-Llama-2-7b-hf",
+  "model_type": "llava",
+  "architectures": [
+    "LlavaForCausalLM"
+  ],
+  "llama_config": {
+    "_name_or_path": "",
+    "bos_token_id": 1,
+    "eos_token_id": 2,
+    "hidden_act": "silu",
+    "hidden_size": 4096,
+    "initializer_range": 0.02,
+    "intermediate_size": 11008,
+    "max_position_embeddings": 4096,
+    "model_type": "llama",
+    "num_attention_heads": 32,
+    "num_hidden_layers": 32,
+    "num_key_value_heads": 32,
+    "pretraining_tp": 1,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": null,
+    "tie_word_embeddings": false,
+    "torch_dtype": "float16",
+    "transformers_version": "4.32.0.dev0",
+    "use_cache": true,
+    "vocab_size": 32000
+  },
+  "llava_vision_config": {
+    "_name_or_path": "",
+    "bos_token_id": 1,
+    "eos_token_id": 2,
+    "freeze_mm_mlp_adapter": true,
+    "hidden_act": "silu",
+    "hidden_size": 4096,
+    "image_aspect_ratio": "square",
+    "image_grid_pinpoints": null,
+    "initializer_range": 0.02,
+    "intermediate_size": 11008,
+    "max_position_embeddings": 2048,
+    "mm_hidden_size": 1024,
+    "mm_resampler_type": null,
+    "mm_use_im_patch_token": false,
+    "mm_use_im_start_end": false,
+    "mm_vision_select_feature": "patch",
+    "mm_vision_select_layer": -2,
+    "mm_vision_tower": "openai/clip-vit-large-patch14",
+    "model_type": "llava",
+    "num_attention_heads": 32,
+    "num_hidden_layers": 32,
+    "num_key_value_heads": 32,
+    "pad_token_id": 0,
+    "pretraining_tp": 1,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": null,
+    "tie_word_embeddings": false,
+    "torch_dtype": "float16",
+    "transformers_version": "4.31.0",
+    "tune_mm_mlp_adapter": false,
+    "tune_mm_vision_resampler": false,
+    "use_cache": false,
+    "use_mm_proj": true,
+    "vocab_size": 32000
+  }
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "bos_token_id": 1,
+  "do_sample": true,
+  "eos_token_id": 2,
+  "max_length": 4096,
+  "pad_token_id": 0,
+  "temperature": 0.6,
+  "top_p": 0.9,
+  "transformers_version": "4.32.0.dev0"
+}

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [0.48145466, 0.4578275, 0.40821073],
+  "image_processor_type": "CLIPImageProcessor",
+  "tokenizer_class": "LlamaTokenizer",
+  "image_std": [0.26862954, 0.26130258, 0.27577711],
+  "processor_class": "LlavaProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": 224
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff