unography
/

blip-long-cap

@@ -3,7 +3,7 @@ license: bsd-3-clause
 tags:
 - image-captioning
 datasets:
-- unography/laion-81k-GPT4V-LIVIS-Captions
 pipeline_tag: image-to-text
 languages:
 - en
@@ -50,7 +50,7 @@ inputs = processor(raw_image, return_tensors="pt")
 pixel_values = inputs.pixel_values
 out = model.generate(pixel_values=pixel_values, max_length=250, num_beams=3, repetition_penalty=2.5)
 print(processor.decode(out[0], skip_special_tokens=True))
->>> a woman sitting on a sandy beach, interacting with a dog wearing a blue and white checkered shirt. the background is an ocean or sea with waves crashing in the distance. there are no other animals or people visible in the image.
 ```
 </details>
@@ -67,8 +67,8 @@ import requests
 from PIL import Image
 from transformers import BlipProcessor, BlipForConditionalGeneration
-processor = BlipProcessor.from_pretrained("unography/blip-large-long-cap")
-model = BlipForConditionalGeneration.from_pretrained("unography/blip-large-long-cap").to("cuda")
 img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg'
 raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
@@ -77,7 +77,7 @@ inputs = processor(raw_image, return_tensors="pt").to("cuda")
 pixel_values = inputs.pixel_values
 out = model.generate(pixel_values=pixel_values, max_length=250, num_beams=3, repetition_penalty=2.5)
 print(processor.decode(out[0], skip_special_tokens=True))
->>> a woman sitting on a sandy beach, interacting with a dog wearing a blue and white checkered shirt. the background is an ocean or sea with waves crashing in the distance. there are no other animals or people visible in the image.
 ```
 </details>
@@ -92,8 +92,8 @@ import requests
 from PIL import Image
 from transformers import BlipProcessor, BlipForConditionalGeneration
-processor = BlipProcessor.from_pretrained("unography/blip-large-long-cap")
-model = BlipForConditionalGeneration.from_pretrained("unography/blip-large-long-cap", torch_dtype=torch.float16).to("cuda")
 img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg'
 raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
@@ -102,6 +102,6 @@ inputs = processor(raw_image, return_tensors="pt").to("cuda", torch.float16)
 pixel_values = inputs.pixel_values
 out = model.generate(pixel_values=pixel_values, max_length=250, num_beams=3, repetition_penalty=2.5)
 print(processor.decode(out[0], skip_special_tokens=True))
->>> a woman sitting on a sandy beach, interacting with a dog wearing a blue and white checkered shirt. the background is an ocean or sea with waves crashing in the distance. there are no other animals or people visible in the image.
 ```
 </details>

 tags:
 - image-captioning
 datasets:
+- unography/laion-14k-GPT4V-LIVIS-Captions
 pipeline_tag: image-to-text
 languages:
 - en
 pixel_values = inputs.pixel_values
 out = model.generate(pixel_values=pixel_values, max_length=250, num_beams=3, repetition_penalty=2.5)
 print(processor.decode(out[0], skip_special_tokens=True))
+>>> a woman sitting on the sand, interacting with a dog wearing a blue and white checkered collar. the dog is positioned to the left of the woman, who is holding something in their hand. the background features a serene beach setting with waves crashing onto the shore. there are no other animals or people visible in the image. the time of day appears to be either early morning or late afternoon, based on the lighting and shadows.
 ```
 </details>
 from PIL import Image
 from transformers import BlipProcessor, BlipForConditionalGeneration
+processor = BlipProcessor.from_pretrained("unography/blip-long-cap")
+model = BlipForConditionalGeneration.from_pretrained("unography/blip-long-cap").to("cuda")
 img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg'
 raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
 pixel_values = inputs.pixel_values
 out = model.generate(pixel_values=pixel_values, max_length=250, num_beams=3, repetition_penalty=2.5)
 print(processor.decode(out[0], skip_special_tokens=True))
+>>> a woman sitting on the sand, interacting with a dog wearing a blue and white checkered collar. the dog is positioned to the left of the woman, who is holding something in their hand. the background features a serene beach setting with waves crashing onto the shore. there are no other animals or people visible in the image. the time of day appears to be either early morning or late afternoon, based on the lighting and shadows.
 ```
 </details>
 from PIL import Image
 from transformers import BlipProcessor, BlipForConditionalGeneration
+processor = BlipProcessor.from_pretrained("unography/blip-long-cap")
+model = BlipForConditionalGeneration.from_pretrained("unography/blip-long-cap", torch_dtype=torch.float16).to("cuda")
 img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg'
 raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
 pixel_values = inputs.pixel_values
 out = model.generate(pixel_values=pixel_values, max_length=250, num_beams=3, repetition_penalty=2.5)
 print(processor.decode(out[0], skip_special_tokens=True))
+>>> a woman sitting on the sand, interacting with a dog wearing a blue and white checkered collar. the dog is positioned to the left of the woman, who is holding something in their hand. the background features a serene beach setting with waves crashing onto the shore. there are no other animals or people visible in the image. the time of day appears to be either early morning or late afternoon, based on the lighting and shadows.
 ```
 </details>