ProfRom commited on
Commit
4d6c27c
·
verified ·
1 Parent(s): 235a4e1

Harper - Final Assignment submission

Browse files
.gitattributes CHANGED
@@ -56,3 +56,18 @@ images/6.png filter=lfs diff=lfs merge=lfs -text
56
  images/7.png filter=lfs diff=lfs merge=lfs -text
57
  images/8.png filter=lfs diff=lfs merge=lfs -text
58
  images/9.png filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  images/7.png filter=lfs diff=lfs merge=lfs -text
57
  images/8.png filter=lfs diff=lfs merge=lfs -text
58
  images/9.png filter=lfs diff=lfs merge=lfs -text
59
+ Photos/20240325_133025.jpg filter=lfs diff=lfs merge=lfs -text
60
+ Photos/20250106_075208.jpg filter=lfs diff=lfs merge=lfs -text
61
+ Photos/20250805_161445.jpg filter=lfs diff=lfs merge=lfs -text
62
+ Photos/20250808_131957.jpg filter=lfs diff=lfs merge=lfs -text
63
+ Photos/20250808_164442.jpg filter=lfs diff=lfs merge=lfs -text
64
+ Photos/20250808_183114.jpg filter=lfs diff=lfs merge=lfs -text
65
+ Photos/20250813_113228.jpg filter=lfs diff=lfs merge=lfs -text
66
+ Photos/20250918_170635.jpg filter=lfs diff=lfs merge=lfs -text
67
+ Photos/20250920_114728.jpg filter=lfs diff=lfs merge=lfs -text
68
+ Photos/20251101_155708.jpg filter=lfs diff=lfs merge=lfs -text
69
+ Photos/20251106_170359.jpg filter=lfs diff=lfs merge=lfs -text
70
+ Photos/20251106_192036.jpg filter=lfs diff=lfs merge=lfs -text
71
+ Photos/20251107_100830.jpg filter=lfs diff=lfs merge=lfs -text
72
+ Photos/20251107_101822.jpg filter=lfs diff=lfs merge=lfs -text
73
+ Photos/20251107_150015.jpg filter=lfs diff=lfs merge=lfs -text
Photos/20240325_133025.jpg ADDED

Git LFS Details

  • SHA256: 43473e22b5101c66dc443854db3d918236eecb5dbab43408fc3293f1bb1d2a35
  • Pointer size: 132 Bytes
  • Size of remote file: 3.04 MB
Photos/20250106_075208.jpg ADDED

Git LFS Details

  • SHA256: aed82d8ec88d514b97b54c984ffaabbb731644e419986c618f9de5ac94b099c2
  • Pointer size: 132 Bytes
  • Size of remote file: 2.15 MB
Photos/20250805_161445.jpg ADDED

Git LFS Details

  • SHA256: 2824930ff93957a55f173e9cdf3a49029fbb03f54c5469fc602ee257be3b31e8
  • Pointer size: 132 Bytes
  • Size of remote file: 7.87 MB
Photos/20250808_131957.jpg ADDED

Git LFS Details

  • SHA256: 6472060645368c66a91d391ee9fe11f2a8b2fc2c80894d096c18d050e0cf9775
  • Pointer size: 132 Bytes
  • Size of remote file: 8.04 MB
Photos/20250808_164442.jpg ADDED

Git LFS Details

  • SHA256: e59baaf4fbcfbe4b54c8b419fcfe0c09ebf079dfe02093b03efa380846a3967a
  • Pointer size: 133 Bytes
  • Size of remote file: 10.1 MB
Photos/20250808_183114.jpg ADDED

Git LFS Details

  • SHA256: 8889daa1da199cfd0381551f7297f82e787141c527a0d8f8a5beaadb712cd945
  • Pointer size: 132 Bytes
  • Size of remote file: 6.06 MB
Photos/20250813_113228.jpg ADDED

Git LFS Details

  • SHA256: aa96808ab506c3b165c433ac96ecd4adbdfa32f9457ac7a3d6ecc522cd6b3c48
  • Pointer size: 132 Bytes
  • Size of remote file: 4.94 MB
Photos/20250918_170635.jpg ADDED

Git LFS Details

  • SHA256: 32da19879f8ddaa46bb993f1dfa610aa87712876d43fd5ab1863ade567a35440
  • Pointer size: 132 Bytes
  • Size of remote file: 3.59 MB
Photos/20250920_114728.jpg ADDED

Git LFS Details

  • SHA256: c3922a218aa9d53a2d2bf9e16b1d84e7c2843a3e1226427078f43c5736c0678b
  • Pointer size: 132 Bytes
  • Size of remote file: 8.2 MB
Photos/20251101_155708.jpg ADDED

Git LFS Details

  • SHA256: 301a997e6b34879e7e94dfb460c17afb62b57b1280837cd588e0f7c76017600f
  • Pointer size: 133 Bytes
  • Size of remote file: 11.9 MB
Photos/20251106_170359.jpg ADDED

Git LFS Details

  • SHA256: fd7638dfd63984249b985960917a2c99a4d00fe5ba37b2dd61c04376e962d5ec
  • Pointer size: 132 Bytes
  • Size of remote file: 6.29 MB
Photos/20251106_192036.jpg ADDED

Git LFS Details

  • SHA256: e70e7ee6d3cc39f04220acd49af2e4918476759e198432aa4fa6ebc060d3bb67
  • Pointer size: 132 Bytes
  • Size of remote file: 5.14 MB
Photos/20251107_100830.jpg ADDED

Git LFS Details

  • SHA256: 5baaeedc84ab3d301755467771bd0992bcb4fa9228f3af40b1d48ab462924dc4
  • Pointer size: 133 Bytes
  • Size of remote file: 10 MB
Photos/20251107_101822.jpg ADDED

Git LFS Details

  • SHA256: 6c0a44626bca6e76c56d4ebe0737fefc6f22b3f1f5d6d99dc289e66b2803367c
  • Pointer size: 132 Bytes
  • Size of remote file: 7.39 MB
Photos/20251107_150015.jpg ADDED

Git LFS Details

  • SHA256: 68e661c821415197be6a4685d00f4841ea69f6abc155d5cfbc77e5427dd870d4
  • Pointer size: 132 Bytes
  • Size of remote file: 5.17 MB
app.py CHANGED
@@ -1,41 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
- from transformers import Blip2Processor, Blip2ForConditionalGeneration
3
- import torch
4
 
5
- # Load pre-trained BLIP-2 model and processor
6
- processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
7
- model = Blip2ForConditionalGeneration.from_pretrained(
8
- "Salesforce/blip2-opt-2.7b",
9
- torch_dtype=torch.float16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  )
11
 
12
- def predict(image, question=None):
13
- # If no question provided, generate a caption
14
- if question is None or question.strip() == "":
15
- inputs = processor(image, return_tensors="pt")
16
- else:
17
- inputs = processor(image, question, return_tensors="pt")
18
-
19
- # Move to GPU if available
20
- device = "cuda" if torch.cuda.is_available() else "cpu"
21
- inputs = inputs.to(device)
22
- model.to(device)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
- # Generate output
25
- out = model.generate(**inputs, max_new_tokens=50)
26
- result = processor.decode(out[0], skip_special_tokens=True)
27
- return result
28
-
29
- # Gradio interface
30
- iface = gr.Interface(
31
- fn=predict,
32
- inputs=[
33
- gr.Image(type="pil", label="Upload Image"),
34
- gr.Textbox(label="Optional Question", placeholder="Ask something about the image...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  ],
36
- outputs=gr.Textbox(label="Result"),
37
- title="BLIP-2 Multimodal Assistant",
38
- description="Upload an image and get a caption. Optionally, ask a question about the image."
39
  )
40
 
41
- iface.launch()
 
 
1
+ # Caption Generator w/English-to-Spanish Translation
2
+ # A. Harper | ARIN 460 | December 2025
3
+
4
+ # Load into Hugging Face Space (using the Gradio Framework)
5
+ # Include requirements.txt file (list: gradio, pandas, torch, sentencepiece, tensorflow, Image, transformers)
6
+
7
+ # To run, navigate to the App tab. Click the red Generate button.
8
+ # The app will randomly select image, generate (English) caption,
9
+ # then generate Spanish translation.
10
+
11
+
12
+ # Import gradio - app framework
13
  import gradio as gr
 
 
14
 
15
+
16
+ # Two image datasources are available.
17
+ # Minor adjustments (add/remove # to deactivate/activate) to switch between datasources.
18
+ # AA comments refer to images in the DataFrame / from Coco database
19
+ # BB comments refer to images stored in local Gradio app folder
20
+
21
+
22
+ # Import os and random to support random selection of image (from folder)
23
+ import os
24
+ import random
25
+
26
+
27
+ # Import pandas datasets, transformers, torch
28
+ import pandas as pd
29
+
30
+ from datasets import load_dataset
31
+
32
+ from transformers import (
33
+ BlipProcessor,
34
+ BlipForConditionalGeneration,
35
+ AutoTokenizer,
36
+ AutoModelForSeq2SeqLM,
37
+ MarianMTModel,
38
+ MarianTokenizer
39
  )
40
 
41
+ from PIL import Image
42
+ import torch
43
+
44
+
45
+ # AA: Load dataset. Initial image source.
46
+ #Load dataset (henryscheible/coco_val2014_tiny)
47
+ dataset = load_dataset("henryscheible/coco_val2014_tiny", split="validation")
48
+
49
+
50
+ # Reduce dataset to 20 rows, i.e., get sample
51
+ samples = dataset.select(range(20))
52
+
53
+
54
+ #Convert to dataframe
55
+ df = pd.DataFrame(samples)
56
+
57
+
58
+ # BB: Direct to Photos folder
59
+ IMAGE_FOLDER = "Photos"
60
+
61
+ image_paths = [
62
+ os.path.join(IMAGE_FOLDER, f)
63
+ for f in os.listdir(IMAGE_FOLDER)
64
+ if f.lower().endswith((".jpg", ".jpeg", ".png"))
65
+ ]
66
+
67
+ #Load the image captioning model (Salesforce/blip-image-captioning-large)
68
+ processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
69
+ model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
70
+
71
+
72
+ #Load transformer for translating captions from English to Spanish
73
+ model_name = "Helsinki-NLP/opus-mt-en-es"
74
+ trans_tokenizer = MarianTokenizer.from_pretrained(model_name)
75
+ trans_model = MarianMTModel.from_pretrained(model_name)
76
+
77
+
78
+ #Configure captioning function
79
+
80
+ def caption_random_image():
81
+
82
+
83
+ # AA: pick random row - from DF
84
+ ##sample = df.sample(1).iloc[0]
85
+
86
+
87
+ # BB: Pick a random image path - image from folder
88
+ img_path = random.choice(image_paths)
89
+
90
+
91
+ # BB: Load into PIL - image from folder - image from folder
92
+ image = Image.open(img_path).convert("RGB")
93
+
94
 
95
+ # AA: Image - for DF
96
+ ##image = sample["image"]
97
+
98
+
99
+ # Unconditional image captioning
100
+ inputs = processor(image, return_tensors="pt")
101
+
102
+
103
+ out = model.generate(**inputs)
104
+ caption_eng = processor.decode(out[0], skip_special_tokens=True)
105
+
106
+
107
+ # Translate caption from English to Spanish
108
+ trans_inputs = trans_tokenizer.encode(caption_eng, return_tensors="pt")
109
+ trans_out = trans_model.generate(trans_inputs)
110
+ caption_es = trans_tokenizer.decode(trans_out[0], skip_special_tokens=True)
111
+
112
+
113
+ return image, caption_eng, caption_es
114
+
115
+
116
+
117
+
118
+ demo = gr.Interface(
119
+ fn=caption_random_image,
120
+ inputs=None,
121
+ outputs=[
122
+ gr.Image(type="pil", label="Random Image"),
123
+ gr.Textbox(label="Caption (English)"),
124
+ gr.Textbox(label="Caption (Spanish)")
125
  ],
126
+ title="Image Captioning (with English to Spanish translation)",
127
+ description="Selects a random image (from either the local folder or henryscheible/coco data subset); generates a BLIP caption; then translates the (English) caption to Spanish."
 
128
  )
129
 
130
+
131
+ demo.launch()
requirements.txt CHANGED
@@ -1,4 +1,8 @@
1
- gradio>=4.0
2
- transformers>=4.30
3
  torch
4
- pillow
 
 
 
 
 
1
+ gradio
2
+ pandas
3
  torch
4
+ sentencepiece
5
+ tensorflow
6
+ Image
7
+ transformers
8
+