Sudeep s commited on
Commit
62dc913
·
1 Parent(s): 273ecc2

changes to code

Browse files
Files changed (3) hide show
  1. app.py +82 -0
  2. examples/text-image-1.jpg +0 -0
  3. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import requests
3
+ import torch
4
+ from PIL import Image
5
+ import spaces
6
+ from transformers import MllamaForConditionalGeneration, AutoProcessor
7
+ import os
8
+ from huggingface_hub import login
9
+
10
+ #huggingface_token = os.getenv("SECRET_ENV_VARIABLE")
11
+ #login(huggingface_token)
12
+
13
+ # Load the Llama 3.2 Vision Model
14
+ def load_llama_model():
15
+ model_id = "meta-llama/Llama-3.2-11B-Vision"
16
+
17
+ # Load model and processor
18
+ model = MllamaForConditionalGeneration.from_pretrained(
19
+ model_id,
20
+ torch_dtype=torch.bfloat16,
21
+ device_map="auto",
22
+ offload_folder="offload",
23
+ )
24
+ model.tie_weights()
25
+ processor = AutoProcessor.from_pretrained(model_id)
26
+
27
+ return model, processor
28
+
29
+ # Function to generate predictions for text and image
30
+ @spaces.GPU
31
+ def process_input(text, image=None):
32
+ model, processor = load_llama_model()
33
+
34
+ if image:
35
+ # If an image is uploaded, process it as a PIL Image object
36
+ vision_input = image.convert("RGB").resize((224, 224))
37
+
38
+ prompt = f"<|image|><|begin_of_text|>{text}"
39
+
40
+ # Process image and text together
41
+ inputs = processor(vision_input, prompt, return_tensors="pt").to(model.device)
42
+ else:
43
+ # If no image is uploaded, just process the text
44
+ prompt = f"<|begin_of_text|>{text}"
45
+ inputs = processor(prompt, return_tensors="pt").to(model.device)
46
+
47
+ # Generate output from the model
48
+ outputs = model.generate(**inputs, max_new_tokens=50)
49
+
50
+ # Decode the output to return a readable text
51
+ decoded_output = processor.decode(outputs[0], skip_special_tokens=True)
52
+
53
+ return decoded_output
54
+
55
+ def demo():
56
+ # Define Gradio input and output components
57
+ text_input = gr.Textbox(label="Text Input", placeholder="Enter text here", lines=5)
58
+ image_input = gr.Image(label="Upload an Image", type="pil")
59
+ output = gr.Textbox(label="Model Output", lines=3)
60
+
61
+ # Add two examples for multimodal analysis
62
+ examples = [
63
+ ["The llama is ", "./examples/llama.png"],
64
+ ["The cute hampster is wearing ", "./examples/hampster.png"]
65
+ ]
66
+
67
+ # Define the interface layout
68
+ interface = gr.Interface(
69
+ fn=process_input,
70
+ inputs=[text_input, image_input],
71
+ outputs=output,
72
+ examples=examples,
73
+ title="Llama 3.2 Multimodal Text-Image Analyzer",
74
+ description="Upload an image and/or provide text for analysis using the Llama 3.2 Vision Model. You can also try out the provided examples.",
75
+ )
76
+
77
+ # Launch the demo
78
+ interface.launch()
79
+
80
+ # Run the demo
81
+ if __name__ == "__main__":
82
+ demo()
examples/text-image-1.jpg ADDED
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ torch
2
+ Pillow
3
+ spaces
4
+ git+https://github.com/huggingface/transformers.git
5
+ accelerate>=0.26.0