arjunanand13 commited on
Commit
8c1ff5e
1 Parent(s): 985ebc1

Upload 7 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ american_football.png filter=lfs diff=lfs merge=lfs -text
37
+ bike.png filter=lfs diff=lfs merge=lfs -text
38
+ finance.png filter=lfs diff=lfs merge=lfs -text
39
+ science.png filter=lfs diff=lfs merge=lfs -text
american_football.png ADDED

Git LFS Details

  • SHA256: dc604236a1bac1e11a0712add4f4ed00f2d3ab3cd6fe6beebd5ad9862c22e7e9
  • Pointer size: 132 Bytes
  • Size of remote file: 1.28 MB
bike.png ADDED

Git LFS Details

  • SHA256: a346b2e0d280cbd561bf3bf5c1ee30965f6eaffff9899fa58fe9fbdeb3d11325
  • Pointer size: 132 Bytes
  • Size of remote file: 1.38 MB
finance.png ADDED

Git LFS Details

  • SHA256: b5012040fc8a6cb84d696dbe4b2883f39f87729824a4932624f70c909e9de2c1
  • Pointer size: 132 Bytes
  • Size of remote file: 1.55 MB
gradio_main.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import AutoProcessor, Idefics2ForConditionalGeneration
3
+ import subprocess
4
+ import torch
5
+ from peft import LoraConfig
6
+ from transformers import BitsAndBytesConfig
7
+
8
+ subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
9
+ DEVICE = "cuda:0"
10
+ USE_LORA = False
11
+ USE_QLORA = True
12
+
13
+ processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics2-8b", do_image_splitting=False)
14
+
15
+ if USE_QLORA or USE_LORA:
16
+ lora_config = LoraConfig(
17
+ r=8,
18
+ lora_alpha=8,
19
+ lora_dropout=0.1,
20
+ target_modules='.*(text_model|modality_projection|perceiver_resampler).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*',
21
+ use_dora=False if USE_QLORA else True,
22
+ init_lora_weights="gaussian"
23
+ )
24
+ bnb_config = BitsAndBytesConfig(
25
+ load_in_4bit=True,
26
+ bnb_4bit_quant_type="nf4",
27
+ bnb_4bit_compute_dtype=torch.float16
28
+ ) if USE_QLORA else None
29
+ model = Idefics2ForConditionalGeneration.from_pretrained(
30
+ "HuggingFaceM4/idefics2-8b",
31
+ torch_dtype=torch.float16,
32
+ quantization_config=bnb_config,
33
+ )
34
+ model.add_adapter(lora_config)
35
+ model.enable_adapters()
36
+ else:
37
+ model = Idefics2ForConditionalGeneration.from_pretrained(
38
+ "HuggingFaceM4/idefics2-8b",
39
+ torch_dtype=torch.float16,
40
+ _attn_implementation="flash_attention_2"
41
+ ).to(DEVICE)
42
+
43
+ def model_inference(image, text):
44
+ resulting_messages = [{"role": "user", "content": [{"type": "image"}] + [{"type": "text", "text": text}]}]
45
+ prompt = processor.apply_chat_template(resulting_messages, add_generation_prompt=True)
46
+ inputs = processor(text=prompt, images=[image], return_tensors="pt")
47
+ inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
48
+
49
+ generated_ids = model.generate(
50
+ **inputs,
51
+ max_new_tokens=1024, # More tokens for extended content
52
+ temperature=0.3,
53
+ do_sample=True, # Slightly more random, to enhance creativity
54
+ top_p=0.7, # Nucleus sampling, for focused yet diverse output
55
+ # num_beams=5, # Use beam search with 5 beams
56
+ num_return_sequences=1 # Return the top 3 sequences from the beam search
57
+
58
+ )
59
+ generated_text = processor.batch_decode(generated_ids[:, inputs["input_ids"].size(1):], skip_special_tokens=True)
60
+
61
+ return generated_text[0]
62
+
63
+ with gr.Blocks() as demo:
64
+ gr.Markdown("## Enhanced IDEFICS2 Demo")
65
+ image_input = gr.Image(label="Upload Image", type="pil",height=480,width=640)
66
+ query_input = gr.Textbox(label="Enter Prompt")
67
+ submit_btn = gr.Button("Generate")
68
+ output = gr.Textbox(label="Model Output")
69
+ submit_btn.click(model_inference, inputs=[image_input, query_input], outputs=output)
70
+
71
+ examples = [
72
+ ["example_images/american_football.png", "Explain in detail what is depicted in the picture"],
73
+ ["example_images/bike.png", "Explore the image closely and describe in detail what you discover."],
74
+ ["example_images/finance.png", "Provide a detailed description of everything you see in the image."],
75
+ ["example_images/science.png", "Please perform optical character recognition (OCR) on the uploaded image. Extract all text visible in the image accurately. Ensure to capture the text in its entirety and maintain the formatting as closely as possible to how it appears in the image. After extracting the text, display it in a clear and readable format, making sure that any special characters or symbols are also accurately represented. Provide the extracted text as output."],
76
+ ["example_images/spirituality.png", "Please perform optical character recognition (OCR) on the uploaded image. Extract all text visible in the image accurately. Ensure to capture the text in its entirety and maintain the formatting as closely as possible to how it appears in the image. After extracting the text, display it in a clear and readable format, making sure that any special characters or symbols are also accurately represented. Provide the extracted text as output."]
77
+ ]
78
+ gr.Examples(examples=examples, inputs=[image_input, query_input], outputs=output)
79
+
80
+ demo.launch(debug=True)
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ git+https://github.com/huggingface/transformers.git
2
+ gradio
3
+ pillow
4
+ torch
5
+ peft
6
+ bitsandbytes
science.png ADDED

Git LFS Details

  • SHA256: 92680e6889b511642342a8debe059f2470950ad1807710bb9ca78bdee62180df
  • Pointer size: 132 Bytes
  • Size of remote file: 1.26 MB
spirituality.png ADDED