Spaces:

ignitariumcloud
/

idefics2

Runtime error

App Files Files Community

arjunanand13 commited on Apr 25, 2024

Commit

8c1ff5e

verified ·

1 Parent(s): 985ebc1

Upload 7 files

Browse files

Files changed (8) hide show

.gitattributes +4 -0
american_football.png +3 -0
bike.png +3 -0
finance.png +3 -0
gradio_main.py +80 -0
requirements.txt +6 -0
science.png +3 -0
spirituality.png +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+american_football.png filter=lfs diff=lfs merge=lfs -text
+bike.png filter=lfs diff=lfs merge=lfs -text
+finance.png filter=lfs diff=lfs merge=lfs -text
+science.png filter=lfs diff=lfs merge=lfs -text

american_football.png ADDED Viewed

Git LFS Details

SHA256: dc604236a1bac1e11a0712add4f4ed00f2d3ab3cd6fe6beebd5ad9862c22e7e9
Pointer size: 132 Bytes
Size of remote file: 1.28 MB

bike.png ADDED Viewed

Git LFS Details

SHA256: a346b2e0d280cbd561bf3bf5c1ee30965f6eaffff9899fa58fe9fbdeb3d11325
Pointer size: 132 Bytes
Size of remote file: 1.38 MB

finance.png ADDED Viewed

Git LFS Details

SHA256: b5012040fc8a6cb84d696dbe4b2883f39f87729824a4932624f70c909e9de2c1
Pointer size: 132 Bytes
Size of remote file: 1.55 MB

gradio_main.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import gradio as gr
+from transformers import AutoProcessor, Idefics2ForConditionalGeneration
+import subprocess
+import torch
+from peft import LoraConfig
+from transformers import BitsAndBytesConfig
+subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
+DEVICE = "cuda:0"
+USE_LORA = False
+USE_QLORA = True
+processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics2-8b", do_image_splitting=False)
+if USE_QLORA or USE_LORA:
+    lora_config = LoraConfig(
+        r=8,
+        lora_alpha=8,
+        lora_dropout=0.1,
+        target_modules='.*(text_model|modality_projection|perceiver_resampler).*(down_proj|gate_proj|up_proj|k_proj|q_proj|v_proj|o_proj).*',
+        use_dora=False if USE_QLORA else True,
+        init_lora_weights="gaussian"
+    )
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=torch.float16
+    ) if USE_QLORA else None
+    model = Idefics2ForConditionalGeneration.from_pretrained(
+        "HuggingFaceM4/idefics2-8b",
+        torch_dtype=torch.float16,
+        quantization_config=bnb_config,
+    )
+    model.add_adapter(lora_config)
+    model.enable_adapters()
+else:
+    model = Idefics2ForConditionalGeneration.from_pretrained(
+        "HuggingFaceM4/idefics2-8b",
+        torch_dtype=torch.float16,
+        _attn_implementation="flash_attention_2"
+    ).to(DEVICE)
+def model_inference(image, text):
+    resulting_messages = [{"role": "user", "content": [{"type": "image"}] + [{"type": "text", "text": text}]}]
+    prompt = processor.apply_chat_template(resulting_messages, add_generation_prompt=True)
+    inputs = processor(text=prompt, images=[image], return_tensors="pt")
+    inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
+    generated_ids = model.generate(
+        **inputs,
+        max_new_tokens=1024,  # More tokens for extended content
+        temperature=0.3,
+        do_sample=True,     # Slightly more random, to enhance creativity
+        top_p=0.7,           # Nucleus sampling, for focused yet diverse output
+        # num_beams=5,  # Use beam search with 5 beams
+        num_return_sequences=1  # Return the top 3 sequences from the beam search
+    )
+    generated_text = processor.batch_decode(generated_ids[:, inputs["input_ids"].size(1):], skip_special_tokens=True)
+    return generated_text[0]
+with gr.Blocks() as demo:
+    gr.Markdown("## Enhanced IDEFICS2 Demo")
+    image_input = gr.Image(label="Upload Image", type="pil",height=480,width=640)
+    query_input = gr.Textbox(label="Enter Prompt")
+    submit_btn = gr.Button("Generate")
+    output = gr.Textbox(label="Model Output")
+    submit_btn.click(model_inference, inputs=[image_input, query_input], outputs=output)
+    examples = [
+        ["example_images/american_football.png", "Explain in detail what is depicted in the picture"],
+        ["example_images/bike.png", "Explore the image closely and describe in detail what you discover."],
+        ["example_images/finance.png", "Provide a detailed description of everything you see in the image."],
+        ["example_images/science.png", "Please perform optical character recognition (OCR) on the uploaded image. Extract all text visible in the image accurately. Ensure to capture the text in its entirety and maintain the formatting as closely as possible to how it appears in the image. After extracting the text, display it in a clear and readable format, making sure that any special characters or symbols are also accurately represented. Provide the extracted text as output."],
+        ["example_images/spirituality.png", "Please perform optical character recognition (OCR) on the uploaded image. Extract all text visible in the image accurately. Ensure to capture the text in its entirety and maintain the formatting as closely as possible to how it appears in the image. After extracting the text, display it in a clear and readable format, making sure that any special characters or symbols are also accurately represented. Provide the extracted text as output."]
+    ]
+    gr.Examples(examples=examples, inputs=[image_input, query_input], outputs=output)
+demo.launch(debug=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+git+https://github.com/huggingface/transformers.git
+gradio
+pillow
+torch
+peft
+bitsandbytes

science.png ADDED Viewed

Git LFS Details

SHA256: 92680e6889b511642342a8debe059f2470950ad1807710bb9ca78bdee62180df
Pointer size: 132 Bytes
Size of remote file: 1.26 MB

spirituality.png ADDED Viewed