Spaces:

adil9858
/

dalton_vision

Sleeping

File size: 4,291 Bytes

import gradio as gr
from openai import OpenAI
import base64
from PIL import Image
import io
from datetime import datetime

# OpenAI client setup
client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key='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'
)

def analyze_image(image, prompt):
    if image is None:
        return "Please upload or capture an image first."
    
    # Convert image to base64
    buffered = io.BytesIO()
    image.save(buffered, format="JPEG")
    img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
    
    try:
        response = client.chat.completions.create(
            model="opengvlab/internvl3-14b:free",
            messages=[
                {
                    "role": "system", 
                    "content": """You are Dalton, an expert AI assistant specialized in image understanding. 
                    Your tasks include:
                    - Extracting and structuring text from images
                    - Answering questions about image content
                    - Providing detailed descriptions
                    - Analyzing receipts, documents, and other visual content
                    Be thorough, accurate, and helpful in your responses."""
                },
                {
                    "role": "user", 
                    "content": [
                        {"type": "text", "text": prompt},
                        {
                            "type": "image_url", 
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{img_str}"
                            }
                        }
                    ]
                }
            ],
            max_tokens=2048
        )
        
        result = response.choices[0].message.content
        return result
    
    except Exception as e:
        return f"An error occurred: {str(e)}"

# Custom CSS for better mobile experience
css = """
#mobile-camera { width: 100% !important; }
#prompt-textbox { min-height: 100px !important; }
.result-box { 
    max-height: 500px; 
    overflow-y: auto; 
    padding: 15px; 
    border: 1px solid #e0e0e0;
    border-radius: 8px;
}
.footer { 
    margin-top: 20px; 
    font-size: 12px; 
    color: #666; 
    text-align: center; 
}
"""

with gr.Blocks(css=css, title="DaltonVision - Koshur AI") as demo:
    gr.Markdown("""
    # 🧾 DaltonVision - InternVL3-14B
    ### Advanced Image Understanding • Powered by OpenRouter • Developed by [Koshur AI](https://koshurai.com)
    """)
    
    with gr.Row():
        with gr.Column():
            # Image input section
            image_input = gr.Image(
                sources=["upload", "webcam"],
                type="pil",
                label="Upload or Capture Image",
                elem_id="mobile-camera"
            )
            
            # Prompt input
            prompt_input = gr.Textbox(
                label="📝 Enter your question or instruction",
                value="Extract all content structurally",
                lines=3,
                elem_id="prompt-textbox"
            )
            
            submit_btn = gr.Button("🔍 Analyze Image", variant="primary")
            
            gr.Examples(
                examples=[
                    ["What is the total amount on this receipt?"],
                    ["List all items and their prices"],
                    ["Who is the vendor and what is the date?"],
                    ["Describe this image in detail"]
                ],
                inputs=[prompt_input],
                label="💡 Try these example prompts:"
            )
        
        with gr.Column():
            # Result output
            result_output = gr.Markdown(
                label="✅ Analysis Result",
                elem_classes="result-box"
            )
    
    # Footer
    gr.Markdown("""
    <div class="footer">
    © 2025 Koshur AI. All rights reserved.<br>
    Note: Images are processed in real-time and not stored.
    </div>
    """)
    
    # Button action
    submit_btn.click(
        fn=analyze_image,
        inputs=[image_input, prompt_input],
        outputs=result_output
    )

# Launch the app
if __name__ == "__main__":
    demo.launch()