Spaces:
Running
on
Zero
Running
on
Zero
| import os | |
| import torch | |
| import gradio as gr | |
| import spaces | |
| import random | |
| import numpy as np | |
| from safetensors.torch import load_file | |
| from huggingface_hub import hf_hub_download | |
| from diffusers.utils import logging | |
| from PIL import Image | |
| from ovis_image.model.tokenizer import build_ovis_tokenizer | |
| from ovis_image.model.autoencoder import load_ae | |
| from ovis_image.model.hf_embedder import OvisEmbedder | |
| from ovis_image.model.model import OvisImageModel | |
| from ovis_image.sampling import generate_image | |
| from ovis_image import ovis_image_configs | |
| logging.set_verbosity_error() | |
| # DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
| MAX_SEED = np.iinfo(np.int32).max | |
| device = "cuda" | |
| _dtype = torch.bfloat16 | |
| hf_token = os.getenv("HF_TOKEN") | |
| print("init ovis_image") | |
| model_config = ovis_image_configs["ovis-image-7b"] | |
| ovis_image = OvisImageModel(model_config) | |
| ovis_image_path = hf_hub_download( | |
| repo_id="AIDC-AI/Ovis-Image-7B", | |
| filename="ovis_image.safetensors", | |
| token=hf_token, | |
| ) | |
| model_state_dict = load_file(ovis_image_path) | |
| missing_keys, unexpected_keys = ovis_image.load_state_dict(model_state_dict) | |
| print(f"Load Missing Keys {missing_keys}") | |
| print(f"Load Unexpected Keys {unexpected_keys}") | |
| ovis_image = ovis_image.to(device=device, dtype=_dtype) | |
| ovis_image.eval() | |
| print("init vae") | |
| vae_path = hf_hub_download( | |
| repo_id="AIDC-AI/Ovis-Image-7B", | |
| filename="ae.safetensors", | |
| token=hf_token, | |
| ) | |
| autoencoder = load_ae( | |
| vae_path, | |
| model_config.autoencoder_params, | |
| device=device, | |
| dtype=_dtype, | |
| random_init=False, | |
| ) | |
| autoencoder.eval() | |
| print("init ovis") | |
| # ovis_path = hf_hub_download( | |
| # repo_id="AIDC-AI/Ovis-Image-7B", | |
| # subfolder="Ovis2.5-2B", | |
| # token=hf_token, | |
| # ) | |
| ovis_tokenizer = build_ovis_tokenizer( | |
| "AIDC-AI/Ovis2.5-2B", | |
| ) | |
| ovis_encoder = OvisEmbedder( | |
| model_path="AIDC-AI/Ovis2.5-2B", | |
| random_init=False, | |
| low_cpu_mem_usage=True, | |
| torch_dtype=torch.bfloat16, | |
| ).to(device=device, dtype=_dtype) | |
| examples = [ | |
| "Five shimmering goldfish weave through crevices between stones; four are red-and-white, while one is silver-white. By the pond's edge, a golden shaded British Shorthair cat watches them intently, counting on blind luck. Watercolor style.", | |
| "Solar punk vehicle in a bustling city", | |
| "An anthropomorphic cat riding a Harley Davidson in Arizona with sunglasses and a leather jacket", | |
| "An elderly woman poses for a high fashion photoshoot in colorful, patterned clothes with a cyberpunk 2077 vibe", | |
| ] | |
| def get_image_size(aspect_ratio): | |
| """Converts aspect ratio string to width, height tuple.""" | |
| if "(" in aspect_ratio and "x" in aspect_ratio: | |
| try: | |
| res_part = aspect_ratio.split("(")[1].split(")")[0] | |
| width, height = res_part.split("x") | |
| return int(width), int(height) | |
| except: | |
| pass | |
| return 1024, 1024 | |
| apple_css = """ | |
| /* Global Styles */ | |
| .gradio-container { | |
| max-width: 85vw !important; | |
| margin: 0 auto !important; | |
| padding: 48px 20px !important; | |
| font-family: -apple-system, BlinkMacSystemFont, 'Inter', 'Segoe UI', 'Roboto', sans-serif !important; | |
| } | |
| /* Disable all transitions globally to prevent layout shifts */ | |
| * { | |
| transition: none !important; | |
| animation: none !important; | |
| } | |
| /* Header */ | |
| .header-container { | |
| text-align: left; | |
| margin-bottom: 24px; | |
| } | |
| .main-title { | |
| font-size: 32px !important; | |
| font-weight: 600 !important; | |
| letter-spacing: -0.02em !important; | |
| line-height: 1.07 !important; | |
| color: #1d1d1f !important; | |
| margin: 0 0 16px 0 !important; | |
| } | |
| .subtitle { | |
| font-size: 21px !important; | |
| font-weight: 400 !important; | |
| line-height: 1.38 !important; | |
| color: #6e6e73 !important; | |
| margin: 0 0 24px 0 !important; | |
| } | |
| .attribution-link { | |
| display: inline-block; | |
| font-size: 14px !important; | |
| color: #0071e3 !important; | |
| text-decoration: none !important; | |
| font-weight: 400 !important; | |
| transition: color 0.2s ease !important; | |
| } | |
| .attribution-link:hover { | |
| color: #0077ed !important; | |
| text-decoration: underline !important; | |
| } | |
| /* Input Section */ | |
| .input-section { | |
| background: #ffffff; | |
| border-radius: 18px; | |
| padding: 32px; | |
| box-shadow: 0 2px 12px rgba(0, 0, 0, 0.08); | |
| } | |
| /* Textbox */ | |
| textarea { | |
| font-size: 17px !important; | |
| line-height: 1.47 !important; | |
| border-radius: 12px !important; | |
| border: 1px solid #d2d2d7 !important; | |
| padding: 12px 16px !important; | |
| background: #ffffff !important; | |
| font-family: -apple-system, BlinkMacSystemFont, 'Inter', sans-serif !important; | |
| min-height: 200px !important; | |
| max-height: 400px !important; | |
| height: 200px !important; | |
| resize: vertical !important; | |
| overflow-y: auto !important; | |
| margin-bottom: 16px !important; | |
| } | |
| textarea:focus { | |
| border-color: #0071e3 !important; | |
| box-shadow: 0 0 0 4px rgba(0, 113, 227, 0.15) !important; | |
| outline: none !important; | |
| } | |
| textarea::placeholder { | |
| color: #86868b !important; | |
| } | |
| /* Button */ | |
| button.primary { | |
| font-size: 17px !important; | |
| font-weight: 400 !important; | |
| padding: 12px 32px !important; | |
| border-radius: 980px !important; | |
| background: #0071e3 !important; | |
| border: none !important; | |
| color: #ffffff !important; | |
| min-height: 44px !important; | |
| letter-spacing: -0.01em !important; | |
| cursor: pointer !important; | |
| } | |
| button.primary:hover { | |
| background: #0077ed !important; | |
| } | |
| button.primary:active { | |
| opacity: 0.9 !important; | |
| } | |
| /* Output Section */ | |
| div.output-section { | |
| background: #ffffff; | |
| border-radius: 18px; | |
| padding: 32px; | |
| box-shadow: 0 2px 12px rgba(0, 0, 0, 0.08); | |
| overflow: hidden; | |
| display: flex; | |
| align-items: center; | |
| justify-content: center; | |
| min-height: 80vh; | |
| max-height: 90vh; | |
| will-change: auto; | |
| position: relative; | |
| } | |
| .output-section * { | |
| transform: none !important; | |
| transition: none !important; | |
| animation: none !important; | |
| } | |
| .output-section img { | |
| border-radius: 12px !important; | |
| max-width: 100% !important; | |
| max-height: 85vh !important; | |
| width: auto !important; | |
| height: auto !important; | |
| object-fit: contain !important; | |
| transform: none !important; | |
| transition: none !important; | |
| animation: none !important; | |
| backface-visibility: hidden; | |
| -webkit-backface-visibility: hidden; | |
| } | |
| /* Make progress/generation area fill more space */ | |
| .output-section > div { | |
| width: 100% !important; | |
| min-height: 75vh !important; | |
| max-height: 85vh !important; | |
| display: flex !important; | |
| align-items: center !important; | |
| justify-content: center !important; | |
| } | |
| .output-section > div > div { | |
| min-height: 75vh !important; | |
| max-height: 85vh !important; | |
| width: 100% !important; | |
| display: flex !important; | |
| align-items: center !important; | |
| justify-content: center !important; | |
| } | |
| .output-section * { | |
| max-width: 100% !important; | |
| } | |
| /* Footer */ | |
| .footer-text { | |
| text-align: center; | |
| margin-top: 48px; | |
| font-size: 14px !important; | |
| color: #86868b !important; | |
| line-height: 1.43 !important; | |
| } | |
| /* Progress */ | |
| .progress-bar { | |
| background: #0071e3 !important; | |
| border-radius: 4px !important; | |
| } | |
| /* Dark Mode */ | |
| .dark .main-title { | |
| color: #ffffff !important; | |
| } | |
| .dark .subtitle { | |
| color: #a1a1a6 !important; | |
| } | |
| .input-section .main-title { | |
| color: #ffffff !important; | |
| } | |
| .dark .input-section .main-title { | |
| color: #f5f5f7 !important; | |
| } | |
| .dark .input-section, | |
| .dark .output-section { | |
| background: #1d1d1f; | |
| box-shadow: 0 2px 12px rgba(0, 0, 0, 0.4); | |
| } | |
| .dark textarea { | |
| background: #1d1d1f !important; | |
| border-color: #424245 !important; | |
| color: #f5f5f7 !important; | |
| } | |
| .dark textarea::placeholder { | |
| color: #86868b !important; | |
| } | |
| /* Inline labels */ | |
| label.inline-label { | |
| display: flex !important; | |
| align-items: center !important; | |
| min-width: 120px !important; | |
| margin: 0 !important; | |
| padding: 0 12px 0 0 !important; | |
| font-weight: 400 !important; | |
| font-size: 14px !important; | |
| color: #1d1d1f !important; | |
| } | |
| /* Fix column width to prevent shrinking - target Gradio's generated structure */ | |
| .input-section { | |
| min-width: 550px !important; | |
| max-width: 550px !important; | |
| width: 550px !important; | |
| flex-shrink: 0 !important; | |
| flex-grow: 0 !important; | |
| } | |
| /* Lock the output section to fill remaining space */ | |
| .output-section { | |
| flex-grow: 1 !important; | |
| flex-shrink: 0 !important; | |
| flex-basis: auto !important; | |
| } | |
| /* Prevent Gradio columns from flexing */ | |
| .gradio-column { | |
| flex-shrink: 0 !important; | |
| } | |
| /* Stabilize row layout - force horizontal layout with maximum specificity */ | |
| .gradio-row, | |
| div.gradio-row, | |
| .gradio-container .gradio-row, | |
| .gradio-container > .gradio-row, | |
| .gradio-container div.gradio-row { | |
| align-items: flex-start !important; | |
| flex-direction: row !important; | |
| display: flex !important; | |
| flex-wrap: nowrap !important; | |
| width: 100% !important; | |
| } | |
| /* Force columns to stay inline */ | |
| .gradio-row > .gradio-column, | |
| .gradio-row > div { | |
| display: inline-flex !important; | |
| vertical-align: top !important; | |
| } | |
| /* First column - input section */ | |
| .gradio-row > .gradio-column:first-child, | |
| .gradio-row > div:first-child { | |
| width: 550px !important; | |
| min-width: 550px !important; | |
| max-width: 550px !important; | |
| flex: 0 0 550px !important; | |
| } | |
| /* Second column - output section */ | |
| .gradio-row > .gradio-column:last-child, | |
| .gradio-row > div:last-child { | |
| flex: 1 1 auto !important; | |
| min-width: 0 !important; | |
| } | |
| /* Lock textbox container size */ | |
| .input-section .gr-textbox, | |
| .input-section label[for] { | |
| width: 100% !important; | |
| } | |
| /* Prevent form from expanding */ | |
| .input-section form { | |
| width: 100% !important; | |
| max-width: 100% !important; | |
| } | |
| /* Ensure seed input always visible */ | |
| .input-section input[type="number"] { | |
| display: block !important; | |
| visibility: visible !important; | |
| } | |
| /* Hide progress indicator in input section - target specific progress elements */ | |
| .input-section .progress-container, | |
| .input-section [class*="progress-bar"], | |
| .input-section [class*="progress-text"], | |
| .input-section [class*="progress-level"], | |
| .input-section .progress, | |
| .input-section .eta-bar { | |
| display: none !important; | |
| visibility: hidden !important; | |
| height: 0 !important; | |
| overflow: hidden !important; | |
| } | |
| /* Override ALL responsive behavior - force horizontal layout at ALL viewport sizes */ | |
| @media (max-width: 2000px) { | |
| .gradio-row, | |
| div.gradio-row, | |
| .gradio-container .gradio-row, | |
| .gradio-container > .gradio-row { | |
| flex-direction: row !important; | |
| flex-wrap: nowrap !important; | |
| display: flex !important; | |
| } | |
| .gradio-row > .gradio-column, | |
| .gradio-row > div { | |
| display: inline-flex !important; | |
| } | |
| .gradio-row > .gradio-column:first-child, | |
| .gradio-row > div:first-child { | |
| width: 550px !important; | |
| min-width: 550px !important; | |
| max-width: 550px !important; | |
| flex: 0 0 550px !important; | |
| } | |
| .gradio-row > .gradio-column:last-child, | |
| .gradio-row > div:last-child { | |
| flex: 1 1 auto !important; | |
| min-width: 0 !important; | |
| } | |
| } | |
| /* Responsive text sizing only */ | |
| @media (max-width: 734px) { | |
| .main-title { | |
| font-size: 40px !important; | |
| } | |
| .subtitle { | |
| font-size: 19px !important; | |
| } | |
| .gradio-container { | |
| padding: 32px 16px !important; | |
| } | |
| .input-section, | |
| .output-section { | |
| padding: 24px !important; | |
| } | |
| /* FORCE horizontal layout even on mobile */ | |
| .gradio-row, | |
| div.gradio-row { | |
| flex-direction: row !important; | |
| flex-wrap: nowrap !important; | |
| } | |
| } | |
| /* Remove default Gradio styling */ | |
| .contain { | |
| padding: 0 !important; | |
| } | |
| /* Hide Gradio footer */ | |
| footer { | |
| display: none !important; | |
| } | |
| .footer { | |
| display: none !important; | |
| } | |
| /* Target main app container */ | |
| #root, #app { | |
| width: 100% !important; | |
| max-width: none !important; | |
| } | |
| """ | |
| # JavaScript to force horizontal layout | |
| js_code = """ | |
| function() { | |
| function forceHorizontalLayout() { | |
| // Set container width | |
| const container = document.querySelector('.gradio-container'); | |
| if (container) { | |
| container.style.maxWidth = '85vw'; | |
| container.style.width = '85vw'; | |
| } | |
| // Target the main row specifically | |
| const mainRow = document.getElementById('main-row'); | |
| if (mainRow) { | |
| mainRow.style.flexDirection = 'row'; | |
| mainRow.style.flexWrap = 'nowrap'; | |
| mainRow.style.display = 'flex'; | |
| mainRow.style.width = '100%'; | |
| } | |
| // Force ALL rows to stay horizontal | |
| const rows = document.querySelectorAll('.gradio-row'); | |
| rows.forEach(row => { | |
| row.style.flexDirection = 'row'; | |
| row.style.flexWrap = 'nowrap'; | |
| row.style.display = 'flex'; | |
| }); | |
| // Target specific columns | |
| const inputCol = document.getElementById('input-column'); | |
| if (inputCol) { | |
| inputCol.style.width = '550px'; | |
| inputCol.style.minWidth = '550px'; | |
| inputCol.style.maxWidth = '550px'; | |
| inputCol.style.flex = '0 0 550px'; | |
| inputCol.style.display = 'inline-flex'; | |
| inputCol.style.flexDirection = 'column'; | |
| } | |
| const outputCol = document.getElementById('output-column'); | |
| if (outputCol) { | |
| outputCol.style.flex = '1 1 auto'; | |
| outputCol.style.minWidth = '0'; | |
| outputCol.style.display = 'inline-flex'; | |
| outputCol.style.flexDirection = 'column'; | |
| } | |
| // Fallback: force all column children of rows | |
| const columns = document.querySelectorAll('.gradio-row > .gradio-column, .gradio-row > div'); | |
| columns.forEach((col, index) => { | |
| if (index === 0) { | |
| col.style.width = '550px'; | |
| col.style.minWidth = '550px'; | |
| col.style.maxWidth = '550px'; | |
| col.style.flex = '0 0 550px'; | |
| } else if (index === 1) { | |
| col.style.flex = '1 1 auto'; | |
| col.style.minWidth = '0'; | |
| } | |
| col.style.display = 'inline-flex'; | |
| }); | |
| } | |
| // Run immediately | |
| forceHorizontalLayout(); | |
| // Run again after delays to override Gradio's dynamic changes | |
| setTimeout(forceHorizontalLayout, 100); | |
| setTimeout(forceHorizontalLayout, 500); | |
| setTimeout(forceHorizontalLayout, 1000); | |
| setTimeout(forceHorizontalLayout, 2000); | |
| // Set up mutation observer to reapply on DOM changes | |
| const observer = new MutationObserver(forceHorizontalLayout); | |
| observer.observe(document.body, { childList: true, subtree: true, attributes: true, attributeFilter: ['style', 'class'] }); | |
| } | |
| """ | |
| def infer( | |
| prompt, | |
| seed=42, | |
| randomize_seed=False, | |
| aspect_ratio="1:1 (1024x1024)", | |
| guidance_scale=5.0, | |
| num_inference_steps=50, | |
| progress=gr.Progress(track_tqdm=True), | |
| ): | |
| """Generates an image using the Ovis-Image pipeline.""" | |
| if randomize_seed: | |
| seed = random.randint(0, MAX_SEED) | |
| width, height = get_image_size(aspect_ratio) | |
| print(f'inference with prompt: {prompt}, size: {height}x{width}, seed: {seed}, steps: {num_inference_steps}, cfg: {guidance_scale}') | |
| image = generate_image( | |
| device=next(ovis_image.parameters()).device, | |
| dtype=_dtype, | |
| model=ovis_image, | |
| prompt=prompt, | |
| autoencoder=autoencoder, | |
| ovis_tokenizer=ovis_tokenizer, | |
| ovis_encoder=ovis_encoder, | |
| img_height=height, | |
| img_width=width, | |
| denoising_steps=num_inference_steps, | |
| cfg_scale=guidance_scale, | |
| seed=seed, | |
| ) | |
| # bring into PIL format and save | |
| image = image.clamp(-1, 1) | |
| image = image.cpu().permute(0, 2, 3, 1).float().numpy() | |
| image = (image * 255).round().astype("uint8") | |
| return image[0], seed | |
| with gr.Blocks( | |
| title="Ovis-Image", | |
| fill_height=False, | |
| theme=gr.themes.Soft( | |
| primary_hue=gr.themes.colors.blue, | |
| secondary_hue=gr.themes.colors.slate, | |
| neutral_hue=gr.themes.colors.gray, | |
| spacing_size=gr.themes.sizes.spacing_lg, | |
| radius_size=gr.themes.sizes.radius_lg, | |
| text_size=gr.themes.sizes.text_md, | |
| font=[gr.themes.GoogleFont("Inter"), "SF Pro Display", "-apple-system", "BlinkMacSystemFont", "system-ui", "sans-serif"], | |
| font_mono=[gr.themes.GoogleFont("JetBrains Mono"), "SF Mono", "ui-monospace", "monospace"], | |
| ).set( | |
| body_background_fill='#f5f5f7', | |
| body_background_fill_dark='#000000', | |
| button_primary_background_fill='#0071e3', | |
| button_primary_background_fill_hover='#0077ed', | |
| button_primary_text_color='#ffffff', | |
| block_background_fill='#ffffff', | |
| block_background_fill_dark='#1d1d1f', | |
| block_border_width='0px', | |
| block_shadow='0 2px 12px rgba(0, 0, 0, 0.08)', | |
| block_shadow_dark='0 2px 12px rgba(0, 0, 0, 0.4)', | |
| input_background_fill='#ffffff', | |
| input_background_fill_dark='#1d1d1f', | |
| input_border_width='1px', | |
| input_border_color='#d2d2d7', | |
| input_border_color_dark='#424245', | |
| input_shadow='none', | |
| input_shadow_focus='0 0 0 4px rgba(0, 113, 227, 0.15)', | |
| ), | |
| css=apple_css, | |
| js=js_code, | |
| ) as demo: | |
| # Two-column layout - variant='panel' prevents responsive stacking | |
| with gr.Row(equal_height=False, variant="panel", elem_id="main-row"): | |
| # Left column - Input controls (fixed width) | |
| with gr.Column(scale=0, min_width=550, elem_classes="input-section", elem_id="input-column"): | |
| # Title above prompt box | |
| gr.HTML(""" | |
| <div class="header-container"> | |
| <h1 class="main-title">Ovis-Image</h1> | |
| </div> | |
| """) | |
| prompt = gr.Textbox( | |
| placeholder="Describe the image you want to create...", | |
| value=examples[0], | |
| lines=7, | |
| max_lines=7, | |
| label="Prompt", | |
| show_label=True, | |
| container=True, | |
| autoscroll=False, | |
| ) | |
| aspect_ratio = gr.Dropdown( | |
| choices=[ | |
| "1:1 (1024x1024)", | |
| "4:3 (1024x768)", | |
| "3:4 (768x1024)", | |
| "16:9 (1024x576)", | |
| "9:16 (576x1024)", | |
| ], | |
| value="1:1 (1024x1024)", | |
| label="Aspect Ratio", | |
| show_label=True, | |
| container=True, | |
| ) | |
| run_button = gr.Button( | |
| "Generate", | |
| variant="primary", | |
| size="lg", | |
| elem_classes="primary" | |
| ) | |
| # Hidden advanced settings (still functional but not visible) | |
| seed = gr.Slider( | |
| label="Seed", | |
| minimum=0, | |
| maximum=MAX_SEED, | |
| step=1, | |
| value=0, | |
| visible=False | |
| ) | |
| randomize_seed = gr.Checkbox(label="Randomize seed", value=True, visible=False) | |
| guidance_scale = gr.Slider( | |
| label="Guidance scale", | |
| minimum=0.0, | |
| maximum=14.0, | |
| step=0.1, | |
| value=5.0, | |
| visible=False | |
| ) | |
| num_inference_steps = gr.Slider( | |
| label="Number of inference steps", | |
| minimum=1, | |
| maximum=100, | |
| step=1, | |
| value=50, | |
| visible=False | |
| ) | |
| # Right column - Image output | |
| with gr.Column(scale=2, elem_classes="output-section", elem_id="output-column"): | |
| result = gr.Image( | |
| label="Result", | |
| show_label=False, | |
| type="numpy", | |
| format="png", | |
| ) | |
| # Event handlers - using gr.on() like original Qwen-Image | |
| gr.on( | |
| triggers=[run_button.click, prompt.submit], | |
| fn=infer, | |
| inputs=[ | |
| prompt, | |
| seed, | |
| randomize_seed, | |
| aspect_ratio, | |
| guidance_scale, | |
| num_inference_steps, | |
| ], | |
| outputs=[result, seed], | |
| ) | |
| if __name__ == '__main__': | |
| demo.launch() |