Ovis-Image / app_old.py
tchung1970's picture
Consolidate to single app.py entry point
d41a998
raw
history blame
20.4 kB
import os
import torch
import gradio as gr
import spaces
import random
import numpy as np
from safetensors.torch import load_file
from huggingface_hub import hf_hub_download
from diffusers.utils import logging
from PIL import Image
from ovis_image.model.tokenizer import build_ovis_tokenizer
from ovis_image.model.autoencoder import load_ae
from ovis_image.model.hf_embedder import OvisEmbedder
from ovis_image.model.model import OvisImageModel
from ovis_image.sampling import generate_image
from ovis_image import ovis_image_configs
logging.set_verbosity_error()
# DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MAX_SEED = np.iinfo(np.int32).max
device = "cuda"
_dtype = torch.bfloat16
hf_token = os.getenv("HF_TOKEN")
print("init ovis_image")
model_config = ovis_image_configs["ovis-image-7b"]
ovis_image = OvisImageModel(model_config)
ovis_image_path = hf_hub_download(
repo_id="AIDC-AI/Ovis-Image-7B",
filename="ovis_image.safetensors",
token=hf_token,
)
model_state_dict = load_file(ovis_image_path)
missing_keys, unexpected_keys = ovis_image.load_state_dict(model_state_dict)
print(f"Load Missing Keys {missing_keys}")
print(f"Load Unexpected Keys {unexpected_keys}")
ovis_image = ovis_image.to(device=device, dtype=_dtype)
ovis_image.eval()
print("init vae")
vae_path = hf_hub_download(
repo_id="AIDC-AI/Ovis-Image-7B",
filename="ae.safetensors",
token=hf_token,
)
autoencoder = load_ae(
vae_path,
model_config.autoencoder_params,
device=device,
dtype=_dtype,
random_init=False,
)
autoencoder.eval()
print("init ovis")
# ovis_path = hf_hub_download(
# repo_id="AIDC-AI/Ovis-Image-7B",
# subfolder="Ovis2.5-2B",
# token=hf_token,
# )
ovis_tokenizer = build_ovis_tokenizer(
"AIDC-AI/Ovis2.5-2B",
)
ovis_encoder = OvisEmbedder(
model_path="AIDC-AI/Ovis2.5-2B",
random_init=False,
low_cpu_mem_usage=True,
torch_dtype=torch.bfloat16,
).to(device=device, dtype=_dtype)
examples = [
"Five shimmering goldfish weave through crevices between stones; four are red-and-white, while one is silver-white. By the pond's edge, a golden shaded British Shorthair cat watches them intently, counting on blind luck. Watercolor style.",
"Solar punk vehicle in a bustling city",
"An anthropomorphic cat riding a Harley Davidson in Arizona with sunglasses and a leather jacket",
"An elderly woman poses for a high fashion photoshoot in colorful, patterned clothes with a cyberpunk 2077 vibe",
]
def get_image_size(aspect_ratio):
"""Converts aspect ratio string to width, height tuple."""
if "(" in aspect_ratio and "x" in aspect_ratio:
try:
res_part = aspect_ratio.split("(")[1].split(")")[0]
width, height = res_part.split("x")
return int(width), int(height)
except:
pass
return 1024, 1024
apple_css = """
/* Global Styles */
.gradio-container {
max-width: 85vw !important;
margin: 0 auto !important;
padding: 48px 20px !important;
font-family: -apple-system, BlinkMacSystemFont, 'Inter', 'Segoe UI', 'Roboto', sans-serif !important;
}
/* Disable all transitions globally to prevent layout shifts */
* {
transition: none !important;
animation: none !important;
}
/* Header */
.header-container {
text-align: left;
margin-bottom: 24px;
}
.main-title {
font-size: 32px !important;
font-weight: 600 !important;
letter-spacing: -0.02em !important;
line-height: 1.07 !important;
color: #1d1d1f !important;
margin: 0 0 16px 0 !important;
}
.subtitle {
font-size: 21px !important;
font-weight: 400 !important;
line-height: 1.38 !important;
color: #6e6e73 !important;
margin: 0 0 24px 0 !important;
}
.attribution-link {
display: inline-block;
font-size: 14px !important;
color: #0071e3 !important;
text-decoration: none !important;
font-weight: 400 !important;
transition: color 0.2s ease !important;
}
.attribution-link:hover {
color: #0077ed !important;
text-decoration: underline !important;
}
/* Input Section */
.input-section {
background: #ffffff;
border-radius: 18px;
padding: 32px;
box-shadow: 0 2px 12px rgba(0, 0, 0, 0.08);
}
/* Textbox */
textarea {
font-size: 17px !important;
line-height: 1.47 !important;
border-radius: 12px !important;
border: 1px solid #d2d2d7 !important;
padding: 12px 16px !important;
background: #ffffff !important;
font-family: -apple-system, BlinkMacSystemFont, 'Inter', sans-serif !important;
min-height: 200px !important;
max-height: 400px !important;
height: 200px !important;
resize: vertical !important;
overflow-y: auto !important;
margin-bottom: 16px !important;
}
textarea:focus {
border-color: #0071e3 !important;
box-shadow: 0 0 0 4px rgba(0, 113, 227, 0.15) !important;
outline: none !important;
}
textarea::placeholder {
color: #86868b !important;
}
/* Button */
button.primary {
font-size: 17px !important;
font-weight: 400 !important;
padding: 12px 32px !important;
border-radius: 980px !important;
background: #0071e3 !important;
border: none !important;
color: #ffffff !important;
min-height: 44px !important;
letter-spacing: -0.01em !important;
cursor: pointer !important;
}
button.primary:hover {
background: #0077ed !important;
}
button.primary:active {
opacity: 0.9 !important;
}
/* Output Section */
div.output-section {
background: #ffffff;
border-radius: 18px;
padding: 32px;
box-shadow: 0 2px 12px rgba(0, 0, 0, 0.08);
overflow: hidden;
display: flex;
align-items: center;
justify-content: center;
min-height: 80vh;
max-height: 90vh;
will-change: auto;
position: relative;
}
.output-section * {
transform: none !important;
transition: none !important;
animation: none !important;
}
.output-section img {
border-radius: 12px !important;
max-width: 100% !important;
max-height: 85vh !important;
width: auto !important;
height: auto !important;
object-fit: contain !important;
transform: none !important;
transition: none !important;
animation: none !important;
backface-visibility: hidden;
-webkit-backface-visibility: hidden;
}
/* Make progress/generation area fill more space */
.output-section > div {
width: 100% !important;
min-height: 75vh !important;
max-height: 85vh !important;
display: flex !important;
align-items: center !important;
justify-content: center !important;
}
.output-section > div > div {
min-height: 75vh !important;
max-height: 85vh !important;
width: 100% !important;
display: flex !important;
align-items: center !important;
justify-content: center !important;
}
.output-section * {
max-width: 100% !important;
}
/* Footer */
.footer-text {
text-align: center;
margin-top: 48px;
font-size: 14px !important;
color: #86868b !important;
line-height: 1.43 !important;
}
/* Progress */
.progress-bar {
background: #0071e3 !important;
border-radius: 4px !important;
}
/* Dark Mode */
.dark .main-title {
color: #ffffff !important;
}
.dark .subtitle {
color: #a1a1a6 !important;
}
.input-section .main-title {
color: #ffffff !important;
}
.dark .input-section .main-title {
color: #f5f5f7 !important;
}
.dark .input-section,
.dark .output-section {
background: #1d1d1f;
box-shadow: 0 2px 12px rgba(0, 0, 0, 0.4);
}
.dark textarea {
background: #1d1d1f !important;
border-color: #424245 !important;
color: #f5f5f7 !important;
}
.dark textarea::placeholder {
color: #86868b !important;
}
/* Inline labels */
label.inline-label {
display: flex !important;
align-items: center !important;
min-width: 120px !important;
margin: 0 !important;
padding: 0 12px 0 0 !important;
font-weight: 400 !important;
font-size: 14px !important;
color: #1d1d1f !important;
}
/* Fix column width to prevent shrinking - target Gradio's generated structure */
.input-section {
min-width: 550px !important;
max-width: 550px !important;
width: 550px !important;
flex-shrink: 0 !important;
flex-grow: 0 !important;
}
/* Lock the output section to fill remaining space */
.output-section {
flex-grow: 1 !important;
flex-shrink: 0 !important;
flex-basis: auto !important;
}
/* Prevent Gradio columns from flexing */
.gradio-column {
flex-shrink: 0 !important;
}
/* Stabilize row layout - force horizontal layout with maximum specificity */
.gradio-row,
div.gradio-row,
.gradio-container .gradio-row,
.gradio-container > .gradio-row,
.gradio-container div.gradio-row {
align-items: flex-start !important;
flex-direction: row !important;
display: flex !important;
flex-wrap: nowrap !important;
width: 100% !important;
}
/* Force columns to stay inline */
.gradio-row > .gradio-column,
.gradio-row > div {
display: inline-flex !important;
vertical-align: top !important;
}
/* First column - input section */
.gradio-row > .gradio-column:first-child,
.gradio-row > div:first-child {
width: 550px !important;
min-width: 550px !important;
max-width: 550px !important;
flex: 0 0 550px !important;
}
/* Second column - output section */
.gradio-row > .gradio-column:last-child,
.gradio-row > div:last-child {
flex: 1 1 auto !important;
min-width: 0 !important;
}
/* Lock textbox container size */
.input-section .gr-textbox,
.input-section label[for] {
width: 100% !important;
}
/* Prevent form from expanding */
.input-section form {
width: 100% !important;
max-width: 100% !important;
}
/* Ensure seed input always visible */
.input-section input[type="number"] {
display: block !important;
visibility: visible !important;
}
/* Hide progress indicator in input section - target specific progress elements */
.input-section .progress-container,
.input-section [class*="progress-bar"],
.input-section [class*="progress-text"],
.input-section [class*="progress-level"],
.input-section .progress,
.input-section .eta-bar {
display: none !important;
visibility: hidden !important;
height: 0 !important;
overflow: hidden !important;
}
/* Override ALL responsive behavior - force horizontal layout at ALL viewport sizes */
@media (max-width: 2000px) {
.gradio-row,
div.gradio-row,
.gradio-container .gradio-row,
.gradio-container > .gradio-row {
flex-direction: row !important;
flex-wrap: nowrap !important;
display: flex !important;
}
.gradio-row > .gradio-column,
.gradio-row > div {
display: inline-flex !important;
}
.gradio-row > .gradio-column:first-child,
.gradio-row > div:first-child {
width: 550px !important;
min-width: 550px !important;
max-width: 550px !important;
flex: 0 0 550px !important;
}
.gradio-row > .gradio-column:last-child,
.gradio-row > div:last-child {
flex: 1 1 auto !important;
min-width: 0 !important;
}
}
/* Responsive text sizing only */
@media (max-width: 734px) {
.main-title {
font-size: 40px !important;
}
.subtitle {
font-size: 19px !important;
}
.gradio-container {
padding: 32px 16px !important;
}
.input-section,
.output-section {
padding: 24px !important;
}
/* FORCE horizontal layout even on mobile */
.gradio-row,
div.gradio-row {
flex-direction: row !important;
flex-wrap: nowrap !important;
}
}
/* Remove default Gradio styling */
.contain {
padding: 0 !important;
}
/* Hide Gradio footer */
footer {
display: none !important;
}
.footer {
display: none !important;
}
/* Target main app container */
#root, #app {
width: 100% !important;
max-width: none !important;
}
"""
# JavaScript to force horizontal layout
js_code = """
function() {
function forceHorizontalLayout() {
// Set container width
const container = document.querySelector('.gradio-container');
if (container) {
container.style.maxWidth = '85vw';
container.style.width = '85vw';
}
// Target the main row specifically
const mainRow = document.getElementById('main-row');
if (mainRow) {
mainRow.style.flexDirection = 'row';
mainRow.style.flexWrap = 'nowrap';
mainRow.style.display = 'flex';
mainRow.style.width = '100%';
}
// Force ALL rows to stay horizontal
const rows = document.querySelectorAll('.gradio-row');
rows.forEach(row => {
row.style.flexDirection = 'row';
row.style.flexWrap = 'nowrap';
row.style.display = 'flex';
});
// Target specific columns
const inputCol = document.getElementById('input-column');
if (inputCol) {
inputCol.style.width = '550px';
inputCol.style.minWidth = '550px';
inputCol.style.maxWidth = '550px';
inputCol.style.flex = '0 0 550px';
inputCol.style.display = 'inline-flex';
inputCol.style.flexDirection = 'column';
}
const outputCol = document.getElementById('output-column');
if (outputCol) {
outputCol.style.flex = '1 1 auto';
outputCol.style.minWidth = '0';
outputCol.style.display = 'inline-flex';
outputCol.style.flexDirection = 'column';
}
// Fallback: force all column children of rows
const columns = document.querySelectorAll('.gradio-row > .gradio-column, .gradio-row > div');
columns.forEach((col, index) => {
if (index === 0) {
col.style.width = '550px';
col.style.minWidth = '550px';
col.style.maxWidth = '550px';
col.style.flex = '0 0 550px';
} else if (index === 1) {
col.style.flex = '1 1 auto';
col.style.minWidth = '0';
}
col.style.display = 'inline-flex';
});
}
// Run immediately
forceHorizontalLayout();
// Run again after delays to override Gradio's dynamic changes
setTimeout(forceHorizontalLayout, 100);
setTimeout(forceHorizontalLayout, 500);
setTimeout(forceHorizontalLayout, 1000);
setTimeout(forceHorizontalLayout, 2000);
// Set up mutation observer to reapply on DOM changes
const observer = new MutationObserver(forceHorizontalLayout);
observer.observe(document.body, { childList: true, subtree: true, attributes: true, attributeFilter: ['style', 'class'] });
}
"""
@spaces.GPU(duration=75)
def infer(
prompt,
seed=42,
randomize_seed=False,
aspect_ratio="1:1 (1024x1024)",
guidance_scale=5.0,
num_inference_steps=50,
progress=gr.Progress(track_tqdm=True),
):
"""Generates an image using the Ovis-Image pipeline."""
if randomize_seed:
seed = random.randint(0, MAX_SEED)
width, height = get_image_size(aspect_ratio)
print(f'inference with prompt: {prompt}, size: {height}x{width}, seed: {seed}, steps: {num_inference_steps}, cfg: {guidance_scale}')
image = generate_image(
device=next(ovis_image.parameters()).device,
dtype=_dtype,
model=ovis_image,
prompt=prompt,
autoencoder=autoencoder,
ovis_tokenizer=ovis_tokenizer,
ovis_encoder=ovis_encoder,
img_height=height,
img_width=width,
denoising_steps=num_inference_steps,
cfg_scale=guidance_scale,
seed=seed,
)
# bring into PIL format and save
image = image.clamp(-1, 1)
image = image.cpu().permute(0, 2, 3, 1).float().numpy()
image = (image * 255).round().astype("uint8")
return image[0], seed
with gr.Blocks(
title="Ovis-Image",
fill_height=False,
theme=gr.themes.Soft(
primary_hue=gr.themes.colors.blue,
secondary_hue=gr.themes.colors.slate,
neutral_hue=gr.themes.colors.gray,
spacing_size=gr.themes.sizes.spacing_lg,
radius_size=gr.themes.sizes.radius_lg,
text_size=gr.themes.sizes.text_md,
font=[gr.themes.GoogleFont("Inter"), "SF Pro Display", "-apple-system", "BlinkMacSystemFont", "system-ui", "sans-serif"],
font_mono=[gr.themes.GoogleFont("JetBrains Mono"), "SF Mono", "ui-monospace", "monospace"],
).set(
body_background_fill='#f5f5f7',
body_background_fill_dark='#000000',
button_primary_background_fill='#0071e3',
button_primary_background_fill_hover='#0077ed',
button_primary_text_color='#ffffff',
block_background_fill='#ffffff',
block_background_fill_dark='#1d1d1f',
block_border_width='0px',
block_shadow='0 2px 12px rgba(0, 0, 0, 0.08)',
block_shadow_dark='0 2px 12px rgba(0, 0, 0, 0.4)',
input_background_fill='#ffffff',
input_background_fill_dark='#1d1d1f',
input_border_width='1px',
input_border_color='#d2d2d7',
input_border_color_dark='#424245',
input_shadow='none',
input_shadow_focus='0 0 0 4px rgba(0, 113, 227, 0.15)',
),
css=apple_css,
js=js_code,
) as demo:
# Two-column layout - variant='panel' prevents responsive stacking
with gr.Row(equal_height=False, variant="panel", elem_id="main-row"):
# Left column - Input controls (fixed width)
with gr.Column(scale=0, min_width=550, elem_classes="input-section", elem_id="input-column"):
# Title above prompt box
gr.HTML("""
<div class="header-container">
<h1 class="main-title">Ovis-Image</h1>
</div>
""")
prompt = gr.Textbox(
placeholder="Describe the image you want to create...",
value=examples[0],
lines=7,
max_lines=7,
label="Prompt",
show_label=True,
container=True,
autoscroll=False,
)
aspect_ratio = gr.Dropdown(
choices=[
"1:1 (1024x1024)",
"4:3 (1024x768)",
"3:4 (768x1024)",
"16:9 (1024x576)",
"9:16 (576x1024)",
],
value="1:1 (1024x1024)",
label="Aspect Ratio",
show_label=True,
container=True,
)
run_button = gr.Button(
"Generate",
variant="primary",
size="lg",
elem_classes="primary"
)
# Hidden advanced settings (still functional but not visible)
seed = gr.Slider(
label="Seed",
minimum=0,
maximum=MAX_SEED,
step=1,
value=0,
visible=False
)
randomize_seed = gr.Checkbox(label="Randomize seed", value=True, visible=False)
guidance_scale = gr.Slider(
label="Guidance scale",
minimum=0.0,
maximum=14.0,
step=0.1,
value=5.0,
visible=False
)
num_inference_steps = gr.Slider(
label="Number of inference steps",
minimum=1,
maximum=100,
step=1,
value=50,
visible=False
)
# Right column - Image output
with gr.Column(scale=2, elem_classes="output-section", elem_id="output-column"):
result = gr.Image(
label="Result",
show_label=False,
type="numpy",
format="png",
)
# Event handlers - using gr.on() like original Qwen-Image
gr.on(
triggers=[run_button.click, prompt.submit],
fn=infer,
inputs=[
prompt,
seed,
randomize_seed,
aspect_ratio,
guidance_scale,
num_inference_steps,
],
outputs=[result, seed],
)
if __name__ == '__main__':
demo.launch()