benibraz commited on
Commit
dcc21df
β€’
1 Parent(s): a040b19

change seed and analytics

Browse files
Files changed (1) hide show
  1. app.py +160 -44
app.py CHANGED
@@ -1,7 +1,9 @@
 
1
  import gradio as gr
2
  from gradio_toggle import Toggle
3
  import torch
4
  from huggingface_hub import snapshot_download
 
5
 
6
  from xora.models.autoencoders.causal_video_autoencoder import CausalVideoAutoencoder
7
  from xora.models.transformers.transformer3d import Transformer3DModel
@@ -20,6 +22,9 @@ import tempfile
20
  import os
21
  import gc
22
  from openai import OpenAI
 
 
 
23
 
24
  # Load Hugging Face token if needed
25
  hf_token = os.getenv("HF_TOKEN")
@@ -36,9 +41,7 @@ with open(system_prompt_i2v_path, "r") as f:
36
  # Set model download directory within Hugging Face Spaces
37
  model_path = "asset"
38
  if not os.path.exists(model_path):
39
- snapshot_download(
40
- "Lightricks/LTX-Video", local_dir=model_path, repo_type="model", token=hf_token
41
- )
42
 
43
  # Global variables to load components
44
  vae_dir = Path(model_path) / "vae"
@@ -47,6 +50,94 @@ scheduler_dir = Path(model_path) / "scheduler"
47
 
48
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
  def load_vae(vae_dir):
52
  vae_ckpt_path = vae_dir / "vae_diffusion_pytorch_model.safetensors"
@@ -185,12 +276,8 @@ vae = load_vae(vae_dir)
185
  unet = load_unet(unet_dir)
186
  scheduler = load_scheduler(scheduler_dir)
187
  patchifier = SymmetricPatchifier(patch_size=1)
188
- text_encoder = T5EncoderModel.from_pretrained(
189
- "PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="text_encoder"
190
- ).to(device)
191
- tokenizer = T5Tokenizer.from_pretrained(
192
- "PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="tokenizer"
193
- )
194
 
195
  pipeline = XoraVideoPipeline(
196
  transformer=unet,
@@ -205,9 +292,10 @@ pipeline = XoraVideoPipeline(
205
  def generate_video_from_text(
206
  prompt="",
207
  enhance_prompt_toggle=False,
 
208
  negative_prompt="",
209
  frame_rate=25,
210
- seed=171198,
211
  num_inference_steps=30,
212
  guidance_scale=3,
213
  height=512,
@@ -221,6 +309,21 @@ def generate_video_from_text(
221
  duration=5,
222
  )
223
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
  prompt = enhance_prompt_if_enabled(prompt, enhance_prompt_toggle, type="t2v")
225
 
226
  sample = {
@@ -269,9 +372,7 @@ def generate_video_from_text(
269
  video_np = images.squeeze(0).permute(1, 2, 3, 0).cpu().float().numpy()
270
  video_np = (video_np * 255).astype(np.uint8)
271
  height, width = video_np.shape[1:3]
272
- out = cv2.VideoWriter(
273
- output_path, cv2.VideoWriter_fourcc(*"mp4v"), frame_rate, (width, height)
274
- )
275
  for frame in video_np[..., ::-1]:
276
  out.write(frame)
277
  out.release()
@@ -286,9 +387,10 @@ def generate_video_from_image(
286
  image_path,
287
  prompt="",
288
  enhance_prompt_toggle=False,
 
289
  negative_prompt="",
290
  frame_rate=25,
291
- seed=171198,
292
  num_inference_steps=30,
293
  guidance_scale=3,
294
  height=512,
@@ -310,9 +412,28 @@ def generate_video_from_image(
310
  if not image_path:
311
  raise gr.Error("Please provide an input image.", duration=5)
312
 
313
- media_items = (
314
- load_image_to_tensor_with_resize(image_path, height, width).to(device).detach()
315
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
316
 
317
  prompt = enhance_prompt_if_enabled(prompt, enhance_prompt_toggle, type="i2v")
318
 
@@ -353,9 +474,7 @@ def generate_video_from_image(
353
  video_np = images.squeeze(0).permute(1, 2, 3, 0).cpu().float().numpy()
354
  video_np = (video_np * 255).astype(np.uint8)
355
  height, width = video_np.shape[1:3]
356
- out = cv2.VideoWriter(
357
- output_path, cv2.VideoWriter_fourcc(*"mp4v"), frame_rate, (width, height)
358
- )
359
  for frame in video_np[..., ::-1]:
360
  out.write(frame)
361
  out.release()
@@ -374,15 +493,9 @@ def generate_video_from_image(
374
 
375
  def create_advanced_options():
376
  with gr.Accordion("Step 4: Advanced Options (Optional)", open=False):
377
- seed = gr.Slider(
378
- label="4.1 Seed", minimum=0, maximum=1000000, step=1, value=171198
379
- )
380
- inference_steps = gr.Slider(
381
- label="4.2 Inference Steps", minimum=1, maximum=50, step=1, value=30
382
- )
383
- guidance_scale = gr.Slider(
384
- label="4.3 Guidance Scale", minimum=1.0, maximum=5.0, step=0.1, value=3.0
385
- )
386
 
387
  height_slider = gr.Slider(
388
  label="4.4 Height",
@@ -451,9 +564,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as iface:
451
  </div>
452
  """
453
  )
454
- with gr.Accordion(
455
- " πŸ“– Tips for Best Results", open=False, elem_id="instructions-accordion"
456
- ):
457
  gr.Markdown(
458
  """
459
  πŸ“ Prompt Engineering
@@ -491,6 +602,12 @@ with gr.Blocks(theme=gr.themes.Soft()) as iface:
491
  value="A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage.",
492
  lines=5,
493
  )
 
 
 
 
 
 
494
  txt2vid_enhance_toggle = Toggle(
495
  label="Enhance Prompt",
496
  value=False,
@@ -566,6 +683,11 @@ with gr.Blocks(theme=gr.themes.Soft()) as iface:
566
  value="A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage.",
567
  lines=5,
568
  )
 
 
 
 
 
569
  img2vid_enhance_toggle = Toggle(
570
  label="Enhance Prompt",
571
  value=False,
@@ -593,9 +715,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as iface:
593
  )
594
 
595
  img2vid_advanced = create_advanced_options()
596
- img2vid_generate = gr.Button(
597
- "Step 6: Generate Video", variant="primary", size="lg"
598
- )
599
 
600
  with gr.Column():
601
  img2vid_output = gr.Video(label="Generated Output")
@@ -632,15 +752,14 @@ with gr.Blocks(theme=gr.themes.Soft()) as iface:
632
  )
633
 
634
  # [Previous event handlers remain the same]
635
- txt2vid_preset.change(
636
- fn=preset_changed, inputs=[txt2vid_preset], outputs=txt2vid_advanced[3:]
637
- )
638
 
639
  txt2vid_generate.click(
640
  fn=generate_video_from_text,
641
  inputs=[
642
  txt2vid_prompt,
643
  txt2vid_enhance_toggle,
 
644
  txt2vid_negative_prompt,
645
  txt2vid_frame_rate,
646
  *txt2vid_advanced,
@@ -651,9 +770,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as iface:
651
  queue=True,
652
  )
653
 
654
- img2vid_preset.change(
655
- fn=preset_changed, inputs=[img2vid_preset], outputs=img2vid_advanced[3:]
656
- )
657
 
658
  img2vid_generate.click(
659
  fn=generate_video_from_image,
@@ -661,6 +778,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as iface:
661
  img2vid_image,
662
  img2vid_prompt,
663
  img2vid_enhance_toggle,
 
664
  img2vid_negative_prompt,
665
  img2vid_frame_rate,
666
  *img2vid_advanced,
@@ -672,6 +790,4 @@ with gr.Blocks(theme=gr.themes.Soft()) as iface:
672
  )
673
 
674
  if __name__ == "__main__":
675
- iface.queue(max_size=64, default_concurrency_limit=1, api_open=False).launch(
676
- share=True, show_api=False
677
- )
 
1
+ from functools import lru_cache
2
  import gradio as gr
3
  from gradio_toggle import Toggle
4
  import torch
5
  from huggingface_hub import snapshot_download
6
+ from transformers import CLIPProcessor, CLIPModel
7
 
8
  from xora.models.autoencoders.causal_video_autoencoder import CausalVideoAutoencoder
9
  from xora.models.transformers.transformer3d import Transformer3DModel
 
22
  import os
23
  import gc
24
  from openai import OpenAI
25
+ import csv
26
+ from datetime import datetime
27
+
28
 
29
  # Load Hugging Face token if needed
30
  hf_token = os.getenv("HF_TOKEN")
 
41
  # Set model download directory within Hugging Face Spaces
42
  model_path = "asset"
43
  if not os.path.exists(model_path):
44
+ snapshot_download("Lightricks/LTX-Video", local_dir=model_path, repo_type="model", token=hf_token)
 
 
45
 
46
  # Global variables to load components
47
  vae_dir = Path(model_path) / "vae"
 
50
 
51
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
52
 
53
+ DATA_DIR = "/data"
54
+ os.makedirs(DATA_DIR, exist_ok=True)
55
+ LOG_FILE_PATH = os.path.join("/data", "user_requests.csv")
56
+
57
+ clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32", cache_dir=model_path)
58
+ clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32", cache_dir=model_path)
59
+
60
+
61
+ if not os.path.exists(LOG_FILE_PATH):
62
+ with open(LOG_FILE_PATH, "w", newline="") as f:
63
+ writer = csv.writer(f)
64
+ writer.writerow(
65
+ [
66
+ "timestamp",
67
+ "request_type",
68
+ "prompt",
69
+ "negative_prompt",
70
+ "height",
71
+ "width",
72
+ "num_frames",
73
+ "frame_rate",
74
+ "seed",
75
+ "num_inference_steps",
76
+ "guidance_scale",
77
+ "is_enhanced",
78
+ "clip_embedding",
79
+ "original_resolution",
80
+ ]
81
+ )
82
+
83
+
84
+ @lru_cache(maxsize=128)
85
+ def log_request(
86
+ request_type,
87
+ prompt,
88
+ negative_prompt,
89
+ height,
90
+ width,
91
+ num_frames,
92
+ frame_rate,
93
+ seed,
94
+ num_inference_steps,
95
+ guidance_scale,
96
+ is_enhanced,
97
+ clip_embedding=None,
98
+ original_resolution=None,
99
+ ):
100
+ """Log the user's request to a CSV file."""
101
+ timestamp = datetime.now().isoformat()
102
+ with open(LOG_FILE_PATH, "a", newline="") as f:
103
+ try:
104
+ writer = csv.writer(f)
105
+ writer.writerow(
106
+ [
107
+ timestamp,
108
+ request_type,
109
+ prompt,
110
+ negative_prompt,
111
+ height,
112
+ width,
113
+ num_frames,
114
+ frame_rate,
115
+ seed,
116
+ num_inference_steps,
117
+ guidance_scale,
118
+ is_enhanced,
119
+ clip_embedding,
120
+ original_resolution,
121
+ ]
122
+ )
123
+ except Exception as e:
124
+ print(f"Error logging request: {e}")
125
+
126
+
127
+ def compute_clip_embedding(text=None, image=None):
128
+ """
129
+ Compute CLIP embedding for a given text or image.
130
+ Args:
131
+ text (str): Input text prompt.
132
+ image (PIL.Image): Input image.
133
+ Returns:
134
+ list: CLIP embedding as a list of floats.
135
+ """
136
+ inputs = clip_processor(text=text, images=image, return_tensors="pt", padding=True)
137
+ outputs = clip_model.get_text_features(**inputs) if text else clip_model.get_image_features(**inputs)
138
+ embedding = outputs.detach().cpu().numpy().flatten().tolist()
139
+ return embedding
140
+
141
 
142
  def load_vae(vae_dir):
143
  vae_ckpt_path = vae_dir / "vae_diffusion_pytorch_model.safetensors"
 
276
  unet = load_unet(unet_dir)
277
  scheduler = load_scheduler(scheduler_dir)
278
  patchifier = SymmetricPatchifier(patch_size=1)
279
+ text_encoder = T5EncoderModel.from_pretrained("PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="text_encoder").to(device)
280
+ tokenizer = T5Tokenizer.from_pretrained("PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="tokenizer")
 
 
 
 
281
 
282
  pipeline = XoraVideoPipeline(
283
  transformer=unet,
 
292
  def generate_video_from_text(
293
  prompt="",
294
  enhance_prompt_toggle=False,
295
+ txt2vid_analytics_toggle=True,
296
  negative_prompt="",
297
  frame_rate=25,
298
+ seed=646373,
299
  num_inference_steps=30,
300
  guidance_scale=3,
301
  height=512,
 
309
  duration=5,
310
  )
311
 
312
+ if txt2vid_analytics_toggle:
313
+ log_request(
314
+ "txt2vid",
315
+ prompt,
316
+ negative_prompt,
317
+ height,
318
+ width,
319
+ num_frames,
320
+ frame_rate,
321
+ seed,
322
+ num_inference_steps,
323
+ guidance_scale,
324
+ enhance_prompt_toggle,
325
+ )
326
+
327
  prompt = enhance_prompt_if_enabled(prompt, enhance_prompt_toggle, type="t2v")
328
 
329
  sample = {
 
372
  video_np = images.squeeze(0).permute(1, 2, 3, 0).cpu().float().numpy()
373
  video_np = (video_np * 255).astype(np.uint8)
374
  height, width = video_np.shape[1:3]
375
+ out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*"mp4v"), frame_rate, (width, height))
 
 
376
  for frame in video_np[..., ::-1]:
377
  out.write(frame)
378
  out.release()
 
387
  image_path,
388
  prompt="",
389
  enhance_prompt_toggle=False,
390
+ img2vid_analytics_toggle=True,
391
  negative_prompt="",
392
  frame_rate=25,
393
+ seed=646373,
394
  num_inference_steps=30,
395
  guidance_scale=3,
396
  height=512,
 
412
  if not image_path:
413
  raise gr.Error("Please provide an input image.", duration=5)
414
 
415
+ if img2vid_analytics_toggle:
416
+ with Image.open(image_path) as img:
417
+ original_resolution = f"{img.width}x{img.height}" # Format as "widthxheight"
418
+ clip_embedding = compute_clip_embedding(image=img)
419
+
420
+ log_request(
421
+ "img2vid",
422
+ prompt,
423
+ negative_prompt,
424
+ height,
425
+ width,
426
+ num_frames,
427
+ frame_rate,
428
+ seed,
429
+ num_inference_steps,
430
+ guidance_scale,
431
+ enhance_prompt_toggle,
432
+ json.dumps(clip_embedding),
433
+ original_resolution,
434
+ )
435
+
436
+ media_items = load_image_to_tensor_with_resize(image_path, height, width).to(device).detach()
437
 
438
  prompt = enhance_prompt_if_enabled(prompt, enhance_prompt_toggle, type="i2v")
439
 
 
474
  video_np = images.squeeze(0).permute(1, 2, 3, 0).cpu().float().numpy()
475
  video_np = (video_np * 255).astype(np.uint8)
476
  height, width = video_np.shape[1:3]
477
+ out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*"mp4v"), frame_rate, (width, height))
 
 
478
  for frame in video_np[..., ::-1]:
479
  out.write(frame)
480
  out.release()
 
493
 
494
  def create_advanced_options():
495
  with gr.Accordion("Step 4: Advanced Options (Optional)", open=False):
496
+ seed = gr.Slider(label="4.1 Seed", minimum=0, maximum=1000000, step=1, value=646373)
497
+ inference_steps = gr.Slider(label="4.2 Inference Steps", minimum=1, maximum=50, step=1, value=30)
498
+ guidance_scale = gr.Slider(label="4.3 Guidance Scale", minimum=1.0, maximum=5.0, step=0.1, value=3.0)
 
 
 
 
 
 
499
 
500
  height_slider = gr.Slider(
501
  label="4.4 Height",
 
564
  </div>
565
  """
566
  )
567
+ with gr.Accordion(" πŸ“– Tips for Best Results", open=False, elem_id="instructions-accordion"):
 
 
568
  gr.Markdown(
569
  """
570
  πŸ“ Prompt Engineering
 
602
  value="A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage.",
603
  lines=5,
604
  )
605
+ txt2vid_analytics_toggle = Toggle(
606
+ label="I agree to share my usage data anonymously to help improve the model features.",
607
+ value=True,
608
+ interactive=True,
609
+ )
610
+
611
  txt2vid_enhance_toggle = Toggle(
612
  label="Enhance Prompt",
613
  value=False,
 
683
  value="A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage.",
684
  lines=5,
685
  )
686
+ img2vid_analytics_toggle = Toggle(
687
+ label="I agree to share my usage data anonymously to help improve the model features.",
688
+ value=True,
689
+ interactive=True,
690
+ )
691
  img2vid_enhance_toggle = Toggle(
692
  label="Enhance Prompt",
693
  value=False,
 
715
  )
716
 
717
  img2vid_advanced = create_advanced_options()
718
+ img2vid_generate = gr.Button("Step 6: Generate Video", variant="primary", size="lg")
 
 
719
 
720
  with gr.Column():
721
  img2vid_output = gr.Video(label="Generated Output")
 
752
  )
753
 
754
  # [Previous event handlers remain the same]
755
+ txt2vid_preset.change(fn=preset_changed, inputs=[txt2vid_preset], outputs=txt2vid_advanced[3:])
 
 
756
 
757
  txt2vid_generate.click(
758
  fn=generate_video_from_text,
759
  inputs=[
760
  txt2vid_prompt,
761
  txt2vid_enhance_toggle,
762
+ txt2vid_analytics_toggle,
763
  txt2vid_negative_prompt,
764
  txt2vid_frame_rate,
765
  *txt2vid_advanced,
 
770
  queue=True,
771
  )
772
 
773
+ img2vid_preset.change(fn=preset_changed, inputs=[img2vid_preset], outputs=img2vid_advanced[3:])
 
 
774
 
775
  img2vid_generate.click(
776
  fn=generate_video_from_image,
 
778
  img2vid_image,
779
  img2vid_prompt,
780
  img2vid_enhance_toggle,
781
+ img2vid_analytics_toggle,
782
  img2vid_negative_prompt,
783
  img2vid_frame_rate,
784
  *img2vid_advanced,
 
790
  )
791
 
792
  if __name__ == "__main__":
793
+ iface.queue(max_size=64, default_concurrency_limit=1, api_open=False).launch(share=True, show_api=False)