Files changed (4) hide show
  1. README.md +1 -1
  2. app.py +36 -98
  3. requirements.txt +5 -8
  4. style.css +1 -189
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: 🚀
4
  colorFrom: pink
5
  colorTo: pink
6
  sdk: gradio
7
- sdk_version: 3.23.0
8
  app_file: app.py
9
  pinned: false
10
  ---
 
4
  colorFrom: pink
5
  colorTo: pink
6
  sdk: gradio
7
+ sdk_version: 3.22.1
8
  app_file: app.py
9
  pinned: false
10
  ---
app.py CHANGED
@@ -3,75 +3,57 @@
3
  from __future__ import annotations
4
 
5
  import os
 
6
  import random
7
- import tempfile
 
8
 
9
  import gradio as gr
10
- import imageio
11
- import numpy as np
12
  import torch
13
- from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
14
 
15
- DESCRIPTION = '# [ModelScope Text to Video Synthesis](https://modelscope.cn/models/damo/text-to-video-synthesis/summary)'
16
- DESCRIPTION += '\n<p>For Colab usage, you can view <a href="https://colab.research.google.com/drive/1uW1ZqswkQ9Z9bp5Nbo5z59cAn7I0hE6R?usp=sharing" style="text-decoration: underline;" target="_blank">this webpage</a>.(the latest update on 2023.03.21)</p>'
17
- DESCRIPTION += '\n<p>This model can only be used for non-commercial purposes. To learn more about the model, take a look at the <a href="https://huggingface.co/damo-vilab/modelscope-damo-text-to-video-synthesis" style="text-decoration: underline;" target="_blank">model card</a>.</p>'
18
- if (SPACE_ID := os.getenv('SPACE_ID')) is not None:
19
- DESCRIPTION += f'\n<p>For faster inference without waiting in queue, you may duplicate the space and upgrade to GPU in settings. <a href="https://huggingface.co/spaces/{SPACE_ID}?duplicate=true"><img style="display: inline; margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space" /></a></p>'
 
20
 
21
- MAX_NUM_FRAMES = int(os.getenv('MAX_NUM_FRAMES', '200'))
22
- DEFAULT_NUM_FRAMES = min(MAX_NUM_FRAMES,
23
- int(os.getenv('DEFAULT_NUM_FRAMES', '16')))
24
 
25
- pipe = DiffusionPipeline.from_pretrained('damo-vilab/text-to-video-ms-1.7b',
26
- torch_dtype=torch.float16,
27
- variant='fp16')
28
- pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
29
- pipe.enable_model_cpu_offload()
30
- pipe.enable_vae_slicing()
31
 
 
 
 
32
 
33
- def to_video(frames: list[np.ndarray], fps: int) -> str:
34
- out_file = tempfile.NamedTemporaryFile(suffix='.mp4', delete=False)
35
- writer = imageio.get_writer(out_file.name, format='FFMPEG', fps=fps)
36
- for frame in frames:
37
- writer.append_data(frame)
38
- writer.close()
39
- return out_file.name
40
 
41
 
42
- def generate(prompt: str, seed: int, num_frames: int,
43
- num_inference_steps: int) -> str:
44
  if seed == -1:
45
  seed = random.randint(0, 1000000)
46
- generator = torch.Generator().manual_seed(seed)
47
- frames = pipe(prompt,
48
- num_inference_steps=num_inference_steps,
49
- num_frames=num_frames,
50
- generator=generator).frames
51
- return to_video(frames, 8)
52
 
53
 
54
  examples = [
55
- ['An astronaut riding a horse.', 0, 16, 25],
56
- ['A panda eating bamboo on a rock.', 0, 16, 25],
57
- ['Spiderman is surfing.', 0, 16, 25],
58
  ]
59
 
60
  with gr.Blocks(css='style.css') as demo:
61
  gr.Markdown(DESCRIPTION)
62
- with gr.Group():
63
- with gr.Box():
64
- with gr.Row(elem_id='prompt-container').style(equal_height=True):
65
- prompt = gr.Text(
66
- label='Prompt',
67
- show_label=False,
68
- max_lines=1,
69
- placeholder='Enter your prompt',
70
- elem_id='prompt-text-input').style(container=False)
71
- run_button = gr.Button('Generate video').style(
72
- full_width=False)
73
- result = gr.Video(label='Result', show_label=False, elem_id='gallery')
74
- with gr.Accordion('Advanced options', open=False):
75
  seed = gr.Slider(
76
  label='Seed',
77
  minimum=-1,
@@ -79,27 +61,11 @@ with gr.Blocks(css='style.css') as demo:
79
  step=1,
80
  value=-1,
81
  info='If set to -1, a different seed will be used each time.')
82
- num_frames = gr.Slider(
83
- label='Number of frames',
84
- minimum=16,
85
- maximum=MAX_NUM_FRAMES,
86
- step=1,
87
- value=16,
88
- info=
89
- 'Note that the content of the video also changes when you change the number of frames.'
90
- )
91
- num_inference_steps = gr.Slider(label='Number of inference steps',
92
- minimum=10,
93
- maximum=50,
94
- step=1,
95
- value=25)
96
-
97
- inputs = [
98
- prompt,
99
- seed,
100
- num_frames,
101
- num_inference_steps,
102
- ]
103
  gr.Examples(examples=examples,
104
  inputs=inputs,
105
  outputs=result,
@@ -109,32 +75,4 @@ with gr.Blocks(css='style.css') as demo:
109
  prompt.submit(fn=generate, inputs=inputs, outputs=result)
110
  run_button.click(fn=generate, inputs=inputs, outputs=result)
111
 
112
-
113
- with gr.Accordion(label='We are hiring(Based in Beijing / Hangzhou, China.)', open=False):
114
- gr.HTML("""<div class="acknowledgments">
115
- <p>
116
- If you're looking for an exciting challenge and the opportunity to work with cutting-edge technologies in AIGC and large-scale pretraining, then we are the place for you. We are looking for talented, motivated and creative individuals to join our team. If you are interested, please send your CV to us.
117
- </p>
118
- <p>
119
- <b>EMAIL: yingya.zyy@alibaba-inc.com</b>.
120
- </p>
121
- </div>
122
- """)
123
-
124
- with gr.Accordion(label='Biases and content acknowledgment', open=False):
125
- gr.HTML("""<div class="acknowledgments">
126
- <h4>Biases and content acknowledgment</h4>
127
- <p>
128
- Despite how impressive being able to turn text into video is, beware to the fact that this model may output content that reinforces or exacerbates societal biases. The training data includes LAION5B, ImageNet, Webvid and other public datasets. The model was not trained to realistically represent people or events, so using it to generate such content is beyond the model's capabilities.
129
- </p>
130
- <p>
131
- It is not intended to generate content that is demeaning or harmful to people or their environment, culture, religion, etc. Similarly, it is not allowed to generate pornographic, violent and bloody content generation. <b>The model is meant for research purposes</b>.
132
- </p>
133
- <p>
134
- To learn more about the model, head to its <a href="https://huggingface.co/damo-vilab/modelscope-damo-text-to-video-synthesis" style="text-decoration: underline;" target="_blank">model card</a>.
135
- </p>
136
- </div>
137
- """)
138
-
139
-
140
  demo.queue(api_open=False, max_size=15).launch()
 
3
  from __future__ import annotations
4
 
5
  import os
6
+ import pathlib
7
  import random
8
+ import shlex
9
+ import subprocess
10
 
11
  import gradio as gr
 
 
12
  import torch
13
+ from huggingface_hub import snapshot_download
14
 
15
+ if os.getenv('SYSTEM') == 'spaces':
16
+ subprocess.run(shlex.split('pip uninstall -y modelscope'))
17
+ subprocess.run(
18
+ shlex.split(
19
+ 'pip install git+https://github.com/modelscope/modelscope.git@refs/pull/207/head'
20
+ ))
21
 
22
+ from modelscope.outputs import OutputKeys
23
+ from modelscope.pipelines import pipeline
 
24
 
25
+ model_dir = pathlib.Path('weights')
26
+ if not model_dir.exists():
27
+ model_dir.mkdir()
28
+ snapshot_download('damo-vilab/modelscope-damo-text-to-video-synthesis',
29
+ repo_type='model',
30
+ local_dir=model_dir)
31
 
32
+ DESCRIPTION = '# [ModelScope Text to Video Synthesis](https://modelscope.cn/models/damo/text-to-video-synthesis/summary)'
33
+ if (SPACE_ID := os.getenv('SPACE_ID')) is not None:
34
+ DESCRIPTION += f'\n<p>For faster inference without waiting in queue, you may duplicate the space and upgrade to GPU in settings. <a href="https://huggingface.co/spaces/{SPACE_ID}?duplicate=true"><img style="display: inline; margin-top: 0em; margin-bottom: 0em" src="https://bit.ly/3gLdBN6" alt="Duplicate Space" /></a></p>'
35
 
36
+ pipe = pipeline('text-to-video-synthesis', model_dir.as_posix())
 
 
 
 
 
 
37
 
38
 
39
+ def generate(prompt: str, seed: int) -> str:
 
40
  if seed == -1:
41
  seed = random.randint(0, 1000000)
42
+ torch.manual_seed(seed)
43
+ return pipe({'text': prompt})[OutputKeys.OUTPUT_VIDEO]
 
 
 
 
44
 
45
 
46
  examples = [
47
+ ['An astronaut riding a horse.', 0],
48
+ ['A panda eating bamboo on a rock.', 0],
49
+ ['Spiderman is surfing.', 0],
50
  ]
51
 
52
  with gr.Blocks(css='style.css') as demo:
53
  gr.Markdown(DESCRIPTION)
54
+ with gr.Row():
55
+ with gr.Column():
56
+ prompt = gr.Text(label='Prompt', max_lines=1)
 
 
 
 
 
 
 
 
 
 
57
  seed = gr.Slider(
58
  label='Seed',
59
  minimum=-1,
 
61
  step=1,
62
  value=-1,
63
  info='If set to -1, a different seed will be used each time.')
64
+ run_button = gr.Button('Run')
65
+ with gr.Column():
66
+ result = gr.Video(label='Result')
67
+
68
+ inputs = [prompt, seed]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  gr.Examples(examples=examples,
70
  inputs=inputs,
71
  outputs=result,
 
75
  prompt.submit(fn=generate, inputs=inputs, outputs=result)
76
  run_button.click(fn=generate, inputs=inputs, outputs=result)
77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  demo.queue(api_open=False, max_size=15).launch()
requirements.txt CHANGED
@@ -1,8 +1,5 @@
1
- accelerate==0.17.1
2
- git+https://github.com/huggingface/diffusers@9dc8444
3
- gradio==3.23.0
4
- huggingface-hub==0.13.3
5
- imageio[ffmpeg]==2.26.1
6
- torch==2.0.0
7
- torchvision==0.15.1
8
- transformers==4.27.2
 
1
+ decord==0.6.0
2
+ fairseq==0.12.2
3
+ gradio==3.22.1
4
+ modelscope[multi-modal]==1.4.1
5
+ open_clip_torch==2.16.0
 
 
 
style.css CHANGED
@@ -1,191 +1,3 @@
1
- /*
2
- This CSS file is copied from here:
3
- https://huggingface.co/spaces/stabilityai/stable-diffusion/blob/2794a3c3ba66115c307075098e713f572b08bf80/app.py
4
- */
5
-
6
  h1 {
7
- text-align: center;
8
- }
9
-
10
- .gradio-container {
11
- font-family: 'IBM Plex Sans', sans-serif;
12
- }
13
-
14
- .gr-button {
15
- color: white;
16
- border-color: black;
17
- background: black;
18
- }
19
-
20
- input[type='range'] {
21
- accent-color: black;
22
- }
23
-
24
- .dark input[type='range'] {
25
- accent-color: #dfdfdf;
26
- }
27
-
28
- .container {
29
- max-width: 730px;
30
- margin: auto;
31
- padding-top: 1.5rem;
32
- }
33
-
34
- #gallery {
35
- min-height: 22rem;
36
- margin-bottom: 15px;
37
- margin-left: auto;
38
- margin-right: auto;
39
- border-bottom-right-radius: .5rem !important;
40
- border-bottom-left-radius: .5rem !important;
41
- }
42
-
43
- #gallery>div>.h-full {
44
- min-height: 20rem;
45
- }
46
-
47
- .details:hover {
48
- text-decoration: underline;
49
- }
50
-
51
- .gr-button {
52
- white-space: nowrap;
53
- }
54
-
55
- .gr-button:focus {
56
- border-color: rgb(147 197 253 / var(--tw-border-opacity));
57
- outline: none;
58
- box-shadow: var(--tw-ring-offset-shadow), var(--tw-ring-shadow), var(--tw-shadow, 0 0 #0000);
59
- --tw-border-opacity: 1;
60
- --tw-ring-offset-shadow: var(--tw-ring-inset) 0 0 0 var(--tw-ring-offset-width) var(--tw-ring-offset-color);
61
- --tw-ring-shadow: var(--tw-ring-inset) 0 0 0 calc(3px var(--tw-ring-offset-width)) var(--tw-ring-color);
62
- --tw-ring-color: rgb(191 219 254 / var(--tw-ring-opacity));
63
- --tw-ring-opacity: .5;
64
- }
65
-
66
- #advanced-btn {
67
- font-size: .7rem !important;
68
- line-height: 19px;
69
- margin-top: 12px;
70
- margin-bottom: 12px;
71
- padding: 2px 8px;
72
- border-radius: 14px !important;
73
- }
74
-
75
- #advanced-options {
76
- display: none;
77
- margin-bottom: 20px;
78
- }
79
-
80
- .footer {
81
- margin-bottom: 45px;
82
- margin-top: 35px;
83
- text-align: center;
84
- border-bottom: 1px solid #e5e5e5;
85
- }
86
-
87
- .footer>p {
88
- font-size: .8rem;
89
- display: inline-block;
90
- padding: 0 10px;
91
- transform: translateY(10px);
92
- background: white;
93
- }
94
-
95
- .dark .footer {
96
- border-color: #303030;
97
- }
98
-
99
- .dark .footer>p {
100
- background: #0b0f19;
101
- }
102
-
103
- .acknowledgments h4 {
104
- margin: 1.25em 0 .25em 0;
105
- font-weight: bold;
106
- font-size: 115%;
107
- }
108
-
109
- .animate-spin {
110
- animation: spin 1s linear infinite;
111
- }
112
-
113
- @keyframes spin {
114
- from {
115
- transform: rotate(0deg);
116
- }
117
-
118
- to {
119
- transform: rotate(360deg);
120
- }
121
- }
122
-
123
- #share-btn-container {
124
- display: flex;
125
- padding-left: 0.5rem !important;
126
- padding-right: 0.5rem !important;
127
- background-color: #000000;
128
- justify-content: center;
129
- align-items: center;
130
- border-radius: 9999px !important;
131
- width: 13rem;
132
- margin-top: 10px;
133
- margin-left: auto;
134
- }
135
-
136
- #share-btn {
137
- all: initial;
138
- color: #ffffff;
139
- font-weight: 600;
140
- cursor: pointer;
141
- font-family: 'IBM Plex Sans', sans-serif;
142
- margin-left: 0.5rem !important;
143
- padding-top: 0.25rem !important;
144
- padding-bottom: 0.25rem !important;
145
- right: 0;
146
- }
147
-
148
- #share-btn * {
149
- all: unset;
150
- }
151
-
152
- #share-btn-container div:nth-child(-n+2) {
153
- width: auto !important;
154
- min-height: 0px !important;
155
- }
156
-
157
- #share-btn-container .wrap {
158
- display: none !important;
159
- }
160
-
161
- .gr-form {
162
- flex: 1 1 50%;
163
- border-top-right-radius: 0;
164
- border-bottom-right-radius: 0;
165
- }
166
-
167
- #prompt-container {
168
- gap: 0;
169
- }
170
-
171
- #prompt-text-input,
172
- #negative-prompt-text-input {
173
- padding: .45rem 0.625rem
174
- }
175
-
176
- #component-16 {
177
- border-top-width: 1px !important;
178
- margin-top: 1em
179
- }
180
-
181
- .image_duplication {
182
- position: absolute;
183
- width: 100px;
184
- left: 50px
185
- }
186
-
187
- #component-0 {
188
- max-width: 730px;
189
- margin: auto;
190
- padding-top: 1.5rem;
191
  }
 
 
 
 
 
 
1
  h1 {
2
+ text-align: center;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  }