File size: 19,622 Bytes
60887b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
import gradio as gr
import numpy as np
import json
import os
import tempfile
from typing import Optional, Tuple, Dict, Any
import time
from pathlib import Path

# Mock implementation for demonstration
# In a real deployment, this would integrate with the actual HY-WorldPlay model
class HYWorldPlayDemo:
    def __init__(self):
        self.model_loaded = False
        self.generation_history = []
        
    def load_model(self, model_type: str = "bidirectional"):
        """Mock model loading"""
        time.sleep(2)  # Simulate loading time
        self.model_loaded = True
        return f"โœ… {model_type.capitalize()} model loaded successfully!"
    
    def generate_video(self, 
                      prompt: str,
                      image_path: Optional[str] = None,
                      resolution: str = "480p",
                      aspect_ratio: str = "16:9",
                      num_frames: int = 125,
                      seed: int = 1,
                      model_type: str = "bidirectional",
                      action_type: str = "forward") -> Tuple[str, Dict]:
        """
        Mock video generation
        Returns: (video_path, metadata)
        """
        if not self.model_loaded:
            raise gr.Error("Please load the model first!")
        
        # Simulate generation time
        progress_steps = [
            "Initializing generation pipeline...",
            "Processing prompt and image...",
            "Generating video frames...",
            "Applying temporal consistency...",
            "Rendering final video..."
        ]
        
        for step in progress_steps:
            yield step, {"status": "processing", "step": step}
            time.sleep(1)
        
        # Create a mock video file (in real implementation, this would be actual video generation)
        temp_dir = tempfile.mkdtemp()
        video_path = os.path.join(temp_dir, "generated_video.mp4")
        
        # Create a simple placeholder text file to simulate video
        with open(video_path, 'w') as f:
            f.write(f"Generated video for prompt: {prompt}\n")
            f.write(f"Resolution: {resolution}\n")
            f.write(f"Aspect Ratio: {aspect_ratio}\n")
            f.write(f"Frames: {num_frames}\n")
            f.write(f"Model: {model_type}\n")
            f.write(f"Action: {action_type}\n")
        
        # Save generation metadata
        metadata = {
            "prompt": prompt,
            "resolution": resolution,
            "aspect_ratio": aspect_ratio,
            "num_frames": num_frames,
            "seed": seed,
            "model_type": model_type,
            "action_type": action_type,
            "generation_time": time.strftime("%Y-%m-%d %H:%M:%S"),
            "status": "completed"
        }
        
        self.generation_history.append(metadata)
        
        yield video_path, metadata

# Initialize demo class
demo_instance = HYWorldPlayDemo()

def load_model_wrapper(model_type):
    """Wrapper for model loading with status updates"""
    return demo_instance.load_model(model_type)

def generate_video_wrapper(*args):
    """Wrapper for video generation with progress tracking"""
    for result in demo_instance.generate_video(*args):
        yield result

def create_pose_json(trajectory_type: str, num_keyframes: int = 32) -> str:
    """Create a mock pose JSON file for camera trajectory"""
    poses = []
    
    if trajectory_type == "forward":
        for i in range(num_keyframes):
            poses.append({
                "frame": i,
                "position": [i * 0.1, 0, 0],
                "rotation": [0, 0, 0],
                "fov": 60
            })
    elif trajectory_type == "circular":
        for i in range(num_keyframes):
            angle = (i / num_keyframes) * 2 * np.pi
            poses.append({
                "frame": i,
                "position": [np.cos(angle) * 2, 0, np.sin(angle) * 2],
                "rotation": [0, np.degrees(angle), 0],
                "fov": 60
            })
    elif trajectory_type == "zoom":
        for i in range(num_keyframes):
            zoom = 1 + (i / num_keyframes) * 2
            poses.append({
                "frame": i,
                "position": [0, 0, 0],
                "rotation": [0, 0, 0],
                "fov": 60 / zoom
            })
    
    temp_dir = tempfile.mkdtemp()
    json_path = os.path.join(temp_dir, "pose_trajectory.json")
    
    with open(json_path, 'w') as f:
        json.dump({"poses": poses}, f, indent=2)
    
    return json_path

def get_generation_history():
    """Return generation history as formatted text"""
    if not demo_instance.generation_history:
        return "No generations yet."
    
    history_text = ""
    for i, gen in enumerate(demo_instance.generation_history[-5:], 1):
        history_text += f"**Generation {i}**\n"
        history_text += f"- Prompt: {gen['prompt'][:50]}...\n"
        history_text += f"- Model: {gen['model_type']}\n"
        history_text += f"- Frames: {gen['num_frames']}\n"
        history_text += f"- Time: {gen['generation_time']}\n\n"
    
    return history_text

# Custom CSS for enhanced UI
custom_css = """
.main-container {
    max-width: 1400px;
    margin: 0 auto;
}

.model-card {
    border: 2px solid #e5e7eb;
    border-radius: 12px;
    padding: 20px;
    margin: 10px 0;
    transition: all 0.3s ease;
}

.model-card:hover {
    border-color: #3b82f6;
    box-shadow: 0 4px 12px rgba(59, 130, 246, 0.1);
}

.status-indicator {
    display: inline-block;
    width: 12px;
    height: 12px;
    border-radius: 50%;
    margin-right: 8px;
}

.status-ready { background-color: #10b981; }
.status-loading { background-color: #f59e0b; }
.status-error { background-color: #ef4444; }

.feature-highlight {
    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
    color: white;
    padding: 20px;
    border-radius: 12px;
    margin: 20px 0;
}

.generation-progress {
    font-family: 'Courier New', monospace;
    background: #1f2937;
    color: #10b981;
    padding: 15px;
    border-radius: 8px;
    margin: 10px 0;
}
"""

with gr.Blocks(css=custom_css, title="HY-WorldPlay: Interactive World Modeling") as demo:
    # Header
    gr.HTML("""
    <div style="text-align: center; margin-bottom: 30px;">
        <h1 style="font-size: 2.5em; margin-bottom: 10px;">๐ŸŽฎ HY-WorldPlay</h1>
        <p style="font-size: 1.2em; color: #6b7280;">Real-Time Interactive World Modeling with Geometric Consistency</p>
        <p style="margin-top: 10px;">
            <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank" style="color: #3b82f6; text-decoration: none;">
                Built with anycoder
            </a>
        </p>
    </div>
    """)
    
    # Feature highlights
    with gr.Row():
        with gr.Column():
            gr.HTML("""
            <div class="feature-highlight">
                <h3>๐Ÿš€ Key Features</h3>
                <ul style="margin: 10px 0;">
                    <li>Real-time video generation at 24 FPS</li>
                    <li>Long-term geometric consistency</li>
                    <li>Dual Action Representation for control</li>
                    <li>Reconstituted Context Memory</li>
                    <li>WorldCompass RL post-training</li>
                    <li>Context Forcing distillation</li>
                </ul>
            </div>
            """)
    
    # Main interface tabs
    with gr.Tabs() as main_tabs:
        # Tab 1: Video Generation
        with gr.TabItem("๐ŸŽฌ Video Generation", id="gen_tab"):
            with gr.Row():
                with gr.Column(scale=2):
                    gr.Markdown("### Configuration")
                    
                    # Model selection
                    model_type = gr.Radio(
                        choices=["bidirectional", "autoregressive", "autoregressive_distilled"],
                        value="bidirectional",
                        label="Model Type",
                        info="Choose the model variant for generation"
                    )
                    
                    load_model_btn = gr.Button("๐Ÿ”„ Load Model", variant="primary", size="lg")
                    model_status = gr.HTML('<div><span class="status-indicator status-error"></span>Model not loaded</div>')
                    
                    # Input controls
                    with gr.Accordion("๐Ÿ“ Input Settings", open=True):
                        prompt_input = gr.Textbox(
                            label="Prompt",
                            placeholder="Describe the world you want to generate...",
                            lines=3,
                            value="A peaceful landscape with a stone bridge spanning a calm body of water, surrounded by lush green trees and a traditional pavilion."
                        )
                        
                        image_input = gr.Image(
                            label="Input Image (Optional)",
                            type="filepath",
                            sources=["upload", "clipboard"],
                            info="Upload an image to guide the generation"
                        )
                    
                    with gr.Accordion("โš™๏ธ Generation Settings", open=False):
                        with gr.Row():
                            resolution = gr.Dropdown(
                                choices=["480p", "720p", "1080p"],
                                value="480p",
                                label="Resolution"
                            )
                            aspect_ratio = gr.Dropdown(
                                choices=["16:9", "9:16", "1:1", "4:3"],
                                value="16:9",
                                label="Aspect Ratio"
                            )
                        
                        with gr.Row():
                            num_frames = gr.Slider(
                                minimum=16,
                                maximum=250,
                                value=125,
                                step=1,
                                label="Number of Frames"
                            )
                            seed = gr.Number(
                                value=1,
                                label="Seed",
                                precision=0
                            )
                    
                    # Camera trajectory
                    with gr.Accordion("๐ŸŽฅ Camera Trajectory", open=False):
                        trajectory_type = gr.Radio(
                            choices=["forward", "circular", "zoom", "custom"],
                            value="forward",
                            label="Trajectory Type"
                        )
                        create_pose_btn = gr.Button("Generate Trajectory JSON")
                        pose_status = gr.Textbox(label="Trajectory Status", interactive=False)
                    
                    # Generation button
                    generate_btn = gr.Button("๐Ÿš€ Generate Video", variant="primary", size="lg", visible=False)
                    
                with gr.Column(scale=1):
                    gr.Markdown("### Output & Progress")
                    
                    # Progress display
                    progress_display = gr.HTML('<div class="generation-progress">Ready to generate...</div>')
                    
                    # Output video
                    video_output = gr.Video(
                        label="Generated Video",
                        visible=False
                    )
                    
                    # Metadata
                    metadata_output = gr.JSON(
                        label="Generation Metadata",
                        visible=False
                    )
        
        # Tab 2: Model Comparison
        with gr.TabItem("๐Ÿ“Š Model Comparison", id="compare_tab"):
            gr.Markdown("### Performance Comparison")
            
            # Performance metrics table
            gr.DataFrame(
                headers=["Model", "Real-time", "PSNR โ†‘", "SSIM โ†‘", "LPIPS โ†“", "R_dist โ†“", "T_dist โ†“"],
                datatype=["str", "str", "number", "number", "number", "number", "number"],
                value=[
                    ["CameraCtrl", "โŒ", 17.93, 0.569, 0.298, 0.037, 0.341],
                    ["SEVA", "โŒ", 19.84, 0.598, 0.313, 0.047, 0.223],
                    ["ViewCrafter", "โŒ", 19.91, 0.617, 0.327, 0.029, 0.543],
                    ["Gen3C", "โŒ", 21.68, 0.635, 0.278, 0.024, 0.477],
                    ["VMem", "โŒ", 19.97, 0.587, 0.316, 0.048, 0.219],
                    ["Matrix-Game-2.0", "โœ…", 17.26, 0.505, 0.383, 0.287, 0.843],
                    ["GameCraft", "โŒ", 21.05, 0.639, 0.341, 0.151, 0.617],
                    ["Ours (w/o Context Forcing)", "โŒ", 21.27, 0.669, 0.261, 0.033, 0.157],
                    ["Ours (full)", "โœ…", 21.92, 0.702, 0.247, 0.031, 0.121]
                ],
                label="Quantitative Evaluation Results",
                interactive=False
            )
            
            with gr.Row():
                with gr.Column():
                    gr.Markdown("""
                    ### ๐Ÿ”ฌ Key Improvements
                    - **Real-time Performance**: Achieves 24 FPS streaming generation
                    - **Superior Consistency**: Best-in-class long-term geometric consistency
                    - **Memory Efficiency**: Reconstituted Context Memory prevents error drift
                    - **Action Control**: Precise keyboard and mouse input response
                    """)
                
                with gr.Column():
                    gr.Markdown("""
                    ### ๐Ÿ“ˆ Technical Innovations
                    - **Dual Action Representation**: Robust action control
                    - **Context Forcing**: Memory-aware model distillation
                    - **WorldCompass**: RL-based post-training
                    - **Temporal Reframing**: Long-past frame accessibility
                    """)
        
        # Tab 3: Examples
        with gr.TabItem("๐ŸŽจ Examples", id="examples_tab"):
            gr.Markdown("### Sample Generations")
            
            with gr.Row():
                with gr.Column():
                    gr.Markdown("""
                    #### Example 1: Bridge Scene
                    **Prompt**: A paved pathway leads towards a stone arch bridge spanning a calm body of water...
                    **Action**: Forward movement
                    **Frames**: 125
                    """)
                    gr.Image("https://via.placeholder.com/400x225/3b82f6/ffffff?text=Bridge+Scene+Example", label="Example 1")
                
                with gr.Column():
                    gr.Markdown("""
                    #### Example 2: Forest Path
                    **Prompt**: A winding path through an enchanted forest with ancient trees...
                    **Action**: Circular trajectory
                    **Frames**: 125
                    """)
                    gr.Image("https://via.placeholder.com/400x225/10b981/ffffff?text=Forest+Path+Example", label="Example 2")
            
            with gr.Row():
                with gr.Column():
                    gr.Markdown("""
                    #### Example 3: Urban Scene
                    **Prompt**: A futuristic cityscape with flying vehicles and neon lights...
                    **Action**: Zoom in
                    **Frames**: 125
                    """)
                    gr.Image("https://via.placeholder.com/400x225/8b5cf6/ffffff?text=Urban+Scene+Example", label="Example 3")
                
                with gr.Column():
                    gr.Markdown("""
                    #### Example 4: Interior Scene
                    **Prompt**: A cozy library with bookshelves and warm lighting...
                    **Action**: Custom trajectory
                    **Frames**: 125
                    """)
                    gr.Image("https://via.placeholder.com/400x225/f59e0b/ffffff?text=Interior+Scene+Example", label="Example 4")
        
        # Tab 4: History
        with gr.TabItem("๐Ÿ“œ History", id="history_tab"):
            gr.Markdown("### Generation History")
            
            history_display = gr.Markdown(get_generation_history())
            refresh_history_btn = gr.Button("๐Ÿ”„ Refresh History")
    
    # Footer
    gr.HTML("""
    <div style="text-align: center; margin-top: 40px; padding: 20px; border-top: 1px solid #e5e7eb;">
        <p style="color: #6b7280;">
            HY-WorldPlay: A Systematic Framework for Interactive World Modeling<br>
            <a href="https://arxiv.org/abs/2512.14614" target="_blank" style="color: #3b82f6;">Paper</a> | 
            <a href="https://github.com/Tencent/HY-WorldPlay" target="_blank" style="color: #3b82f6;">GitHub</a> | 
            <a href="https://huggingface.co/tencent/HY-WorldPlay" target="_blank" style="color: #3b82f6;">Model Card</a>
        </p>
    </div>
    """)
    
    # Event handlers
    def update_model_status(is_loaded, model_type):
        if is_loaded:
            return f'<div><span class="status-indicator status-ready"></span>{model_type.capitalize()} model loaded</div>', gr.Button(visible=True)
        else:
            return f'<div><span class="status-indicator status-loading"></span>Loading {model_type} model...</div>', gr.Button(visible=False)
    
    load_model_btn.click(
        fn=lambda x: load_model_wrapper(x),
        inputs=[model_type],
        outputs=[model_status]
    ).then(
        fn=lambda x: update_model_status(True, x),
        inputs=[model_type],
        outputs=[model_status, generate_btn]
    )
    
    def update_progress(progress_text, show_video=False):
        if "completed" in progress_text.lower():
            return f'<div class="generation-progress">โœ… {progress_text}</div>', gr.Video(visible=True), gr.JSON(visible=True)
        else:
            return f'<div class="generation-progress">โณ {progress_text}</div>', gr.Video(visible=False), gr.JSON(visible=False)
    
    generate_btn.click(
        fn=generate_video_wrapper,
        inputs=[
            prompt_input,
            image_input,
            resolution,
            aspect_ratio,
            num_frames,
            seed,
            model_type,
            trajectory_type
        ],
        outputs=[progress_display, video_output, metadata_output]
    )
    
    create_pose_btn.click(
        fn=create_pose_json,
        inputs=[trajectory_type],
        outputs=[pose_status]
    ).then(
        fn=lambda x: f"โœ… Trajectory JSON created for {x} motion",
        inputs=[trajectory_type],
        outputs=[pose_status]
    )
    
    refresh_history_btn.click(
        fn=get_generation_history,
        outputs=[history_display]
    )

# Launch the app
demo.launch(
    theme=gr.themes.Soft(
        primary_hue="blue",
        secondary_hue="indigo",
        neutral_hue="slate",
        font=gr.themes.GoogleFont("Inter"),
        text_size="lg",
        spacing_size="lg",
        radius_size="md"
    ),
    footer_links=[
        {"label": "Built with anycoder", "url": "https://huggingface.co/spaces/akhaliq/anycoder"},
        {"label": "Paper", "url": "https://arxiv.org/abs/2512.14614"},
        {"label": "GitHub", "url": "https://github.com/Tencent/HY-WorldPlay"},
        {"label": "Model Card", "url": "https://huggingface.co/tencent/HY-WorldPlay"}
    ]
)