Spaces:

gagndeep
/

anycoder-e867323b

Runtime error

File size: 19,622 Bytes

60887b6

import gradio as gr
import numpy as np
import json
import os
import tempfile
from typing import Optional, Tuple, Dict, Any
import time
from pathlib import Path

# Mock implementation for demonstration
# In a real deployment, this would integrate with the actual HY-WorldPlay model
class HYWorldPlayDemo:
    def __init__(self):
        self.model_loaded = False
        self.generation_history = []
        
    def load_model(self, model_type: str = "bidirectional"):
        """Mock model loading"""
        time.sleep(2)  # Simulate loading time
        self.model_loaded = True
        return f"✅ {model_type.capitalize()} model loaded successfully!"
    
    def generate_video(self, 
                      prompt: str,
                      image_path: Optional[str] = None,
                      resolution: str = "480p",
                      aspect_ratio: str = "16:9",
                      num_frames: int = 125,
                      seed: int = 1,
                      model_type: str = "bidirectional",
                      action_type: str = "forward") -> Tuple[str, Dict]:
        """
        Mock video generation
        Returns: (video_path, metadata)
        """
        if not self.model_loaded:
            raise gr.Error("Please load the model first!")
        
        # Simulate generation time
        progress_steps = [
            "Initializing generation pipeline...",
            "Processing prompt and image...",
            "Generating video frames...",
            "Applying temporal consistency...",
            "Rendering final video..."
        ]
        
        for step in progress_steps:
            yield step, {"status": "processing", "step": step}
            time.sleep(1)
        
        # Create a mock video file (in real implementation, this would be actual video generation)
        temp_dir = tempfile.mkdtemp()
        video_path = os.path.join(temp_dir, "generated_video.mp4")
        
        # Create a simple placeholder text file to simulate video
        with open(video_path, 'w') as f:
            f.write(f"Generated video for prompt: {prompt}\n")
            f.write(f"Resolution: {resolution}\n")
            f.write(f"Aspect Ratio: {aspect_ratio}\n")
            f.write(f"Frames: {num_frames}\n")
            f.write(f"Model: {model_type}\n")
            f.write(f"Action: {action_type}\n")
        
        # Save generation metadata
        metadata = {
            "prompt": prompt,
            "resolution": resolution,
            "aspect_ratio": aspect_ratio,
            "num_frames": num_frames,
            "seed": seed,
            "model_type": model_type,
            "action_type": action_type,
            "generation_time": time.strftime("%Y-%m-%d %H:%M:%S"),
            "status": "completed"
        }
        
        self.generation_history.append(metadata)
        
        yield video_path, metadata

# Initialize demo class
demo_instance = HYWorldPlayDemo()

def load_model_wrapper(model_type):
    """Wrapper for model loading with status updates"""
    return demo_instance.load_model(model_type)

def generate_video_wrapper(*args):
    """Wrapper for video generation with progress tracking"""
    for result in demo_instance.generate_video(*args):
        yield result

def create_pose_json(trajectory_type: str, num_keyframes: int = 32) -> str:
    """Create a mock pose JSON file for camera trajectory"""
    poses = []
    
    if trajectory_type == "forward":
        for i in range(num_keyframes):
            poses.append({
                "frame": i,
                "position": [i * 0.1, 0, 0],
                "rotation": [0, 0, 0],
                "fov": 60
            })
    elif trajectory_type == "circular":
        for i in range(num_keyframes):
            angle = (i / num_keyframes) * 2 * np.pi
            poses.append({
                "frame": i,
                "position": [np.cos(angle) * 2, 0, np.sin(angle) * 2],
                "rotation": [0, np.degrees(angle), 0],
                "fov": 60
            })
    elif trajectory_type == "zoom":
        for i in range(num_keyframes):
            zoom = 1 + (i / num_keyframes) * 2
            poses.append({
                "frame": i,
                "position": [0, 0, 0],
                "rotation": [0, 0, 0],
                "fov": 60 / zoom
            })
    
    temp_dir = tempfile.mkdtemp()
    json_path = os.path.join(temp_dir, "pose_trajectory.json")
    
    with open(json_path, 'w') as f:
        json.dump({"poses": poses}, f, indent=2)
    
    return json_path

def get_generation_history():
    """Return generation history as formatted text"""
    if not demo_instance.generation_history:
        return "No generations yet."
    
    history_text = ""
    for i, gen in enumerate(demo_instance.generation_history[-5:], 1):
        history_text += f"**Generation {i}**\n"
        history_text += f"- Prompt: {gen['prompt'][:50]}...\n"
        history_text += f"- Model: {gen['model_type']}\n"
        history_text += f"- Frames: {gen['num_frames']}\n"
        history_text += f"- Time: {gen['generation_time']}\n\n"
    
    return history_text

# Custom CSS for enhanced UI
custom_css = """
.main-container {
    max-width: 1400px;
    margin: 0 auto;
}

.model-card {
    border: 2px solid #e5e7eb;
    border-radius: 12px;
    padding: 20px;
    margin: 10px 0;
    transition: all 0.3s ease;
}

.model-card:hover {
    border-color: #3b82f6;
    box-shadow: 0 4px 12px rgba(59, 130, 246, 0.1);
}

.status-indicator {
    display: inline-block;
    width: 12px;
    height: 12px;
    border-radius: 50%;
    margin-right: 8px;
}

.status-ready { background-color: #10b981; }
.status-loading { background-color: #f59e0b; }
.status-error { background-color: #ef4444; }

.feature-highlight {
    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
    color: white;
    padding: 20px;
    border-radius: 12px;
    margin: 20px 0;
}

.generation-progress {
    font-family: 'Courier New', monospace;
    background: #1f2937;
    color: #10b981;
    padding: 15px;
    border-radius: 8px;
    margin: 10px 0;
}
"""

with gr.Blocks(css=custom_css, title="HY-WorldPlay: Interactive World Modeling") as demo:
    # Header
    gr.HTML("""
    <div style="text-align: center; margin-bottom: 30px;">
        <h1 style="font-size: 2.5em; margin-bottom: 10px;">🎮 HY-WorldPlay</h1>
        <p style="font-size: 1.2em; color: #6b7280;">Real-Time Interactive World Modeling with Geometric Consistency</p>
        <p style="margin-top: 10px;">
            <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank" style="color: #3b82f6; text-decoration: none;">
                Built with anycoder
            </a>
        </p>
    </div>
    """)
    
    # Feature highlights
    with gr.Row():
        with gr.Column():
            gr.HTML("""
            <div class="feature-highlight">
                <h3>🚀 Key Features</h3>
                <ul style="margin: 10px 0;">
                    <li>Real-time video generation at 24 FPS</li>
                    <li>Long-term geometric consistency</li>
                    <li>Dual Action Representation for control</li>
                    <li>Reconstituted Context Memory</li>
                    <li>WorldCompass RL post-training</li>
                    <li>Context Forcing distillation</li>
                </ul>
            </div>
            """)
    
    # Main interface tabs
    with gr.Tabs() as main_tabs:
        # Tab 1: Video Generation
        with gr.TabItem("🎬 Video Generation", id="gen_tab"):
            with gr.Row():
                with gr.Column(scale=2):
                    gr.Markdown("### Configuration")
                    
                    # Model selection
                    model_type = gr.Radio(
                        choices=["bidirectional", "autoregressive", "autoregressive_distilled"],
                        value="bidirectional",
                        label="Model Type",
                        info="Choose the model variant for generation"
                    )
                    
                    load_model_btn = gr.Button("🔄 Load Model", variant="primary", size="lg")
                    model_status = gr.HTML('<div><span class="status-indicator status-error"></span>Model not loaded</div>')
                    
                    # Input controls
                    with gr.Accordion("📝 Input Settings", open=True):
                        prompt_input = gr.Textbox(
                            label="Prompt",
                            placeholder="Describe the world you want to generate...",
                            lines=3,
                            value="A peaceful landscape with a stone bridge spanning a calm body of water, surrounded by lush green trees and a traditional pavilion."
                        )
                        
                        image_input = gr.Image(
                            label="Input Image (Optional)",
                            type="filepath",
                            sources=["upload", "clipboard"],
                            info="Upload an image to guide the generation"
                        )
                    
                    with gr.Accordion("⚙️ Generation Settings", open=False):
                        with gr.Row():
                            resolution = gr.Dropdown(
                                choices=["480p", "720p", "1080p"],
                                value="480p",
                                label="Resolution"
                            )
                            aspect_ratio = gr.Dropdown(
                                choices=["16:9", "9:16", "1:1", "4:3"],
                                value="16:9",
                                label="Aspect Ratio"
                            )
                        
                        with gr.Row():
                            num_frames = gr.Slider(
                                minimum=16,
                                maximum=250,
                                value=125,
                                step=1,
                                label="Number of Frames"
                            )
                            seed = gr.Number(
                                value=1,
                                label="Seed",
                                precision=0
                            )
                    
                    # Camera trajectory
                    with gr.Accordion("🎥 Camera Trajectory", open=False):
                        trajectory_type = gr.Radio(
                            choices=["forward", "circular", "zoom", "custom"],
                            value="forward",
                            label="Trajectory Type"
                        )
                        create_pose_btn = gr.Button("Generate Trajectory JSON")
                        pose_status = gr.Textbox(label="Trajectory Status", interactive=False)
                    
                    # Generation button
                    generate_btn = gr.Button("🚀 Generate Video", variant="primary", size="lg", visible=False)
                    
                with gr.Column(scale=1):
                    gr.Markdown("### Output & Progress")
                    
                    # Progress display
                    progress_display = gr.HTML('<div class="generation-progress">Ready to generate...</div>')
                    
                    # Output video
                    video_output = gr.Video(
                        label="Generated Video",
                        visible=False
                    )
                    
                    # Metadata
                    metadata_output = gr.JSON(
                        label="Generation Metadata",
                        visible=False
                    )
        
        # Tab 2: Model Comparison
        with gr.TabItem("📊 Model Comparison", id="compare_tab"):
            gr.Markdown("### Performance Comparison")
            
            # Performance metrics table
            gr.DataFrame(
                headers=["Model", "Real-time", "PSNR ↑", "SSIM ↑", "LPIPS ↓", "R_dist ↓", "T_dist ↓"],
                datatype=["str", "str", "number", "number", "number", "number", "number"],
                value=[
                    ["CameraCtrl", "❌", 17.93, 0.569, 0.298, 0.037, 0.341],
                    ["SEVA", "❌", 19.84, 0.598, 0.313, 0.047, 0.223],
                    ["ViewCrafter", "❌", 19.91, 0.617, 0.327, 0.029, 0.543],
                    ["Gen3C", "❌", 21.68, 0.635, 0.278, 0.024, 0.477],
                    ["VMem", "❌", 19.97, 0.587, 0.316, 0.048, 0.219],
                    ["Matrix-Game-2.0", "✅", 17.26, 0.505, 0.383, 0.287, 0.843],
                    ["GameCraft", "❌", 21.05, 0.639, 0.341, 0.151, 0.617],
                    ["Ours (w/o Context Forcing)", "❌", 21.27, 0.669, 0.261, 0.033, 0.157],
                    ["Ours (full)", "✅", 21.92, 0.702, 0.247, 0.031, 0.121]
                ],
                label="Quantitative Evaluation Results",
                interactive=False
            )
            
            with gr.Row():
                with gr.Column():
                    gr.Markdown("""
                    ### 🔬 Key Improvements
                    - **Real-time Performance**: Achieves 24 FPS streaming generation
                    - **Superior Consistency**: Best-in-class long-term geometric consistency
                    - **Memory Efficiency**: Reconstituted Context Memory prevents error drift
                    - **Action Control**: Precise keyboard and mouse input response
                    """)
                
                with gr.Column():
                    gr.Markdown("""
                    ### 📈 Technical Innovations
                    - **Dual Action Representation**: Robust action control
                    - **Context Forcing**: Memory-aware model distillation
                    - **WorldCompass**: RL-based post-training
                    - **Temporal Reframing**: Long-past frame accessibility
                    """)
        
        # Tab 3: Examples
        with gr.TabItem("🎨 Examples", id="examples_tab"):
            gr.Markdown("### Sample Generations")
            
            with gr.Row():
                with gr.Column():
                    gr.Markdown("""
                    #### Example 1: Bridge Scene
                    **Prompt**: A paved pathway leads towards a stone arch bridge spanning a calm body of water...
                    **Action**: Forward movement
                    **Frames**: 125
                    """)
                    gr.Image("https://via.placeholder.com/400x225/3b82f6/ffffff?text=Bridge+Scene+Example", label="Example 1")
                
                with gr.Column():
                    gr.Markdown("""
                    #### Example 2: Forest Path
                    **Prompt**: A winding path through an enchanted forest with ancient trees...
                    **Action**: Circular trajectory
                    **Frames**: 125
                    """)
                    gr.Image("https://via.placeholder.com/400x225/10b981/ffffff?text=Forest+Path+Example", label="Example 2")
            
            with gr.Row():
                with gr.Column():
                    gr.Markdown("""
                    #### Example 3: Urban Scene
                    **Prompt**: A futuristic cityscape with flying vehicles and neon lights...
                    **Action**: Zoom in
                    **Frames**: 125
                    """)
                    gr.Image("https://via.placeholder.com/400x225/8b5cf6/ffffff?text=Urban+Scene+Example", label="Example 3")
                
                with gr.Column():
                    gr.Markdown("""
                    #### Example 4: Interior Scene
                    **Prompt**: A cozy library with bookshelves and warm lighting...
                    **Action**: Custom trajectory
                    **Frames**: 125
                    """)
                    gr.Image("https://via.placeholder.com/400x225/f59e0b/ffffff?text=Interior+Scene+Example", label="Example 4")
        
        # Tab 4: History
        with gr.TabItem("📜 History", id="history_tab"):
            gr.Markdown("### Generation History")
            
            history_display = gr.Markdown(get_generation_history())
            refresh_history_btn = gr.Button("🔄 Refresh History")
    
    # Footer
    gr.HTML("""
    <div style="text-align: center; margin-top: 40px; padding: 20px; border-top: 1px solid #e5e7eb;">
        <p style="color: #6b7280;">
            HY-WorldPlay: A Systematic Framework for Interactive World Modeling<br>
            <a href="https://arxiv.org/abs/2512.14614" target="_blank" style="color: #3b82f6;">Paper</a> | 
            <a href="https://github.com/Tencent/HY-WorldPlay" target="_blank" style="color: #3b82f6;">GitHub</a> | 
            <a href="https://huggingface.co/tencent/HY-WorldPlay" target="_blank" style="color: #3b82f6;">Model Card</a>
        </p>
    </div>
    """)
    
    # Event handlers
    def update_model_status(is_loaded, model_type):
        if is_loaded:
            return f'<div><span class="status-indicator status-ready"></span>{model_type.capitalize()} model loaded</div>', gr.Button(visible=True)
        else:
            return f'<div><span class="status-indicator status-loading"></span>Loading {model_type} model...</div>', gr.Button(visible=False)
    
    load_model_btn.click(
        fn=lambda x: load_model_wrapper(x),
        inputs=[model_type],
        outputs=[model_status]
    ).then(
        fn=lambda x: update_model_status(True, x),
        inputs=[model_type],
        outputs=[model_status, generate_btn]
    )
    
    def update_progress(progress_text, show_video=False):
        if "completed" in progress_text.lower():
            return f'<div class="generation-progress">✅ {progress_text}</div>', gr.Video(visible=True), gr.JSON(visible=True)
        else:
            return f'<div class="generation-progress">⏳ {progress_text}</div>', gr.Video(visible=False), gr.JSON(visible=False)
    
    generate_btn.click(
        fn=generate_video_wrapper,
        inputs=[
            prompt_input,
            image_input,
            resolution,
            aspect_ratio,
            num_frames,
            seed,
            model_type,
            trajectory_type
        ],
        outputs=[progress_display, video_output, metadata_output]
    )
    
    create_pose_btn.click(
        fn=create_pose_json,
        inputs=[trajectory_type],
        outputs=[pose_status]
    ).then(
        fn=lambda x: f"✅ Trajectory JSON created for {x} motion",
        inputs=[trajectory_type],
        outputs=[pose_status]
    )
    
    refresh_history_btn.click(
        fn=get_generation_history,
        outputs=[history_display]
    )

# Launch the app
demo.launch(
    theme=gr.themes.Soft(
        primary_hue="blue",
        secondary_hue="indigo",
        neutral_hue="slate",
        font=gr.themes.GoogleFont("Inter"),
        text_size="lg",
        spacing_size="lg",
        radius_size="md"
    ),
    footer_links=[
        {"label": "Built with anycoder", "url": "https://huggingface.co/spaces/akhaliq/anycoder"},
        {"label": "Paper", "url": "https://arxiv.org/abs/2512.14614"},
        {"label": "GitHub", "url": "https://github.com/Tencent/HY-WorldPlay"},
        {"label": "Model Card", "url": "https://huggingface.co/tencent/HY-WorldPlay"}
    ]
)