Spaces:
Running
Running
| import { | |
| AutoProcessor, | |
| AutoModelForImageTextToText, | |
| load_image, | |
| TextStreamer, | |
| } from "https://cdn.jsdelivr.net/npm/@huggingface/[email protected]"; | |
| class VideoCaptionApp { | |
| constructor() { | |
| this.videoFile = null; | |
| this.model = null; | |
| this.processor = null; | |
| this.isProcessing = false; | |
| this.captionInterval = null; | |
| this.isLiveCaptioning = false; | |
| this.captionHistory = []; | |
| this.initializeElements(); | |
| this.attachEventListeners(); | |
| this.checkWebGPUSupport(); | |
| } | |
| initializeElements() { | |
| this.elements = { | |
| videoPlayer: document.getElementById('videoPlayer'), | |
| videoInput: document.getElementById('videoInput'), | |
| uploadArea: document.getElementById('uploadArea'), | |
| processBtn: document.getElementById('processBtn'), | |
| deviceSelect: document.getElementById('deviceSelect'), | |
| results: document.getElementById('results'), | |
| frameCaptions: document.getElementById('frameCaptions'), | |
| controls: document.getElementById('controls'), | |
| controlsContent: document.getElementById('controlsContent'), | |
| controlsToggle: document.getElementById('controlsToggle'), | |
| copyBtn: document.getElementById('copyBtn'), | |
| downloadBtn: document.getElementById('downloadBtn'), | |
| changeVideoBtn: document.getElementById('changeVideoBtn'), | |
| liveCaption: document.getElementById('liveCaption'), | |
| captionText: document.getElementById('captionText') | |
| }; | |
| } | |
| attachEventListeners() { | |
| this.elements.uploadArea.addEventListener('click', () => { | |
| if (!this.isProcessing) { | |
| this.elements.videoInput.click(); | |
| } | |
| }); | |
| this.elements.uploadArea.addEventListener('dragover', (e) => { | |
| e.preventDefault(); | |
| if (!this.isProcessing) { | |
| this.elements.uploadArea.classList.add('drag-over'); | |
| } | |
| }); | |
| this.elements.uploadArea.addEventListener('dragleave', () => { | |
| this.elements.uploadArea.classList.remove('drag-over'); | |
| }); | |
| this.elements.uploadArea.addEventListener('drop', (e) => { | |
| e.preventDefault(); | |
| this.elements.uploadArea.classList.remove('drag-over'); | |
| if (!this.isProcessing && e.dataTransfer.files.length > 0) { | |
| const file = e.dataTransfer.files[0]; | |
| if (file.type.startsWith('video/')) { | |
| this.handleVideoUpload(file); | |
| } | |
| } | |
| }); | |
| this.elements.videoInput.addEventListener('change', (e) => { | |
| if (e.target.files.length > 0) { | |
| this.handleVideoUpload(e.target.files[0]); | |
| } | |
| }); | |
| this.elements.processBtn.addEventListener('click', () => { | |
| if (this.videoFile) { | |
| if (!this.isLiveCaptioning) { | |
| this.startLiveCaptions(); | |
| } else { | |
| this.stopLiveCaptions(); | |
| } | |
| } | |
| }); | |
| this.elements.videoPlayer.addEventListener('ended', () => { | |
| if (this.isLiveCaptioning) { | |
| this.stopLiveCaptions(); | |
| } | |
| }); | |
| this.elements.videoPlayer.addEventListener('pause', () => { | |
| if (this.isLiveCaptioning && this.captionInterval) { | |
| clearInterval(this.captionInterval); | |
| this.captionInterval = null; | |
| } | |
| }); | |
| this.elements.videoPlayer.addEventListener('play', () => { | |
| if (this.isLiveCaptioning && !this.captionInterval) { | |
| this.startCaptionInterval(); | |
| } | |
| }); | |
| this.elements.copyBtn.addEventListener('click', () => { | |
| this.copyResults(); | |
| }); | |
| this.elements.downloadBtn.addEventListener('click', () => { | |
| this.downloadResults(); | |
| }); | |
| this.elements.changeVideoBtn.addEventListener('click', () => { | |
| this.changeVideo(); | |
| }); | |
| this.elements.controlsToggle.addEventListener('click', () => { | |
| this.toggleControls(); | |
| }); | |
| } | |
| async checkWebGPUSupport() { | |
| if (!navigator.gpu) { | |
| this.elements.deviceSelect.querySelector('option[value="webgpu"]').disabled = true; | |
| this.elements.deviceSelect.value = 'cpu'; | |
| } else { | |
| // Default to WebGPU if available | |
| this.elements.deviceSelect.value = 'webgpu'; | |
| } | |
| } | |
| handleVideoUpload(file) { | |
| this.videoFile = file; | |
| const videoURL = URL.createObjectURL(file); | |
| this.elements.videoPlayer.src = videoURL; | |
| this.elements.uploadArea.style.display = 'none'; | |
| this.elements.controls.style.display = 'block'; | |
| this.elements.results.style.display = 'none'; | |
| this.elements.liveCaption.style.display = 'none'; | |
| this.captionHistory = []; | |
| } | |
| async captureCurrentFrame() { | |
| const video = this.elements.videoPlayer; | |
| const canvas = document.createElement('canvas'); | |
| // Use larger resolution for better caption quality | |
| canvas.width = Math.min(video.videoWidth, 640); | |
| canvas.height = Math.min(video.videoHeight, 360); | |
| const ctx = canvas.getContext('2d'); | |
| ctx.drawImage(video, 0, 0, canvas.width, canvas.height); | |
| const blob = await new Promise(resolve => { | |
| canvas.toBlob(resolve, 'image/jpeg', 0.5); | |
| }); | |
| return { | |
| blob, | |
| timestamp: video.currentTime | |
| }; | |
| } | |
| async initializeModel() { | |
| if (this.model) return; | |
| const device = this.elements.deviceSelect.value; | |
| const model_id = "onnx-community/FastVLM-0.5B-ONNX"; | |
| this.updateStatus('Loading AI model...'); | |
| try { | |
| this.processor = await AutoProcessor.from_pretrained(model_id); | |
| this.updateStatus('Initializing model...'); | |
| const modelOptions = { | |
| dtype: { | |
| embed_tokens: "fp16", | |
| vision_encoder: "q4", | |
| decoder_model_merged: "q4", | |
| } | |
| }; | |
| if (device === 'webgpu') { | |
| modelOptions.device = 'webgpu'; | |
| } | |
| this.model = await AutoModelForImageTextToText.from_pretrained(model_id, modelOptions); | |
| // Pre-warm the model with a dummy run | |
| this.updateStatus('Warming up model...'); | |
| await this.warmupModel(); | |
| this.updateStatus('Model ready'); | |
| } catch (error) { | |
| console.error('Model initialization error:', error); | |
| throw error; | |
| } | |
| } | |
| updateStatus(status) { | |
| this.elements.captionText.textContent = status; | |
| this.elements.captionText.style.opacity = '0.6'; | |
| } | |
| async warmupModel() { | |
| try { | |
| // Create a small dummy image | |
| const canvas = document.createElement('canvas'); | |
| canvas.width = 224; | |
| canvas.height = 224; | |
| const ctx = canvas.getContext('2d'); | |
| ctx.fillStyle = 'black'; | |
| ctx.fillRect(0, 0, 224, 224); | |
| const blob = await new Promise(resolve => { | |
| canvas.toBlob(resolve, 'image/jpeg', 0.5); | |
| }); | |
| const frameUrl = URL.createObjectURL(blob); | |
| const image = await load_image(frameUrl); | |
| const messages = [ | |
| { | |
| role: "user", | |
| content: `<image>Describe what you see in one sentence.`, | |
| }, | |
| ]; | |
| const prompt = this.processor.apply_chat_template(messages, { | |
| add_generation_prompt: true, | |
| }); | |
| const inputs = await this.processor(image, prompt, { | |
| add_special_tokens: false, | |
| }); | |
| // Run a quick generation to warm up the model | |
| await this.model.generate({ | |
| ...inputs, | |
| max_new_tokens: 5, | |
| do_sample: false, | |
| }); | |
| URL.revokeObjectURL(frameUrl); | |
| } catch (error) { | |
| console.error('Warmup error:', error); | |
| } | |
| } | |
| async startLiveCaptions() { | |
| this.isLiveCaptioning = true; | |
| this.elements.processBtn.classList.add('loading'); | |
| this.elements.processBtn.querySelector('.btn-text').textContent = 'Stop Captions'; | |
| this.elements.controls.classList.add('collapsed'); | |
| this.elements.liveCaption.style.display = 'block'; | |
| this.elements.results.style.display = 'block'; | |
| try { | |
| if (!this.model) { | |
| await this.initializeModel(); | |
| } | |
| // Start playing the video | |
| this.elements.videoPlayer.play(); | |
| // Start the caption interval | |
| this.startCaptionInterval(); | |
| } catch (error) { | |
| console.error('Error starting live captions:', error); | |
| this.stopLiveCaptions(); | |
| alert('Failed to start live captions. Please try again.'); | |
| } | |
| } | |
| startCaptionInterval() { | |
| // Use very short intervals for real-time feel | |
| const intervalSeconds = 0.5; // Generate captions every 500ms for rapid updates | |
| // Generate initial caption | |
| this.generateLiveCaption(); | |
| // Set up interval for continuous captions | |
| this.captionInterval = setInterval(() => { | |
| if (!this.elements.videoPlayer.paused && !this.elements.videoPlayer.ended) { | |
| this.generateLiveCaption(); | |
| } | |
| }, intervalSeconds * 1000); | |
| } | |
| async generateLiveCaption() { | |
| if (this.isProcessing) return; | |
| this.isProcessing = true; | |
| try { | |
| const frame = await this.captureCurrentFrame(); | |
| const frameUrl = URL.createObjectURL(frame.blob); | |
| const image = await load_image(frameUrl); | |
| const messages = [ | |
| { | |
| role: "user", | |
| content: `<image>What's happening?`, | |
| }, | |
| ]; | |
| const prompt = this.processor.apply_chat_template(messages, { | |
| add_generation_prompt: true, | |
| }); | |
| const inputs = await this.processor(image, prompt, { | |
| add_special_tokens: false, | |
| }); | |
| let captionText = ''; | |
| let isStreaming = true; | |
| // Clear previous caption and show streaming indicator | |
| this.elements.captionText.style.opacity = '1'; | |
| this.elements.captionText.textContent = ''; | |
| const streamer = new TextStreamer(this.processor.tokenizer, { | |
| skip_prompt: true, | |
| skip_special_tokens: true, | |
| callback_function: (text) => { | |
| captionText += text; | |
| // Stream the text to the live caption | |
| if (isStreaming) { | |
| this.elements.captionText.textContent = captionText; | |
| } | |
| } | |
| }); | |
| const outputs = await this.model.generate({ | |
| ...inputs, | |
| max_new_tokens: 15, // Keep at 15 for fast generation | |
| do_sample: false, | |
| streamer: streamer, | |
| temperature: 0.1, // Lower temperature for faster, more deterministic output | |
| repetition_penalty: 1.0, | |
| }); | |
| isStreaming = false; | |
| // Add to history | |
| const captionData = { | |
| timestamp: frame.timestamp, | |
| caption: captionText.trim() | |
| }; | |
| this.captionHistory.push(captionData); | |
| this.displayFrameCaption(captionData); | |
| URL.revokeObjectURL(frameUrl); | |
| } catch (error) { | |
| console.error('Error generating caption:', error); | |
| } finally { | |
| this.isProcessing = false; | |
| } | |
| } | |
| stopLiveCaptions() { | |
| this.isLiveCaptioning = false; | |
| this.elements.processBtn.classList.remove('loading'); | |
| this.elements.processBtn.querySelector('.btn-text').textContent = 'Start Live Captions'; | |
| if (this.captionInterval) { | |
| clearInterval(this.captionInterval); | |
| this.captionInterval = null; | |
| } | |
| this.elements.videoPlayer.pause(); | |
| this.elements.liveCaption.style.display = 'none'; | |
| this.elements.controls.classList.remove('collapsed'); | |
| } | |
| toggleControls() { | |
| this.elements.controls.classList.toggle('collapsed'); | |
| } | |
| displayFrameCaption(captionData) { | |
| const captionElement = document.createElement('div'); | |
| captionElement.className = 'frame-caption-item'; | |
| captionElement.innerHTML = ` | |
| <div class="frame-header"> | |
| <span class="frame-time">${this.formatTime(captionData.timestamp)}</span> | |
| </div> | |
| <p class="frame-text">${captionData.caption}</p> | |
| `; | |
| this.elements.frameCaptions.insertBefore(captionElement, this.elements.frameCaptions.firstChild); | |
| // Keep only last 20 captions in view for rapid updates | |
| while (this.elements.frameCaptions.children.length > 20) { | |
| this.elements.frameCaptions.removeChild(this.elements.frameCaptions.lastChild); | |
| } | |
| } | |
| formatTime(seconds) { | |
| const mins = Math.floor(seconds / 60); | |
| const secs = Math.floor(seconds % 60); | |
| return `${mins}:${secs.toString().padStart(2, '0')}`; | |
| } | |
| async copyResults() { | |
| const captions = this.captionHistory | |
| .map(c => `[${this.formatTime(c.timestamp)}] ${c.caption}`) | |
| .join('\n'); | |
| try { | |
| await navigator.clipboard.writeText(captions); | |
| this.elements.copyBtn.classList.add('copied'); | |
| setTimeout(() => { | |
| this.elements.copyBtn.classList.remove('copied'); | |
| }, 2000); | |
| } catch (err) { | |
| console.error('Failed to copy:', err); | |
| } | |
| } | |
| downloadResults() { | |
| const captions = this.captionHistory | |
| .map(c => `[${this.formatTime(c.timestamp)}] ${c.caption}`) | |
| .join('\n'); | |
| const blob = new Blob([captions], { type: 'text/plain' }); | |
| const url = URL.createObjectURL(blob); | |
| const a = document.createElement('a'); | |
| a.href = url; | |
| a.download = `captions_${new Date().toISOString().slice(0, 19).replace(/:/g, '-')}.txt`; | |
| document.body.appendChild(a); | |
| a.click(); | |
| document.body.removeChild(a); | |
| URL.revokeObjectURL(url); | |
| this.elements.downloadBtn.classList.add('copied'); | |
| setTimeout(() => { | |
| this.elements.downloadBtn.classList.remove('copied'); | |
| }, 2000); | |
| } | |
| changeVideo() { | |
| // Stop any ongoing captioning | |
| if (this.isLiveCaptioning) { | |
| this.stopLiveCaptions(); | |
| } | |
| // Reset the video source | |
| this.elements.videoPlayer.src = ''; | |
| this.videoFile = null; | |
| // Show upload area, hide controls | |
| this.elements.uploadArea.style.display = 'block'; | |
| this.elements.controls.style.display = 'none'; | |
| this.elements.results.style.display = 'none'; | |
| this.elements.liveCaption.style.display = 'none'; | |
| // Clear caption history | |
| this.captionHistory = []; | |
| this.elements.frameCaptions.innerHTML = ''; | |
| // Trigger file input | |
| this.elements.videoInput.click(); | |
| } | |
| } | |
| document.addEventListener('DOMContentLoaded', () => { | |
| new VideoCaptionApp(); | |
| }); |