| | import os |
| | import gc |
| | import time |
| | import platform |
| | import ctypes |
| | from ctypes import wintypes |
| | import torch |
| | import torch.nn.functional as F |
| | import comfy.model_management as model_management |
| | import comfy.sample as _sample |
| | import comfy.samplers as _samplers |
| | import comfy.utils as _utils |
| |
|
| | try: |
| | import psutil |
| | except Exception: |
| | psutil = None |
| |
|
| |
|
| | def _get_ram_mb() -> float: |
| | try: |
| | if psutil is not None: |
| | p = psutil.Process(os.getpid()) |
| | rss = float(p.memory_info().rss) |
| | try: |
| | private = getattr(p.memory_full_info(), "private", None) |
| | if isinstance(private, (int, float)) and private > 0: |
| | rss = float(private) |
| | except Exception: |
| | pass |
| | return rss / (1024.0 * 1024.0) |
| | except Exception: |
| | pass |
| | return 0.0 |
| |
|
| |
|
| | def _get_vram_mb_per_device() -> list[tuple[int, float, float]]: |
| | out = [] |
| | try: |
| | if torch.cuda.is_available(): |
| | for d in range(torch.cuda.device_count()): |
| | try: |
| | reserved = float(torch.cuda.memory_reserved(d)) / (1024.0 * 1024.0) |
| | allocated = float(torch.cuda.memory_allocated(d)) / (1024.0 * 1024.0) |
| | except Exception: |
| | reserved = 0.0 |
| | allocated = 0.0 |
| | out.append((d, reserved, allocated)) |
| | except Exception: |
| | pass |
| | return out |
| |
|
| |
|
| | def _trim_working_set_windows(): |
| | try: |
| | if platform.system().lower().startswith("win"): |
| | kernel32 = ctypes.windll.kernel32 |
| | proc = kernel32.GetCurrentProcess() |
| | kernel32.SetProcessWorkingSetSize(proc, ctypes.c_size_t(-1), ctypes.c_size_t(-1)) |
| | except Exception: |
| | pass |
| |
|
| |
|
| | def _enable_win_privileges(names): |
| | """Best-effort enable a set of Windows privileges for the current process.""" |
| | try: |
| | if not platform.system().lower().startswith('win'): |
| | return False |
| | advapi32 = ctypes.windll.advapi32 |
| | kernel32 = ctypes.windll.kernel32 |
| | token = wintypes.HANDLE() |
| | TOKEN_ADJUST_PRIVILEGES = 0x20 |
| | TOKEN_QUERY = 0x8 |
| | if not advapi32.OpenProcessToken(kernel32.GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, ctypes.byref(token)): |
| | return False |
| |
|
| | class LUID(ctypes.Structure): |
| | _fields_ = [("LowPart", wintypes.DWORD), ("HighPart", wintypes.LONG)] |
| |
|
| | class LUID_AND_ATTRIBUTES(ctypes.Structure): |
| | _fields_ = [("Luid", LUID), ("Attributes", wintypes.DWORD)] |
| |
|
| | class TOKEN_PRIVILEGES(ctypes.Structure): |
| | _fields_ = [("PrivilegeCount", wintypes.DWORD), ("Privileges", LUID_AND_ATTRIBUTES * 1)] |
| |
|
| | SE_PRIVILEGE_ENABLED = 0x2 |
| | success = False |
| | for name in names: |
| | luid = LUID() |
| | if not advapi32.LookupPrivilegeValueW(None, ctypes.c_wchar_p(name), ctypes.byref(luid)): |
| | continue |
| | tp = TOKEN_PRIVILEGES() |
| | tp.PrivilegeCount = 1 |
| | tp.Privileges[0].Luid = luid |
| | tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED |
| | if advapi32.AdjustTokenPrivileges(token, False, ctypes.byref(tp), 0, None, None): |
| | success = True |
| | return success |
| | except Exception: |
| | return False |
| |
|
| |
|
| | def _system_cache_trim_windows(): |
| | """Attempt to purge standby/file caches on Windows (requires privileges).""" |
| | try: |
| | if not platform.system().lower().startswith('win'): |
| | return False |
| | _enable_win_privileges([ |
| | 'SeIncreaseQuotaPrivilege', |
| | 'SeProfileSingleProcessPrivilege', |
| | 'SeDebugPrivilege', |
| | ]) |
| | try: |
| | kernel32 = ctypes.windll.kernel32 |
| | SIZE_T = ctypes.c_size_t |
| | kernel32.SetSystemFileCacheSize(SIZE_T(-1), SIZE_T(-1), wintypes.DWORD(0)) |
| | except Exception: |
| | pass |
| | try: |
| | ntdll = ctypes.windll.ntdll |
| | SystemMemoryListInformation = 0x50 |
| | MemoryPurgeStandbyList = ctypes.c_ulong(4) |
| | ntdll.NtSetSystemInformation(wintypes.ULONG(SystemMemoryListInformation), ctypes.byref(MemoryPurgeStandbyList), ctypes.sizeof(MemoryPurgeStandbyList)) |
| | except Exception: |
| | pass |
| | return True |
| | except Exception: |
| | return False |
| |
|
| |
|
| | def cleanup_memory(sync_cuda: bool = True, hard_trim: bool = True) -> dict: |
| | """Run a best-effort cleanup of RAM/VRAM. Returns stats dict with before/after deltas.""" |
| | stats: dict = {"ram_before_mb": 0.0, "ram_after_mb": 0.0, "ram_freed_mb": 0.0, "gpu": []} |
| | stats["ram_before_mb"] = _get_ram_mb() |
| | gpu_before = _get_vram_mb_per_device() |
| | try: |
| | if sync_cuda and torch.cuda.is_available(): |
| | torch.cuda.synchronize() |
| | except Exception: |
| | pass |
| | try: |
| | import comfy.model_management as mm |
| | if hasattr(mm, 'soft_empty_cache'): |
| | mm.soft_empty_cache() |
| | except Exception: |
| | pass |
| | try: |
| | gc.collect() |
| | except Exception: |
| | pass |
| | try: |
| | if torch.cuda.is_available(): |
| | torch.cuda.empty_cache() |
| | torch.cuda.ipc_collect() |
| | except Exception: |
| | pass |
| | try: |
| | time.sleep(0) |
| | except Exception: |
| | pass |
| | if hard_trim: |
| | try: |
| | import comfy.model_management as mm |
| | if hasattr(mm, 'unload_all_models'): |
| | mm.unload_all_models() |
| | except Exception: |
| | pass |
| | try: |
| | for _ in range(2): |
| | time.sleep(0) |
| | gc.collect() |
| | except Exception: |
| | pass |
| | try: |
| | if hasattr(_utils, 'cleanup_lru_caches'): |
| | _utils.cleanup_lru_caches() |
| | except Exception: |
| | pass |
| | try: |
| | _trim_working_set_windows() |
| | psapi = ctypes.windll.psapi |
| | kernel32 = ctypes.windll.kernel32 |
| | psapi.EmptyWorkingSet(kernel32.GetCurrentProcess()) |
| | except Exception: |
| | pass |
| | try: |
| | if platform.system().lower().startswith('linux'): |
| | libc = ctypes.CDLL('libc.so.6') |
| | libc.malloc_trim(0) |
| | except Exception: |
| | pass |
| | try: |
| | _system_cache_trim_windows() |
| | except Exception: |
| | pass |
| | stats["ram_after_mb"] = _get_ram_mb() |
| | stats["ram_freed_mb"] = max(0.0, stats["ram_before_mb"] - stats["ram_after_mb"]) |
| | gpu_after = _get_vram_mb_per_device() |
| | device_map = {d: (r, a) for d, r, a in gpu_before} |
| | gpu_stats = [] |
| | for d, r_after, a_after in gpu_after: |
| | r_before, a_before = device_map.get(d, (0.0, 0.0)) |
| | gpu_stats.append({ |
| | "device": d, |
| | "reserved_before_mb": r_before, |
| | "reserved_after_mb": r_after, |
| | "reserved_freed_mb": max(0.0, r_before - r_after), |
| | "allocated_before_mb": a_before, |
| | "allocated_after_mb": a_after, |
| | "allocated_freed_mb": max(0.0, a_before - a_after), |
| | }) |
| | stats["gpu"] = gpu_stats |
| | return stats |
| |
|
| |
|
| | class MG_CleanUp: |
| | @classmethod |
| | def INPUT_TYPES(cls): |
| | return { |
| | "required": { |
| | "samples": ("LATENT", {}), |
| | }, |
| | "optional": { |
| | "hard_trim": ("BOOLEAN", {"default": True, "tooltip": "Aggressively free RAM/VRAM and ask OS to return pages to the system."}), |
| | "sync_cuda": ("BOOLEAN", {"default": True, "tooltip": "Synchronize CUDA before cleanup to flush pending kernels."}), |
| | "hires_only_threshold": ("INT", {"default": 0, "min": 0, "max": 16384, "step": 64, "tooltip": "Apply only when latent longest side >= threshold (0 == always)."}), |
| | } |
| | } |
| |
|
| | RETURN_TYPES = ("LATENT", "IMAGE") |
| | RETURN_NAMES = ("samples", "Preview") |
| | FUNCTION = "apply" |
| | CATEGORY = "MagicNodes" |
| |
|
| | def apply(self, samples, hard_trim=True, sync_cuda=True, hires_only_threshold=0, |
| | model=None, positive=None, negative=None, vae=None): |
| | img_prev = None |
| | try: |
| | if (model is not None) and (positive is not None) and (negative is not None) and (vae is not None): |
| | lat = samples.get("samples", None) |
| | if lat is not None and isinstance(lat, torch.Tensor) and lat.ndim == 4: |
| | z = lat |
| | B, C, H, W = z.shape |
| | target = 32 |
| | z_ds = z if (H == target and W == target) else F.interpolate(z, size=(target, target), mode='bilinear', align_corners=False) |
| | lat_img = _sample.fix_empty_latent_channels(model, z_ds) if hasattr(_sample, 'fix_empty_latent_channels') else z_ds |
| | batch_inds = samples.get("batch_index", None) |
| | noise = _sample.prepare_noise(lat_img, int(0), batch_inds) |
| | steps = 1 |
| | out = _sample.sample( |
| | model, noise, int(steps), float(1.0), "ddim", "normal", |
| | positive, negative, lat_img, |
| | denoise=float(0.10), disable_noise=False, start_step=None, last_step=None, |
| | force_full_denoise=False, noise_mask=None, callback=None, |
| | disable_pbar=not _utils.PROGRESS_BAR_ENABLED, seed=int(0) |
| | ) |
| | try: |
| | img_prev = vae.decode(out) |
| | if len(img_prev.shape) == 5: |
| | img_prev = img_prev.reshape(-1, img_prev.shape[-3], img_prev.shape[-2], img_prev.shape[-1]) |
| | except Exception: |
| | img_prev = None |
| | except Exception: |
| | img_prev = None |
| |
|
| | try: |
| | do_cleanup = True |
| | try: |
| | if int(hires_only_threshold) > 0: |
| | z = samples.get("samples", None) |
| | if z is not None and hasattr(z, "shape") and len(z.shape) >= 4: |
| | _, _, H, W = z.shape |
| | if max(int(H), int(W)) < int(hires_only_threshold): |
| | do_cleanup = False |
| | except Exception: |
| | pass |
| | if do_cleanup: |
| | print("=== CleanUP RAM and GPU ===") |
| | stats = cleanup_memory(sync_cuda=bool(sync_cuda), hard_trim=bool(hard_trim)) |
| | try: |
| | print(f"RAM freed: {stats['ram_freed_mb']:.1f} MB (before {stats['ram_before_mb']:.1f} -> after {stats['ram_after_mb']:.1f})") |
| | except Exception: |
| | pass |
| | try: |
| | for g in stats.get("gpu", []): |
| | print( |
| | f"GPU{g['device']}: reserved freed {g['reserved_freed_mb']:.1f} MB ( {g['reserved_before_mb']:.1f} -> {g['reserved_after_mb']:.1f} ), " |
| | f"allocated freed {g['allocated_freed_mb']:.1f} MB ( {g['allocated_before_mb']:.1f} -> {g['allocated_after_mb']:.1f} )" |
| | ) |
| | except Exception: |
| | pass |
| | |
| | try: |
| | time.sleep(0.150) |
| | stats2 = cleanup_memory(sync_cuda=False, hard_trim=bool(hard_trim)) |
| | if stats2 and float(stats2.get('ram_freed_mb', 0.0)) > 0.0: |
| | print(f"2nd pass: RAM freed +{stats2['ram_freed_mb']:.1f} MB") |
| | try: |
| | for g in stats2.get('gpu', []): |
| | if float(g.get('reserved_freed_mb', 0.0)) > 0.0 or float(g.get('allocated_freed_mb', 0.0)) > 0.0: |
| | print(f"2nd pass GPU{g['device']}: reserved +{g['reserved_freed_mb']:.1f} MB, allocated +{g['allocated_freed_mb']:.1f} MB") |
| | except Exception: |
| | pass |
| | except Exception: |
| | pass |
| | print("done.") |
| | except Exception: |
| | pass |
| |
|
| | if img_prev is None: |
| | try: |
| | device = model_management.intermediate_device() if hasattr(model_management, 'intermediate_device') else 'cpu' |
| | img_prev = torch.zeros((1, 32, 32, 3), dtype=torch.float32, device=device) |
| | except Exception: |
| | img_prev = torch.zeros((1, 32, 32, 3)) |
| | return (samples, img_prev) |
| |
|
| |
|
| |
|
| |
|