5675 lines
220 KiB
Python
5675 lines
220 KiB
Python
bl_info = {
|
|
"name": "Multi-Headless Instance Renderer | HoloMARI Platform",
|
|
"author": "HP Park",
|
|
"version": (1, 6, 5),
|
|
"blender": (2, 93, 0),
|
|
"location": "Render Properties â–¸ Multi-Instance Frames",
|
|
"description": "Spawns one or more headless Blender workers per GPU to maximize available compute resources.",
|
|
"category": "Render",
|
|
}
|
|
|
|
import bpy
|
|
import os
|
|
import sys
|
|
import re
|
|
import shutil
|
|
import tempfile
|
|
import time
|
|
import threading
|
|
import queue
|
|
import subprocess
|
|
import signal
|
|
import socket
|
|
import json
|
|
import random
|
|
import addon_utils
|
|
import string
|
|
import shlex
|
|
import hashlib
|
|
from pathlib import Path
|
|
from collections import Counter, deque
|
|
|
|
ADDON_KEY = "multi_instance_render"
|
|
_KM_ITEMS = []
|
|
_MANAGER = None
|
|
|
|
IS_WIN = (os.name == "nt")
|
|
IS_MAC = (sys.platform == "darwin")
|
|
|
|
VIDEO_FORMATS = {"FFMPEG", "AVI_JPEG", "AVI_RAW", "FRAME_SERVER"}
|
|
|
|
|
|
|
|
# ----------------------- helpers -----------------------
|
|
|
|
def _log(msg):
|
|
print(f"[MGPU] {msg}")
|
|
|
|
def _manager_has_active_workers(manager):
|
|
if not manager:
|
|
return False
|
|
try:
|
|
for w in getattr(manager, "workers", []):
|
|
try:
|
|
if w.running:
|
|
return True
|
|
except Exception:
|
|
continue
|
|
except Exception:
|
|
pass
|
|
return False
|
|
|
|
def _cleanup_stale_manager():
|
|
global _MANAGER
|
|
if _MANAGER and not _manager_has_active_workers(_MANAGER):
|
|
try:
|
|
_MANAGER.stop()
|
|
except Exception:
|
|
pass
|
|
_MANAGER = None
|
|
|
|
def _cycles_prefs():
|
|
try:
|
|
return bpy.context.preferences.addons["cycles"].preferences
|
|
except Exception:
|
|
return None
|
|
|
|
def _current_compute_type():
|
|
cp = _cycles_prefs()
|
|
if not cp:
|
|
return "CUDA"
|
|
return getattr(cp, "compute_device_type", "CUDA") or "CUDA"
|
|
|
|
def _cycles_cpu_device_selected():
|
|
cp = _cycles_prefs()
|
|
if not cp:
|
|
return False
|
|
try:
|
|
cp.refresh_devices()
|
|
except Exception:
|
|
pass
|
|
for d in getattr(cp, "devices", []):
|
|
try:
|
|
if str(getattr(d, "type", "") or "").upper() == "CPU" and bool(getattr(d, "use", False)):
|
|
return True
|
|
except Exception:
|
|
continue
|
|
return False
|
|
|
|
def _fmt_bytes(n):
|
|
try:
|
|
for unit in ["B","KiB","MiB","GiB","TiB"]:
|
|
if n < 1024: return f"{n:.1f}{unit}"
|
|
n /= 1024.0
|
|
except Exception:
|
|
pass
|
|
return "?"
|
|
|
|
def _median(values):
|
|
vals = []
|
|
for v in (values or []):
|
|
try:
|
|
fv = float(v)
|
|
if fv > 0:
|
|
vals.append(fv)
|
|
except Exception:
|
|
pass
|
|
if not vals:
|
|
return None
|
|
vals.sort()
|
|
n = len(vals)
|
|
m = n // 2
|
|
if n % 2 == 1:
|
|
return vals[m]
|
|
return (vals[m - 1] + vals[m]) * 0.5
|
|
|
|
_RENDERTIME_GUARD_PROFILES = {
|
|
"OFF": {
|
|
"enabled": False,
|
|
"warmup_completed_jobs": 1,
|
|
"warmup_per_worker_jobs": 1,
|
|
"periodic_recycle_enabled": False,
|
|
"periodic_recycle_points": [],
|
|
},
|
|
"CONSERVATIVE": {
|
|
"enabled": True,
|
|
"warmup_completed_jobs": 1,
|
|
"warmup_per_worker_jobs": 1,
|
|
"min_samples_soft": 6,
|
|
"soft_mult": 3.8,
|
|
"soft_min_s": 150.0,
|
|
"hard_mult": 7.0,
|
|
"hard_min_s": 420.0,
|
|
"progress_stall_s": 120.0,
|
|
"hedge_grace_s": 90.0,
|
|
"hedge_max_per_job": 1,
|
|
"restart_max_per_job": 1,
|
|
"worker_restart_cooldown_s": 300.0,
|
|
"worker_restart_budget": 1,
|
|
"worker_restart_window_frames": 20,
|
|
"global_restart_limit": 2,
|
|
"global_restart_window_s": 180.0,
|
|
"single_worker_min_stall_s": 240.0,
|
|
"min_baseline_s": 20.0,
|
|
"periodic_recycle_enabled": False,
|
|
"periodic_recycle_points": [0.25, 0.50, 0.75],
|
|
"periodic_recycle_min_completed_jobs": 32,
|
|
},
|
|
"BALANCED": {
|
|
"enabled": True,
|
|
"warmup_completed_jobs": 1,
|
|
"warmup_per_worker_jobs": 1,
|
|
"min_samples_soft": 4,
|
|
"soft_mult": 3.0,
|
|
"soft_min_s": 90.0,
|
|
"hard_mult": 5.5,
|
|
"hard_min_s": 300.0,
|
|
"progress_stall_s": 90.0,
|
|
"hedge_grace_s": 60.0,
|
|
"hedge_max_per_job": 1,
|
|
"restart_max_per_job": 1,
|
|
"worker_restart_cooldown_s": 180.0,
|
|
"worker_restart_budget": 2,
|
|
"worker_restart_window_frames": 20,
|
|
"global_restart_limit": 3,
|
|
"global_restart_window_s": 150.0,
|
|
"single_worker_min_stall_s": 180.0,
|
|
"min_baseline_s": 20.0,
|
|
"periodic_recycle_enabled": False,
|
|
"periodic_recycle_points": [0.25, 0.50, 0.75],
|
|
"periodic_recycle_min_completed_jobs": 32,
|
|
},
|
|
"AGGRESSIVE": {
|
|
"enabled": True,
|
|
"warmup_completed_jobs": 1,
|
|
"warmup_per_worker_jobs": 1,
|
|
"min_samples_soft": 2,
|
|
"soft_mult": 1.6,
|
|
"soft_min_s": 35.0,
|
|
"hard_mult": 2.1,
|
|
"hard_min_s": 95.0,
|
|
"progress_stall_s": 35.0,
|
|
"hedge_grace_s": 20.0,
|
|
"hedge_max_per_job": 1,
|
|
"restart_max_per_job": 1,
|
|
"worker_restart_cooldown_s": 90.0,
|
|
"worker_restart_budget": 3,
|
|
"worker_restart_window_frames": 20,
|
|
"global_restart_limit": 8,
|
|
"global_restart_window_s": 180.0,
|
|
"single_worker_min_stall_s": 100.0,
|
|
"min_baseline_s": 20.0,
|
|
"periodic_recycle_enabled": True,
|
|
"periodic_recycle_points": [0.25, 0.50, 0.75],
|
|
"periodic_recycle_min_completed_jobs": 32,
|
|
},
|
|
}
|
|
|
|
def _rendertime_guard_profile(tier: str):
|
|
key = str(tier or "AGGRESSIVE").upper()
|
|
base = _RENDERTIME_GUARD_PROFILES.get(key) or _RENDERTIME_GUARD_PROFILES["AGGRESSIVE"]
|
|
out = dict(base)
|
|
out["tier"] = key
|
|
return out
|
|
|
|
def _classify_launch_exception(exc: Exception):
|
|
s = str(exc or "")
|
|
low = s.lower()
|
|
if ("1455" in low) or ("paging file" in low) or ("not enough memory" in low):
|
|
return "SYSTEM_RAM_OR_COMMIT_EXHAUSTED"
|
|
if ("access is denied" in low) or ("permission denied" in low):
|
|
return "ACCESS_DENIED"
|
|
if ("file not found" in low) or ("no such file" in low):
|
|
return "BINARY_OR_PATH_NOT_FOUND"
|
|
if ("is not recognized" in low):
|
|
return "BINARY_NOT_EXECUTABLE"
|
|
return "PROCESS_START_EXCEPTION"
|
|
|
|
def _classify_runtime_exit_reason(last_line: str, returncode):
|
|
rc = "" if returncode is None else str(returncode)
|
|
ll = (last_line or "").lower()
|
|
if ("out of memory" in ll) or ("not enough memory" in ll):
|
|
return "EXIT_OUT_OF_MEMORY"
|
|
if ("cuda error out of memory" in ll) or ("optix error out of memory" in ll):
|
|
return "EXIT_GPU_VRAM_OOM"
|
|
if ("failed to create" in ll and "context" in ll):
|
|
return "EXIT_GPU_CONTEXT_INIT_FAILED"
|
|
if rc == "-1073741819":
|
|
return "EXIT_ACCESS_VIOLATION"
|
|
if rc == "-1073740791":
|
|
return "EXIT_STACK_BUFFER_OVERRUN"
|
|
return "EXIT_BEFORE_HANDSHAKE"
|
|
|
|
def _mgpu_is_video(scene):
|
|
try:
|
|
img = scene.render.image_settings
|
|
fmt = str(getattr(img, "file_format", "") or "").upper()
|
|
media = str(getattr(img, "media_type", "") or "").upper()
|
|
return (fmt in VIDEO_FORMATS or media == "VIDEO")
|
|
except Exception:
|
|
return False
|
|
|
|
def _mgpu_video_output_path(scene):
|
|
try:
|
|
return bpy.path.abspath(scene.render.filepath)
|
|
except Exception:
|
|
return ""
|
|
|
|
def _mgpu_first_frame_path(scene):
|
|
try:
|
|
return bpy.path.abspath(scene.render.frame_path(frame=scene.frame_start))
|
|
except Exception:
|
|
try:
|
|
return bpy.path.abspath(scene.render.filepath)
|
|
except Exception:
|
|
return ""
|
|
|
|
def _mgpu_dir_has_entries(path):
|
|
try:
|
|
if not os.path.isdir(path):
|
|
return False
|
|
with os.scandir(path) as it:
|
|
for _entry in it:
|
|
return True
|
|
except Exception:
|
|
return False
|
|
return False
|
|
|
|
def _mgpu_sequence_exists(first_frame_path):
|
|
try:
|
|
dir_path = os.path.dirname(first_frame_path)
|
|
if not os.path.isdir(dir_path):
|
|
return False
|
|
base = os.path.basename(first_frame_path)
|
|
m = re.search(r"(\\d+)(\\.[^.]+)?$", base)
|
|
if not m:
|
|
return False
|
|
prefix = base[:m.start(1)]
|
|
suffix = m.group(2) or ""
|
|
for name in os.listdir(dir_path):
|
|
if not name.startswith(prefix):
|
|
continue
|
|
if suffix and not name.endswith(suffix):
|
|
continue
|
|
return True
|
|
except Exception:
|
|
return False
|
|
return False
|
|
|
|
def _mgpu_video_temp_dir_for(scene, use_target_dir=True):
|
|
out_path = _mgpu_video_output_path(scene) or "mgpu_video"
|
|
base = os.path.splitext(os.path.basename(out_path))[0] or "render"
|
|
safe = re.sub(r"[^A-Za-z0-9._-]+", "_", base)
|
|
h = hashlib.md5(out_path.encode("utf-8", "ignore")).hexdigest()[:8]
|
|
if use_target_dir:
|
|
out_dir = os.path.dirname(out_path)
|
|
if out_dir:
|
|
return os.path.join(out_dir, f"{safe}_TEMP")
|
|
return os.path.join(tempfile.gettempdir(), f"mgpu_frames_{safe}_{h}")
|
|
|
|
def _mgpu_overwrite_warnings(scene, is_video, temp_dir=None):
|
|
warnings = []
|
|
if is_video:
|
|
out_path = _mgpu_video_output_path(scene)
|
|
if out_path and os.path.exists(out_path):
|
|
warnings.append(f"Output file exists: {out_path}")
|
|
else:
|
|
first_frame = _mgpu_first_frame_path(scene)
|
|
if first_frame and os.path.exists(first_frame):
|
|
warnings.append(f"Output frame exists: {first_frame}")
|
|
if first_frame and _mgpu_sequence_exists(first_frame):
|
|
warnings.append(f"Existing frame files detected in: {os.path.dirname(first_frame)}")
|
|
if temp_dir and _mgpu_dir_has_entries(temp_dir):
|
|
warnings.append(f"Temp frame folder has files: {temp_dir}")
|
|
return warnings
|
|
|
|
def _mgpu_has_mari_addon():
|
|
mari_mod = None
|
|
try:
|
|
for mod in addon_utils.modules():
|
|
if getattr(mod, "addon_prefix", None) == "mari":
|
|
mari_mod = mod
|
|
break
|
|
bi = getattr(mod, "bl_info", {}) or {}
|
|
if (bi.get("name") or "").strip().lower() == "mari advanced":
|
|
mari_mod = mod
|
|
break
|
|
except Exception:
|
|
pass
|
|
if not mari_mod:
|
|
return False
|
|
try:
|
|
name = getattr(mari_mod, "__name__", None)
|
|
if name:
|
|
_loaded, enabled = addon_utils.check(name)
|
|
return bool(enabled)
|
|
except Exception:
|
|
pass
|
|
return False
|
|
|
|
def _mgpu_enabled_addons_snapshot():
|
|
"""Capture add-ons currently enabled in this Blender session."""
|
|
records = []
|
|
names = []
|
|
try:
|
|
prefs_addons = getattr(bpy.context.preferences, "addons", None)
|
|
if prefs_addons is not None:
|
|
names.extend(list(prefs_addons.keys()))
|
|
except Exception:
|
|
pass
|
|
|
|
try:
|
|
for meta in addon_utils.modules():
|
|
mod_name = getattr(meta, "__name__", None)
|
|
if not mod_name:
|
|
continue
|
|
enabled = False
|
|
try:
|
|
state = addon_utils.check(mod_name)
|
|
if isinstance(state, tuple):
|
|
enabled = any(bool(v) for v in state)
|
|
else:
|
|
enabled = bool(state)
|
|
except Exception:
|
|
enabled = False
|
|
if enabled:
|
|
names.append(mod_name)
|
|
except Exception:
|
|
pass
|
|
|
|
dedup = []
|
|
seen = set()
|
|
for n in names:
|
|
if not n or n in seen:
|
|
continue
|
|
seen.add(n)
|
|
src = ""
|
|
is_pkg = False
|
|
try:
|
|
mod = sys.modules.get(n)
|
|
if mod is None:
|
|
for meta in addon_utils.modules():
|
|
if getattr(meta, "__name__", None) == n:
|
|
mod = meta
|
|
break
|
|
src = str(getattr(mod, "__file__", "") or "")
|
|
if src:
|
|
src = os.path.abspath(src)
|
|
is_pkg = os.path.basename(src).lower() == "__init__.py"
|
|
except Exception:
|
|
src = ""
|
|
is_pkg = False
|
|
rec = {"module": n}
|
|
if src:
|
|
rec["file"] = src
|
|
rec["is_package"] = bool(is_pkg)
|
|
records.append(rec)
|
|
return records
|
|
|
|
def _mgpu_enabled_addon_module_names(records):
|
|
names = []
|
|
seen = set()
|
|
for entry in (records or []):
|
|
if isinstance(entry, str):
|
|
mod_name = str(entry or "")
|
|
elif isinstance(entry, dict):
|
|
mod_name = str(entry.get("module") or "")
|
|
else:
|
|
mod_name = ""
|
|
if not mod_name or mod_name in seen:
|
|
continue
|
|
seen.add(mod_name)
|
|
names.append(mod_name)
|
|
return names
|
|
|
|
def _proc_rss_bytes():
|
|
if IS_WIN:
|
|
try:
|
|
import ctypes, ctypes.wintypes as wt
|
|
class PROCESS_MEMORY_COUNTERS(ctypes.Structure):
|
|
_fields_ = [
|
|
("cb", wt.DWORD), ("PageFaultCount", wt.DWORD),
|
|
("PeakWorkingSetSize", ctypes.c_size_t), ("WorkingSetSize", ctypes.c_size_t),
|
|
("QuotaPeakPagedPoolUsage", ctypes.c_size_t), ("QuotaPagedPoolUsage", ctypes.c_size_t),
|
|
("QuotaPeakNonPagedPoolUsage", ctypes.c_size_t), ("QuotaNonPagedPoolUsage", ctypes.c_size_t),
|
|
("PagefileUsage", ctypes.c_size_t), ("PeakPagefileUsage", ctypes.c_size_t),
|
|
]
|
|
GetProcessMemoryInfo = ctypes.windll.psapi.GetProcessMemoryInfo
|
|
GetCurrentProcess = ctypes.windll.kernel32.GetCurrentProcess
|
|
h = GetCurrentProcess()
|
|
counters = PROCESS_MEMORY_COUNTERS()
|
|
counters.cb = ctypes.sizeof(PROCESS_MEMORY_COUNTERS)
|
|
if GetProcessMemoryInfo(h, ctypes.byref(counters), counters.cb):
|
|
return int(counters.WorkingSetSize)
|
|
except Exception:
|
|
return None
|
|
else:
|
|
try:
|
|
import resource
|
|
r = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
|
|
if IS_MAC:
|
|
return int(r)
|
|
return int(r * 1024)
|
|
except Exception:
|
|
return None
|
|
|
|
def _sys_mem_available_bytes():
|
|
if IS_WIN:
|
|
try:
|
|
import ctypes, ctypes.wintypes as wt
|
|
class MEMORYSTATUSEX(ctypes.Structure):
|
|
_fields_ = [
|
|
("dwLength", wt.DWORD),
|
|
("dwMemoryLoad", wt.DWORD),
|
|
("ullTotalPhys", ctypes.c_ulonglong),
|
|
("ullAvailPhys", ctypes.c_ulonglong),
|
|
("ullTotalPageFile", ctypes.c_ulonglong),
|
|
("ullAvailPageFile", ctypes.c_ulonglong),
|
|
("ullTotalVirtual", ctypes.c_ulonglong),
|
|
("ullAvailVirtual", ctypes.c_ulonglong),
|
|
("ullAvailExtendedVirtual", ctypes.c_ulonglong),
|
|
]
|
|
stat = MEMORYSTATUSEX()
|
|
stat.dwLength = ctypes.sizeof(MEMORYSTATUSEX)
|
|
ctypes.windll.kernel32.GlobalMemoryStatusEx(ctypes.byref(stat))
|
|
return int(stat.ullAvailPhys)
|
|
except Exception:
|
|
return None
|
|
try:
|
|
import psutil
|
|
return int(psutil.virtual_memory().available)
|
|
except Exception:
|
|
return None
|
|
|
|
# ----------------------- GPU detection -----------------------
|
|
|
|
def _normalize_gpu_name(n: str) -> str:
|
|
n = (n or "")
|
|
n = re.sub(r"\s*\(Display.*?\)", "", n)
|
|
n = n.replace("NVIDIA", "").replace("GeForce", "").strip()
|
|
return re.sub(r"\s+", " ", n)
|
|
|
|
def _normalize_pci_bus_id(raw: str) -> str:
|
|
s = (str(raw or "").strip().lower())
|
|
if not s:
|
|
return ""
|
|
m = re.search(r"([0-9a-f]{4,8})?:?([0-9a-f]{1,2}):([0-9a-f]{1,2})(?:\.([0-7]))?", s)
|
|
if not m:
|
|
return ""
|
|
dom = (m.group(1) or "00000000")
|
|
if len(dom) == 4:
|
|
dom = "0000" + dom
|
|
elif len(dom) < 8:
|
|
dom = dom.rjust(8, "0")
|
|
bus = m.group(2).rjust(2, "0")
|
|
dev = m.group(3).rjust(2, "0")
|
|
fn = m.group(4) or "0"
|
|
return f"{dom}:{bus}:{dev}.{fn}"
|
|
|
|
def _extract_pci_bus_id_from_dev(dev) -> str | None:
|
|
pat = re.compile(r"(?:^|[_\s:])([0-9A-Fa-f]{4,8}:[0-9A-Fa-f]{1,2}:[0-9A-Fa-f]{1,2}(?:\.[0-7])?)")
|
|
for field in (getattr(dev, "id", ""), getattr(dev, "name", "")):
|
|
m = pat.search(str(field) or "")
|
|
if m:
|
|
norm = _normalize_pci_bus_id(m.group(1))
|
|
if norm:
|
|
return norm
|
|
return None
|
|
|
|
def _win_query_nvidia_smi_detailed():
|
|
if not IS_WIN:
|
|
return None
|
|
try:
|
|
out = subprocess.check_output(
|
|
["nvidia-smi", "--query-gpu=index,uuid,pci.bus_id,name", "--format=csv,noheader"],
|
|
encoding="utf-8", errors="ignore"
|
|
)
|
|
phys = []
|
|
for line in out.strip().splitlines():
|
|
parts = [p.strip() for p in line.split(",")]
|
|
if len(parts) >= 4:
|
|
idx = int(parts[0])
|
|
uuid = parts[1]
|
|
bus = _normalize_pci_bus_id(parts[2]) or parts[2].lower()
|
|
name = ",".join(parts[3:]).strip()
|
|
phys.append({"index": idx, "uuid": uuid, "bus": bus, "name": name})
|
|
return phys
|
|
except Exception:
|
|
return None
|
|
|
|
def _dev_key(name: str, bus: str) -> tuple:
|
|
bus = _normalize_pci_bus_id(bus)
|
|
if bus:
|
|
return ("bus", bus)
|
|
return ("name", _normalize_gpu_name(name))
|
|
|
|
# LEGACY: broad scan (often contains 'ghost' GPU)
|
|
def _detect_gpu_devices_legacy(selected_only=False):
|
|
cp = _cycles_prefs()
|
|
if not cp:
|
|
return []
|
|
backend = getattr(cp, "compute_device_type", None)
|
|
try:
|
|
cp.refresh_devices()
|
|
except Exception:
|
|
pass
|
|
|
|
devs = []
|
|
for d in getattr(cp, "devices", []):
|
|
if getattr(d, "type", "") != backend:
|
|
continue
|
|
sel = bool(getattr(d, "use", False))
|
|
if selected_only and not sel:
|
|
continue
|
|
devs.append(d)
|
|
|
|
if not devs:
|
|
return []
|
|
|
|
phys = _win_query_nvidia_smi_detailed()
|
|
out = []
|
|
if phys:
|
|
bus_to_idx = {g["bus"]: g["index"] for g in phys}
|
|
used = set()
|
|
matched_dev_ids = set()
|
|
# Pass 1: bus
|
|
for d in devs:
|
|
bus = _extract_pci_bus_id_from_dev(d)
|
|
if bus and bus in bus_to_idx:
|
|
idx = bus_to_idx[bus]; used.add(idx)
|
|
out.append((idx, getattr(d,"name","?"), backend, bool(getattr(d,"use",False)), bus))
|
|
matched_dev_ids.add(id(d))
|
|
# Pass 2: name fallback
|
|
for d in devs:
|
|
if id(d) in matched_dev_ids:
|
|
continue
|
|
dn = _normalize_gpu_name(getattr(d,"name",""))
|
|
cand = None
|
|
for g in phys:
|
|
if g["index"] in used: continue
|
|
if _normalize_gpu_name(g["name"]) == dn:
|
|
cand = g; break
|
|
if cand:
|
|
out.append((cand["index"], getattr(d,"name","?"), backend, bool(getattr(d,"use",False)), cand["bus"]))
|
|
used.add(cand["index"])
|
|
else:
|
|
out.append((999, getattr(d,"name","?"), backend, bool(getattr(d,"use",False)), _extract_pci_bus_id_from_dev(d) or ""))
|
|
out.sort(key=lambda t: (t[0]==999, t[0]))
|
|
return out
|
|
|
|
return [(i, getattr(d,"name","?"), backend, bool(getattr(d,"use",False)), _extract_pci_bus_id_from_dev(d) or "") for i,d in enumerate(devs)]
|
|
|
|
def _detect_gpu_devices_strict(selected_only=True):
|
|
cp = _cycles_prefs()
|
|
if not cp:
|
|
return []
|
|
backend = getattr(cp, "compute_device_type", None)
|
|
try:
|
|
cp.refresh_devices()
|
|
except Exception:
|
|
pass
|
|
|
|
devs = []
|
|
for d in getattr(cp, "devices", []):
|
|
if getattr(d, "type", "") != backend:
|
|
continue
|
|
sel = bool(getattr(d, "use", False))
|
|
if selected_only and not sel:
|
|
continue
|
|
devs.append(d)
|
|
|
|
if not devs:
|
|
return []
|
|
|
|
phys = _win_query_nvidia_smi_detailed()
|
|
out = []
|
|
if phys:
|
|
bus_to_idx = {g["bus"]: g["index"] for g in phys}
|
|
used = set()
|
|
for d in devs:
|
|
bus = _extract_pci_bus_id_from_dev(d)
|
|
if bus and bus in bus_to_idx:
|
|
idx = bus_to_idx[bus]; used.add(idx)
|
|
out.append((idx, getattr(d,"name","?"), backend, True, bus))
|
|
else:
|
|
dn = _normalize_gpu_name(getattr(d,"name",""))
|
|
cand = None
|
|
for g in phys:
|
|
if g["index"] in used: continue
|
|
if _normalize_gpu_name(g["name"]) == dn:
|
|
cand = g; break
|
|
if cand:
|
|
used.add(cand["index"])
|
|
out.append((cand["index"], getattr(d,"name","?"), backend, True, cand["bus"]))
|
|
else:
|
|
out.append((999, getattr(d,"name","?"), backend, True, _extract_pci_bus_id_from_dev(d) or ""))
|
|
out.sort(key=lambda t: (t[0]==999, t[0]))
|
|
return out
|
|
|
|
return [(i, getattr(d,"name","?"), backend, True, _extract_pci_bus_id_from_dev(d) or "") for i,d in enumerate(devs)]
|
|
|
|
def _multiset_subtract(primary_list, subtract_list):
|
|
sub_counts = Counter(_dev_key(name, bus) for (_i, name, _t, _sel, bus) in subtract_list)
|
|
result = []
|
|
for tpl in primary_list:
|
|
(_i, name, _t, _sel, bus) = tpl
|
|
k = _dev_key(name, bus)
|
|
if sub_counts[k] > 0:
|
|
sub_counts[k] -= 1
|
|
else:
|
|
result.append(tpl)
|
|
return result
|
|
|
|
def _dedupe_selection_by_bus(rows):
|
|
rows = list(rows or [])
|
|
if not rows:
|
|
return rows
|
|
phys = _win_query_nvidia_smi_detailed() or []
|
|
phys_name_by_bus = {
|
|
_normalize_pci_bus_id(g.get("bus")): _normalize_gpu_name(g.get("name"))
|
|
for g in phys
|
|
if _normalize_pci_bus_id(g.get("bus"))
|
|
}
|
|
out = []
|
|
bus_pos = {}
|
|
for row in rows:
|
|
try:
|
|
idx, name, backend, selected, bus = row
|
|
except Exception:
|
|
out.append(row)
|
|
continue
|
|
nbus = _normalize_pci_bus_id(bus)
|
|
if not nbus:
|
|
out.append(row)
|
|
continue
|
|
phys_norm = phys_name_by_bus.get(nbus, "")
|
|
cur_norm = _normalize_gpu_name(name)
|
|
|
|
def _score(_row, _norm):
|
|
try:
|
|
_idx, _name, _backend, _selected, _bus = _row
|
|
except Exception:
|
|
return -999
|
|
s = 0
|
|
if bool(_selected):
|
|
s += 2
|
|
if _idx != 999:
|
|
s += 1
|
|
if _norm and _normalize_gpu_name(_name) == _norm:
|
|
s += 4
|
|
return s
|
|
|
|
if nbus not in bus_pos:
|
|
bus_pos[nbus] = len(out)
|
|
out.append((idx, name, backend, selected, nbus))
|
|
continue
|
|
|
|
pos = bus_pos[nbus]
|
|
prev = out[pos]
|
|
if _score((idx, name, backend, selected, nbus), phys_norm) > _score(prev, phys_norm):
|
|
out[pos] = (idx, name, backend, selected, nbus)
|
|
return out
|
|
|
|
def _detect_gpu_devices_final_from_lists(mode: str, legacy, strict):
|
|
legacy = list(legacy or [])
|
|
strict = list(strict or [])
|
|
if mode == "LEGACY_ONLY":
|
|
return _dedupe_selection_by_bus(legacy)
|
|
if mode == "STRICT_ONLY":
|
|
return _dedupe_selection_by_bus(strict)
|
|
if mode == "LEGACY_MINUS_STRICT":
|
|
return _dedupe_selection_by_bus(_multiset_subtract(legacy, strict))
|
|
|
|
final = _multiset_subtract(strict, legacy)
|
|
# Safety: never silently drop explicitly selected strict GPUs.
|
|
if strict and len(final) < len(strict):
|
|
return _dedupe_selection_by_bus(strict)
|
|
return _dedupe_selection_by_bus(final)
|
|
|
|
def _detect_gpu_devices_final(mode: str = "STRICT_MINUS_LEGACY"):
|
|
"""
|
|
mode:
|
|
- 'STRICT_MINUS_LEGACY' -> strict(all) minus legacy(ghost) [default]
|
|
- 'LEGACY_MINUS_STRICT' -> legacy minus strict
|
|
- 'STRICT_ONLY' -> just strict
|
|
- 'LEGACY_ONLY' -> just legacy
|
|
"""
|
|
strict = _detect_gpu_devices_strict(selected_only=True)
|
|
legacy = _detect_gpu_devices_legacy(selected_only=False)
|
|
return _detect_gpu_devices_final_from_lists(mode, legacy, strict)
|
|
|
|
def _cycles_device_snapshot():
|
|
cp = _cycles_prefs()
|
|
if not cp:
|
|
return {"backend": None, "rows": []}
|
|
backend = getattr(cp, "compute_device_type", None)
|
|
try:
|
|
cp.refresh_devices()
|
|
except Exception:
|
|
pass
|
|
rows = []
|
|
for d in getattr(cp, "devices", []):
|
|
rows.append({
|
|
"name": getattr(d, "name", "?"),
|
|
"type": getattr(d, "type", "?"),
|
|
"use": bool(getattr(d, "use", False)),
|
|
"bus": _extract_pci_bus_id_from_dev(d) or "",
|
|
"id": str(getattr(d, "id", "") or ""),
|
|
})
|
|
return {"backend": backend, "rows": rows}
|
|
|
|
# -------- map selection to physical UUIDs --------
|
|
|
|
def _map_selection_to_uuids(sel_tuples):
|
|
phys = _win_query_nvidia_smi_detailed()
|
|
if not phys:
|
|
out = []
|
|
for (idx, name, _t, _sel, bus) in sel_tuples:
|
|
out.append({
|
|
"index": idx, "name": name, "cycles_name": name,
|
|
"bus": _normalize_pci_bus_id(bus) or bus or "", "uuid": None,
|
|
"phys_index": idx if idx != 999 else None
|
|
})
|
|
return out
|
|
|
|
bus_map = {g["bus"]: g for g in phys}
|
|
name_buckets = {}
|
|
for g in phys:
|
|
name_buckets.setdefault(_normalize_gpu_name(g["name"]), []).append(g)
|
|
|
|
used_ids = set()
|
|
out = []
|
|
|
|
for (idx, name, _t, _sel, bus) in sel_tuples:
|
|
bus = _normalize_pci_bus_id(bus) or (bus or "")
|
|
g = None
|
|
if bus and bus in bus_map and bus_map[bus]["uuid"] not in used_ids:
|
|
g = bus_map[bus]
|
|
if (g is None) and (idx is not None) and isinstance(idx, int):
|
|
for cand in phys:
|
|
if cand["index"] == idx and cand["uuid"] not in used_ids:
|
|
g = cand; break
|
|
if g is None:
|
|
nb = name_buckets.get(_normalize_gpu_name(name), [])
|
|
for cand in nb:
|
|
if cand["uuid"] not in used_ids:
|
|
g = cand; break
|
|
|
|
if g:
|
|
used_ids.add(g["uuid"])
|
|
out.append({
|
|
"index": g["index"],
|
|
# Prefer physical inventory naming from nvidia-smi.
|
|
"name": g.get("name") or name,
|
|
"cycles_name": name,
|
|
"bus": g["bus"],
|
|
"uuid": g["uuid"],
|
|
"phys_index": g["index"],
|
|
})
|
|
else:
|
|
out.append({
|
|
"index": idx, "name": name, "cycles_name": name,
|
|
"bus": bus or "", "uuid": None, "phys_index": None
|
|
})
|
|
|
|
return out
|
|
|
|
def _filter_known_mapped_gpus(mapped):
|
|
"""Hide unresolved mapped entries (phys_index=None => shown as '?')."""
|
|
keep = []
|
|
dropped = []
|
|
for m in list(mapped or []):
|
|
if m.get("phys_index") is None:
|
|
dropped.append(m)
|
|
else:
|
|
keep.append(m)
|
|
return keep, dropped
|
|
|
|
BANNER_ASCII = r"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
▒▒▒▒▒▒▒▒▒▒▒ ░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░░
|
|
▒▌▄ ███ ▄▐▒ ░█░█░███░█░░███░█████░███░███░█░
|
|
▒▌█▌███▐█▐▒ ░█░█░█░█░█░░█░█░█░█░█░█░█░█░█░█░
|
|
▒▌█▌███▐█▐▒ ░███░█░█░█░░█░█░█░█░█░███░██░░█░
|
|
▒▌█▌███▐█▐▒ ░█░█░█░█░█░░█░█░█░█░█░█░█░█░█░█░
|
|
▒▌█▌███▐█▐▒ ░█░█░███░██░███░█░█░█░█░█░█░█░█░
|
|
▒▌▀ ███ ▀▐▒ ░▒░▒░▒▒▒░▒▒░▒▒▒░▒░▒░▒░▒░▒░▒░▒░▒░
|
|
▒▒▒▒▒▒▒▒▒▒▒ BLENDER MULTI-INSTANCE RENDERER
|
|
|
|
[[Provided By HoloMARI - Holographic Media Creators Platform]]
|
|
|
|
Check our Holographic image rendering/sharing at: holomari.com
|
|
|
|
|
|
"""
|
|
|
|
# Manager-side diagnostics console banner (line 3 labels this as manager console).
|
|
BANNER_MANAGER_ASCII = "\n\nRENDER MANAGER CONSOLE\n\n" + BANNER_ASCII.lstrip("\n")
|
|
_WORKER_BANNER_REPEAT_EVERY_LINES = 30
|
|
_WORKER_BANNER_REPEAT_TEXT = BANNER_ASCII.lstrip("\n").rstrip("\n") + "\n"
|
|
|
|
# ----------------------- child script -----------------------
|
|
|
|
_CHILD_SCRIPT_SRC = r"""
|
|
import bpy, sys, json, socket, os, re, time, traceback
|
|
try:
|
|
import addon_utils
|
|
except Exception:
|
|
addon_utils = None
|
|
|
|
# Make all prints flush immediately so the parent can time frames
|
|
try:
|
|
sys.stdout.reconfigure(line_buffering=True, write_through=True)
|
|
except Exception:
|
|
pass
|
|
|
|
HOST = "127.0.0.1"
|
|
args = sys.argv[sys.argv.index("--")+1:] if "--" in sys.argv else []
|
|
def _argval(flag, default=None):
|
|
if flag in args:
|
|
i = args.index(flag)
|
|
return args[i+1] if i+1 < len(args) else default
|
|
return default
|
|
|
|
PORT = int(_argval("--mgpu-port", "0"))
|
|
TOKEN = _argval("--mgpu-token", "")
|
|
TAG = _argval("--mgpu-tag", "worker")
|
|
DEVICE = _argval("--mgpu-device", None)
|
|
FALLBACK_DEVICE = _argval("--mgpu-fallback-device", "")
|
|
TARGET_GPU_BUS = _argval("--mgpu-gpu-bus", "") or ""
|
|
TARGET_GPU_NAME = _argval("--mgpu-gpu-name", "") or ""
|
|
THREADS = int(_argval("--mgpu-threads", "0") or "0")
|
|
USECPU = int(_argval("--mgpu-usecpu", "0") or "0")
|
|
DENOISE_GPU = int(_argval("--mgpu-denoise-gpu", "1") or "1")
|
|
PERSIST = int(_argval("--mgpu-persistent", "1") or "1")
|
|
MODE = _argval("--mgpu-mode", "FRAMES")
|
|
SRC_DIR = _argval("--src-dir", "") or ""
|
|
SEQ_DIR = _argval("--mgpu-seq-dir", "") or ""
|
|
SEQ_FMT = (_argval("--mgpu-seq-format", "PNG") or "PNG").upper()
|
|
SEQ_EXT = (_argval("--mgpu-seq-ext", ".png") or ".png").strip() or ".png"
|
|
if not SEQ_EXT.startswith("."):
|
|
SEQ_EXT = "." + SEQ_EXT
|
|
PRECHECKED_EXISTING = int(_argval("--mgpu-prechecked-existing", "0") or "0")
|
|
ADDONS_FILE = _argval("--mgpu-enabled-addons-file", "") or ""
|
|
|
|
|
|
MARI_PATH = _argval("--mari-path", "")
|
|
|
|
def _normalize_gpu_name(n):
|
|
n = str(n or "")
|
|
n = n.replace("NVIDIA", "").replace("GeForce", "").strip()
|
|
n = re.sub(r"\s+", " ", n)
|
|
return n.lower()
|
|
|
|
def _normalize_pci_bus_id(raw):
|
|
s = (str(raw or "").strip().lower())
|
|
if not s:
|
|
return ""
|
|
m = re.search(r"([0-9a-f]{4,8})?:?([0-9a-f]{1,2}):([0-9a-f]{1,2})(?:\.([0-7]))?", s)
|
|
if not m:
|
|
return ""
|
|
dom = (m.group(1) or "00000000")
|
|
if len(dom) == 4:
|
|
dom = "0000" + dom
|
|
elif len(dom) < 8:
|
|
dom = dom.rjust(8, "0")
|
|
bus = m.group(2).rjust(2, "0")
|
|
dev = m.group(3).rjust(2, "0")
|
|
fn = m.group(4) or "0"
|
|
return f"{dom}:{bus}:{dev}.{fn}"
|
|
|
|
def _extract_pci_bus_id_from_dev(dev):
|
|
pat = re.compile(r"([0-9A-Fa-f]{4,8}:[0-9A-Fa-f]{1,2}:[0-9A-Fa-f]{1,2}(?:\.[0-7])?)")
|
|
for field in (getattr(dev, "id", ""), getattr(dev, "name", "")):
|
|
m = pat.search(str(field) or "")
|
|
if m:
|
|
norm = _normalize_pci_bus_id(m.group(1))
|
|
if norm:
|
|
return norm
|
|
return ""
|
|
|
|
def _addon_record_fields(entry):
|
|
if isinstance(entry, str):
|
|
return entry, "", False
|
|
if isinstance(entry, dict):
|
|
return (
|
|
str(entry.get("module") or ""),
|
|
str(entry.get("file") or ""),
|
|
bool(entry.get("is_package", False)),
|
|
)
|
|
return "", "", False
|
|
|
|
def _load_addon_from_source(mod_name, src_path, is_package=False):
|
|
try:
|
|
import importlib.util
|
|
if not mod_name or not src_path:
|
|
return False
|
|
src_path = os.path.abspath(src_path)
|
|
if os.path.isdir(src_path):
|
|
is_package = True
|
|
init_path = os.path.join(src_path, "__init__.py")
|
|
if not os.path.isfile(init_path):
|
|
return False
|
|
spec = importlib.util.spec_from_file_location(
|
|
mod_name, init_path, submodule_search_locations=[src_path]
|
|
)
|
|
elif is_package or os.path.basename(src_path).lower() == "__init__.py":
|
|
pkg_dir = os.path.dirname(src_path)
|
|
spec = importlib.util.spec_from_file_location(
|
|
mod_name, src_path, submodule_search_locations=[pkg_dir]
|
|
)
|
|
else:
|
|
spec = importlib.util.spec_from_file_location(mod_name, src_path)
|
|
if not spec or not spec.loader:
|
|
return False
|
|
mod = sys.modules.get(mod_name)
|
|
if mod is None:
|
|
mod = importlib.util.module_from_spec(spec)
|
|
sys.modules[mod_name] = mod
|
|
spec.loader.exec_module(mod)
|
|
if hasattr(mod, "register"):
|
|
try:
|
|
mod.register()
|
|
except Exception:
|
|
pass
|
|
return True
|
|
except Exception:
|
|
return False
|
|
|
|
def _enable_parent_addons():
|
|
activated = []
|
|
if not ADDONS_FILE or not addon_utils:
|
|
return activated
|
|
try:
|
|
with open(ADDONS_FILE, "r", encoding="utf-8") as fp:
|
|
payload = json.load(fp)
|
|
mods = payload.get("addons", []) if isinstance(payload, dict) else payload
|
|
if not isinstance(mods, (list, tuple)):
|
|
mods = []
|
|
req = len(mods)
|
|
ok = 0
|
|
fail = 0
|
|
loaded_from_source = 0
|
|
already_enabled = 0
|
|
for entry in mods:
|
|
mod_name, src_path, is_package = _addon_record_fields(entry)
|
|
if not mod_name:
|
|
continue
|
|
try:
|
|
st = addon_utils.check(mod_name)
|
|
if isinstance(st, tuple) and any(bool(v) for v in st):
|
|
ok += 1
|
|
already_enabled += 1
|
|
activated.append(mod_name)
|
|
continue
|
|
except Exception:
|
|
pass
|
|
try:
|
|
addon_utils.enable(mod_name, default_set=False, persistent=False)
|
|
ok += 1
|
|
activated.append(mod_name)
|
|
except Exception:
|
|
if _load_addon_from_source(mod_name, src_path, is_package=is_package):
|
|
ok += 1
|
|
loaded_from_source += 1
|
|
activated.append(mod_name)
|
|
else:
|
|
fail += 1
|
|
print(f"[MGPU-CHILD] Add-on sync: requested={req} enabled={ok} already_enabled={already_enabled} loaded_from_source={loaded_from_source} failed={fail}")
|
|
sys.stdout.flush()
|
|
except Exception as e:
|
|
print(f"[MGPU-CHILD] WARNING: addon sync failed: {e}")
|
|
sys.stdout.flush()
|
|
return activated
|
|
|
|
_SYNCED_ADDON_MODULES = _enable_parent_addons()
|
|
|
|
try:
|
|
ops = dir(bpy.ops.mari)
|
|
print("[MGPU-CHILD] bpy.ops.mari ->", ", ".join(ops))
|
|
except Exception:
|
|
print("[MGPU-CHILD] bpy.ops.mari namespace missing")
|
|
|
|
if MARI_PATH:
|
|
try:
|
|
if MARI_PATH not in sys.path:
|
|
sys.path.insert(0, MARI_PATH)
|
|
try:
|
|
import holo_mari_addon as _hma
|
|
except Exception:
|
|
import importlib.util
|
|
p = os.path.join(MARI_PATH, "__init__.py")
|
|
spec = importlib.util.spec_from_file_location("holo_mari_addon", p)
|
|
_hma = importlib.util.module_from_spec(spec)
|
|
sys.modules["holo_mari_addon"] = _hma
|
|
spec.loader.exec_module(_hma)
|
|
if hasattr(_hma, "register"):
|
|
_hma.register()
|
|
_SYNCED_ADDON_MODULES.append("holo_mari_addon")
|
|
print("[MGPU-CHILD] Loaded MARI addon from path:", MARI_PATH); sys.stdout.flush()
|
|
except Exception as e:
|
|
print("[MGPU-CHILD] ERROR loading MARI addon path:", e); sys.stdout.flush()
|
|
|
|
|
|
|
|
|
|
def _ensure_threads():
|
|
try:
|
|
if THREADS and THREADS > 0:
|
|
try:
|
|
bpy.context.preferences.system.threads = THREADS
|
|
except Exception:
|
|
pass
|
|
except Exception:
|
|
pass
|
|
|
|
_ensure_threads()
|
|
|
|
def _scene_override_kwargs(scn):
|
|
kw = {}
|
|
if scn is None:
|
|
return kw
|
|
kw["scene"] = scn
|
|
try:
|
|
vls = getattr(scn, "view_layers", None)
|
|
if vls:
|
|
active_name = ""
|
|
try:
|
|
active_name = str(getattr(bpy.context.view_layer, "name", "") or "")
|
|
except Exception:
|
|
active_name = ""
|
|
if active_name and active_name in vls:
|
|
kw["view_layer"] = vls[active_name]
|
|
else:
|
|
kw["view_layer"] = vls[0]
|
|
except Exception:
|
|
pass
|
|
return kw
|
|
|
|
def _run_with_scene_override(scn, fn):
|
|
kw = _scene_override_kwargs(scn)
|
|
if kw:
|
|
try:
|
|
ctx = bpy.context.temp_override(**kw)
|
|
except Exception:
|
|
ctx = None
|
|
if ctx is not None:
|
|
with ctx:
|
|
return fn()
|
|
win = getattr(bpy.context, "window", None)
|
|
prev_scene = None
|
|
try:
|
|
if win and scn is not None:
|
|
prev_scene = win.scene
|
|
win.scene = scn
|
|
except Exception:
|
|
prev_scene = None
|
|
try:
|
|
return fn()
|
|
finally:
|
|
try:
|
|
if win and prev_scene is not None:
|
|
win.scene = prev_scene
|
|
except Exception:
|
|
pass
|
|
|
|
def _handler_matches_modules(handler, modules):
|
|
hmod = str(getattr(handler, "__module__", "") or "")
|
|
if not hmod:
|
|
return False
|
|
for mod_name in (modules or []):
|
|
mod_name = str(mod_name or "")
|
|
if mod_name and (hmod == mod_name or hmod.startswith(mod_name + ".")):
|
|
return True
|
|
return False
|
|
|
|
def _replay_addon_load_post_handlers(modules):
|
|
mods = [str(m or "") for m in (modules or []) if str(m or "")]
|
|
if not mods:
|
|
return
|
|
try:
|
|
handlers = list(getattr(bpy.app.handlers, "load_post", []) or [])
|
|
except Exception:
|
|
handlers = []
|
|
if not handlers:
|
|
return
|
|
|
|
filepath = bpy.data.filepath or ""
|
|
called = 0
|
|
failed = 0
|
|
for handler in handlers:
|
|
if not callable(handler) or not _handler_matches_modules(handler, mods):
|
|
continue
|
|
try:
|
|
handler(filepath)
|
|
called += 1
|
|
except TypeError:
|
|
try:
|
|
handler()
|
|
called += 1
|
|
except Exception as e:
|
|
failed += 1
|
|
print(f"[MGPU-CHILD] WARNING: load_post replay failed for {getattr(handler, '__name__', '<handler>')}: {e}")
|
|
sys.stdout.flush()
|
|
except Exception as e:
|
|
failed += 1
|
|
print(f"[MGPU-CHILD] WARNING: load_post replay failed for {getattr(handler, '__name__', '<handler>')}: {e}")
|
|
sys.stdout.flush()
|
|
if called or failed:
|
|
print(f"[MGPU-CHILD] load_post replay: called={called} failed={failed} modules={len(mods)}")
|
|
sys.stdout.flush()
|
|
|
|
def _replay_addon_scene_handlers(handler_name, scn, modules):
|
|
mods = [str(m or "") for m in (modules or []) if str(m or "")]
|
|
if not scn or not mods:
|
|
return
|
|
try:
|
|
handlers = list(getattr(bpy.app.handlers, handler_name, []) or [])
|
|
except Exception:
|
|
handlers = []
|
|
for handler in handlers:
|
|
if not callable(handler) or not _handler_matches_modules(handler, mods):
|
|
continue
|
|
try:
|
|
_run_with_scene_override(scn, lambda _h=handler: _h(scn))
|
|
except TypeError:
|
|
try:
|
|
_run_with_scene_override(scn, lambda _h=handler: _h())
|
|
except Exception as e:
|
|
print(f"[MGPU-CHILD] WARNING: {handler_name} replay failed for {getattr(handler, '__name__', '<handler>')}: {e}")
|
|
sys.stdout.flush()
|
|
except Exception as e:
|
|
print(f"[MGPU-CHILD] WARNING: {handler_name} replay failed for {getattr(handler, '__name__', '<handler>')}: {e}")
|
|
sys.stdout.flush()
|
|
|
|
def _force_scene_refresh(scn, frame=None):
|
|
if scn is None:
|
|
return
|
|
try:
|
|
target = int(scn.frame_current if frame is None else frame)
|
|
except Exception:
|
|
try:
|
|
target = int(scn.frame_current)
|
|
except Exception:
|
|
target = 0
|
|
|
|
try:
|
|
cur = int(scn.frame_current)
|
|
except Exception:
|
|
cur = target
|
|
|
|
def _apply():
|
|
try:
|
|
if cur == target:
|
|
alt = target - 1
|
|
try:
|
|
start = int(getattr(scn, "frame_start", target))
|
|
end = int(getattr(scn, "frame_end", target))
|
|
if alt < start and (target + 1) <= end:
|
|
alt = target + 1
|
|
if alt != target and start <= alt <= end:
|
|
try:
|
|
scn.frame_set(alt, subframe=0.0)
|
|
except TypeError:
|
|
scn.frame_set(alt)
|
|
except Exception:
|
|
pass
|
|
try:
|
|
scn.frame_set(target, subframe=0.0)
|
|
except TypeError:
|
|
scn.frame_set(target)
|
|
except Exception:
|
|
pass
|
|
|
|
try:
|
|
bpy.context.view_layer.update()
|
|
except Exception:
|
|
pass
|
|
try:
|
|
deps = bpy.context.evaluated_depsgraph_get()
|
|
upd = getattr(deps, "update", None)
|
|
if callable(upd):
|
|
upd()
|
|
except Exception:
|
|
pass
|
|
try:
|
|
_replay_addon_scene_handlers("frame_change_post", scn, _SYNCED_ADDON_MODULES)
|
|
except Exception:
|
|
pass
|
|
|
|
_run_with_scene_override(scn, _apply)
|
|
|
|
_replay_addon_load_post_handlers(_SYNCED_ADDON_MODULES)
|
|
|
|
try:
|
|
scn0 = bpy.context.scene
|
|
if scn0:
|
|
_force_scene_refresh(scn0, scn0.frame_current)
|
|
except Exception:
|
|
pass
|
|
|
|
# Cycles device setup:
|
|
# - GPU workers must stay on GPU (no silent CPU fallback).
|
|
# - If OptiX cannot enable devices, try CUDA as a fallback backend.
|
|
def _configure_cycles_devices():
|
|
try:
|
|
prefs = bpy.context.preferences.addons['cycles'].preferences
|
|
except Exception as e:
|
|
print(f"[MGPU-CHILD] WARNING: Cycles preferences unavailable: {e}")
|
|
sys.stdout.flush()
|
|
# For GPU workers, missing Cycles prefs must be treated as failure to
|
|
# prevent silent CPU rendering.
|
|
return bool(USECPU)
|
|
|
|
scn = bpy.context.scene
|
|
if USECPU:
|
|
try:
|
|
scn.cycles.device = 'CPU'
|
|
except Exception:
|
|
pass
|
|
try:
|
|
prefs.refresh_devices()
|
|
except Exception:
|
|
pass
|
|
cpu_enabled = 0
|
|
for d in getattr(prefs, "devices", []):
|
|
try:
|
|
dtype = str(getattr(d, "type", "") or "").upper()
|
|
if dtype == "CPU":
|
|
d.use = True
|
|
cpu_enabled += 1
|
|
else:
|
|
d.use = False
|
|
except Exception:
|
|
pass
|
|
print(f"[MGPU-CHILD] Cycles device setup: mode=CPU cpu_enabled={cpu_enabled}")
|
|
sys.stdout.flush()
|
|
return (cpu_enabled > 0)
|
|
|
|
# GPU worker path
|
|
try:
|
|
scn.cycles.device = 'GPU'
|
|
except Exception:
|
|
pass
|
|
|
|
wanted = str(DEVICE or "").upper()
|
|
fallback = str(FALLBACK_DEVICE or "").upper()
|
|
target_bus = _normalize_pci_bus_id(TARGET_GPU_BUS)
|
|
target_name_norm = _normalize_gpu_name(TARGET_GPU_NAME)
|
|
attempts = []
|
|
if wanted:
|
|
attempts.append(wanted)
|
|
if fallback and fallback not in attempts:
|
|
attempts.append(fallback)
|
|
if not attempts:
|
|
attempts = ["CUDA"]
|
|
|
|
for backend in attempts:
|
|
try:
|
|
prefs.compute_device_type = backend
|
|
except Exception as e:
|
|
print(f"[MGPU-CHILD] Cycles backend set failed for {backend}: {e}")
|
|
sys.stdout.flush()
|
|
continue
|
|
try:
|
|
prefs.refresh_devices()
|
|
except Exception as e:
|
|
print(f"[MGPU-CHILD] Cycles refresh failed for {backend}: {e}")
|
|
sys.stdout.flush()
|
|
continue
|
|
|
|
candidates = []
|
|
for d in getattr(prefs, "devices", []):
|
|
try:
|
|
dtype = str(getattr(d, "type", "") or "").upper()
|
|
if dtype == "CPU":
|
|
d.use = False
|
|
continue
|
|
if dtype != backend:
|
|
d.use = False
|
|
continue
|
|
bus = _extract_pci_bus_id_from_dev(d)
|
|
name = str(getattr(d, "name", dtype))
|
|
candidates.append((d, bus, name, dtype))
|
|
except Exception:
|
|
pass
|
|
|
|
selected = []
|
|
# Primary selector: PCI bus id (stable and unique per physical GPU).
|
|
if target_bus:
|
|
for (d, bus, name, _dtype) in candidates:
|
|
use_this = bool(bus and bus == target_bus)
|
|
d.use = use_this
|
|
if use_this:
|
|
selected.append((bus, name))
|
|
|
|
# Secondary selector: normalized device name (only if bus match failed).
|
|
if (not selected) and target_name_norm:
|
|
picked = False
|
|
for (d, _bus, name, _dtype) in candidates:
|
|
use_this = (not picked) and (_normalize_gpu_name(name) == target_name_norm)
|
|
d.use = use_this
|
|
if use_this:
|
|
selected.append((_bus, name))
|
|
picked = True
|
|
|
|
# Last resort: if only one candidate exists for this backend, use it.
|
|
if (not selected) and len(candidates) == 1:
|
|
d, bus, name, _dtype = candidates[0]
|
|
d.use = True
|
|
selected.append((bus, name))
|
|
print(f"[MGPU-CHILD] Cycles selector fallback: single-candidate backend={backend} bus={bus or '-'} name={name}")
|
|
sys.stdout.flush()
|
|
|
|
# If no explicit selector was passed, keep previous behavior (all backend devices).
|
|
if (not target_bus) and (not target_name_norm):
|
|
selected = []
|
|
for (d, bus, name, _dtype) in candidates:
|
|
d.use = True
|
|
selected.append((bus, name))
|
|
|
|
enabled = len(selected)
|
|
names = [n for (_b, n) in selected]
|
|
print(
|
|
f"[MGPU-CHILD] Cycles device setup: backend={backend} enabled_gpu={enabled} names={names} "
|
|
f"target_bus={target_bus or '-'} target_name={TARGET_GPU_NAME or '-'}"
|
|
)
|
|
sys.stdout.flush()
|
|
if enabled > 0:
|
|
if wanted and backend != wanted:
|
|
print(f"[MGPU-CHILD] Cycles backend fallback: requested={wanted} active={backend}")
|
|
sys.stdout.flush()
|
|
return True
|
|
else:
|
|
cands = [f"{b or '-'}:{n}" for (_d, b, n, _t) in candidates]
|
|
print(f"[MGPU-CHILD] Cycles selector no-match for backend={backend}; candidates={cands}")
|
|
sys.stdout.flush()
|
|
|
|
print(f"[MGPU-CHILD] ERROR: No GPU devices enabled for backends={attempts}; CPU fallback disabled for GPU worker.")
|
|
sys.stdout.flush()
|
|
return False
|
|
|
|
_CYCLES_READY = _configure_cycles_devices()
|
|
|
|
def _enforce_cycles_scene_device(scn):
|
|
try:
|
|
if not scn or scn.render.engine != "CYCLES":
|
|
return
|
|
scn.cycles.device = 'CPU' if USECPU else 'GPU'
|
|
except Exception:
|
|
pass
|
|
|
|
try:
|
|
for _scn in list(getattr(bpy.data, "scenes", []) or []):
|
|
_enforce_cycles_scene_device(_scn)
|
|
except Exception:
|
|
pass
|
|
|
|
def _cycles_runtime_report():
|
|
rep = {
|
|
"scene_device": None,
|
|
"compute_device_type": None,
|
|
"cpu_enabled": None,
|
|
"gpu_enabled": 0,
|
|
"enabled": [],
|
|
}
|
|
try:
|
|
scn = bpy.context.scene
|
|
rep["scene_device"] = str(getattr(getattr(scn, "cycles", None), "device", "") or "").upper()
|
|
except Exception:
|
|
pass
|
|
try:
|
|
prefs = bpy.context.preferences.addons['cycles'].preferences
|
|
rep["compute_device_type"] = str(getattr(prefs, "compute_device_type", "") or "").upper()
|
|
try:
|
|
prefs.refresh_devices()
|
|
except Exception:
|
|
pass
|
|
cpu_enabled = False
|
|
gpu_enabled = 0
|
|
enabled = []
|
|
for d in getattr(prefs, "devices", []):
|
|
try:
|
|
use = bool(getattr(d, "use", False))
|
|
dtype = str(getattr(d, "type", "") or "").upper()
|
|
name = str(getattr(d, "name", dtype))
|
|
if not use:
|
|
continue
|
|
enabled.append(f"{dtype}:{name}")
|
|
if dtype == "CPU":
|
|
cpu_enabled = True
|
|
else:
|
|
gpu_enabled += 1
|
|
except Exception:
|
|
pass
|
|
rep["cpu_enabled"] = cpu_enabled
|
|
rep["gpu_enabled"] = gpu_enabled
|
|
rep["enabled"] = enabled
|
|
except Exception as e:
|
|
rep["error"] = str(e)
|
|
return rep
|
|
|
|
def _cycles_policy_ok(require_gpu):
|
|
rep = _cycles_runtime_report()
|
|
if require_gpu:
|
|
ok = (
|
|
rep.get("scene_device") == "GPU" and
|
|
int(rep.get("gpu_enabled", 0) or 0) > 0 and
|
|
not bool(rep.get("cpu_enabled"))
|
|
)
|
|
else:
|
|
ok = (rep.get("scene_device") == "CPU" and bool(rep.get("cpu_enabled")))
|
|
return ok, rep
|
|
|
|
def _ensure_cycles_policy(require_gpu, phase):
|
|
ok, rep = _cycles_policy_ok(require_gpu)
|
|
if ok:
|
|
return True, rep
|
|
try:
|
|
_configure_cycles_devices()
|
|
except Exception:
|
|
pass
|
|
try:
|
|
_enforce_cycles_scene_device(bpy.context.scene)
|
|
except Exception:
|
|
pass
|
|
ok2, rep2 = _cycles_policy_ok(require_gpu)
|
|
if ok2:
|
|
print(f"[MGPU-CHILD] Cycles policy recovered at {phase}: {rep2}")
|
|
sys.stdout.flush()
|
|
return True, rep2
|
|
print(f"[MGPU-CHILD] GPU_POLICY_VIOLATION at {phase}: {rep2}")
|
|
sys.stdout.flush()
|
|
return False, rep2
|
|
|
|
def _set_enum_if_valid(owner, prop_name, value):
|
|
try:
|
|
prop = owner.bl_rna.properties.get(prop_name)
|
|
if not prop:
|
|
return False
|
|
items = [e.identifier for e in prop.enum_items]
|
|
if value in items:
|
|
setattr(owner, prop_name, value)
|
|
return True
|
|
except Exception:
|
|
pass
|
|
return False
|
|
|
|
def _configure_cycles_denoiser():
|
|
try:
|
|
scn = bpy.context.scene
|
|
except Exception:
|
|
return
|
|
if not scn or scn.render.engine != "CYCLES":
|
|
return
|
|
if not bool(DENOISE_GPU):
|
|
print("[MGPU-CHILD] Cycles denoise policy: disabled by add-on setting.")
|
|
sys.stdout.flush()
|
|
return
|
|
if bool(USECPU):
|
|
print("[MGPU-CHILD] Cycles denoise policy: CPU worker, leaving denoiser unchanged.")
|
|
sys.stdout.flush()
|
|
return
|
|
|
|
changed = []
|
|
try:
|
|
c = scn.cycles
|
|
except Exception:
|
|
c = None
|
|
if c is not None:
|
|
try:
|
|
if bool(getattr(c, "use_denoising", False)):
|
|
if _set_enum_if_valid(c, "denoiser", "OPTIX"):
|
|
changed.append("scene.cycles.denoiser=OPTIX")
|
|
except Exception:
|
|
pass
|
|
try:
|
|
if bool(getattr(c, "use_preview_denoising", False)):
|
|
if _set_enum_if_valid(c, "preview_denoiser", "OPTIX"):
|
|
changed.append("scene.cycles.preview_denoiser=OPTIX")
|
|
except Exception:
|
|
pass
|
|
|
|
try:
|
|
for vl in list(getattr(scn, "view_layers", []) or []):
|
|
vc = getattr(vl, "cycles", None)
|
|
if not vc:
|
|
continue
|
|
if bool(getattr(vc, "use_denoising", False)):
|
|
if _set_enum_if_valid(vc, "denoiser", "OPTIX"):
|
|
changed.append(f"view_layer[{vl.name}].cycles.denoiser=OPTIX")
|
|
except Exception:
|
|
pass
|
|
|
|
if changed:
|
|
print(f"[MGPU-CHILD] Cycles denoise policy: GPU denoiser configured ({'; '.join(changed)}).")
|
|
else:
|
|
print("[MGPU-CHILD] Cycles denoise policy: no active denoiser properties changed.")
|
|
sys.stdout.flush()
|
|
|
|
try:
|
|
if bpy.context.scene and bpy.context.scene.render.engine == "CYCLES":
|
|
_configure_cycles_denoiser()
|
|
print(f"[MGPU-CHILD] Cycles runtime report(init): {_cycles_runtime_report()}"); sys.stdout.flush()
|
|
except Exception:
|
|
pass
|
|
|
|
try:
|
|
bpy.context.scene.render.use_persistent_data = bool(PERSIST)
|
|
except Exception:
|
|
pass
|
|
|
|
# If we are building a video in the parent, workers render a temp image sequence.
|
|
try:
|
|
if MODE == "FRAMES" and SEQ_DIR:
|
|
os.makedirs(SEQ_DIR, exist_ok=True)
|
|
scn = bpy.context.scene
|
|
try:
|
|
scn.render.image_settings.color_mode = "RGBA"
|
|
except Exception:
|
|
pass
|
|
try:
|
|
if hasattr(scn.render.image_settings, "media_type"):
|
|
scn.render.image_settings.media_type = "IMAGE"
|
|
except Exception:
|
|
pass
|
|
scn.render.filepath = os.path.join(SEQ_DIR, "frame_")
|
|
try:
|
|
scn.render.use_file_extension = True
|
|
except Exception:
|
|
pass
|
|
except Exception as e:
|
|
print(f"[MGPU-CHILD] WARNING: sequence bootstrap setup failed: {e}"); sys.stdout.flush()
|
|
|
|
# --- Rebase MARI output folder to original .blend directory ---
|
|
try:
|
|
scn = bpy.context.scene
|
|
prop = getattr(scn, "mari_props", None)
|
|
if prop:
|
|
raw = getattr(prop, "render_settings_filepath", "") or ""
|
|
name = getattr(prop, "render_settings_name", "") or ""
|
|
rebased = raw
|
|
# If Blender-style relative path ("//..."), rebase against SRC_DIR
|
|
if raw.startswith("//") and SRC_DIR:
|
|
rebased = os.path.normpath(os.path.join(SRC_DIR, raw[2:]))
|
|
else:
|
|
# Resolve any other path using Blender's abspath (will be absolute already)
|
|
rebased = bpy.path.abspath(raw)
|
|
# Ensure a trailing separator
|
|
if not rebased.endswith(os.sep):
|
|
rebased += os.sep
|
|
# Persist back so ALL operators (including bpy.ops.mari.render_one) use the corrected absolute path
|
|
prop.render_settings_filepath = rebased
|
|
print(f"[MGPU-CHILD] Rebased MARI output dir to: {rebased} (name='{name}')"); sys.stdout.flush()
|
|
except Exception as e:
|
|
print("[MGPU-CHILD] WARNING: Could not rebase MARI output path:", e); sys.stdout.flush()
|
|
|
|
def _mari_ext_from_settings(scn):
|
|
# Match MARI add-on's extension mapping so filenames match everywhere
|
|
ff = scn.render.image_settings.file_format.lower()
|
|
if ff == "ffmpeg":
|
|
# MARI uses ffmpeg.format to decide container (e.g. mkv)
|
|
fmt = scn.render.ffmpeg.format
|
|
return {"MPEG1":"mpeg1","MPEG2":"mpeg2","MPEG4":"mp4","AVI":"avi","QUICKTIME":"mov",
|
|
"DV":"dv","OGG":"ogg","MKV":"mkv","FLASH":"flv","WEBM":"webm"}.get(fmt, "mkv")
|
|
return {
|
|
"jpeg":"jpeg","jpeg_2000":"jpeg","iris":"rgb",
|
|
"targa":"tga","targa_raw":"tga","cineon":"cin",
|
|
"open_exr":"exr","open_exr_multilayer":"exr",
|
|
"tiff":"tif","avi_jpeg":"avi","avi_raw":"avi",
|
|
"png":"png","bmp":"bmp"
|
|
}.get(ff, ff)
|
|
|
|
def _fix_mari_still_output_name(scn, H, V):
|
|
try:
|
|
prop = getattr(scn, "mari_props", None)
|
|
if not prop:
|
|
return
|
|
|
|
base_dir = bpy.path.abspath(getattr(prop, "render_settings_filepath", ""))
|
|
name = (getattr(prop, "render_settings_name", "") or "").strip()
|
|
if not (base_dir and name):
|
|
return
|
|
if not base_dir.endswith(os.sep):
|
|
base_dir += os.sep
|
|
|
|
root = os.path.join(base_dir, name)
|
|
if not os.path.isdir(root):
|
|
return
|
|
|
|
ext = (_mari_ext_from_settings(scn) or "").lower().lstrip(".")
|
|
if not ext:
|
|
return
|
|
|
|
stem = f"{name}_H{int(H)}_V{int(V)}"
|
|
final_path = os.path.join(root, f"{stem}.{ext}")
|
|
|
|
candidates = []
|
|
for fname in os.listdir(root):
|
|
full = os.path.join(root, fname)
|
|
if not os.path.isfile(full):
|
|
continue
|
|
fstem, fext = os.path.splitext(fname)
|
|
if fext.lower().lstrip(".") != ext:
|
|
continue
|
|
if not fstem.startswith(stem):
|
|
continue
|
|
suffix = fstem[len(stem):]
|
|
if suffix and len(suffix) >= 3 and suffix.isdigit():
|
|
candidates.append(full)
|
|
|
|
if os.path.isfile(final_path) and os.path.getsize(final_path) > 0:
|
|
for extra in candidates:
|
|
try:
|
|
os.remove(extra)
|
|
except Exception:
|
|
pass
|
|
return
|
|
|
|
if not candidates:
|
|
return
|
|
|
|
candidates.sort(key=lambda p: os.path.getmtime(p), reverse=True)
|
|
os.replace(candidates[0], final_path)
|
|
for extra in candidates[1:]:
|
|
try:
|
|
os.remove(extra)
|
|
except Exception:
|
|
pass
|
|
except Exception:
|
|
pass
|
|
|
|
def _prime_mari_output_for_frame(scn, H, V, action):
|
|
prop = getattr(scn, "mari_props", None)
|
|
if not prop:
|
|
return
|
|
|
|
base_dir = bpy.path.abspath(prop.render_settings_filepath)
|
|
if not base_dir.endswith(os.sep):
|
|
base_dir += os.sep
|
|
name = prop.render_settings_name
|
|
ext = _mari_ext_from_settings(scn)
|
|
|
|
# Root "<base>\<name>\"
|
|
root = os.path.join(base_dir, name)
|
|
os.makedirs(root, exist_ok=True)
|
|
|
|
# For image-sequence ANIM, make the per-camera folder and let Blender append frame numbers (NAME_0001, NAME_0002 ...)
|
|
if action == "ANIM" and scn.render.image_settings.file_format.lower() != "ffmpeg":
|
|
cam_dir = os.path.join(root, f"{name}_H{int(H)}_V{int(V)}")
|
|
os.makedirs(cam_dir, exist_ok=True)
|
|
scn.render.filepath = os.path.join(cam_dir, f"{name}_")
|
|
elif action == "STILL":
|
|
scn.render.filepath = os.path.join(root, f"{name}_H{int(H)}_V{int(V)}")
|
|
else:
|
|
# Video ANIM: point to final video base (per camera)
|
|
scn.render.filepath = os.path.join(root, f"{name}_H{int(H)}_V{int(V)}")
|
|
try:
|
|
scn.render.use_file_extension = True
|
|
except Exception:
|
|
pass
|
|
|
|
def jsend(sock, obj):
|
|
sock.sendall((json.dumps(obj) + "\n").encode("utf-8", "ignore"))
|
|
|
|
def jrecv(sock):
|
|
buf = b""
|
|
while b"\n" not in buf:
|
|
chunk = sock.recv(4096)
|
|
if not chunk:
|
|
raise ConnectionError("server closed")
|
|
buf += chunk
|
|
line, rest = buf.split(b"\n", 1)
|
|
return json.loads(line.decode("utf-8", "ignore"))
|
|
|
|
def _proj_bar(done, total, width=30):
|
|
try:
|
|
total = int(total)
|
|
done = max(0, int(done))
|
|
if total <= 0:
|
|
return "[------------------------------]", 0.0
|
|
ratio = min(1.0, done / float(total))
|
|
filled = int(round(width * ratio))
|
|
return "[" + ("#" * filled) + ("-" * (width - filled)) + "]", ratio * 100.0
|
|
except Exception:
|
|
return "[------------------------------]", 0.0
|
|
|
|
def _proj_print(H, V, elapsed, glb):
|
|
# glb carries per-job globals from the parent; we extend it in step 4
|
|
total = int(glb.get("proj_total") or 0)
|
|
done_before = int(glb.get("proj_done") or 0)
|
|
done_now = min(total, done_before + 1) if total > 0 else (done_before + 1)
|
|
bar, pct = _proj_bar(done_now, total)
|
|
try:
|
|
h = int(H)
|
|
except Exception:
|
|
h = H
|
|
try:
|
|
v = int(V)
|
|
except Exception:
|
|
v = V
|
|
print(f"[MGPU-PROJ] H{h}_V{v} | {float(elapsed):.2f}s | {bar} {pct:.1f}% ({done_now}/{total})")
|
|
sys.stdout.flush()
|
|
|
|
def _safe_out_path(scn, n):
|
|
# Native Blender path for frame n
|
|
try:
|
|
p = scn.render.frame_path(frame=n)
|
|
return bpy.path.abspath(p)
|
|
except Exception:
|
|
base = bpy.path.abspath(scn.render.filepath)
|
|
if "#" in base:
|
|
hashes = len(re.search(r"(#+)", base).group(1))
|
|
return re.sub(r"(#+)", str(n).zfill(hashes), base)
|
|
else:
|
|
root, ext = os.path.splitext(base)
|
|
if not ext:
|
|
ext = "." + (scn.render.file_extension or "png")
|
|
return f"{root}{str(n).zfill(4)}{ext}"
|
|
|
|
def _scene_expected_image_size(scn):
|
|
try:
|
|
pct = float(getattr(scn.render, "resolution_percentage", 100) or 100.0)
|
|
w = int(round(float(scn.render.resolution_x) * pct / 100.0))
|
|
h = int(round(float(scn.render.resolution_y) * pct / 100.0))
|
|
if w > 0 and h > 0:
|
|
return (w, h)
|
|
except Exception:
|
|
pass
|
|
return None
|
|
|
|
def _is_valid_render_output(path, expected_size=None):
|
|
try:
|
|
if os.path.getsize(path) <= 0:
|
|
return False
|
|
except Exception:
|
|
return False
|
|
|
|
img = None
|
|
try:
|
|
img = bpy.data.images.load(path, check_existing=False)
|
|
size = getattr(img, "size", None)
|
|
if not (size and size[0] > 0 and size[1] > 0):
|
|
return False
|
|
if expected_size:
|
|
try:
|
|
exp_w = int(expected_size[0]); exp_h = int(expected_size[1])
|
|
return int(size[0]) == exp_w and int(size[1]) == exp_h
|
|
except Exception:
|
|
return False
|
|
return True
|
|
except Exception:
|
|
return False
|
|
finally:
|
|
if img is not None:
|
|
try:
|
|
bpy.data.images.remove(img)
|
|
except Exception:
|
|
pass
|
|
|
|
def _render_meta(rendered=False, skipped=False, elapsed=0.0):
|
|
try:
|
|
elapsed = float(elapsed or 0.0)
|
|
except Exception:
|
|
elapsed = 0.0
|
|
return {
|
|
"rendered": bool(rendered),
|
|
"skipped": bool(skipped),
|
|
"elapsed": elapsed,
|
|
}
|
|
|
|
_MARI_RENDER_ONE_STATUS_KEY = "_mari_render_one_status"
|
|
|
|
def _mari_read_render_status(scn):
|
|
raw = ""
|
|
try:
|
|
raw = str(scn.get(_MARI_RENDER_ONE_STATUS_KEY, "") or "").upper()
|
|
except Exception:
|
|
raw = ""
|
|
return {
|
|
"status": raw,
|
|
"rendered": raw == "RENDERED",
|
|
"skipped": raw == "SKIPPED",
|
|
}
|
|
|
|
_SEQ_DIRECT_SAVE = False
|
|
_SEQ_DIRECT_SAVE_LOGGED = False
|
|
|
|
def _render_result_image():
|
|
img = bpy.data.images.get("Render Result")
|
|
if img is not None:
|
|
return img
|
|
for candidate in bpy.data.images:
|
|
try:
|
|
if getattr(candidate, "type", "") == "RENDER_RESULT":
|
|
return candidate
|
|
except Exception:
|
|
continue
|
|
return None
|
|
|
|
def _save_render_result_to_file(out_path, file_format="PNG"):
|
|
img = _render_result_image()
|
|
if img is None:
|
|
raise RuntimeError("Render Result image not found after frame render.")
|
|
prev_raw = getattr(img, "filepath_raw", "")
|
|
prev_fmt = getattr(img, "file_format", "PNG")
|
|
try:
|
|
os.makedirs(os.path.dirname(out_path), exist_ok=True)
|
|
except Exception:
|
|
pass
|
|
try:
|
|
img.filepath_raw = out_path
|
|
except Exception:
|
|
img.filepath = out_path
|
|
try:
|
|
img.file_format = str(file_format or "PNG").upper()
|
|
except Exception:
|
|
img.file_format = "PNG"
|
|
img.save()
|
|
try:
|
|
img.filepath_raw = prev_raw
|
|
except Exception:
|
|
pass
|
|
try:
|
|
img.file_format = prev_fmt
|
|
except Exception:
|
|
pass
|
|
|
|
def _render_frame_via_sandbox(main_scene, out_path, frame):
|
|
global _SEQ_DIRECT_SAVE, _SEQ_DIRECT_SAVE_LOGGED
|
|
prev_fp = main_scene.render.filepath
|
|
prev_use_ext = getattr(main_scene.render, "use_file_extension", True)
|
|
img = main_scene.render.image_settings
|
|
prev_fmt = getattr(img, "file_format", "PNG")
|
|
prev_mode = getattr(img, "color_mode", "RGBA")
|
|
prev_depth = getattr(img, "color_depth", "8")
|
|
prev_comp = getattr(img, "compression", 15)
|
|
prev_media = getattr(img, "media_type", None) if hasattr(img, "media_type") else None
|
|
try:
|
|
# Render from the real scene so scene-bound add-ons keep their state.
|
|
_force_scene_refresh(main_scene, frame)
|
|
|
|
use_direct_save = bool(_SEQ_DIRECT_SAVE)
|
|
if not use_direct_save:
|
|
try:
|
|
img.file_format = SEQ_FMT
|
|
img.color_mode = "RGBA"
|
|
img.color_depth = "16"
|
|
img.compression = 0
|
|
try:
|
|
if hasattr(img, "media_type"):
|
|
img.media_type = "IMAGE"
|
|
except Exception:
|
|
pass
|
|
except Exception as fmt_err:
|
|
use_direct_save = True
|
|
_SEQ_DIRECT_SAVE = True
|
|
if not _SEQ_DIRECT_SAVE_LOGGED:
|
|
print(
|
|
f"[MGPU-CHILD] {TAG} WARN: temp frame format {SEQ_FMT} unavailable on scene render settings; "
|
|
f"using Render Result direct-save fallback ({fmt_err})"
|
|
)
|
|
sys.stdout.flush()
|
|
_SEQ_DIRECT_SAVE_LOGGED = True
|
|
|
|
main_scene.render.filepath = os.path.splitext(out_path)[0]
|
|
try:
|
|
main_scene.render.use_file_extension = True
|
|
except Exception:
|
|
pass
|
|
|
|
if use_direct_save:
|
|
_run_with_scene_override(
|
|
main_scene,
|
|
lambda: bpy.ops.render.render(write_still=False, animation=False, use_viewport=False),
|
|
)
|
|
_save_render_result_to_file(out_path, file_format=SEQ_FMT)
|
|
return {'FINISHED'}
|
|
|
|
return _run_with_scene_override(
|
|
main_scene,
|
|
lambda: bpy.ops.render.render(write_still=True, animation=False, use_viewport=False),
|
|
)
|
|
finally:
|
|
main_scene.render.filepath = prev_fp
|
|
try:
|
|
main_scene.render.use_file_extension = prev_use_ext
|
|
except Exception:
|
|
pass
|
|
try:
|
|
img.file_format = prev_fmt
|
|
except Exception:
|
|
pass
|
|
try:
|
|
img.color_mode = prev_mode
|
|
except Exception:
|
|
pass
|
|
try:
|
|
img.color_depth = prev_depth
|
|
except Exception:
|
|
pass
|
|
try:
|
|
img.compression = prev_comp
|
|
except Exception:
|
|
pass
|
|
try:
|
|
if hasattr(img, "media_type") and prev_media is not None:
|
|
img.media_type = prev_media
|
|
except Exception:
|
|
pass
|
|
|
|
def render_frame(n):
|
|
scn = bpy.context.scene
|
|
expected_size = _scene_expected_image_size(scn)
|
|
if scn.render.engine == "CYCLES":
|
|
_enforce_cycles_scene_device(scn)
|
|
if not _CYCLES_READY:
|
|
return False, "Cycles device setup failed (no eligible GPU/CPU device configured for this worker).", _render_meta()
|
|
ok_policy, rep = _ensure_cycles_policy(require_gpu=(not bool(USECPU)), phase=f"pre-frame-{n}")
|
|
if not ok_policy:
|
|
return False, f"GPU_POLICY_VIOLATION pre-frame-{n}: {rep}", _render_meta()
|
|
_force_scene_refresh(scn, n)
|
|
|
|
if SEQ_DIR:
|
|
out_path = os.path.join(SEQ_DIR, f"frame_{n:04d}{SEQ_EXT}")
|
|
if (not PRECHECKED_EXISTING) and (not getattr(scn.render, "use_overwrite", True)):
|
|
try:
|
|
if os.path.exists(out_path) and _is_valid_render_output(out_path, expected_size=expected_size):
|
|
start = time.time()
|
|
print(f"[MGPU-CHILD] {TAG} start frame {n} -> {out_path}"); sys.stdout.flush()
|
|
elapsed = time.time() - start
|
|
print(f"[MGPU-CHILD] {TAG} finished frame {n} ({elapsed:.2f}s) -> {out_path}"); sys.stdout.flush()
|
|
return True, "Skipped existing frame (overwrite disabled)", _render_meta(rendered=False, skipped=True, elapsed=elapsed)
|
|
except Exception:
|
|
pass
|
|
try:
|
|
os.makedirs(os.path.dirname(out_path), exist_ok=True)
|
|
except Exception:
|
|
pass
|
|
if getattr(scn.render, "use_placeholder", False):
|
|
try:
|
|
if not os.path.exists(out_path):
|
|
with open(out_path, "wb"):
|
|
pass
|
|
except Exception:
|
|
pass
|
|
try:
|
|
start = time.time()
|
|
print(f"[MGPU-CHILD] {TAG} start frame {n} -> {out_path}"); sys.stdout.flush()
|
|
_render_frame_via_sandbox(scn, out_path, n)
|
|
ok = os.path.exists(out_path) and os.path.getsize(out_path) > 0
|
|
elapsed = time.time() - start
|
|
if ok and scn.render.engine == "CYCLES":
|
|
ok_policy, rep = _ensure_cycles_policy(require_gpu=(not bool(USECPU)), phase=f"post-frame-{n}")
|
|
if not ok_policy:
|
|
return False, f"GPU_POLICY_VIOLATION post-frame-{n}: {rep}", _render_meta(rendered=True, elapsed=elapsed)
|
|
if ok:
|
|
print(f"[MGPU-CHILD] {TAG} finished frame {n} ({elapsed:.2f}s) -> {out_path}"); sys.stdout.flush()
|
|
return True, "", _render_meta(rendered=True, elapsed=elapsed)
|
|
else:
|
|
print(f"[MGPU-CHILD] {TAG} MISSING frame {n} ({elapsed:.2f}s) -> {out_path}"); sys.stdout.flush()
|
|
return False, f"Rendered file missing or empty: {out_path}", _render_meta(elapsed=elapsed)
|
|
except Exception as e:
|
|
print(f"[MGPU-CHILD] {TAG} ERROR frame {n}: {e}"); sys.stdout.flush()
|
|
return False, str(e), _render_meta()
|
|
|
|
prev_fp = scn.render.filepath
|
|
prev_use_ext = getattr(scn.render, "use_file_extension", True)
|
|
out_path = _safe_out_path(scn, n)
|
|
if (not PRECHECKED_EXISTING) and (not getattr(scn.render, "use_overwrite", True)):
|
|
try:
|
|
if os.path.exists(out_path) and _is_valid_render_output(out_path, expected_size=expected_size):
|
|
start = time.time()
|
|
print(f"[MGPU-CHILD] {TAG} start frame {n} -> {out_path}"); sys.stdout.flush()
|
|
elapsed = time.time() - start
|
|
print(f"[MGPU-CHILD] {TAG} finished frame {n} ({elapsed:.2f}s) -> {out_path}"); sys.stdout.flush()
|
|
return True, "Skipped existing frame (overwrite disabled)", _render_meta(rendered=False, skipped=True, elapsed=elapsed)
|
|
except Exception:
|
|
pass
|
|
try:
|
|
os.makedirs(os.path.dirname(out_path), exist_ok=True)
|
|
except Exception:
|
|
pass
|
|
if getattr(scn.render, "use_placeholder", False):
|
|
try:
|
|
if not os.path.exists(out_path):
|
|
with open(out_path, "wb"):
|
|
pass
|
|
except Exception:
|
|
pass
|
|
|
|
try:
|
|
scn.render.filepath = out_path
|
|
try:
|
|
scn.render.use_file_extension = False # out_path already has extension
|
|
except Exception:
|
|
pass
|
|
|
|
start = time.time()
|
|
print(f"[MGPU-CHILD] {TAG} start frame {n} -> {out_path}"); sys.stdout.flush()
|
|
_run_with_scene_override(
|
|
scn,
|
|
lambda: bpy.ops.render.render(animation=False, write_still=True, use_viewport=False),
|
|
)
|
|
ok = os.path.exists(out_path) and os.path.getsize(out_path) > 0
|
|
elapsed = time.time() - start
|
|
if ok and scn.render.engine == "CYCLES":
|
|
ok_policy, rep = _ensure_cycles_policy(require_gpu=(not bool(USECPU)), phase=f"post-frame-{n}")
|
|
if not ok_policy:
|
|
return False, f"GPU_POLICY_VIOLATION post-frame-{n}: {rep}", _render_meta(rendered=True, elapsed=elapsed)
|
|
if ok:
|
|
print(f"[MGPU-CHILD] {TAG} finished frame {n} ({elapsed:.2f}s) -> {out_path}"); sys.stdout.flush()
|
|
return True, "", _render_meta(rendered=True, elapsed=elapsed)
|
|
else:
|
|
print(f"[MGPU-CHILD] {TAG} MISSING frame {n} ({elapsed:.2f}s) -> {out_path}"); sys.stdout.flush()
|
|
return False, f"Rendered file missing or empty: {out_path}", _render_meta(elapsed=elapsed)
|
|
except Exception as e:
|
|
print(f"[MGPU-CHILD] {TAG} ERROR frame {n}: {e}"); sys.stdout.flush()
|
|
return False, str(e), _render_meta()
|
|
finally:
|
|
scn.render.filepath = prev_fp
|
|
try:
|
|
scn.render.use_file_extension = prev_use_ext
|
|
except Exception:
|
|
pass
|
|
|
|
def _ensure_mari_enabled():
|
|
# If we injected a path, we already imported & registered it
|
|
if MARI_PATH:
|
|
return True
|
|
try:
|
|
import addon_utils
|
|
for m in addon_utils.modules():
|
|
bi = getattr(m, "bl_info", {}) or {}
|
|
nm = (bi.get("name") or "").lower()
|
|
if "mari" in nm:
|
|
addon_utils.enable(m.__name__, default_set=True, persistent=True)
|
|
return True
|
|
return False
|
|
except Exception:
|
|
return False
|
|
|
|
|
|
def _mari_prop_mode_id(value):
|
|
key = str(value or "").upper()
|
|
if key == "FRAME":
|
|
return "FRAME"
|
|
return "CRICLE"
|
|
|
|
|
|
def _apply_mari_scene_settings(scn, glb):
|
|
prop = getattr(scn, "mari_props", None)
|
|
settings = dict(glb.get("mari_settings") or {})
|
|
|
|
try:
|
|
if "render_resolution_x" in glb:
|
|
scn.render.resolution_x = int(glb.get("render_resolution_x"))
|
|
if "render_resolution_y" in glb:
|
|
scn.render.resolution_y = int(glb.get("render_resolution_y"))
|
|
if "render_resolution_percentage" in glb:
|
|
scn.render.resolution_percentage = int(glb.get("render_resolution_percentage"))
|
|
except Exception:
|
|
pass
|
|
|
|
if not prop:
|
|
return
|
|
|
|
vector_props = ("frame_ratio", "frame_dimensions", "frame_center", "frame_rotation")
|
|
for name in vector_props:
|
|
if name not in settings:
|
|
continue
|
|
value = settings.get(name)
|
|
try:
|
|
setattr(prop, name, tuple(value))
|
|
continue
|
|
except Exception:
|
|
pass
|
|
try:
|
|
seq = tuple(value)
|
|
cur = getattr(prop, name)
|
|
for idx in range(min(len(cur), len(seq))):
|
|
cur[idx] = seq[idx]
|
|
except Exception:
|
|
pass
|
|
|
|
scalar_props = ("render_settings_filepath", "render_settings_name", "render_settings_normalize")
|
|
for name in scalar_props:
|
|
if name not in settings:
|
|
continue
|
|
try:
|
|
setattr(prop, name, settings.get(name))
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
def _render_mari_job(job, glb):
|
|
'''
|
|
job: {"cam_name": str, "H": int, "V": int, ["frame": int]}
|
|
glb: {"mode": "FRAME"/"CIRCLE", "action": "STILL"/"ANIM", "is_video": bool}
|
|
'''
|
|
try:
|
|
if not _ensure_mari_enabled():
|
|
return False, "MARI add-on not enabled in child", _render_meta()
|
|
|
|
if not (hasattr(bpy.ops, "mari") and hasattr(bpy.ops.mari, "render_one")):
|
|
return False, "bpy.ops.mari.render_one unavailable", _render_meta()
|
|
|
|
scn = bpy.context.scene
|
|
if scn.render.engine == "CYCLES":
|
|
_enforce_cycles_scene_device(scn)
|
|
if not _CYCLES_READY:
|
|
return False, "Cycles device setup failed (no eligible GPU/CPU device configured for this worker).", _render_meta()
|
|
ok_policy, rep = _ensure_cycles_policy(require_gpu=(not bool(USECPU)), phase=f"pre-mari-{job.get('cam_name','?')}")
|
|
if not ok_policy:
|
|
return False, f"GPU_POLICY_VIOLATION pre-mari: {rep}", _render_meta()
|
|
prop = getattr(scn, "mari_props", None)
|
|
try:
|
|
scn.render.use_overwrite = bool(glb.get("use_overwrite", scn.render.use_overwrite))
|
|
if hasattr(scn.render, "use_placeholder"):
|
|
scn.render.use_placeholder = bool(glb.get("use_placeholder", scn.render.use_placeholder))
|
|
except Exception:
|
|
pass
|
|
try:
|
|
_apply_mari_scene_settings(scn, glb)
|
|
except Exception:
|
|
pass
|
|
|
|
cam_name = job.get("cam_name")
|
|
cam_obj = bpy.data.objects.get(cam_name) if cam_name else None
|
|
if cam_obj:
|
|
scn.camera = cam_obj
|
|
else:
|
|
return False, f"Camera '{cam_name}' not found", _render_meta()
|
|
try:
|
|
bpy.context.view_layer.update()
|
|
except Exception:
|
|
pass
|
|
|
|
mode_target = _mari_prop_mode_id(glb.get("mode"))
|
|
if prop and _mari_prop_mode_id(getattr(prop, "frame", None)) != mode_target:
|
|
try:
|
|
prop.frame = mode_target
|
|
print(f"[MGPU-CHILD] Adjusted MARI mode to {prop.frame}")
|
|
except Exception:
|
|
pass
|
|
|
|
try:
|
|
obj = bpy.context.object
|
|
if obj and obj.mode != 'OBJECT':
|
|
bpy.ops.object.mode_set(mode='OBJECT', toggle=False)
|
|
except Exception:
|
|
pass
|
|
|
|
action = glb.get("action", "STILL")
|
|
try:
|
|
frame = int(job.get("frame", -1))
|
|
except Exception:
|
|
frame = -1
|
|
try:
|
|
_force_scene_refresh(scn, (frame if frame >= 0 else scn.frame_current))
|
|
except Exception:
|
|
pass
|
|
|
|
H = job.get("H")
|
|
V = job.get("V")
|
|
if prop:
|
|
# Ensure per-job paths exist and set filepaths to avoid any spillover across cameras.
|
|
_prime_mari_output_for_frame(scn, H, V, action)
|
|
try:
|
|
scn.render.use_file_extension = True
|
|
except Exception:
|
|
pass
|
|
tag = TAG
|
|
start_msg = f"[MGPU-CHILD] {tag} start MARI {action} H{H} V{V}"
|
|
if frame >= 0:
|
|
start_msg += f" f{frame}"
|
|
print(start_msg + f" -> {cam_name}")
|
|
st = time.time()
|
|
try:
|
|
scn[_MARI_RENDER_ONE_STATUS_KEY] = ""
|
|
except Exception:
|
|
pass
|
|
try:
|
|
res = _run_with_scene_override(
|
|
scn,
|
|
lambda: bpy.ops.mari.render_one(camera_name=cam_name, action=action, frame=frame),
|
|
)
|
|
except Exception as call_err:
|
|
return False, str(call_err), _render_meta()
|
|
|
|
ok = (res == {'FINISHED'})
|
|
if ok and scn.render.engine == "CYCLES":
|
|
ok_policy, rep = _ensure_cycles_policy(
|
|
require_gpu=(not bool(USECPU)),
|
|
phase=f"post-mari-{cam_name}-f{frame}"
|
|
)
|
|
if not ok_policy:
|
|
return False, f"GPU_POLICY_VIOLATION post-mari: {rep}", _render_meta(rendered=True, elapsed=(time.time() - st))
|
|
if ok and action == "STILL":
|
|
_fix_mari_still_output_name(scn, H, V)
|
|
elapsed = time.time() - st
|
|
status = _mari_read_render_status(scn)
|
|
fin_msg = f"[MGPU-CHILD] {tag} finished MARI {action} H{H} V{V}"
|
|
if frame >= 0:
|
|
fin_msg += f" f{frame}"
|
|
if ok:
|
|
print(fin_msg + f" ({elapsed:.2f}s) -> {cam_name}")
|
|
else:
|
|
print(fin_msg.replace("finished", "MISSING") + f" ({elapsed:.2f}s) -> {cam_name}")
|
|
sys.stdout.flush()
|
|
_proj_print(H, V, elapsed, glb)
|
|
meta = _render_meta(rendered=status.get("rendered"), skipped=status.get("skipped"), elapsed=elapsed)
|
|
return ok, "" if ok else "mari.render_one returned CANCELLED", meta
|
|
except Exception as e:
|
|
traceback.print_exc()
|
|
return False, str(e), _render_meta()
|
|
|
|
|
|
# connect to parent scheduler
|
|
import socket
|
|
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
|
_CONNECT_RETRIES = 40
|
|
for attempt in range(_CONNECT_RETRIES):
|
|
try:
|
|
sock.connect((HOST, PORT))
|
|
break
|
|
except Exception as e:
|
|
if attempt + 1 == _CONNECT_RETRIES:
|
|
raise
|
|
print(f"[MGPU-CHILD] {TAG} waiting for scheduler ({attempt+1}/{_CONNECT_RETRIES}): {e}")
|
|
time.sleep(0.25)
|
|
jsend(sock, {"hello": TAG, "token": TOKEN})
|
|
|
|
while True:
|
|
jsend(sock, {"get": True})
|
|
msg = jrecv(sock)
|
|
if msg.get("exit"):
|
|
print(f"[MGPU-CHILD] {TAG} exit signal received"); sys.stdout.flush()
|
|
break
|
|
|
|
if MODE == "MARI" and msg.get("mari_job"):
|
|
job = msg.get("mari_job") or {}
|
|
glb = msg.get("globals") or {}
|
|
ok, err, meta = _render_mari_job(job, glb)
|
|
if not ok:
|
|
print(f"[MGPU-CHILD] {TAG} ERROR MARI job {job.get('cam_name')} f{job.get('frame','-')}: {err}"); sys.stdout.flush()
|
|
jsend(sock, {"done": job.get("cam_name"), "ok": bool(ok), "job": job, "err": err, "meta": meta})
|
|
continue
|
|
|
|
|
|
# default legacy: frames
|
|
if "frame" in msg:
|
|
n = int(msg["frame"])
|
|
ok, err, meta = render_frame(n)
|
|
jsend(sock, {"done": n, "ok": bool(ok), "err": err, "meta": meta})
|
|
continue
|
|
|
|
try:
|
|
sock.close()
|
|
except Exception:
|
|
pass
|
|
"""
|
|
|
|
def _write_child_script(dirpath):
|
|
path = os.path.join(dirpath, "mgpu_worker.py")
|
|
with open(path, "w", encoding="utf-8") as f:
|
|
f.write(_CHILD_SCRIPT_SRC)
|
|
return path
|
|
|
|
def _mgpu_scene_expected_image_size(scene):
|
|
try:
|
|
pct = float(getattr(scene.render, "resolution_percentage", 100) or 100.0)
|
|
w = int(round(float(scene.render.resolution_x) * pct / 100.0))
|
|
h = int(round(float(scene.render.resolution_y) * pct / 100.0))
|
|
if w > 0 and h > 0:
|
|
return (w, h)
|
|
except Exception:
|
|
pass
|
|
return None
|
|
|
|
def _mgpu_existing_file_nonempty(path):
|
|
try:
|
|
return os.path.isfile(path) and os.path.getsize(path) > 0
|
|
except Exception:
|
|
return False
|
|
|
|
def _mgpu_validate_existing_image(path, expected_size=None, cache=None):
|
|
if cache is not None and path in cache:
|
|
return bool(cache[path])
|
|
|
|
ok = False
|
|
img = None
|
|
try:
|
|
if os.path.getsize(path) <= 0:
|
|
ok = False
|
|
else:
|
|
img = bpy.data.images.load(path, check_existing=False)
|
|
size = getattr(img, "size", None)
|
|
ok = bool(size and size[0] > 0 and size[1] > 0)
|
|
if ok and expected_size:
|
|
try:
|
|
ok = (int(size[0]) == int(expected_size[0]) and int(size[1]) == int(expected_size[1]))
|
|
except Exception:
|
|
ok = False
|
|
except Exception:
|
|
ok = False
|
|
finally:
|
|
if img is not None:
|
|
try:
|
|
bpy.data.images.remove(img)
|
|
except Exception:
|
|
pass
|
|
if cache is not None:
|
|
cache[path] = bool(ok)
|
|
return bool(ok)
|
|
|
|
def _mgpu_scan_dir_files(dir_path):
|
|
index = {}
|
|
try:
|
|
if not os.path.isdir(dir_path):
|
|
return index
|
|
with os.scandir(dir_path) as it:
|
|
for entry in it:
|
|
try:
|
|
if not entry.is_file():
|
|
continue
|
|
except Exception:
|
|
continue
|
|
try:
|
|
size = int(entry.stat().st_size)
|
|
except Exception:
|
|
size = -1
|
|
index[entry.name.lower()] = {"path": entry.path, "size": size}
|
|
except Exception:
|
|
return {}
|
|
return index
|
|
|
|
def _mgpu_scene_frame_output_path(scene, frame):
|
|
try:
|
|
return bpy.path.abspath(scene.render.frame_path(frame=int(frame)))
|
|
except Exception:
|
|
try:
|
|
return bpy.path.abspath(scene.render.filepath)
|
|
except Exception:
|
|
return ""
|
|
|
|
def _mgpu_video_seq_frame_path(seq_dir, frame, ext=".png"):
|
|
ext = str(ext or ".png").strip() or ".png"
|
|
if not ext.startswith("."):
|
|
ext = "." + ext
|
|
try:
|
|
frame_num = int(frame)
|
|
except Exception:
|
|
frame_num = frame
|
|
return os.path.join(seq_dir, f"frame_{frame_num:04d}{ext}")
|
|
|
|
def _mgpu_mari_ext_from_scene(scene):
|
|
try:
|
|
ff = str(scene.render.image_settings.file_format or "").lower()
|
|
except Exception:
|
|
ff = ""
|
|
if ff == "ffmpeg":
|
|
try:
|
|
fmt = str(scene.render.ffmpeg.format or "")
|
|
except Exception:
|
|
fmt = ""
|
|
return {
|
|
"MPEG1": "mpeg1",
|
|
"MPEG2": "mpeg2",
|
|
"MPEG4": "mp4",
|
|
"AVI": "avi",
|
|
"QUICKTIME": "mov",
|
|
"DV": "dv",
|
|
"OGG": "ogg",
|
|
"MKV": "mkv",
|
|
"FLASH": "flv",
|
|
"WEBM": "webm",
|
|
}.get(fmt, "mkv")
|
|
return {
|
|
"jpeg": "jpeg",
|
|
"jpeg_2000": "jpeg",
|
|
"iris": "rgb",
|
|
"targa": "tga",
|
|
"targa_raw": "tga",
|
|
"cineon": "cin",
|
|
"open_exr": "exr",
|
|
"open_exr_multilayer": "exr",
|
|
"tiff": "tif",
|
|
"avi_jpeg": "avi",
|
|
"avi_raw": "avi",
|
|
"png": "png",
|
|
"bmp": "bmp",
|
|
}.get(ff, ff)
|
|
|
|
def _mgpu_mari_output_root(scene):
|
|
try:
|
|
prop = getattr(scene, "mari_props", None)
|
|
if not prop:
|
|
return "", ""
|
|
base = bpy.path.abspath(getattr(prop, "render_settings_filepath", ""))
|
|
name = str(getattr(prop, "render_settings_name", "") or "").strip()
|
|
if not (base and name):
|
|
return "", ""
|
|
return os.path.join(base, name), name
|
|
except Exception:
|
|
return "", ""
|
|
|
|
def _mgpu_format_hv_label(value):
|
|
try:
|
|
return str(int(value))
|
|
except Exception:
|
|
return str(value)
|
|
|
|
def _mgpu_get_vse_strip_collection(se):
|
|
if hasattr(se, "strips"):
|
|
return se.strips
|
|
if hasattr(se, "strips_all"):
|
|
return se.strips_all
|
|
if hasattr(se, "sequences"):
|
|
return se.sequences
|
|
if hasattr(se, "sequences_all"):
|
|
return se.sequences_all
|
|
return None
|
|
|
|
def _mgpu_build_video_from_sequence(scene, seq_dir, frames, final_path):
|
|
if not frames:
|
|
raise RuntimeError("No frames provided to build video.")
|
|
|
|
seq_dir = os.path.normpath(seq_dir)
|
|
frames = sorted(frames)
|
|
|
|
first_name = frames[0]
|
|
first_path = os.path.join(seq_dir, first_name)
|
|
if not os.path.isfile(first_path):
|
|
raise RuntimeError(f"First frame not found: {first_path!r}")
|
|
|
|
work_scene = scene
|
|
created_scene = False
|
|
try:
|
|
work_scene = scene.copy()
|
|
work_scene.name = "MGPU_TEMP_VSE"
|
|
created_scene = True
|
|
except Exception:
|
|
work_scene = scene
|
|
|
|
old_se = work_scene.sequence_editor
|
|
old_frame_start = work_scene.frame_start
|
|
old_frame_end = work_scene.frame_end
|
|
old_filepath = work_scene.render.filepath
|
|
old_use_seq = work_scene.render.use_sequencer
|
|
old_use_cmp = work_scene.render.use_compositing
|
|
|
|
se = old_se if old_se is not None else work_scene.sequence_editor_create()
|
|
strip_coll = _mgpu_get_vse_strip_collection(se)
|
|
if strip_coll is None:
|
|
raise RuntimeError("SequenceEditor has no strips/sequences collection.")
|
|
|
|
try:
|
|
for s in list(strip_coll):
|
|
strip_coll.remove(s)
|
|
except Exception:
|
|
pass
|
|
|
|
strip = None
|
|
frame_count = len(frames)
|
|
|
|
try:
|
|
try:
|
|
strip = strip_coll.new_image(
|
|
name="MGPU_TEMP_SEQ",
|
|
filepath=first_path,
|
|
channel=1,
|
|
frame_start=old_frame_start,
|
|
)
|
|
except TypeError:
|
|
strip = strip_coll.new_image("MGPU_TEMP_SEQ", first_path, 1, old_frame_start)
|
|
|
|
directory = seq_dir + os.sep if not seq_dir.endswith(os.sep) else seq_dir
|
|
strip.directory = directory
|
|
if strip.elements:
|
|
strip.elements[0].filename = first_name
|
|
else:
|
|
strip.elements.append(first_name)
|
|
|
|
for name in frames[1:]:
|
|
strip.elements.append(name)
|
|
|
|
strip.frame_start = old_frame_start
|
|
strip.frame_final_duration = frame_count
|
|
|
|
work_scene.frame_start = old_frame_start
|
|
work_scene.frame_end = old_frame_start + frame_count - 1
|
|
work_scene.render.use_sequencer = True
|
|
work_scene.render.use_compositing = False
|
|
|
|
work_scene.render.filepath = os.path.splitext(final_path)[0]
|
|
try:
|
|
with bpy.context.temp_override(scene=work_scene, view_layer=work_scene.view_layers[0]):
|
|
bpy.ops.render.render(animation=True)
|
|
except Exception:
|
|
win = bpy.context.window
|
|
prev_scene = win.scene if win else None
|
|
try:
|
|
if win:
|
|
win.scene = work_scene
|
|
bpy.ops.render.render(animation=True)
|
|
finally:
|
|
if win and prev_scene:
|
|
win.scene = prev_scene
|
|
finally:
|
|
try:
|
|
work_scene.render.filepath = old_filepath
|
|
work_scene.frame_start = old_frame_start
|
|
work_scene.frame_end = old_frame_end
|
|
work_scene.render.use_sequencer = old_use_seq
|
|
work_scene.render.use_compositing = old_use_cmp
|
|
except Exception:
|
|
pass
|
|
try:
|
|
if strip is not None and strip_coll is not None and hasattr(strip_coll, "remove"):
|
|
strip_coll.remove(strip)
|
|
except Exception:
|
|
pass
|
|
if created_scene:
|
|
try:
|
|
bpy.data.scenes.remove(work_scene)
|
|
except Exception:
|
|
pass
|
|
|
|
# ----------------------- progress parsing (parent side) -----------------------
|
|
|
|
_SAMPLE_RE = re.compile(r"[Ss]amples?\s+(\d+)\s*/\s*(\d+)")
|
|
_TILE_RE = re.compile(r"[Tt]iles?\s+(\d+)\s*/\s*(\d+)")
|
|
_TILE2_RE = re.compile(r"[Tt]ile\s+(\d+)\s*/\s*(\d+)")
|
|
_CHILD_START_RE = re.compile(r"^\[MGPU-CHILD\]\s+(.+?)\s+start\s+frame\s+(\d+)\s+->\s+(.+)$")
|
|
_CHILD_FIN_RE = re.compile(r"^\[MGPU-CHILD\]\s+(.+?)\s+finished\s+frame\s+(\d+)\s+\(([\d.]+)s\)\s+->\s+(.+)$")
|
|
_CHILD_MISS_RE = re.compile(r"^\[MGPU-CHILD\]\s+(.+?)\s+MISSING\s+frame\s+(\d+)\s+\(([\d.]+)s\)\s+->\s+(.+)$")
|
|
_CHILD_MARI_START_RE = re.compile(r"^\[MGPU-CHILD\]\s+(.+?)\s+start\s+MARI\s+(\S+)\s+H(-?\d+)\s+V(-?\d+)(?:\s+f(-?\d+))?\s+->\s+(.+)$")
|
|
_CHILD_MARI_FIN_RE = re.compile(r"^\[MGPU-CHILD\]\s+(.+?)\s+finished\s+MARI\s+(\S+)\s+H(-?\d+)\s+V(-?\d+)(?:\s+f(-?\d+))?\s+\(([\d.]+)s\)\s+->\s+(.+)$")
|
|
_CHILD_MARI_MISS_RE = re.compile(r"^\[MGPU-CHILD\]\s+(.+?)\s+MISSING\s+MARI\s+(\S+)\s+H(-?\d+)\s+V(-?\d+)(?:\s+f(-?\d+))?\s+\(([\d.]+)s\)\s+->\s+(.+)$")
|
|
|
|
def _parse_progress_fields(line: str):
|
|
s_cur = s_tot = t_cur = t_tot = None
|
|
m = _SAMPLE_RE.search(line)
|
|
if m:
|
|
try:
|
|
s_cur, s_tot = int(m.group(1)), int(m.group(2))
|
|
except Exception:
|
|
pass
|
|
m2 = _TILE_RE.search(line) or _TILE2_RE.search(line)
|
|
if m2:
|
|
try:
|
|
t_cur, t_tot = int(m2.group(1)), int(m2.group(2))
|
|
except Exception:
|
|
pass
|
|
return s_cur, s_tot, t_cur, t_tot
|
|
|
|
def _progress_percent(s_cur, s_tot, t_cur, t_tot):
|
|
if s_cur is not None and s_tot and s_tot > 0:
|
|
return max(0.0, min(100.0, (s_cur / s_tot) * 100.0))
|
|
if t_cur is not None and t_tot and t_tot > 0:
|
|
return max(0.0, min(100.0, (t_cur / t_tot) * 100.0))
|
|
return None
|
|
|
|
def _progress_bar(pct, width=20):
|
|
if pct is None: return "-" * width
|
|
filled = max(0, min(width, int(round((pct / 100.0) * width))))
|
|
return "#" * filled + "-" * (width - filled)
|
|
|
|
# ----------------------- Windows Job Object (kill children on Blender exit) -----------------------
|
|
|
|
_WS_JOB = None
|
|
def _win_job_init():
|
|
if not IS_WIN: return None
|
|
try:
|
|
import ctypes
|
|
from ctypes import wintypes as wt
|
|
kernel32 = ctypes.windll.kernel32
|
|
CreateJobObjectW = kernel32.CreateJobObjectW
|
|
SetInformationJobObject = kernel32.SetInformationJobObject
|
|
|
|
class JOBOBJECT_BASIC_LIMIT_INFORMATION(ctypes.Structure):
|
|
_fields_ = [
|
|
("PerProcessUserTimeLimit", ctypes.c_longlong),
|
|
("PerJobUserTimeLimit", ctypes.c_longlong),
|
|
("LimitFlags", wt.DWORD),
|
|
("MinimumWorkingSetSize", ctypes.c_size_t),
|
|
("MaximumWorkingSetSize", ctypes.c_size_t),
|
|
("ActiveProcessLimit", wt.DWORD),
|
|
("Affinity", wt.LPVOID),
|
|
("PriorityClass", wt.DWORD),
|
|
("SchedulingClass", wt.DWORD),
|
|
]
|
|
|
|
class IO_COUNTERS(ctypes.Structure):
|
|
_fields_ = [
|
|
("ReadOperationCount", ctypes.c_ulonglong),
|
|
("WriteOperationCount", ctypes.c_ulonglong),
|
|
("OtherOperationCount", ctypes.c_ulonglong),
|
|
("ReadTransferCount", ctypes.c_ulonglong),
|
|
("WriteTransferCount", ctypes.c_ulonglong),
|
|
("OtherTransferCount", ctypes.c_ulonglong),
|
|
]
|
|
|
|
class JOBOBJECT_EXTENDED_LIMIT_INFORMATION(ctypes.Structure):
|
|
_fields_ = [
|
|
("BasicLimitInformation", JOBOBJECT_BASIC_LIMIT_INFORMATION),
|
|
("IoInfo", IO_COUNTERS),
|
|
("ProcessMemoryLimit", ctypes.c_size_t),
|
|
("JobMemoryLimit", ctypes.c_size_t),
|
|
("PeakProcessMemoryUsed", ctypes.c_size_t),
|
|
("PeakJobMemoryUsed", ctypes.c_size_t),
|
|
]
|
|
|
|
JOB_OBJECT_EXTENDED_LIMIT_INFORMATION = 9
|
|
JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE = 0x00002000
|
|
|
|
hJob = CreateJobObjectW(None, None)
|
|
if not hJob:
|
|
return None
|
|
|
|
info = JOBOBJECT_EXTENDED_LIMIT_INFORMATION()
|
|
info.BasicLimitInformation.LimitFlags = JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE
|
|
if not SetInformationJobObject(hJob, JOB_OBJECT_EXTENDED_LIMIT_INFORMATION,
|
|
ctypes.byref(info), ctypes.sizeof(info)):
|
|
return None
|
|
return hJob
|
|
except Exception:
|
|
return None
|
|
|
|
def _win_job_assign(proc):
|
|
if not IS_WIN or not proc: return
|
|
global _WS_JOB
|
|
if _WS_JOB is None:
|
|
_WS_JOB = _win_job_init()
|
|
if _WS_JOB:
|
|
try:
|
|
import ctypes
|
|
AssignProcessToJobObject = ctypes.windll.kernel32.AssignProcessToJobObject
|
|
AssignProcessToJobObject(_WS_JOB, int(proc._handle))
|
|
except Exception:
|
|
pass
|
|
|
|
# ----------------------- manager / workers -----------------------
|
|
|
|
class Worker:
|
|
def __init__(self, tag_label, gpu_uuid, phys_index, instance_index, total_instances, is_cpu=False, gpu_bus="", gpu_name=""):
|
|
self.is_cpu = bool(is_cpu)
|
|
self.gpu_uuid = gpu_uuid # 'GPU-xxxx...' or None for CPU
|
|
self.phys_index = phys_index # physical index for display (int or None)
|
|
self.gpu_bus = str(gpu_bus or "")
|
|
self.gpu_name = str(gpu_name or "")
|
|
self.instance_index = instance_index
|
|
self.total_instances = total_instances
|
|
self.proc = None
|
|
self.stdout_thread = None
|
|
self.last_line = ""
|
|
self.log_path = None
|
|
self._log_fp = None
|
|
self.tag = f"{tag_label}-#{instance_index}"
|
|
safe = re.sub(r"[^A-Za-z0-9._-]+", "_", self.tag)
|
|
if not safe:
|
|
safe = f"worker_{instance_index}"
|
|
self._file_tag = safe
|
|
self._live_line_active = False
|
|
self._last_samples = (None, None) # remember for the final 100% line
|
|
self.local_frames = []
|
|
|
|
|
|
# terminal tail process (PowerShell/xterm)
|
|
self.term_proc = None
|
|
|
|
# progress throttling state
|
|
self._last_emit_time = 0.0
|
|
self._last_pct = -1.0
|
|
self._last_s_pair = (None, None)
|
|
self._last_t_pair = (None, None)
|
|
|
|
# per-frame state (for richer messages)
|
|
self.cur_frame = None
|
|
self.cur_path = None
|
|
self.frame_start_time = 0.0
|
|
|
|
# launch diagnostics
|
|
self.launch_state = "PLANNED"
|
|
self.launch_reason = "planned"
|
|
self.launch_detail = ""
|
|
self.launch_attempted = False
|
|
self.launch_ok = False
|
|
self.launch_ts = 0.0
|
|
self.launch_pid = None
|
|
self.hello_received = False
|
|
self.hello_ts = 0.0
|
|
self.hello_timeout_reported = False
|
|
self.exit_before_hello_reported = False
|
|
self.guard_last_progress_ts = 0.0
|
|
self.guard_last_progress_sig = None
|
|
self.guard_epoch = 0
|
|
self.guard_restart_ts = deque(maxlen=16)
|
|
self.guard_restart_marks = []
|
|
self.guard_restarts_total = 0
|
|
self.cycles_backend_override = None
|
|
self.cycles_policy_failures = 0
|
|
self.cycles_cpu_hint_ts = 0.0
|
|
self.cycles_cpu_hint_line = ""
|
|
self._banner_lines_since_repeat = 0
|
|
|
|
@property
|
|
def running(self):
|
|
return (self.proc is not None) and (self.proc.poll() is None)
|
|
|
|
def open_log(self, path):
|
|
dir_path = os.path.dirname(path)
|
|
os.makedirs(dir_path, exist_ok=True)
|
|
self.log_path = os.path.join(dir_path, f"{self._file_tag}.log")
|
|
# UTF-8 with BOM so PS5/PS7 detect UTF-8 and render block glyphs correctly
|
|
self._log_fp = open(self.log_path, "w", encoding="utf-8-sig", newline="")
|
|
|
|
def alive(self) -> bool:
|
|
p = getattr(self, "proc", None)
|
|
try:
|
|
return (p is not None) and (p.poll() is None)
|
|
except Exception:
|
|
return False
|
|
|
|
|
|
def close_log(self):
|
|
try:
|
|
if self._log_fp:
|
|
self._log_fp.close()
|
|
except Exception:
|
|
pass
|
|
|
|
class MultiGPUManager:
|
|
def __init__(self, scene, threads=0, instances_per_gpu=1, dispatch_mode="DYNAMIC", max_retries=2,
|
|
open_terms=True, ghost_mode="STRICT_MINUS_LEGACY", use_persistent_data=True,
|
|
job_mode="FRAMES", mari_jobs=None, mari_globals=None, render_guard_tier="AGGRESSIVE",
|
|
denoise_on_gpu=True):
|
|
self.scene = scene
|
|
self.device_mode = _current_compute_type()
|
|
self.fallback_device_mode = "CUDA" if str(self.device_mode or "").upper() == "OPTIX" else ""
|
|
self.cpu_selected = bool(_cycles_cpu_device_selected()) if scene.render.engine == "CYCLES" else False
|
|
self.threads = int(threads)
|
|
self.instances_per_gpu = max(1, int(instances_per_gpu))
|
|
self.dispatch_mode = dispatch_mode
|
|
self.max_retries = max(0, int(max_retries))
|
|
self.open_terms = bool(open_terms)
|
|
self.ghost_mode = ghost_mode
|
|
self.use_persistent_data = bool(use_persistent_data)
|
|
self.denoise_on_gpu = bool(denoise_on_gpu)
|
|
|
|
self.job_mode = job_mode # "FRAMES" or "MARI"
|
|
self.mari_jobs = list(mari_jobs or [])
|
|
self.mari_globals = dict(mari_globals or {})
|
|
self.render_guard_tier = str(render_guard_tier or "AGGRESSIVE").upper()
|
|
self.rt_guard_cfg = _rendertime_guard_profile(self.render_guard_tier)
|
|
self.rt_guard_enabled = bool(self.rt_guard_cfg.get("enabled", False))
|
|
self.rt_guard_pause_until = 0.0
|
|
self.rt_guard_job_state = {}
|
|
self.rt_guard_restart_ts = deque(maxlen=64)
|
|
self._worker_hist = {}
|
|
self._global_hist = deque(maxlen=80)
|
|
self._rt_guard_last_log = {}
|
|
self.rt_periodic_recycle_enabled = bool(self.rt_guard_cfg.get("periodic_recycle_enabled", False))
|
|
raw_points = list(self.rt_guard_cfg.get("periodic_recycle_points", []) or [])
|
|
cleaned_points = []
|
|
for p in raw_points:
|
|
try:
|
|
fp = float(p)
|
|
if 0.0 < fp < 1.0:
|
|
cleaned_points.append(fp)
|
|
except Exception:
|
|
pass
|
|
self.rt_periodic_recycle_points = sorted(set(cleaned_points))
|
|
self.rt_periodic_recycle_seen = set()
|
|
self.rt_periodic_recycle_pending = {}
|
|
|
|
self.video_mode = False
|
|
self.video_seq_dir = None
|
|
self.video_seq_format = "PNG"
|
|
self.video_seq_ext = ".png"
|
|
self.video_output_path = None
|
|
self._forced_temp_dir = None
|
|
self._preflight_existing_check_done = False
|
|
self._skip_video_encode = False
|
|
try:
|
|
if self.job_mode == "FRAMES":
|
|
img = scene.render.image_settings
|
|
fmt = str(getattr(img, "file_format", "") or "").upper()
|
|
media = str(getattr(img, "media_type", "") or "").upper()
|
|
if fmt in VIDEO_FORMATS or media == "VIDEO":
|
|
self.video_mode = True
|
|
self.video_output_path = bpy.path.abspath(scene.render.filepath)
|
|
except Exception:
|
|
pass
|
|
|
|
if self.job_mode == "FRAMES":
|
|
fstart, fend, fstep = scene.frame_start, scene.frame_end, max(1, scene.frame_step)
|
|
self.frames = list(range(fstart, fend + 1, fstep))
|
|
self.total_frames = len(self.frames)
|
|
self.pending = list(self.frames)
|
|
else:
|
|
# MARI jobs can be expanded later (per-frame for ANIM) before start()
|
|
self.frames = []
|
|
self.total_frames = len(self.mari_jobs)
|
|
self.pending = list(self.mari_jobs)
|
|
|
|
self.finished = []
|
|
|
|
self.retries = {}
|
|
self.finished_set = set()
|
|
self.inflight = {}
|
|
self.worker_stats = {}
|
|
self.total_render_time = 0.0
|
|
self.total_render_count = 0
|
|
self.rt_last_real_completion_ts = 0.0
|
|
|
|
self.temp_dir = None
|
|
self.temp_blend = None
|
|
self.logs_dir = None
|
|
self.cancelled = False
|
|
self._lock = threading.Lock()
|
|
self._server_sock = None
|
|
self._server_thread = None
|
|
self._clients = {}
|
|
self._token = ''.join(random.choice(string.ascii_letters+string.digits) for _ in range(24))
|
|
self._child_script = None
|
|
self._hello_timeout_s = 20.0
|
|
self._launch_events = []
|
|
self._ram_cap_estimate = None
|
|
self._ram_cap_note = ""
|
|
self._enabled_addon_modules_csv = ""
|
|
self._diag_log_path = None
|
|
self._diag_log_fp = None
|
|
self._diag_term_proc = None
|
|
self._diag_term_opened = False
|
|
self._diag_buffer = []
|
|
self._selection_warning = False
|
|
|
|
self._legacy_detect = _detect_gpu_devices_legacy(False)
|
|
self._strict_detect = _detect_gpu_devices_strict(True)
|
|
|
|
sel = _detect_gpu_devices_final_from_lists(
|
|
self.ghost_mode, self._legacy_detect, self._strict_detect
|
|
)
|
|
mapped_all = _map_selection_to_uuids(sel)
|
|
mapped, dropped_unknown = _filter_known_mapped_gpus(mapped_all)
|
|
if dropped_unknown:
|
|
msg = (
|
|
f"[MGPU-GPUSEL] INFO: Hidden {len(dropped_unknown)} unresolved GPU entry(s) "
|
|
f"(index='?'). They will not be launched."
|
|
)
|
|
_log(msg)
|
|
self._diag_write(msg)
|
|
for d in dropped_unknown:
|
|
self._diag_write(
|
|
f"[MGPU-GPUSEL] dropped idx={d.get('index')} bus={d.get('bus') or '-'} "
|
|
f"uuid={d.get('uuid') or 'NONE'} name={d.get('name')}"
|
|
)
|
|
self.workers = []
|
|
if mapped:
|
|
for m in mapped:
|
|
tag_label = f"GPU{m['phys_index'] if m['phys_index'] is not None else '??'}"
|
|
for i in range(1, self.instances_per_gpu + 1):
|
|
self.workers.append(
|
|
Worker(
|
|
tag_label, m["uuid"], m["phys_index"], i, self.instances_per_gpu, is_cpu=False,
|
|
gpu_bus=(m.get("bus") or ""), gpu_name=(m.get("name") or "")
|
|
)
|
|
)
|
|
|
|
if self.scene.render.engine == "CYCLES":
|
|
if self.cpu_selected:
|
|
self.workers.append(Worker("CPU", None, None, 1, 1, is_cpu=True))
|
|
_log("[MGPU-LAUNCH] Cycles CPU device is enabled; adding one dedicated CPU worker.")
|
|
elif not mapped:
|
|
_log("[MGPU-LAUNCH] No mapped GPU workers and Cycles CPU is disabled; CPU fallback is disabled.")
|
|
else:
|
|
if not mapped:
|
|
self.workers.append(Worker("EEVEE", None, 0, 1, 1, is_cpu=False))
|
|
_log("No explicit GPU list for Eevee - running one worker.")
|
|
|
|
if self.scene.render.engine == "CYCLES" and not self.workers:
|
|
raise RuntimeError(
|
|
"No Cycles workers planned. Enable at least one GPU device, or enable CPU in Cycles render devices."
|
|
)
|
|
|
|
if self.scene.render.engine == "CYCLES":
|
|
_log(
|
|
f"[MGPU-LAUNCH] Cycles device policy: requested_backend={self.device_mode} "
|
|
f"fallback_backend={self.fallback_device_mode or 'none'} "
|
|
f"cpu_selected={'YES' if self.cpu_selected else 'NO'}"
|
|
)
|
|
|
|
_log(f"[MGPU-LAUNCH] Worker plan: mapped_gpus={len(mapped)} instances_per_gpu={self.instances_per_gpu} planned_workers={len(self.workers)}")
|
|
for w in self.workers:
|
|
dev_txt = "CPU" if w.is_cpu else f"GPU idx={w.phys_index if w.phys_index is not None else '?'} uuid={(w.gpu_uuid or 'none')[:24]}"
|
|
self._record_launch_event(w, "PLANNED", "WORKER_PLANNED", dev_txt)
|
|
self._log_gpu_selection_breakdown(sel, mapped)
|
|
|
|
self._rebuild_dispatch_queues()
|
|
|
|
if self.rt_guard_enabled:
|
|
_log(
|
|
f"[MGPU-GUARD] Render-time guard active: tier={self.render_guard_tier} "
|
|
f"(soft={self.rt_guard_cfg.get('soft_mult')}x/{int(self.rt_guard_cfg.get('soft_min_s', 0))}s, "
|
|
f"hard={self.rt_guard_cfg.get('hard_mult')}x/{int(self.rt_guard_cfg.get('hard_min_s', 0))}s, "
|
|
f"warmup_worker={int(self.rt_guard_cfg.get('warmup_per_worker_jobs', 0) or 0)}, "
|
|
f"warmup_global={int(self.rt_guard_cfg.get('warmup_completed_jobs', 0) or 0)})"
|
|
)
|
|
if self.rt_periodic_recycle_enabled and self.rt_periodic_recycle_points:
|
|
pts = ",".join(str(int(round(p * 100.0))) for p in self.rt_periodic_recycle_points)
|
|
_log(f"[MGPU-GUARD] Periodic VRAM hygiene restarts enabled at progress points: {pts}%")
|
|
else:
|
|
_log("[MGPU-GUARD] Render-time guard is OFF.")
|
|
|
|
def prepare_blend_copy(self):
|
|
if self._forced_temp_dir:
|
|
self.temp_dir = self._forced_temp_dir
|
|
try:
|
|
if os.path.isdir(self.temp_dir) and self.scene.render.use_overwrite:
|
|
shutil.rmtree(self.temp_dir, ignore_errors=True)
|
|
except Exception:
|
|
pass
|
|
os.makedirs(self.temp_dir, exist_ok=True)
|
|
else:
|
|
self.temp_dir = tempfile.mkdtemp(prefix="mgpu_frames_")
|
|
self.logs_dir = os.path.join(self.temp_dir, "logs")
|
|
os.makedirs(self.logs_dir, exist_ok=True)
|
|
self._init_diag_log()
|
|
if self.video_mode:
|
|
self.video_seq_dir = os.path.join(self.temp_dir, "frames")
|
|
os.makedirs(self.video_seq_dir, exist_ok=True)
|
|
base = os.path.basename(bpy.data.filepath) or "untitled.blend"
|
|
temp_path = os.path.join(self.temp_dir, base)
|
|
bpy.ops.wm.save_as_mainfile(filepath=temp_path, copy=True)
|
|
self.temp_blend = temp_path
|
|
self.src_blend_dir = os.path.dirname(bpy.data.filepath)
|
|
self._child_script = _write_child_script(self.temp_dir)
|
|
self._enabled_addons_file = None
|
|
try:
|
|
enabled_addons = _mgpu_enabled_addons_snapshot()
|
|
addons_file = os.path.join(self.temp_dir, "enabled_addons.json")
|
|
with open(addons_file, "w", encoding="utf-8") as fp:
|
|
json.dump({"addons": enabled_addons}, fp)
|
|
self._enabled_addons_file = addons_file
|
|
self._enabled_addon_modules_csv = ",".join(_mgpu_enabled_addon_module_names(enabled_addons))
|
|
_log(f"Captured {len(enabled_addons)} enabled add-ons for workers.")
|
|
except Exception as e:
|
|
_log(f"WARN: Failed to capture enabled add-ons: {e}")
|
|
_log(f"Prepared temp blend: {self.temp_blend}")
|
|
|
|
# Bundle the MARI add-on so child workers load the same version
|
|
self._mari_dir = None
|
|
if self.job_mode == "MARI":
|
|
try:
|
|
import importlib, inspect
|
|
mari_mod = None
|
|
|
|
# 1) Search installed add-ons; import each real module, then look for addon_prefix == "mari"
|
|
for meta in addon_utils.modules():
|
|
name = getattr(meta, "__name__", None)
|
|
if not name:
|
|
continue
|
|
try:
|
|
mod = importlib.import_module(name)
|
|
except Exception:
|
|
continue
|
|
|
|
# Primary signal: addon declares addon_prefix = "mari"
|
|
if getattr(mod, "addon_prefix", None) == "mari":
|
|
mari_mod = mod
|
|
break
|
|
|
|
# Fallback heuristic: any registered classes with bl_idname starting with "mari."
|
|
try:
|
|
if any(
|
|
isinstance(obj, type) and getattr(obj, "bl_idname", "").startswith("mari.")
|
|
for obj in mod.__dict__.values()
|
|
):
|
|
mari_mod = mod
|
|
break
|
|
except Exception:
|
|
pass
|
|
|
|
if mari_mod:
|
|
src = os.path.dirname(mari_mod.__file__)
|
|
dst = os.path.join(self.temp_dir, "holo_mari_addon")
|
|
if os.path.isdir(src):
|
|
shutil.copytree(src, dst, ignore=shutil.ignore_patterns("__pycache__", "*.pyc"))
|
|
else:
|
|
os.makedirs(dst, exist_ok=True)
|
|
shutil.copy2(mari_mod.__file__, os.path.join(dst, "__init__.py"))
|
|
self._mari_dir = dst
|
|
_log(f"Copied MARI addon to: {self._mari_dir}")
|
|
else:
|
|
_log("WARN: Could not import/locate the MARI add-on; child will not have bpy.ops.mari.*")
|
|
|
|
except Exception as e:
|
|
_log(f"WARN: Failed to copy MARI add-on: {e}")
|
|
|
|
|
|
def _start_server(self):
|
|
self._server_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
|
self._server_sock.bind(("127.0.0.1", 0))
|
|
self._server_sock.listen(16)
|
|
self._server_sock.settimeout(1.0)
|
|
self._server_port = self._server_sock.getsockname()[1]
|
|
self._server_thread = threading.Thread(target=self._accept_loop, daemon=True)
|
|
self._server_thread.start()
|
|
_log(f"Scheduler server on port {self._server_port}")
|
|
|
|
def _accept_loop(self):
|
|
while not self.cancelled:
|
|
try:
|
|
conn, _addr = self._server_sock.accept()
|
|
except socket.timeout:
|
|
continue
|
|
except Exception:
|
|
break
|
|
threading.Thread(target=self._client_loop, args=(conn,), daemon=True).start()
|
|
|
|
def _client_loop(self, conn):
|
|
f = conn.makefile("rwb", buffering=0)
|
|
tag = None
|
|
def jrecv():
|
|
line = f.readline()
|
|
if not line:
|
|
raise ConnectionError("client closed")
|
|
return json.loads(line.decode("utf-8", "ignore"))
|
|
def jsend(obj):
|
|
f.write((json.dumps(obj)+"\n").encode("utf-8", "ignore")); f.flush()
|
|
|
|
try:
|
|
hello = jrecv()
|
|
tag = hello.get("hello"); token = hello.get("token")
|
|
if token != self._token or not tag:
|
|
try: jsend({"exit": True})
|
|
except Exception: pass
|
|
f.close(); conn.close(); return
|
|
|
|
with self._lock:
|
|
self._clients[tag] = (conn, f, jsend, jrecv)
|
|
self._mark_worker_connected(tag)
|
|
|
|
while not self.cancelled:
|
|
msg = jrecv()
|
|
if msg.get("get"):
|
|
unit = None
|
|
send_exit = False
|
|
g = None
|
|
frame = None
|
|
with self._lock:
|
|
if self.job_mode == "MARI":
|
|
unit = self._next_mari_for_tag(tag)
|
|
else:
|
|
unit = self._next_frame_for_tag(tag)
|
|
if unit is None:
|
|
send_exit = True
|
|
else:
|
|
self._record_inflight(tag, unit)
|
|
if self.job_mode == "MARI":
|
|
g = dict(self.mari_globals)
|
|
g["proj_total"] = getattr(self, "total_frames", 0)
|
|
g["proj_done"] = len(self.finished_set)
|
|
else:
|
|
frame = int(unit)
|
|
if send_exit:
|
|
try: jsend({"exit": True})
|
|
except Exception: pass
|
|
break
|
|
if self.job_mode == "MARI":
|
|
jsend({"mari_job": unit, "globals": g})
|
|
else:
|
|
jsend({"frame": frame})
|
|
continue
|
|
|
|
if "done" in msg:
|
|
with self._lock:
|
|
self._handle_job_done(tag, msg)
|
|
continue
|
|
|
|
except Exception as e:
|
|
_log(f"[client] connection end: {e}")
|
|
finally:
|
|
if tag:
|
|
try:
|
|
with self._lock:
|
|
cur = self._clients.get(tag)
|
|
if cur and cur[0] is conn:
|
|
self._clients.pop(tag, None)
|
|
except Exception:
|
|
pass
|
|
try: f.close()
|
|
except Exception: pass
|
|
try: conn.close()
|
|
except Exception: pass
|
|
|
|
def _worker_by_tag(self, tag):
|
|
for w in self.workers:
|
|
if w.tag == tag:
|
|
return w
|
|
return None
|
|
|
|
def _cycles_backend_for_worker(self, w: Worker):
|
|
if not w:
|
|
return str(self.device_mode or "CUDA").upper()
|
|
if w.is_cpu or self.scene.render.engine != "CYCLES":
|
|
return str(self.device_mode or "CUDA").upper()
|
|
override = str(getattr(w, "cycles_backend_override", "") or "").upper()
|
|
if override:
|
|
return override
|
|
return str(self.device_mode or "CUDA").upper()
|
|
|
|
def _cycles_fallback_for_worker(self, w: Worker, primary_backend: str = ""):
|
|
if (not w) or w.is_cpu or self.scene.render.engine != "CYCLES":
|
|
return ""
|
|
primary = str(primary_backend or self._cycles_backend_for_worker(w) or "").upper()
|
|
if primary == "OPTIX":
|
|
return "CUDA"
|
|
return ""
|
|
|
|
def _handle_cycles_gpu_policy_failure(self, w: Worker, err_text: str, inflight=None):
|
|
if (not w) or w.is_cpu or self.scene.render.engine != "CYCLES":
|
|
return False
|
|
txt = str(err_text or "")
|
|
txt_up = txt.upper()
|
|
trigger = ("GPU_POLICY_VIOLATION" in txt_up) or ("CYCLES DEVICE SETUP FAILED" in txt_up)
|
|
if not trigger:
|
|
return False
|
|
|
|
w.cycles_policy_failures = int(getattr(w, "cycles_policy_failures", 0) or 0) + 1
|
|
current_backend = self._cycles_backend_for_worker(w)
|
|
switched = False
|
|
if current_backend == "OPTIX":
|
|
w.cycles_backend_override = "CUDA"
|
|
switched = True
|
|
|
|
reason = f"cycles-gpu-policy-failure#{w.cycles_policy_failures}"
|
|
if switched:
|
|
reason += f" backend={current_backend}->CUDA"
|
|
_log(f"[MGPU-LAUNCH] {w.tag}: GPU policy violation; switching backend {current_backend} -> CUDA and restarting worker.")
|
|
else:
|
|
reason += f" backend={self._cycles_backend_for_worker(w)}"
|
|
_log(f"[MGPU-LAUNCH] {w.tag}: GPU policy violation persisted on backend={self._cycles_backend_for_worker(w)}; restarting worker.")
|
|
|
|
self._diag_write(f"[MGPU-LAUNCH] {w.tag}: err='{txt[:220]}'")
|
|
ok = self._restart_worker_same_gpu(w, reason, info=inflight)
|
|
if ok:
|
|
self._open_diag_terminal_if_needed()
|
|
else:
|
|
_log(f"[MGPU-LAUNCH] WARNING: restart failed after GPU policy violation for {w.tag}.")
|
|
return ok
|
|
|
|
def _init_diag_log(self):
|
|
if not self.logs_dir:
|
|
return
|
|
try:
|
|
path = os.path.join(self.logs_dir, "_launch_diagnostics.log")
|
|
self._diag_log_path = path
|
|
self._diag_log_fp = open(path, "w", encoding="utf-8-sig", newline="")
|
|
self._diag_log_fp.write(BANNER_MANAGER_ASCII.rstrip("\n") + "\n")
|
|
self._diag_log_fp.write("[MGPU-LAUNCH] Diagnostics log initialized.\n")
|
|
for line in self._diag_buffer:
|
|
self._diag_log_fp.write((line or "").rstrip("\n") + "\n")
|
|
self._diag_buffer = []
|
|
# Flush queued events captured before logs_dir existed
|
|
for evt in self._launch_events:
|
|
msg = f"[MGPU-LAUNCH] {evt.get('tag','?')} {evt.get('state','')}:{evt.get('reason','')}"
|
|
det = evt.get("detail") or ""
|
|
if det:
|
|
msg += f" | {det}"
|
|
self._diag_log_fp.write(msg + "\n")
|
|
self._diag_log_fp.flush()
|
|
except Exception:
|
|
self._diag_log_fp = None
|
|
|
|
def _diag_write(self, text):
|
|
try:
|
|
line = (text or "").rstrip("\n")
|
|
if self._diag_log_fp:
|
|
self._diag_log_fp.write(line + "\n")
|
|
self._diag_log_fp.flush()
|
|
else:
|
|
self._diag_buffer.append(line)
|
|
if len(self._diag_buffer) > 300:
|
|
self._diag_buffer = self._diag_buffer[-300:]
|
|
except Exception:
|
|
pass
|
|
|
|
def _spawn_tail_terminal(self, log_path: str, enable_vt=False):
|
|
if not log_path:
|
|
return None
|
|
if IS_WIN:
|
|
path_ps = str(log_path).replace("'", "''")
|
|
vt_block = ""
|
|
if enable_vt:
|
|
vt_block = (
|
|
"$c='using System; using System.Runtime.InteropServices; "
|
|
"public static class VT{"
|
|
"[DllImport(\"kernel32.dll\")] public static extern System.IntPtr GetStdHandle(int n); "
|
|
"[DllImport(\"kernel32.dll\")] public static extern bool GetConsoleMode(System.IntPtr h, out int m); "
|
|
"[DllImport(\"kernel32.dll\")] public static extern bool SetConsoleMode(System.IntPtr h, int m);"
|
|
"}'; "
|
|
"Add-Type -TypeDefinition $c -ErrorAction SilentlyContinue; "
|
|
"$h=[VT]::GetStdHandle(-11); $m=0; [VT]::GetConsoleMode($h,[ref]$m)|Out-Null; "
|
|
"[VT]::SetConsoleMode($h, ($m -bor 4)) | Out-Null; "
|
|
)
|
|
cmd = (
|
|
"$ErrorActionPreference='SilentlyContinue'; "
|
|
+ vt_block +
|
|
"try{[Console]::OutputEncoding=[Text.UTF8Encoding]::new($true)}catch{}; "
|
|
"chcp 65001 | Out-Null; "
|
|
"try{$raw=$Host.UI.RawUI; $raw.BackgroundColor='Black'; $raw.ForegroundColor='Red'; Clear-Host}catch{}; "
|
|
f"$p='{path_ps}'; Get-Content -LiteralPath $p -Wait"
|
|
)
|
|
try:
|
|
proc = subprocess.Popen(
|
|
["powershell", "-NoLogo", "-NoProfile", "-Command", cmd],
|
|
creationflags=subprocess.CREATE_NEW_CONSOLE
|
|
)
|
|
_win_job_assign(proc)
|
|
return proc
|
|
except Exception:
|
|
return None
|
|
|
|
if IS_MAC:
|
|
path_applescript = str(log_path).replace("\\", "\\\\").replace('"', '\\"')
|
|
cmd_prefix = "printf '\\\\033[0;31;40m'; clear; tail -f "
|
|
script = (
|
|
'tell application "Terminal"\n'
|
|
f' do script "{cmd_prefix}" & quoted form of POSIX path of "{path_applescript}"\n'
|
|
'end tell'
|
|
)
|
|
try:
|
|
return subprocess.Popen(["osascript", "-e", script])
|
|
except Exception:
|
|
return None
|
|
|
|
# Linux / BSD / other POSIX
|
|
quoted = shlex.quote(str(log_path))
|
|
tail_cmd = f"printf '\\033[0;31;40m'; clear; tail -f {quoted}"
|
|
candidates = []
|
|
if shutil.which("xterm"):
|
|
candidates.append(["xterm", "-hold", "-bg", "black", "-fg", "red", "-e", "sh", "-lc", tail_cmd])
|
|
if shutil.which("x-terminal-emulator"):
|
|
candidates.append(["x-terminal-emulator", "-e", "sh", "-lc", tail_cmd])
|
|
if shutil.which("gnome-terminal"):
|
|
candidates.append(["gnome-terminal", "--", "sh", "-lc", tail_cmd])
|
|
if shutil.which("konsole"):
|
|
candidates.append(["konsole", "--hold", "-e", "sh", "-lc", tail_cmd])
|
|
if shutil.which("xfce4-terminal"):
|
|
candidates.append(["xfce4-terminal", "--hold", "--command", f"sh -lc {shlex.quote(tail_cmd)}"])
|
|
if shutil.which("mate-terminal"):
|
|
candidates.append(["mate-terminal", "--", "sh", "-lc", tail_cmd])
|
|
if shutil.which("lxterminal"):
|
|
candidates.append(["lxterminal", "-e", f"sh -lc {shlex.quote(tail_cmd)}"])
|
|
if shutil.which("kitty"):
|
|
candidates.append(["kitty", "sh", "-lc", tail_cmd])
|
|
if shutil.which("alacritty"):
|
|
candidates.append(["alacritty", "-e", "sh", "-lc", tail_cmd])
|
|
|
|
for argv in candidates:
|
|
try:
|
|
return subprocess.Popen(argv)
|
|
except Exception:
|
|
continue
|
|
return None
|
|
|
|
def _open_diag_terminal_if_needed(self):
|
|
if self._diag_term_opened:
|
|
return
|
|
if (not self.open_terms) or (not self._diag_log_path):
|
|
return
|
|
self._diag_term_opened = True
|
|
self._diag_term_proc = self._spawn_tail_terminal(self._diag_log_path, enable_vt=False)
|
|
if self._diag_term_proc is None:
|
|
self._diag_term_opened = False
|
|
|
|
def _log_gpu_selection_breakdown(self, final_sel, mapped):
|
|
mode = self.ghost_mode
|
|
snap = _cycles_device_snapshot()
|
|
backend = snap.get("backend")
|
|
rows = list(snap.get("rows") or [])
|
|
if rows:
|
|
type_counts = Counter(str(r.get("type") or "?") for r in rows)
|
|
cnt_text = ", ".join(f"{k}:{type_counts[k]}" for k in sorted(type_counts.keys()))
|
|
sum_msg = f"[MGPU-GPUSEL] cycles backend={backend} rows={len(rows)} by_type=[{cnt_text}]"
|
|
_log(sum_msg)
|
|
self._diag_write(sum_msg)
|
|
for i, r in enumerate(rows):
|
|
self._diag_write(
|
|
f"[MGPU-GPUSEL] cycles[{i}] type={r.get('type')} use={r.get('use')} "
|
|
f"bus={r.get('bus') or '-'} id={r.get('id') or '-'} name={r.get('name')}"
|
|
)
|
|
_log(
|
|
f"[MGPU-GPUSEL] mode={mode} legacy={len(self._legacy_detect)} strict={len(self._strict_detect)} "
|
|
f"final={len(final_sel)} mapped={len(mapped)}"
|
|
)
|
|
for i, item in enumerate(final_sel):
|
|
idx, name, backend, selected, bus = item
|
|
msg = f"[MGPU-GPUSEL] final[{i}] idx={idx} sel={selected} backend={backend} bus={bus or '-'} name={name}"
|
|
_log(msg)
|
|
self._diag_write(msg)
|
|
for i, m in enumerate(mapped):
|
|
msg = (
|
|
f"[MGPU-GPUSEL] mapped[{i}] phys_index={m.get('phys_index')} idx={m.get('index')} "
|
|
f"bus={m.get('bus') or '-'} uuid={(m.get('uuid') or 'NONE')} "
|
|
f"name={m.get('name') or '-'}"
|
|
f"{(' cycles_name=' + str(m.get('cycles_name'))) if m.get('cycles_name') else ''}"
|
|
)
|
|
_log(msg)
|
|
self._diag_write(msg)
|
|
if len(final_sel) != len(mapped):
|
|
msg = f"[MGPU-GPUSEL] WARNING: final({len(final_sel)}) != mapped({len(mapped)})"
|
|
_log(msg)
|
|
self._diag_write(msg)
|
|
self._selection_warning = True
|
|
if len(final_sel) < len(self._strict_detect):
|
|
msg = (
|
|
f"[MGPU-GPUSEL] WARNING: strict selected GPUs ({len(self._strict_detect)}) reduced to final ({len(final_sel)}) "
|
|
f"by ghost filter mode '{mode}'. STRICT safeguard should prevent launch drops."
|
|
)
|
|
_log(msg)
|
|
self._diag_write(msg)
|
|
self._selection_warning = True
|
|
missing_uuid = sum(1 for m in mapped if not m.get("uuid"))
|
|
if missing_uuid > 0:
|
|
msg = f"[MGPU-GPUSEL] WARNING: {missing_uuid} mapped GPU(s) have no UUID match; launch pinning may be unreliable."
|
|
_log(msg)
|
|
self._diag_write(msg)
|
|
self._selection_warning = True
|
|
phys = _win_query_nvidia_smi_detailed() or []
|
|
if phys:
|
|
mapped_phys = {m.get("phys_index") for m in mapped if m.get("phys_index") is not None}
|
|
backend_rows = [r for r in rows if str(r.get("type")) == str(backend)]
|
|
if backend and len(backend_rows) < len(phys):
|
|
msg = (
|
|
f"[MGPU-GPUSEL] WARNING: Cycles backend '{backend}' exposes {len(backend_rows)} "
|
|
f"device row(s), but nvidia-smi sees {len(phys)} GPU(s)."
|
|
)
|
|
_log(msg)
|
|
self._diag_write(msg)
|
|
self._selection_warning = True
|
|
if len(mapped_phys) < len(phys):
|
|
phys_idx = {g.get("index") for g in phys if g.get("index") is not None}
|
|
missing_idx = sorted([i for i in phys_idx if i not in mapped_phys])
|
|
msg = (
|
|
f"[MGPU-GPUSEL] WARNING: NVIDIA physical GPUs={len(phys)} but mapped GPUs={len(mapped_phys)}. "
|
|
f"A GPU may be filtered out by backend/type mismatch or unresolved bus-id mapping."
|
|
)
|
|
_log(msg)
|
|
self._diag_write(msg)
|
|
if missing_idx:
|
|
miss_msg = f"[MGPU-GPUSEL] WARNING: unmapped NVIDIA index(es): {','.join(str(i) for i in missing_idx)}"
|
|
_log(miss_msg)
|
|
self._diag_write(miss_msg)
|
|
self._selection_warning = True
|
|
for g in phys:
|
|
msg = (
|
|
f"[MGPU-GPUSEL] phys idx={g.get('index')} bus={g.get('bus')} "
|
|
f"uuid={g.get('uuid')} name={g.get('name')}"
|
|
)
|
|
self._diag_write(msg)
|
|
|
|
def _record_launch_event(self, w: Worker, state: str, reason: str, detail: str = ""):
|
|
w.launch_state = str(state or "")
|
|
w.launch_reason = str(reason or "")
|
|
w.launch_detail = str(detail or "")
|
|
evt = {"tag": w.tag, "state": w.launch_state, "reason": w.launch_reason, "detail": w.launch_detail, "t": time.time()}
|
|
self._launch_events.append(evt)
|
|
msg = f"[MGPU-LAUNCH] {w.tag} {w.launch_state}: {w.launch_reason}"
|
|
if w.launch_detail:
|
|
msg += f" | {w.launch_detail}"
|
|
_log(msg)
|
|
self._diag_write(msg)
|
|
try:
|
|
self._emit(w, msg + "\n")
|
|
except Exception:
|
|
pass
|
|
if w.launch_state in {"FAILED_TO_LAUNCH", "FAILED_RUNTIME", "STALLING"}:
|
|
self._open_diag_terminal_if_needed()
|
|
|
|
def _update_ram_capacity_note(self):
|
|
rss = _proc_rss_bytes()
|
|
avail = _sys_mem_available_bytes()
|
|
if rss is None or avail is None:
|
|
return
|
|
per_child = max(int(rss * 0.8), 512 * 1024 * 1024)
|
|
if per_child <= 0:
|
|
return
|
|
cap = max(1, int(avail // per_child))
|
|
self._ram_cap_estimate = cap
|
|
planned = len(self.workers)
|
|
if planned > cap:
|
|
self._ram_cap_note = (
|
|
f"Planned workers={planned} exceeds rough RAM capacity={cap} "
|
|
f"(RSS={_fmt_bytes(rss)}, free={_fmt_bytes(avail)}, per-worker~{_fmt_bytes(per_child)})."
|
|
)
|
|
_log(f"[MGPU-LAUNCH] WARN: {self._ram_cap_note}")
|
|
|
|
def _mark_worker_connected(self, tag):
|
|
w = self._worker_by_tag(tag)
|
|
if not w:
|
|
return
|
|
if not w.hello_received:
|
|
w.hello_received = True
|
|
w.hello_ts = time.time()
|
|
self._record_launch_event(w, "CONNECTED", "WORKER_HELLO_OK", f"pid={w.launch_pid if w.launch_pid is not None else '?'}")
|
|
|
|
def _check_launch_health(self):
|
|
now = time.time()
|
|
for w in self.workers:
|
|
if not w.launch_attempted:
|
|
continue
|
|
p = w.proc
|
|
if not p:
|
|
continue
|
|
try:
|
|
rc = p.poll()
|
|
except Exception:
|
|
rc = None
|
|
|
|
if (rc is None) and (not w.hello_received) and w.launch_ts and (now - w.launch_ts > self._hello_timeout_s) and (not w.hello_timeout_reported):
|
|
w.hello_timeout_reported = True
|
|
self._record_launch_event(w, "STALLING", "NO_HANDSHAKE_TIMEOUT", f"waited>{int(self._hello_timeout_s)}s")
|
|
|
|
if (rc is not None) and (not w.hello_received) and (not w.exit_before_hello_reported):
|
|
w.exit_before_hello_reported = True
|
|
reason = _classify_runtime_exit_reason(getattr(w, "last_line", ""), rc)
|
|
self._record_launch_event(w, "FAILED_RUNTIME", reason, f"returncode={rc} last='{(w.last_line or '').strip()[:180]}'")
|
|
|
|
def _rebuild_dispatch_queues(self):
|
|
if self.dispatch_mode != "STRIDE":
|
|
for w in self.workers:
|
|
w.local_frames = []
|
|
return
|
|
worker_count = max(1, len(self.workers))
|
|
units = list(self.pending)
|
|
for idx, w in enumerate(self.workers):
|
|
w.local_frames = units[idx::worker_count]
|
|
|
|
def _output_spec_for_unit(self, unit):
|
|
expected_size = _mgpu_scene_expected_image_size(self.scene)
|
|
|
|
if self.job_mode == "MARI":
|
|
if not isinstance(unit, dict):
|
|
return None
|
|
root, name = _mgpu_mari_output_root(self.scene)
|
|
ext = (_mgpu_mari_ext_from_scene(self.scene) or "").lower().lstrip(".")
|
|
if not (root and name and ext):
|
|
return None
|
|
action = str(self.mari_globals.get("action", "STILL") or "STILL").upper()
|
|
is_video = bool(self.mari_globals.get("is_video", False))
|
|
h_txt = _mgpu_format_hv_label(unit.get("H"))
|
|
v_txt = _mgpu_format_hv_label(unit.get("V"))
|
|
stem = f"{name}_H{h_txt}_V{v_txt}"
|
|
try:
|
|
frame = int(unit.get("frame", -1))
|
|
except Exception:
|
|
frame = -1
|
|
if action == "ANIM" and (not is_video) and frame >= 0:
|
|
return {
|
|
"path": os.path.join(root, stem, f"{name}_{frame:04d}.{ext}"),
|
|
"kind": "image",
|
|
"expected_size": expected_size,
|
|
}
|
|
if action == "ANIM" and is_video:
|
|
start_f = int(getattr(self.scene, "frame_start", 0))
|
|
end_f = int(getattr(self.scene, "frame_end", 0))
|
|
video_stem = f"{name}_{start_f:04d}-{end_f:04d}_H{h_txt}_V{v_txt}"
|
|
return {
|
|
"path": os.path.join(root, f"{video_stem}.{ext}"),
|
|
"kind": "video",
|
|
"expected_size": None,
|
|
}
|
|
return {
|
|
"path": os.path.join(root, f"{stem}.{ext}"),
|
|
"kind": "image",
|
|
"expected_size": expected_size,
|
|
}
|
|
|
|
try:
|
|
frame = int(unit)
|
|
except Exception:
|
|
frame = unit
|
|
if self.video_mode and self.video_seq_dir:
|
|
return {
|
|
"path": _mgpu_video_seq_frame_path(self.video_seq_dir, frame, self.video_seq_ext),
|
|
"kind": "image",
|
|
"expected_size": expected_size,
|
|
}
|
|
return {
|
|
"path": _mgpu_scene_frame_output_path(self.scene, frame),
|
|
"kind": "image",
|
|
"expected_size": expected_size,
|
|
}
|
|
|
|
def _preflight_existing_outputs(self):
|
|
self._preflight_existing_check_done = False
|
|
self._skip_video_encode = False
|
|
self._rebuild_dispatch_queues()
|
|
|
|
if getattr(self.scene.render, "use_overwrite", True):
|
|
return
|
|
|
|
pending_before = list(self.pending)
|
|
total_checked = len(pending_before)
|
|
if total_checked <= 0:
|
|
return
|
|
|
|
if self.job_mode == "FRAMES" and self.video_mode and _mgpu_existing_file_nonempty(self.video_output_path):
|
|
reused = 0
|
|
for unit in pending_before:
|
|
if self._mark_finished(self._job_key(unit, "FRAMES")):
|
|
reused += 1
|
|
self.pending = []
|
|
self._skip_video_encode = True
|
|
self._preflight_existing_check_done = True
|
|
self._rebuild_dispatch_queues()
|
|
msg = (
|
|
f"[MGPU-PREFLIGHT] overwrite=OFF reused existing final video; "
|
|
f"checked={total_checked} reusable={reused} pending=0 final={self.video_output_path}"
|
|
)
|
|
_log(msg)
|
|
self._diag_write(msg)
|
|
return
|
|
|
|
dir_cache = {}
|
|
image_cache = {}
|
|
pending_after = []
|
|
reused = 0
|
|
missing = 0
|
|
invalid = 0
|
|
|
|
for unit in pending_before:
|
|
spec = self._output_spec_for_unit(unit)
|
|
path = os.path.normpath(str((spec or {}).get("path", "") or ""))
|
|
if not path:
|
|
pending_after.append(unit)
|
|
missing += 1
|
|
continue
|
|
dir_path = os.path.dirname(path)
|
|
base = os.path.basename(path).lower()
|
|
if dir_path not in dir_cache:
|
|
dir_cache[dir_path] = _mgpu_scan_dir_files(dir_path)
|
|
entry = dir_cache[dir_path].get(base)
|
|
if not entry:
|
|
pending_after.append(unit)
|
|
missing += 1
|
|
continue
|
|
if int(entry.get("size", -1) or -1) <= 0:
|
|
pending_after.append(unit)
|
|
invalid += 1
|
|
continue
|
|
|
|
kind = str(spec.get("kind", "image") or "image").lower()
|
|
if kind == "image":
|
|
ok = _mgpu_validate_existing_image(
|
|
path,
|
|
expected_size=spec.get("expected_size"),
|
|
cache=image_cache,
|
|
)
|
|
else:
|
|
ok = True
|
|
|
|
if ok:
|
|
if self._mark_finished(self._job_key(unit)):
|
|
reused += 1
|
|
else:
|
|
pending_after.append(unit)
|
|
invalid += 1
|
|
|
|
self.pending = pending_after
|
|
self._preflight_existing_check_done = True
|
|
if self.job_mode == "FRAMES" and self.video_mode and (not self.pending) and _mgpu_existing_file_nonempty(self.video_output_path):
|
|
self._skip_video_encode = True
|
|
self._rebuild_dispatch_queues()
|
|
|
|
msg = (
|
|
f"[MGPU-PREFLIGHT] overwrite=OFF checked={total_checked} reusable={reused} "
|
|
f"pending={len(self.pending)} missing={missing} invalid={invalid} dirs={len(dir_cache)}"
|
|
)
|
|
_log(msg)
|
|
self._diag_write(msg)
|
|
if self.job_mode == "FRAMES" and self.video_mode and (not self.pending) and (not self._skip_video_encode):
|
|
enc_msg = (
|
|
f"[MGPU-PREFLIGHT] All temp frames are already valid; final video will be encoded -> "
|
|
f"{self.video_output_path}"
|
|
)
|
|
_log(enc_msg)
|
|
self._diag_write(enc_msg)
|
|
|
|
def _next_frame_for_tag(self, tag):
|
|
if self.dispatch_mode == "STRIDE":
|
|
for w in self.workers:
|
|
if w.tag == tag and getattr(w, "local_frames", None) is not None:
|
|
while w.local_frames:
|
|
frame = w.local_frames.pop(0)
|
|
if self._job_key(frame, "FRAMES") in self.finished_set:
|
|
continue
|
|
return frame
|
|
return None
|
|
while self.pending:
|
|
frame = self.pending.pop(0)
|
|
if self._job_key(frame, "FRAMES") in self.finished_set:
|
|
continue
|
|
return frame
|
|
return None
|
|
|
|
def _requeue_frame_for_tag(self, tag, frame, prefer_other=False):
|
|
if self.dispatch_mode == "STRIDE":
|
|
target = None
|
|
if prefer_other:
|
|
others = [w for w in self.workers if w.tag != tag]
|
|
if others:
|
|
target = min(others, key=lambda w: len(getattr(w, "local_frames", []) or []))
|
|
if target is None:
|
|
for w in self.workers:
|
|
if w.tag == tag:
|
|
target = w
|
|
break
|
|
if target:
|
|
if getattr(target, "local_frames", None) is None:
|
|
target.local_frames = []
|
|
target.local_frames.append(frame)
|
|
return
|
|
else:
|
|
self.pending.insert(0, frame)
|
|
|
|
def _next_mari_for_tag(self, tag):
|
|
if self.dispatch_mode == "STRIDE":
|
|
for w in self.workers:
|
|
if w.tag == tag and getattr(w, "local_frames", None) is not None:
|
|
# For MARI stride we reuse local_frames to store jobs
|
|
while w.local_frames:
|
|
job = w.local_frames.pop(0)
|
|
if self._job_key(job, "MARI") in self.finished_set:
|
|
continue
|
|
return job
|
|
return None
|
|
while self.pending:
|
|
job = self.pending.pop(0)
|
|
if self._job_key(job, "MARI") in self.finished_set:
|
|
continue
|
|
return job
|
|
return None
|
|
|
|
def _requeue_mari_for_tag(self, tag, job, prefer_other=False):
|
|
if self.dispatch_mode == "STRIDE":
|
|
target = None
|
|
if prefer_other:
|
|
others = [w for w in self.workers if w.tag != tag]
|
|
if others:
|
|
target = min(others, key=lambda w: len(getattr(w, "local_frames", []) or []))
|
|
if target is None:
|
|
for w in self.workers:
|
|
if w.tag == tag:
|
|
target = w
|
|
break
|
|
if target:
|
|
if getattr(target, "local_frames", None) is None:
|
|
target.local_frames = []
|
|
target.local_frames.append(job)
|
|
return
|
|
else:
|
|
self.pending.insert(0, job)
|
|
|
|
def _job_key(self, job, mode=None):
|
|
mode = mode or self.job_mode
|
|
if mode == "MARI":
|
|
if isinstance(job, dict):
|
|
cam = job.get("cam_name") or job.get("camera") or job.get("name") or "?"
|
|
h = job.get("H")
|
|
v = job.get("V")
|
|
frame = job.get("frame", None)
|
|
try:
|
|
frame_val = int(frame) if frame is not None else -1
|
|
except Exception:
|
|
frame_val = frame if frame is not None else -1
|
|
return f"mari:{cam}:{h}:{v}:{frame_val}"
|
|
return f"mari:{job}"
|
|
try:
|
|
n = int(job)
|
|
except Exception:
|
|
n = job
|
|
return f"frame:{n}"
|
|
|
|
def _job_label(self, job, mode=None):
|
|
mode = mode or self.job_mode
|
|
if mode == "MARI":
|
|
if isinstance(job, dict):
|
|
cam = job.get("cam_name") or "?"
|
|
frame = job.get("frame", None)
|
|
try:
|
|
frame_val = int(frame) if frame is not None else -1
|
|
except Exception:
|
|
frame_val = frame if frame is not None else -1
|
|
if frame_val is None or frame_val == -1:
|
|
return cam
|
|
return f"{cam} f{frame_val}"
|
|
return str(job)
|
|
return f"frame {job}"
|
|
|
|
def _record_inflight(self, tag, job):
|
|
key = self._job_key(job)
|
|
now = time.time()
|
|
w = self._worker_by_tag(tag)
|
|
progress_ts = now
|
|
if w:
|
|
w.cycles_cpu_hint_ts = 0.0
|
|
w.cycles_cpu_hint_line = ""
|
|
w.guard_last_progress_ts = now
|
|
w.guard_last_progress_sig = ("ASSIGN", str(key))
|
|
self.inflight[tag] = {
|
|
"job": job,
|
|
"key": key,
|
|
"start": now,
|
|
"stolen": False,
|
|
"guard_hedged": False,
|
|
"guard_hedge_ts": 0.0,
|
|
"guard_last_progress_ts": progress_ts,
|
|
"guard_progress_seen": False,
|
|
"guard_epoch": int(getattr(w, "guard_epoch", 0) or 0),
|
|
}
|
|
return key
|
|
|
|
def _mark_finished(self, job_key):
|
|
if job_key in self.finished_set:
|
|
return False
|
|
self.finished_set.add(job_key)
|
|
self.finished.append(job_key)
|
|
self.rt_guard_job_state.pop(job_key, None)
|
|
return True
|
|
|
|
def _update_worker_avg(self, tag, elapsed):
|
|
try:
|
|
elapsed = float(elapsed)
|
|
except Exception:
|
|
return
|
|
if elapsed <= 0.0:
|
|
return
|
|
st = self.worker_stats.get(tag)
|
|
if not st:
|
|
st = {"count": 0, "avg": 0.0}
|
|
count = st["count"] + 1
|
|
avg = (st["avg"] * st["count"] + elapsed) / count
|
|
st["count"] = count
|
|
st["avg"] = avg
|
|
self.worker_stats[tag] = st
|
|
self.total_render_time += elapsed
|
|
self.total_render_count += 1
|
|
self.rt_last_real_completion_ts = time.time()
|
|
try:
|
|
self._worker_hist.setdefault(tag, deque(maxlen=12)).append(float(elapsed))
|
|
self._global_hist.append(float(elapsed))
|
|
except Exception:
|
|
pass
|
|
|
|
def _reset_worker_timing_baseline(self, tag):
|
|
self.worker_stats.pop(tag, None)
|
|
try:
|
|
self._worker_hist[tag] = deque(maxlen=12)
|
|
except Exception:
|
|
self._worker_hist[tag] = deque(maxlen=12)
|
|
|
|
def _avg_for_tag(self, tag):
|
|
st = self.worker_stats.get(tag)
|
|
if st and st.get("count", 0) > 0:
|
|
return st.get("avg", 0.0)
|
|
if self.total_render_count > 0:
|
|
return self.total_render_time / float(self.total_render_count)
|
|
return None
|
|
|
|
def _rt_guard_log(self, key: str, msg: str, every_s: float = 20.0, force: bool = False):
|
|
now = time.time()
|
|
if not force:
|
|
last = self._rt_guard_last_log.get(key, 0.0)
|
|
if (now - last) < max(0.0, float(every_s)):
|
|
return
|
|
self._rt_guard_last_log[key] = now
|
|
_log(msg)
|
|
self._diag_write(msg)
|
|
|
|
def _alive_worker_count(self):
|
|
n = 0
|
|
for w in self.workers:
|
|
try:
|
|
if w.alive():
|
|
n += 1
|
|
except Exception:
|
|
continue
|
|
return n
|
|
|
|
def _has_other_alive_worker(self, tag):
|
|
for w in self.workers:
|
|
if w.tag == tag:
|
|
continue
|
|
try:
|
|
if w.alive():
|
|
return True
|
|
except Exception:
|
|
continue
|
|
return False
|
|
|
|
def _rt_pending_count(self):
|
|
count = 0
|
|
if self.dispatch_mode == "STRIDE":
|
|
for w in self.workers:
|
|
for unit in list(getattr(w, "local_frames", []) or []):
|
|
try:
|
|
if self._job_key(unit) in self.finished_set:
|
|
continue
|
|
except Exception:
|
|
pass
|
|
count += 1
|
|
return count
|
|
for unit in list(self.pending):
|
|
try:
|
|
if self._job_key(unit) in self.finished_set:
|
|
continue
|
|
except Exception:
|
|
pass
|
|
count += 1
|
|
return count
|
|
|
|
def _rt_tail_straggler_ready(self, tag):
|
|
# Tail-straggler means this worker is the only remaining inflight job
|
|
# and there is no queued work left for anyone else.
|
|
if tag not in self.inflight:
|
|
return False
|
|
if len(self.inflight) != 1:
|
|
return False
|
|
if self._rt_pending_count() > 0:
|
|
return False
|
|
others = [w for w in self.workers if w.tag != tag]
|
|
if not others:
|
|
return False
|
|
for ow in others:
|
|
try:
|
|
st = self.worker_stats.get(ow.tag) or {}
|
|
if int(st.get("count", 0) or 0) > 0:
|
|
return True
|
|
except Exception:
|
|
pass
|
|
try:
|
|
if (ow.tag not in self.inflight) and ow.alive() and bool(getattr(ow, "hello_received", False)):
|
|
return True
|
|
except Exception:
|
|
pass
|
|
return False
|
|
|
|
def _rt_warmup_state(self, tag):
|
|
need_global = int(self.rt_guard_cfg.get("warmup_completed_jobs", 0) or 0)
|
|
need_worker = int(self.rt_guard_cfg.get("warmup_per_worker_jobs", 0) or 0)
|
|
done_global = int(self.total_render_count or 0)
|
|
done_worker = int((self.worker_stats.get(tag) or {}).get("count", 0) or 0)
|
|
block_global = (need_global > 0 and done_global < need_global)
|
|
block_worker = (need_worker > 0 and done_worker < need_worker)
|
|
block = bool(block_global or block_worker)
|
|
return {
|
|
"block": block,
|
|
"done_global": done_global,
|
|
"done_worker": done_worker,
|
|
"need_global": need_global,
|
|
"need_worker": need_worker,
|
|
}
|
|
|
|
def _rt_activate_periodic_recycle_stages(self):
|
|
if (not self.rt_guard_enabled) or (not self.rt_periodic_recycle_enabled):
|
|
return
|
|
if not self.rt_periodic_recycle_points:
|
|
return
|
|
total = int(getattr(self, "total_frames", 0) or 0)
|
|
if total <= 0:
|
|
return
|
|
rendered_done = int(self.total_render_count or 0)
|
|
min_done = int(self.rt_guard_cfg.get("periodic_recycle_min_completed_jobs", 0) or 0)
|
|
if rendered_done < max(0, min_done):
|
|
return
|
|
progress = float(rendered_done) / float(max(1, total))
|
|
gpu_tags = [w.tag for w in self.workers if not getattr(w, "is_cpu", False)]
|
|
if not gpu_tags:
|
|
return
|
|
for p in self.rt_periodic_recycle_points:
|
|
mark = int(round(float(p) * 100.0))
|
|
if mark in self.rt_periodic_recycle_seen:
|
|
continue
|
|
if progress < float(p):
|
|
continue
|
|
self.rt_periodic_recycle_seen.add(mark)
|
|
self.rt_periodic_recycle_pending[mark] = set(gpu_tags)
|
|
self._rt_guard_log(
|
|
f"rt-periodic-activate-{mark}",
|
|
f"[MGPU-GUARD] Activated VRAM hygiene recycle stage {mark}% "
|
|
f"(rendered={rendered_done}/{total}, completed={len(self.finished_set)}/{total}).",
|
|
force=True
|
|
)
|
|
|
|
def _rt_try_periodic_recycle_after_job(self, tag):
|
|
if (not self.rt_guard_enabled) or (not self.rt_periodic_recycle_enabled):
|
|
return
|
|
self._rt_activate_periodic_recycle_stages()
|
|
if not self.rt_periodic_recycle_pending:
|
|
return
|
|
try:
|
|
stages = sorted(self.rt_periodic_recycle_pending.keys())
|
|
except Exception:
|
|
stages = list(self.rt_periodic_recycle_pending.keys())
|
|
for stage in stages:
|
|
pending = self.rt_periodic_recycle_pending.get(stage)
|
|
if not pending:
|
|
self.rt_periodic_recycle_pending.pop(stage, None)
|
|
continue
|
|
if tag not in pending:
|
|
continue
|
|
w = self._worker_by_tag(tag)
|
|
if (not w) or w.is_cpu:
|
|
pending.discard(tag)
|
|
if not pending:
|
|
self.rt_periodic_recycle_pending.pop(stage, None)
|
|
continue
|
|
now = time.time()
|
|
can_restart, why = self._rt_can_restart_worker(w, now)
|
|
if not can_restart:
|
|
self._rt_guard_log(
|
|
f"rt-periodic-skip-{stage}-{tag}",
|
|
f"[MGPU-GUARD] {tag}: periodic VRAM recycle {stage}% delayed ({why}).",
|
|
every_s=20.0
|
|
)
|
|
return
|
|
reason = f"periodic-vram-hygiene-{stage}%"
|
|
ok = self._restart_worker_same_gpu(w, reason, info=None)
|
|
if ok:
|
|
pending.discard(tag)
|
|
self._rt_guard_log(
|
|
f"rt-periodic-restarted-{stage}-{tag}",
|
|
f"[MGPU-GUARD] {tag}: periodic VRAM hygiene restart at {stage}% complete.",
|
|
force=True
|
|
)
|
|
self._open_diag_terminal_if_needed()
|
|
if not pending:
|
|
self.rt_periodic_recycle_pending.pop(stage, None)
|
|
self._rt_guard_log(
|
|
f"rt-periodic-stage-done-{stage}",
|
|
f"[MGPU-GUARD] Periodic VRAM hygiene stage {stage}% completed for all GPU workers.",
|
|
force=True
|
|
)
|
|
else:
|
|
self._rt_guard_log(
|
|
f"rt-periodic-fail-{stage}-{tag}",
|
|
f"[MGPU-GUARD] {tag}: periodic VRAM hygiene restart at {stage}% failed.",
|
|
force=True
|
|
)
|
|
self._open_diag_terminal_if_needed()
|
|
return
|
|
|
|
def _rt_baseline_for(self, tag):
|
|
min_base = float(self.rt_guard_cfg.get("min_baseline_s", 20.0) or 20.0)
|
|
worker_med = _median(self._worker_hist.get(tag, []))
|
|
global_med = _median(self._global_hist)
|
|
tag_avg = self._avg_for_tag(tag)
|
|
worker_count = int((self.worker_stats.get(tag) or {}).get("count", 0) or 0)
|
|
cands = []
|
|
pref = (worker_med, tag_avg) if worker_count > 0 else ()
|
|
fallback = (global_med,) if worker_count <= 0 else (global_med,)
|
|
for v in tuple(pref) + tuple(fallback):
|
|
try:
|
|
fv = float(v)
|
|
if fv > 0:
|
|
cands.append(fv)
|
|
except Exception:
|
|
pass
|
|
if not cands:
|
|
return min_base
|
|
return max(max(cands), min_base)
|
|
|
|
def _rt_stall_restart_threshold(self, baseline, soft_th, progress_stall_s):
|
|
try:
|
|
baseline = float(baseline)
|
|
except Exception:
|
|
baseline = 0.0
|
|
try:
|
|
soft_th = float(soft_th)
|
|
except Exception:
|
|
soft_th = 0.0
|
|
try:
|
|
progress_stall_s = float(progress_stall_s)
|
|
except Exception:
|
|
progress_stall_s = 0.0
|
|
return max(progress_stall_s * 2.0, min(soft_th if soft_th > 0.0 else progress_stall_s, baseline * 2.5 if baseline > 0.0 else progress_stall_s))
|
|
|
|
def _rt_clean_worker_restart_marks(self, w: Worker):
|
|
window_frames = int(self.rt_guard_cfg.get("worker_restart_window_frames", 20) or 20)
|
|
kept = []
|
|
for marker in list(getattr(w, "guard_restart_marks", []) or []):
|
|
try:
|
|
if (int(self.total_render_count) - int(marker)) < window_frames:
|
|
kept.append(int(marker))
|
|
except Exception:
|
|
pass
|
|
w.guard_restart_marks = kept
|
|
|
|
def _rt_can_restart_worker(self, w: Worker, now: float):
|
|
if (not w) or (not w.alive()):
|
|
return (False, "worker-not-alive")
|
|
if now < float(getattr(self, "rt_guard_pause_until", 0.0) or 0.0):
|
|
return (False, "global-pause")
|
|
|
|
global_window = float(self.rt_guard_cfg.get("global_restart_window_s", 150.0) or 150.0)
|
|
global_limit = int(self.rt_guard_cfg.get("global_restart_limit", 3) or 3)
|
|
while self.rt_guard_restart_ts and ((now - self.rt_guard_restart_ts[0]) > global_window):
|
|
self.rt_guard_restart_ts.popleft()
|
|
if global_limit > 0 and len(self.rt_guard_restart_ts) >= global_limit:
|
|
self.rt_guard_pause_until = now + global_window
|
|
self._rt_guard_log(
|
|
"rt-global-circuit",
|
|
f"[MGPU-GUARD] Global restart circuit open for {int(global_window)}s (limit={global_limit}).",
|
|
force=True
|
|
)
|
|
self._open_diag_terminal_if_needed()
|
|
return (False, "global-circuit")
|
|
|
|
cooldown = float(self.rt_guard_cfg.get("worker_restart_cooldown_s", 120.0) or 120.0)
|
|
if w.guard_restart_ts and ((now - w.guard_restart_ts[-1]) < cooldown):
|
|
return (False, "worker-cooldown")
|
|
|
|
self._rt_clean_worker_restart_marks(w)
|
|
budget = int(self.rt_guard_cfg.get("worker_restart_budget", 2) or 2)
|
|
if budget > 0 and len(w.guard_restart_marks) >= budget:
|
|
return (False, "worker-budget")
|
|
|
|
return (True, "ok")
|
|
|
|
def _duplicate_job_for_hedge(self, tag, job):
|
|
if job is None:
|
|
return
|
|
if self.job_mode == "MARI":
|
|
self._requeue_mari_for_tag(tag, job, prefer_other=True)
|
|
else:
|
|
try:
|
|
frame = int(job)
|
|
except Exception:
|
|
frame = job
|
|
self._requeue_frame_for_tag(tag, frame, prefer_other=True)
|
|
|
|
def _restart_worker_same_gpu(self, w: Worker, reason: str, info=None):
|
|
now = time.time()
|
|
if info is None:
|
|
info = self.inflight.get(w.tag)
|
|
self.inflight.pop(w.tag, None)
|
|
|
|
if info:
|
|
key = info.get("key")
|
|
job = info.get("job")
|
|
if job is not None and (not key or key not in self.finished_set):
|
|
self._duplicate_job_for_hedge(w.tag, job)
|
|
self._rt_guard_log(
|
|
f"rt-requeue-{w.tag}",
|
|
f"[MGPU-GUARD] {w.tag}: requeued {self._job_label(job)} before restart.",
|
|
force=True
|
|
)
|
|
|
|
p = getattr(w, "proc", None)
|
|
if p and (p.poll() is None):
|
|
try:
|
|
if IS_WIN:
|
|
p.send_signal(signal.CTRL_BREAK_EVENT)
|
|
else:
|
|
p.terminate()
|
|
except Exception:
|
|
pass
|
|
deadline = time.time() + 2.5
|
|
while (p.poll() is None) and (time.time() < deadline):
|
|
time.sleep(0.05)
|
|
if p.poll() is None:
|
|
try:
|
|
p.kill()
|
|
except Exception:
|
|
pass
|
|
|
|
w.proc = None
|
|
w.stdout_thread = None
|
|
w.launch_ok = False
|
|
w.launch_pid = None
|
|
w.hello_received = False
|
|
w.hello_ts = 0.0
|
|
w.hello_timeout_reported = False
|
|
w.exit_before_hello_reported = False
|
|
w.guard_epoch = int(getattr(w, "guard_epoch", 0) or 0) + 1
|
|
w.guard_last_progress_ts = now
|
|
w.guard_last_progress_sig = ("RESTART", int(now))
|
|
self._reset_worker_timing_baseline(w.tag)
|
|
self._record_launch_event(w, "RESTARTING", "RENDERTIME_GUARD", reason)
|
|
|
|
ok = self._launch_worker_process(w)
|
|
if ok:
|
|
w.guard_restart_ts.append(now)
|
|
w.guard_restart_marks.append(int(self.total_render_count))
|
|
w.guard_restarts_total += 1
|
|
self.rt_guard_restart_ts.append(now)
|
|
return ok
|
|
|
|
def _check_render_time_guard(self):
|
|
if not self.rt_guard_enabled:
|
|
return
|
|
cfg = self.rt_guard_cfg
|
|
now = time.time()
|
|
min_samples_soft = int(cfg.get("min_samples_soft", 3) or 3)
|
|
soft_mult = float(cfg.get("soft_mult", 2.5) or 2.5)
|
|
soft_min = float(cfg.get("soft_min_s", 60.0) or 60.0)
|
|
hard_mult = float(cfg.get("hard_mult", 4.5) or 4.5)
|
|
hard_min = float(cfg.get("hard_min_s", 180.0) or 180.0)
|
|
progress_stall_s = float(cfg.get("progress_stall_s", 60.0) or 60.0)
|
|
hedge_grace_s = float(cfg.get("hedge_grace_s", 45.0) or 45.0)
|
|
hedge_max_per_job = int(cfg.get("hedge_max_per_job", 1) or 1)
|
|
restart_max_per_job = int(cfg.get("restart_max_per_job", 1) or 1)
|
|
single_worker_min_stall = float(cfg.get("single_worker_min_stall_s", 180.0) or 180.0)
|
|
snapshots = {}
|
|
for tag, info in list(self.inflight.items()):
|
|
w = self._worker_by_tag(tag)
|
|
if (not w) or (not w.alive()):
|
|
continue
|
|
key = info.get("key")
|
|
if not key:
|
|
continue
|
|
start = float(info.get("start", now) or now)
|
|
elapsed = max(0.0, now - start)
|
|
baseline = self._rt_baseline_for(tag)
|
|
soft_th = max(soft_min, baseline * soft_mult)
|
|
hard_th = max(hard_min, baseline * hard_mult)
|
|
progress_ts = float(getattr(w, "guard_last_progress_ts", 0.0) or 0.0)
|
|
if progress_ts <= 0.0:
|
|
progress_ts = float(info.get("guard_last_progress_ts", start) or start)
|
|
progress_ts = max(progress_ts, start)
|
|
info["guard_last_progress_ts"] = progress_ts
|
|
no_progress_for = max(0.0, now - progress_ts)
|
|
stall_restart_s = self._rt_stall_restart_threshold(baseline, soft_th, progress_stall_s)
|
|
snapshots[tag] = {
|
|
"worker": w,
|
|
"key": key,
|
|
"start": start,
|
|
"elapsed": elapsed,
|
|
"baseline": baseline,
|
|
"soft_th": soft_th,
|
|
"hard_th": hard_th,
|
|
"progress_ts": progress_ts,
|
|
"no_progress_for": no_progress_for,
|
|
"stall_restart_s": stall_restart_s,
|
|
}
|
|
|
|
global_wave_tags = set()
|
|
active_tags = [tag for tag, snap in snapshots.items() if snap["key"] not in self.finished_set]
|
|
if len(active_tags) > 1 and self.total_render_count >= max(1, min_samples_soft):
|
|
all_stalled = True
|
|
for tag in active_tags:
|
|
snap = snapshots[tag]
|
|
info = self.inflight.get(tag) or {}
|
|
if (not bool(info.get("guard_progress_seen"))) or snap["elapsed"] < snap["stall_restart_s"] or snap["no_progress_for"] < snap["stall_restart_s"]:
|
|
all_stalled = False
|
|
break
|
|
if all_stalled:
|
|
global_wave_tags = set(active_tags)
|
|
wave_gap = min(snapshots[tag]["no_progress_for"] for tag in active_tags)
|
|
self._rt_guard_log(
|
|
"rt-global-stall-wave",
|
|
f"[MGPU-GUARD] Global no-progress wave detected across {len(active_tags)} workers "
|
|
f"(stall={wave_gap:.1f}s).",
|
|
every_s=15.0
|
|
)
|
|
|
|
for tag, info in list(self.inflight.items()):
|
|
snap = snapshots.get(tag)
|
|
if not snap:
|
|
continue
|
|
w = snap["worker"]
|
|
key = snap["key"]
|
|
job = info.get("job")
|
|
elapsed = snap["elapsed"]
|
|
baseline = snap["baseline"]
|
|
soft_th = snap["soft_th"]
|
|
hard_th = snap["hard_th"]
|
|
no_progress_for = snap["no_progress_for"]
|
|
stall_restart_s = snap["stall_restart_s"]
|
|
progress_seen = bool(info.get("guard_progress_seen"))
|
|
|
|
state = self.rt_guard_job_state.setdefault(key, {"hedges": 0, "restarts": 0})
|
|
warm = self._rt_warmup_state(tag)
|
|
if warm.get("block") and tag not in global_wave_tags:
|
|
tail_ready = self._rt_tail_straggler_ready(tag)
|
|
if not tail_ready:
|
|
needs = []
|
|
if int(warm.get("need_worker", 0) or 0) > 0:
|
|
needs.append(f"worker={int(warm.get('done_worker', 0))}/{int(warm.get('need_worker', 0))}")
|
|
if int(warm.get("need_global", 0) or 0) > 0:
|
|
needs.append(f"global={int(warm.get('done_global', 0))}/{int(warm.get('need_global', 0))}")
|
|
detail = ", ".join(needs) if needs else "warmup"
|
|
self._rt_guard_log(
|
|
f"rt-warmup-skip-{tag}",
|
|
f"[MGPU-GUARD] {tag}: warmup skip for {self._job_label(job)} ({detail}).",
|
|
every_s=20.0
|
|
)
|
|
continue
|
|
self._rt_guard_log(
|
|
f"rt-warmup-tail-{tag}",
|
|
f"[MGPU-GUARD] {tag}: warmup override (tail-straggler) for {self._job_label(job)}.",
|
|
every_s=20.0
|
|
)
|
|
|
|
restart_reason = None
|
|
if self.scene.render.engine == "CYCLES" and (not w.is_cpu):
|
|
hint_ts = float(getattr(w, "cycles_cpu_hint_ts", 0.0) or 0.0)
|
|
if hint_ts > 0.0 and hint_ts >= snap["start"]:
|
|
line_hint = str(getattr(w, "cycles_cpu_hint_line", "") or "").strip()
|
|
if self._cycles_backend_for_worker(w) == "OPTIX":
|
|
w.cycles_backend_override = "CUDA"
|
|
self._rt_guard_log(
|
|
f"rt-cpu-hint-switch-{tag}",
|
|
f"[MGPU-GUARD] {tag}: CPU hint detected; switching backend OPTIX -> CUDA.",
|
|
every_s=10.0
|
|
)
|
|
restart_reason = f"cpu-device-hint ({line_hint[:96]})"
|
|
|
|
if (key not in self.finished_set) and (restart_reason is None):
|
|
can_hedge = (
|
|
tag not in global_wave_tags and
|
|
progress_seen and
|
|
elapsed >= soft_th and
|
|
self.total_render_count >= min_samples_soft and
|
|
(not info.get("guard_hedged")) and
|
|
state.get("hedges", 0) < hedge_max_per_job and
|
|
self._has_other_alive_worker(tag)
|
|
)
|
|
if can_hedge:
|
|
info["guard_hedged"] = True
|
|
info["guard_hedge_ts"] = now
|
|
state["hedges"] = int(state.get("hedges", 0)) + 1
|
|
self._duplicate_job_for_hedge(tag, job)
|
|
self._rt_guard_log(
|
|
f"rt-hedge-{key}",
|
|
f"[MGPU-GUARD] {tag}: hedge duplicate for {self._job_label(job)} "
|
|
f"(elapsed={elapsed:.1f}s, baseline={baseline:.1f}s, soft={soft_th:.1f}s).",
|
|
force=True
|
|
)
|
|
continue
|
|
|
|
if (key in self.finished_set) and info.get("guard_hedged"):
|
|
hedge_for = max(0.0, now - float(info.get("guard_hedge_ts", now) or now))
|
|
if hedge_for >= hedge_grace_s:
|
|
restart_reason = f"hedged-copy-finished-elsewhere ({hedge_for:.0f}s)"
|
|
|
|
if restart_reason is None and info.get("guard_hedged"):
|
|
hedge_for = max(0.0, now - float(info.get("guard_hedge_ts", now) or now))
|
|
if progress_seen and hedge_for >= hedge_grace_s and no_progress_for >= stall_restart_s:
|
|
restart_reason = f"post-hedge no-progress {no_progress_for:.0f}s"
|
|
|
|
if restart_reason is None and tag in global_wave_tags:
|
|
restart_reason = f"global-no-progress-wave {no_progress_for:.0f}s"
|
|
|
|
if restart_reason is None and elapsed >= hard_th:
|
|
if progress_seen and no_progress_for >= progress_stall_s:
|
|
restart_reason = f"no-progress {no_progress_for:.0f}s"
|
|
elif info.get("guard_hedged"):
|
|
hedge_for = max(0.0, now - float(info.get("guard_hedge_ts", now) or now))
|
|
if hedge_for >= hedge_grace_s:
|
|
restart_reason = f"post-hedge slow ({hedge_for:.0f}s)"
|
|
elif elapsed >= (hard_th * 1.35):
|
|
restart_reason = "hard-timeout"
|
|
|
|
if restart_reason is None:
|
|
continue
|
|
|
|
if int(state.get("restarts", 0)) >= restart_max_per_job:
|
|
self._rt_guard_log(
|
|
f"rt-restart-cap-{key}",
|
|
f"[MGPU-GUARD] {tag}: restart cap reached for {self._job_label(job)}; continuing without restart.",
|
|
every_s=45.0
|
|
)
|
|
continue
|
|
|
|
if self._alive_worker_count() <= 1 and no_progress_for < single_worker_min_stall:
|
|
self._rt_guard_log(
|
|
f"rt-single-skip-{tag}",
|
|
f"[MGPU-GUARD] {tag}: single-worker mode, delaying restart until stall>{int(single_worker_min_stall)}s.",
|
|
every_s=45.0
|
|
)
|
|
continue
|
|
|
|
can_restart, why = self._rt_can_restart_worker(w, now)
|
|
if not can_restart:
|
|
self._rt_guard_log(
|
|
f"rt-restart-skip-{tag}-{why}",
|
|
f"[MGPU-GUARD] {tag}: restart skipped ({why}).",
|
|
every_s=30.0
|
|
)
|
|
continue
|
|
|
|
state["restarts"] = int(state.get("restarts", 0)) + 1
|
|
reason = (
|
|
f"{restart_reason}; elapsed={elapsed:.1f}s baseline={baseline:.1f}s "
|
|
f"stall={stall_restart_s:.1f}s soft={soft_th:.1f}s hard={hard_th:.1f}s"
|
|
)
|
|
ok = self._restart_worker_same_gpu(w, reason, info=info)
|
|
if ok:
|
|
self._rt_guard_log(
|
|
f"rt-restarted-{tag}",
|
|
f"[MGPU-GUARD] {tag}: restarted on same GPU ({reason}).",
|
|
force=True
|
|
)
|
|
self._open_diag_terminal_if_needed()
|
|
else:
|
|
self._rt_guard_log(
|
|
f"rt-restart-fail-{tag}",
|
|
f"[MGPU-GUARD] {tag}: restart failed ({reason}).",
|
|
force=True
|
|
)
|
|
self._open_diag_terminal_if_needed()
|
|
|
|
def _handle_job_done(self, tag, msg):
|
|
ok = bool(msg.get("ok", False))
|
|
meta = dict(msg.get("meta") or {})
|
|
inflight = self.inflight.pop(tag, None)
|
|
job = inflight.get("job") if inflight else None
|
|
key = inflight.get("key") if inflight else None
|
|
if job is None:
|
|
job = msg.get("job")
|
|
if job is None:
|
|
job = msg.get("done")
|
|
if job is None:
|
|
_log(f"WARNING: Missing job payload from {tag}")
|
|
return
|
|
if key is None:
|
|
mode = "MARI" if isinstance(job, dict) else "FRAMES"
|
|
key = self._job_key(job, mode)
|
|
if not ok:
|
|
w = self._worker_by_tag(tag)
|
|
err_text = str(msg.get("err", "") or "")
|
|
if self._handle_cycles_gpu_policy_failure(w, err_text, inflight=inflight):
|
|
return
|
|
self._handle_retry(tag, job, key=key, reason="failed")
|
|
return
|
|
if key in self.finished_set:
|
|
self.rt_guard_job_state.pop(key, None)
|
|
return
|
|
w = self._worker_by_tag(tag)
|
|
if w:
|
|
try:
|
|
w.guard_last_progress_ts = time.time()
|
|
w.guard_last_progress_sig = ("SOCKET_DONE", str(key))
|
|
except Exception:
|
|
pass
|
|
if self.scene.render.engine == "CYCLES" and w and (not w.is_cpu):
|
|
hint_ts = float(getattr(w, "cycles_cpu_hint_ts", 0.0) or 0.0)
|
|
start_ts = float((inflight or {}).get("start", 0.0) or 0.0)
|
|
if hint_ts > 0.0 and start_ts > 0.0 and hint_ts >= start_ts:
|
|
err_text = f"GPU_POLICY_VIOLATION cpu-hint: {getattr(w, 'cycles_cpu_hint_line', '')}"
|
|
if self._handle_cycles_gpu_policy_failure(w, err_text, inflight=inflight):
|
|
return
|
|
self._handle_retry(tag, job, key=key, reason="cpu-hint")
|
|
return
|
|
rendered = meta.get("rendered")
|
|
skipped = meta.get("skipped")
|
|
if rendered is None:
|
|
err_text = str(msg.get("err", "") or "")
|
|
if "skip existing" in err_text.lower():
|
|
rendered = False
|
|
skipped = True
|
|
else:
|
|
rendered = ok
|
|
rendered = bool(rendered)
|
|
skipped = bool(skipped)
|
|
if rendered:
|
|
elapsed = 0.0
|
|
try:
|
|
elapsed = float(meta.get("elapsed", 0.0) or 0.0)
|
|
except Exception:
|
|
elapsed = 0.0
|
|
if elapsed <= 0.0 and inflight and inflight.get("start"):
|
|
elapsed = max(0.0, time.time() - inflight["start"])
|
|
self._update_worker_avg(tag, elapsed)
|
|
elif skipped:
|
|
self._rt_guard_log(
|
|
f"rt-skip-{tag}-{key}",
|
|
f"[MGPU-GUARD] {tag}: reused existing output for {self._job_label(job)}; not counted in timing baseline.",
|
|
every_s=5.0
|
|
)
|
|
marked = self._mark_finished(key)
|
|
if marked and rendered:
|
|
self._rt_try_periodic_recycle_after_job(tag)
|
|
|
|
def _handle_retry(self, tag, job, key=None, reason="failed", prefer_other=False):
|
|
if job is None:
|
|
return
|
|
if key is None:
|
|
mode = "MARI" if isinstance(job, dict) else "FRAMES"
|
|
key = self._job_key(job, mode)
|
|
if key in self.finished_set:
|
|
return
|
|
self.inflight.pop(tag, None)
|
|
tries = self.retries.get(key, 0) + 1
|
|
self.retries[key] = tries
|
|
if tries > self.max_retries:
|
|
_log(f"Giving up on {self._job_label(job)} after {tries - 1} retries ({reason}).")
|
|
self._mark_finished(key)
|
|
return
|
|
_log(f"Retrying {self._job_label(job)} ({reason}) attempt {tries}/{self.max_retries}")
|
|
if self.job_mode == "MARI":
|
|
self._requeue_mari_for_tag(tag, job, prefer_other=prefer_other)
|
|
else:
|
|
try:
|
|
frame = int(job)
|
|
except Exception:
|
|
frame = job
|
|
self._requeue_frame_for_tag(tag, frame, prefer_other=prefer_other)
|
|
|
|
def _check_inflight_timeouts(self):
|
|
if self.rt_guard_enabled:
|
|
return
|
|
if len(self.workers) < 2:
|
|
return
|
|
now = time.time()
|
|
for tag, info in list(self.inflight.items()):
|
|
if info.get("stolen"):
|
|
continue
|
|
start = info.get("start", 0)
|
|
if not start:
|
|
continue
|
|
avg = self._avg_for_tag(tag)
|
|
if avg is None or avg <= 0.0:
|
|
continue
|
|
elapsed = now - start
|
|
threshold = max(avg * 2.5, 30.0)
|
|
if elapsed >= threshold:
|
|
info["stolen"] = True
|
|
job = info.get("job")
|
|
_log(f"Slow job on {tag} ({elapsed:.1f}s > {threshold:.1f}s). Requeueing {self._job_label(job)}.")
|
|
self._handle_retry(tag, job, reason="slow", prefer_other=True)
|
|
|
|
# ---------- terminal tail ----------
|
|
|
|
def _write_header(self, w: Worker):
|
|
try:
|
|
scene = self.scene
|
|
engine = scene.render.engine
|
|
backend = self.device_mode
|
|
if engine == "CYCLES" and not w.is_cpu:
|
|
backend = self._cycles_backend_for_worker(w)
|
|
dev = "CPU" if w.is_cpu else (
|
|
f"GPU {w.phys_index if w.phys_index is not None else '?'} {(w.gpu_uuid or '')[:12]}"
|
|
f"{(' bus=' + w.gpu_bus) if w.gpu_bus else ''}"
|
|
)
|
|
blend_name = os.path.basename(bpy.data.filepath or "untitled.blend")
|
|
fstart, fend, fstep = scene.frame_start, scene.frame_end, scene.frame_step
|
|
try:
|
|
out_dir = os.path.dirname(bpy.path.abspath(scene.render.frame_path(frame=fstart)))
|
|
except Exception:
|
|
out_dir = bpy.path.abspath(scene.render.filepath)
|
|
|
|
header = (
|
|
BANNER_ASCII + "\n"
|
|
f"[MGPU-INFO] Tag: {w.tag}\n"
|
|
f"[MGPU-INFO] Engine: {engine} | Backend: {backend} | Device: {dev} | Threads/child: {self.threads} | Guard: {self.render_guard_tier} | DenoiseGPU: {'ON' if self.denoise_on_gpu else 'OFF'}\n"
|
|
f"[MGPU-INFO] .blend: {blend_name} | Frames: {fstart}–{fend} step {fstep}\n"
|
|
f"[MGPU-INFO] Output dir: {out_dir}\n"
|
|
)
|
|
w._banner_lines_since_repeat = 0
|
|
self._emit(w, header, count_for_banner=False)
|
|
except Exception:
|
|
pass
|
|
|
|
def _open_terminal_tail(self, w: Worker):
|
|
if not self.open_terms:
|
|
return
|
|
proc = self._spawn_tail_terminal(w.log_path, enable_vt=True)
|
|
if proc:
|
|
w.term_proc = proc
|
|
|
|
|
|
|
|
def _launch_worker_process(self, w: Worker):
|
|
with open(w.log_path, "a", encoding="utf-8"): pass
|
|
w.launch_attempted = True
|
|
w.launch_ts = time.time()
|
|
w.guard_last_progress_ts = w.launch_ts
|
|
w.guard_last_progress_sig = ("LAUNCH", int(w.launch_ts))
|
|
w.cycles_cpu_hint_ts = 0.0
|
|
w.cycles_cpu_hint_line = ""
|
|
|
|
blender_bin = bpy.app.binary_path
|
|
launch_backend = str(self.device_mode or "").upper()
|
|
launch_fallback = ""
|
|
if self.scene.render.engine == "CYCLES" and not w.is_cpu:
|
|
launch_backend = self._cycles_backend_for_worker(w)
|
|
launch_fallback = self._cycles_fallback_for_worker(w, launch_backend)
|
|
cmd = [
|
|
blender_bin, "--enable-autoexec",
|
|
]
|
|
if getattr(self, "_enabled_addon_modules_csv", ""):
|
|
cmd += ["--addons", self._enabled_addon_modules_csv]
|
|
cmd += [
|
|
"-b", self.temp_blend,
|
|
"-P", self._child_script,
|
|
"--",
|
|
]
|
|
if self.scene.render.engine == "CYCLES":
|
|
cli_dev = "CPU" if w.is_cpu else (launch_backend or self.device_mode or "CUDA")
|
|
cmd += ["--cycles-device", str(cli_dev).upper()]
|
|
cmd += [
|
|
"--mgpu-port", str(self._server_port),
|
|
"--mgpu-token", self._token,
|
|
"--mgpu-tag", w.tag,
|
|
"--mgpu-device", launch_backend or self.device_mode,
|
|
"--mgpu-threads", str(self.threads),
|
|
"--mgpu-usecpu", "1" if w.is_cpu else "0",
|
|
"--mgpu-denoise-gpu", "1" if self.denoise_on_gpu else "0",
|
|
"--mgpu-persistent", "1" if self.use_persistent_data else "0",
|
|
"--mgpu-mode", self.job_mode,
|
|
]
|
|
if (self.scene.render.engine == "CYCLES") and (not w.is_cpu):
|
|
if w.gpu_bus:
|
|
cmd += ["--mgpu-gpu-bus", w.gpu_bus]
|
|
if w.gpu_name:
|
|
cmd += ["--mgpu-gpu-name", w.gpu_name]
|
|
if (self.scene.render.engine == "CYCLES") and (not w.is_cpu) and launch_fallback:
|
|
cmd += ["--mgpu-fallback-device", launch_fallback]
|
|
if getattr(self, "src_blend_dir", None):
|
|
cmd += ["--src-dir", self.src_blend_dir]
|
|
if getattr(self, "_enabled_addons_file", None):
|
|
cmd += ["--mgpu-enabled-addons-file", self._enabled_addons_file]
|
|
if self.video_mode and self.job_mode == "FRAMES" and self.video_seq_dir:
|
|
cmd += [
|
|
"--mgpu-seq-dir", self.video_seq_dir,
|
|
"--mgpu-seq-format", self.video_seq_format,
|
|
"--mgpu-seq-ext", self.video_seq_ext,
|
|
]
|
|
if self._preflight_existing_check_done:
|
|
cmd += ["--mgpu-prechecked-existing", "1"]
|
|
# Pass MARI add-on path (so child imports & registers it)
|
|
if self.job_mode == "MARI" and getattr(self, "_mari_dir", None):
|
|
cmd += ["--mari-path", self._mari_dir]
|
|
|
|
|
|
|
|
env = os.environ.copy()
|
|
# Bind EXACT GPU via UUID
|
|
if not w.is_cpu and w.gpu_uuid:
|
|
env["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
|
|
env["CUDA_VISIBLE_DEVICES"] = w.gpu_uuid
|
|
else:
|
|
env.pop("CUDA_VISIBLE_DEVICES", None)
|
|
|
|
creationflags = subprocess.CREATE_NEW_PROCESS_GROUP if IS_WIN else 0
|
|
try:
|
|
proc = subprocess.Popen(
|
|
cmd, env=env,
|
|
stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
|
|
universal_newlines=True, bufsize=1,
|
|
creationflags=creationflags
|
|
)
|
|
w.proc = proc
|
|
w.launch_ok = True
|
|
w.launch_pid = int(getattr(proc, "pid", -1))
|
|
w.guard_last_progress_ts = time.time()
|
|
w.guard_last_progress_sig = ("PROCESS_STARTED", w.launch_pid)
|
|
_win_job_assign(proc)
|
|
w.stdout_thread = threading.Thread(target=self._pump_stdout, args=(w,), daemon=True)
|
|
w.stdout_thread.start()
|
|
self._record_launch_event(
|
|
w, "LAUNCHED", "PROCESS_STARTED",
|
|
f"pid={w.launch_pid} dev={'CPU' if w.is_cpu else (w.gpu_uuid or 'no-uuid')} "
|
|
f"{(' bus=' + w.gpu_bus) if ((not w.is_cpu) and w.gpu_bus) else ''} "
|
|
f"{(' cli_cycles_device=' + ('CPU' if w.is_cpu else (launch_backend or self.device_mode or 'CUDA'))) if (self.scene.render.engine == 'CYCLES') else ''} "
|
|
f"backend={(launch_backend or self.device_mode)}{(' fallback=' + launch_fallback) if ((not w.is_cpu) and launch_fallback) else ''} "
|
|
f"denoise_gpu={'ON' if self.denoise_on_gpu else 'OFF'}"
|
|
)
|
|
return True
|
|
except Exception as e:
|
|
w.launch_ok = False
|
|
w.launch_pid = None
|
|
reason = _classify_launch_exception(e)
|
|
self._record_launch_event(w, "FAILED_TO_LAUNCH", reason, str(e))
|
|
return False
|
|
|
|
def _emit(self, w: Worker, text: str, count_for_banner: bool = True):
|
|
try:
|
|
if w._log_fp:
|
|
w._log_fp.write(text)
|
|
w._log_fp.flush()
|
|
w.last_line = text.rstrip()
|
|
except Exception:
|
|
return
|
|
|
|
if not count_for_banner:
|
|
return
|
|
try:
|
|
step = int(_WORKER_BANNER_REPEAT_EVERY_LINES or 0)
|
|
if step <= 0:
|
|
return
|
|
added = int(str(text).count("\n"))
|
|
if added <= 0 and str(text):
|
|
added = 1
|
|
w._banner_lines_since_repeat = int(getattr(w, "_banner_lines_since_repeat", 0) or 0) + max(0, added)
|
|
if w._banner_lines_since_repeat < step:
|
|
return
|
|
w._banner_lines_since_repeat = 0
|
|
banner = "\n" + _WORKER_BANNER_REPEAT_TEXT
|
|
if w._log_fp:
|
|
w._log_fp.write(banner)
|
|
w._log_fp.flush()
|
|
w.last_line = banner.rstrip()
|
|
except Exception:
|
|
pass
|
|
|
|
def _pump_stdout(self, w: Worker):
|
|
PERCENT_STEP = 5.0 # update when percentage jumps by ≥5
|
|
TIME_STEP = 2.0 # every ≥2s
|
|
try:
|
|
for raw in w.proc.stdout:
|
|
line = raw.strip()
|
|
if not line:
|
|
continue
|
|
|
|
if (self.scene.render.engine == "CYCLES") and (not w.is_cpu):
|
|
ll = line.lower()
|
|
cpu_hint = False
|
|
if ("'cpu_enabled': true" in ll) or ('"cpu_enabled": true' in ll):
|
|
cpu_hint = True
|
|
if ("'scene_device': 'cpu'" in ll) or ('"scene_device": "cpu"' in ll):
|
|
cpu_hint = True
|
|
if not cpu_hint and re.search(r"\b(using|use|rendering on|fallback(?:ing)? to)\s+cpu\b", ll):
|
|
cpu_hint = True
|
|
if (
|
|
(not cpu_hint) and
|
|
re.search(r"\bdevice\b.{0,24}\bcpu\b", ll) and
|
|
("scene_device" not in ll) and
|
|
("cpu_enabled" not in ll)
|
|
):
|
|
cpu_hint = True
|
|
if cpu_hint:
|
|
w.cycles_cpu_hint_ts = time.time()
|
|
w.cycles_cpu_hint_line = line[:220]
|
|
self._emit(w, f"[MGPU-GUARD] {w.tag} CPU device hint: {w.cycles_cpu_hint_line}\n")
|
|
|
|
# Frame lifecycle from child markers
|
|
ms = _CHILD_START_RE.match(line)
|
|
if ms:
|
|
w.cur_frame = int(ms.group(2))
|
|
w.cur_path = ms.group(3)
|
|
w.frame_start_time = time.time()
|
|
w.guard_last_progress_ts = time.time()
|
|
w.guard_last_progress_sig = ("START", w.cur_frame)
|
|
w._live_line_active = False
|
|
w._last_pct = -1.0
|
|
w._last_emit_time = 0.0
|
|
self._emit(w, f"[MGPU-DASH] frame {w.cur_frame} preparing -> {w.cur_path}\n")
|
|
continue
|
|
|
|
msm = _CHILD_MARI_START_RE.match(line)
|
|
if msm:
|
|
action = str(msm.group(2) or "")
|
|
h = msm.group(3)
|
|
v = msm.group(4)
|
|
frame_txt = msm.group(5)
|
|
target = msm.group(6)
|
|
label = f"{action} H{h} V{v}" + (f" f{frame_txt}" if frame_txt is not None else "")
|
|
w.cur_frame = None
|
|
w.cur_path = target
|
|
w.frame_start_time = time.time()
|
|
w.guard_last_progress_ts = time.time()
|
|
w.guard_last_progress_sig = ("MARI_START", label)
|
|
w._live_line_active = False
|
|
w._last_pct = -1.0
|
|
w._last_emit_time = 0.0
|
|
self._emit(w, f"[MGPU-DASH] {label} preparing -> {target}\n")
|
|
continue
|
|
|
|
mf = _CHILD_FIN_RE.match(line)
|
|
if mf:
|
|
frame = int(mf.group(2))
|
|
elapsed = float(mf.group(3))
|
|
w.guard_last_progress_ts = time.time()
|
|
w.guard_last_progress_sig = ("FIN", frame)
|
|
s_cur, s_tot = w._last_samples
|
|
bar = _progress_bar(100.0, 20)
|
|
samples_txt = f" samples {s_tot}/{s_tot}" if (s_tot is not None) else ""
|
|
prefix = "\x1b[1F\x1b[2K" if w._live_line_active else ""
|
|
final_line = f"{prefix}[MGPU-PROG] {w.tag} f{frame:>4} 100.0% [{bar}]{samples_txt} {elapsed:.1f}s\n"
|
|
self._emit(w, final_line)
|
|
# reset
|
|
w._live_line_active = False
|
|
w.cur_frame = None
|
|
w.cur_path = None
|
|
w.frame_start_time = 0.0
|
|
continue
|
|
|
|
mfm = _CHILD_MARI_FIN_RE.match(line)
|
|
if mfm:
|
|
action = str(mfm.group(2) or "")
|
|
h = mfm.group(3)
|
|
v = mfm.group(4)
|
|
frame_txt = mfm.group(5)
|
|
elapsed = float(mfm.group(6))
|
|
target = mfm.group(7)
|
|
label = f"{action} H{h} V{v}" + (f" f{frame_txt}" if frame_txt is not None else "")
|
|
w.guard_last_progress_ts = time.time()
|
|
w.guard_last_progress_sig = ("MARI_FIN", label)
|
|
prefix = "\x1b[1F\x1b[2K" if w._live_line_active else ""
|
|
self._emit(w, f"{prefix}[MGPU-PROG] {w.tag} {label} 100.0% {elapsed:.1f}s -> {target}\n")
|
|
w._live_line_active = False
|
|
w.cur_frame = None
|
|
w.cur_path = None
|
|
w.frame_start_time = 0.0
|
|
continue
|
|
|
|
mm = _CHILD_MISS_RE.match(line)
|
|
if mm:
|
|
frame = int(mm.group(2))
|
|
elapsed = float(mm.group(3))
|
|
w.guard_last_progress_ts = time.time()
|
|
w.guard_last_progress_sig = ("MISS", frame)
|
|
prefix = "\x1b[1F\x1b[2K" if w._live_line_active else ""
|
|
self._emit(w, f"{prefix}[MGPU-FAIL] {w.tag} f{frame:>4} ({elapsed:.1f}s)\n")
|
|
w._live_line_active = False
|
|
w.cur_frame = None
|
|
w.cur_path = None
|
|
w.frame_start_time = 0.0
|
|
continue
|
|
|
|
mmm = _CHILD_MARI_MISS_RE.match(line)
|
|
if mmm:
|
|
action = str(mmm.group(2) or "")
|
|
h = mmm.group(3)
|
|
v = mmm.group(4)
|
|
frame_txt = mmm.group(5)
|
|
elapsed = float(mmm.group(6))
|
|
target = mmm.group(7)
|
|
label = f"{action} H{h} V{v}" + (f" f{frame_txt}" if frame_txt is not None else "")
|
|
w.guard_last_progress_ts = time.time()
|
|
w.guard_last_progress_sig = ("MARI_MISS", label)
|
|
prefix = "\x1b[1F\x1b[2K" if w._live_line_active else ""
|
|
self._emit(w, f"{prefix}[MGPU-FAIL] {w.tag} {label} ({elapsed:.1f}s) -> {target}\n")
|
|
w._live_line_active = False
|
|
w.cur_frame = None
|
|
w.cur_path = None
|
|
w.frame_start_time = 0.0
|
|
continue
|
|
|
|
if line.startswith("[MGPU-CHILD]") or "ERROR" in line or "WARNING" in line or "Traceback" in line:
|
|
self._emit(w, line + "\n")
|
|
continue
|
|
if line.startswith("[MGPU-PROJ]"):
|
|
self._emit(w, line + "\n")
|
|
continue
|
|
|
|
s_cur, s_tot, t_cur, t_tot = _parse_progress_fields(line)
|
|
pct = _progress_percent(s_cur, s_tot, t_cur, t_tot)
|
|
has_progress = any(v is not None for v in (s_cur, s_tot, t_cur, t_tot, pct))
|
|
if has_progress:
|
|
try:
|
|
info = self.inflight.get(w.tag)
|
|
if info is not None:
|
|
info["guard_progress_seen"] = True
|
|
except Exception:
|
|
pass
|
|
now = time.time()
|
|
progress_sig = (s_cur, s_tot, t_cur, t_tot, (None if pct is None else int(pct)))
|
|
if progress_sig != w.guard_last_progress_sig:
|
|
w.guard_last_progress_sig = progress_sig
|
|
w.guard_last_progress_ts = now
|
|
|
|
if s_cur is not None or s_tot is not None:
|
|
w._last_samples = (s_cur, s_tot)
|
|
|
|
should_emit = False
|
|
if pct is not None:
|
|
if (pct - w._last_pct) >= PERCENT_STEP or (now - w._last_emit_time) >= TIME_STEP:
|
|
should_emit = True
|
|
|
|
if should_emit and pct is not None:
|
|
bar = _progress_bar(pct, 20)
|
|
samples_txt = f" samples {s_cur}/{s_tot}" if (s_cur is not None and s_tot) else ""
|
|
elapsed = (now - w.frame_start_time) if w.frame_start_time else 0.0
|
|
fr = f"f{w.cur_frame:>4}" if (w.cur_frame is not None) else "f --"
|
|
prefix = "\x1b[1F\x1b[2K" if w._live_line_active else ""
|
|
out = f"{prefix}[MGPU-PROG] {w.tag} {fr} {pct:5.1f}% [{bar}]{samples_txt} {elapsed:.1f}s\n"
|
|
self._emit(w, out)
|
|
w._live_line_active = True
|
|
w._last_pct = pct
|
|
w._last_emit_time = now
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
def prepare_and_spawn(self):
|
|
self._update_ram_capacity_note()
|
|
if self._selection_warning:
|
|
self._open_diag_terminal_if_needed()
|
|
if not self.pending:
|
|
msg = "[MGPU-LAUNCH] No worker launch needed; all pending outputs were resolved during preflight."
|
|
_log(msg)
|
|
self._diag_write(msg)
|
|
return
|
|
launched = 0
|
|
for w in self.workers:
|
|
try:
|
|
log_file = os.path.join(self.logs_dir, f"{w.tag}.log")
|
|
w.open_log(log_file)
|
|
self._write_header(w)
|
|
if self.open_terms:
|
|
self._open_terminal_tail(w)
|
|
ok = self._launch_worker_process(w)
|
|
if ok:
|
|
launched += 1
|
|
except Exception as e:
|
|
self._record_launch_event(w, "FAILED_TO_LAUNCH", "PREPARE_OR_OPEN_LOG_FAILED", str(e))
|
|
|
|
failed = len(self.workers) - launched
|
|
_log(f"[MGPU-LAUNCH] Spawn summary: planned={len(self.workers)} launched={launched} failed_to_launch={failed}")
|
|
self._diag_write(f"[MGPU-LAUNCH] Spawn summary: planned={len(self.workers)} launched={launched} failed_to_launch={failed}")
|
|
if self._ram_cap_note:
|
|
_log(f"[MGPU-LAUNCH] Note: {self._ram_cap_note}")
|
|
self._diag_write(f"[MGPU-LAUNCH] Note: {self._ram_cap_note}")
|
|
self._open_diag_terminal_if_needed()
|
|
if failed > 0:
|
|
self._open_diag_terminal_if_needed()
|
|
if launched <= 0:
|
|
raise RuntimeError("No workers launched. Check [MGPU-LAUNCH] lines for reasons.")
|
|
|
|
def _print_launch_status_summary(self, title="Status"):
|
|
_log(f"[MGPU-LAUNCH] {title} summary:")
|
|
self._diag_write(f"[MGPU-LAUNCH] {title} summary:")
|
|
for w in self.workers:
|
|
detail = f" | {w.launch_detail}" if w.launch_detail else ""
|
|
conn = " hello=yes" if w.hello_received else " hello=no"
|
|
pid = f" pid={w.launch_pid}" if w.launch_pid is not None else ""
|
|
line = f"[MGPU-LAUNCH] {w.tag}: {w.launch_state}/{w.launch_reason}{pid}{conn}{detail}"
|
|
_log(line)
|
|
self._diag_write(line)
|
|
|
|
def finish(self):
|
|
self._print_launch_status_summary("Final")
|
|
self._kill_all()
|
|
for w in self.workers:
|
|
try:
|
|
if getattr(w, "term_proc", None) and (w.term_proc.poll() is None):
|
|
if IS_WIN: w.term_proc.send_signal(signal.CTRL_BREAK_EVENT)
|
|
else: w.term_proc.terminate()
|
|
except Exception:
|
|
pass
|
|
for w in self.workers:
|
|
w.close_log()
|
|
|
|
# Build final video from temp frames (non-MARI mode only).
|
|
try:
|
|
if self.job_mode == "FRAMES" and self.video_mode and (not self._skip_video_encode):
|
|
self._encode_video_from_sequence()
|
|
elif self.job_mode == "FRAMES" and self.video_mode and self._skip_video_encode:
|
|
_log(f"[MGPU] Reused existing final video: {self.video_output_path}")
|
|
except Exception as _e:
|
|
print(f"[MGPU] Video encode failed: {_e}")
|
|
|
|
# --- NEW: package MARI media if requested ---
|
|
try:
|
|
if self.job_mode == "MARI":
|
|
self._package_mari_zip()
|
|
except Exception as _e:
|
|
print(f"[MGPU] ZIP packaging skipped/failed: {_e}")
|
|
# -------------------------------------------
|
|
|
|
try:
|
|
if self.job_mode == "MARI":
|
|
self._cleanup_mari_temp_dirs()
|
|
except Exception as _e:
|
|
print(f"[MGPU] TEMP cleanup skipped/failed: {_e}")
|
|
|
|
self._cleanup_temp()
|
|
|
|
def _package_mari_zip(self):
|
|
"""Create <blend-dir>/<render_settings_name>.zip that contains the entire MARI output folder."""
|
|
try:
|
|
prop = self.scene.mari_props
|
|
except Exception:
|
|
return
|
|
try:
|
|
save_zip = bool(getattr(prop, "mari_save_media", False))
|
|
except Exception:
|
|
save_zip = False
|
|
if not save_zip:
|
|
return
|
|
|
|
try:
|
|
base = bpy.path.abspath(getattr(prop, "render_settings_filepath", ""))
|
|
name = getattr(prop, "render_settings_name", "").strip()
|
|
src_dir = os.path.join(base, name)
|
|
|
|
if not (name and os.path.isdir(src_dir)):
|
|
print(f"[MGPU] ZIP: source folder missing or invalid: {src_dir}")
|
|
return
|
|
|
|
# Zip lives one level above the render folder, alongside it.
|
|
zip_path = os.path.join(base, f"{name}.zip")
|
|
|
|
from zipfile import ZipFile, ZIP_DEFLATED
|
|
with ZipFile(zip_path, 'w', ZIP_DEFLATED) as zipf:
|
|
for root, dirs, files in os.walk(src_dir):
|
|
for file in files:
|
|
full = os.path.join(root, file)
|
|
# Avoid adding the zip file into itself if it already exists.
|
|
if os.path.normpath(full) == os.path.normpath(zip_path):
|
|
continue
|
|
arc = os.path.relpath(full, start=src_dir)
|
|
# keep folder name at the top level in the archive
|
|
zipf.write(full, arcname=os.path.join(name, arc))
|
|
print(f"[MGPU] Wrote MARI media ZIP: {zip_path}")
|
|
except Exception as e:
|
|
print(f"[MGPU] ZIP packaging failed: {e}")
|
|
|
|
def _collect_video_frames(self):
|
|
if not self.video_seq_dir:
|
|
return []
|
|
try:
|
|
ext = (self.video_seq_ext or "").lower()
|
|
files = [f for f in os.listdir(self.video_seq_dir) if f.lower().endswith(ext)]
|
|
files.sort()
|
|
return files
|
|
except Exception:
|
|
return []
|
|
|
|
def _encode_video_from_sequence(self):
|
|
if not self.video_seq_dir or not self.video_output_path:
|
|
return
|
|
frames = self._collect_video_frames()
|
|
if not frames:
|
|
raise RuntimeError("No rendered frames found for video encode.")
|
|
expected = int(getattr(self, "total_frames", 0) or 0)
|
|
if expected and len(frames) < expected:
|
|
raise RuntimeError(f"Missing frames ({len(frames)}/{expected}) for video encode.")
|
|
_log(f"Encoding video from {len(frames)} frames -> {self.video_output_path}")
|
|
_mgpu_build_video_from_sequence(self.scene, self.video_seq_dir, frames, self.video_output_path)
|
|
|
|
|
|
def start(self):
|
|
if (self.scene.render.engine == 'CYCLES') and (not _cycles_prefs()):
|
|
raise RuntimeError("Cycles add-on is not enabled. Enable it in Preferences > Add-ons, or switch render engine to Eevee.")
|
|
if self.job_mode == "FRAMES":
|
|
if not self.frames:
|
|
raise RuntimeError("No frames to render (check frame start/end).")
|
|
else:
|
|
if not self.pending:
|
|
raise RuntimeError("No MARI camera jobs to render (job list empty).")
|
|
|
|
self.prepare_blend_copy()
|
|
|
|
|
|
# If MARI ANIM, expand camera jobs into per-frame jobs unless we're rendering video containers.
|
|
# Explicit per-frame jobs supplied by MARI are preserved as-is so partial resumes can start immediately.
|
|
expand_frames = (
|
|
self.job_mode == "MARI" and
|
|
self.mari_globals.get("action") == "ANIM" and
|
|
not self.mari_globals.get("is_video", False)
|
|
)
|
|
if expand_frames:
|
|
fstart, fend, fstep = self.scene.frame_start, self.scene.frame_end, max(1, self.scene.frame_step)
|
|
expanded = []
|
|
for j in self.mari_jobs:
|
|
try:
|
|
existing_frame = int(j.get("frame", -1))
|
|
except Exception:
|
|
existing_frame = -1
|
|
if existing_frame >= 0:
|
|
expanded.append(dict(j))
|
|
continue
|
|
for f in range(fstart, fend + 1, fstep):
|
|
jj = dict(j)
|
|
jj["frame"] = int(f)
|
|
expanded.append(jj)
|
|
self.pending = expanded
|
|
self.total_frames = len(expanded)
|
|
|
|
self._preflight_existing_outputs()
|
|
self._start_server()
|
|
self.prepare_and_spawn()
|
|
|
|
def stop(self):
|
|
self._print_launch_status_summary("Stop")
|
|
self.cancelled = True
|
|
self._kill_all()
|
|
for w in self.workers:
|
|
try:
|
|
if getattr(w, "term_proc", None) and (w.term_proc.poll() is None):
|
|
if IS_WIN: w.term_proc.send_signal(signal.CTRL_BREAK_EVENT)
|
|
else: w.term_proc.terminate()
|
|
except Exception:
|
|
pass
|
|
for w in self.workers:
|
|
w.close_log()
|
|
self._cleanup_temp()
|
|
|
|
def poll(self) -> bool:
|
|
with self._lock:
|
|
self._check_render_time_guard()
|
|
self._check_inflight_timeouts()
|
|
self._check_launch_health()
|
|
for w in self.workers:
|
|
if w.proc and (w.proc.poll() is not None):
|
|
try:
|
|
if w.stdout_thread:
|
|
w.stdout_thread.join(timeout=0.1)
|
|
except Exception:
|
|
pass
|
|
with self._lock:
|
|
info = self.inflight.get(w.tag)
|
|
if info:
|
|
self._handle_retry(w.tag, info.get("job"), key=info.get("key"),
|
|
reason="worker-exit", prefer_other=True)
|
|
return self._shutdown_if_done()
|
|
|
|
def _shutdown_if_done(self) -> bool:
|
|
all_frames_done = len(self.finished_set) >= getattr(self, "total_frames", 0)
|
|
procs_alive = any(getattr(w, "alive", lambda: False)() if callable(getattr(w, "alive", None)) else w.alive
|
|
for w in self.workers)
|
|
if all_frames_done and not procs_alive:
|
|
self.finish()
|
|
return True
|
|
return False
|
|
|
|
def _kill_all(self):
|
|
for w in self.workers:
|
|
if w.proc and (w.proc.poll() is None):
|
|
try:
|
|
if IS_WIN: w.proc.send_signal(signal.CTRL_BREAK_EVENT)
|
|
else: w.proc.terminate()
|
|
except Exception:
|
|
pass
|
|
for w in self.workers:
|
|
if w.term_proc and (w.term_proc.poll() is None):
|
|
try:
|
|
if IS_WIN: w.term_proc.send_signal(signal.CTRL_BREAK_EVENT)
|
|
else: w.term_proc.terminate()
|
|
except Exception:
|
|
pass
|
|
try:
|
|
if self._diag_term_proc and (self._diag_term_proc.poll() is None):
|
|
if IS_WIN: self._diag_term_proc.send_signal(signal.CTRL_BREAK_EVENT)
|
|
else: self._diag_term_proc.terminate()
|
|
except Exception:
|
|
pass
|
|
|
|
def _cleanup_mari_temp_dirs(self):
|
|
try:
|
|
prop = self.scene.mari_props
|
|
except Exception:
|
|
return
|
|
try:
|
|
base = bpy.path.abspath(getattr(prop, "render_settings_filepath", ""))
|
|
name = getattr(prop, "render_settings_name", "").strip()
|
|
root = os.path.join(base, name)
|
|
if not (name and os.path.isdir(root)):
|
|
return
|
|
for entry in os.scandir(root):
|
|
if entry.is_dir() and entry.name.upper().endswith("_TEMP"):
|
|
shutil.rmtree(entry.path, ignore_errors=True)
|
|
except Exception:
|
|
pass
|
|
|
|
def _cleanup_temp(self):
|
|
try:
|
|
if self._server_sock: self._server_sock.close()
|
|
except Exception:
|
|
pass
|
|
try:
|
|
if self._diag_log_fp:
|
|
self._diag_log_fp.close()
|
|
except Exception:
|
|
pass
|
|
try:
|
|
if self.temp_dir and os.path.isdir(self.temp_dir):
|
|
shutil.rmtree(self.temp_dir, ignore_errors=True)
|
|
except Exception:
|
|
pass
|
|
|
|
def _shorten_path(p, maxlen=96):
|
|
try:
|
|
p = os.path.normpath(p)
|
|
except Exception:
|
|
p = str(p)
|
|
if len(p) <= maxlen: return p
|
|
keep = max(12, maxlen // 2 - 3)
|
|
return p[:keep] + "..." + p[-(maxlen - keep - 3):]
|
|
|
|
# ----------------------- UI / operators -----------------------
|
|
|
|
class MGPU_RuntimePrefs(bpy.types.PropertyGroup):
|
|
threads_per_process: bpy.props.IntProperty(
|
|
name="Threads per Process", min=0, max=256, default=0,
|
|
description="CPU threads per child Blender (0 = Blender decides)"
|
|
)
|
|
instances_per_gpu: bpy.props.IntProperty(
|
|
name="Instances per GPU", min=1, max=8, default=1,
|
|
description="Workers launched per physical GPU"
|
|
)
|
|
dispatch_mode: bpy.props.EnumProperty(
|
|
name="Dispatch",
|
|
items=[("DYNAMIC","Dynamic Queue (recommended)",""),
|
|
("STRIDE","Stride (round-robin)","")],
|
|
default="DYNAMIC"
|
|
)
|
|
max_retries: bpy.props.IntProperty(
|
|
name="Max Retries per Frame", min=0, max=10, default=2
|
|
)
|
|
render_time_guard_tier: bpy.props.EnumProperty(
|
|
name="Render-Time Guard",
|
|
description="Adaptive slow-frame guard behavior: duplicate, then restart unhealthy worker on same GPU",
|
|
items=[
|
|
("AGGRESSIVE", "Aggressive (default)", "Fast intervention; highest chance to restart slow workers"),
|
|
("BALANCED", "Balanced", "Moderate intervention and restart cadence"),
|
|
("CONSERVATIVE", "Conservative", "Rare intervention; safer for naturally long frames"),
|
|
("OFF", "Off", "Disable adaptive render-time guard"),
|
|
],
|
|
default="AGGRESSIVE"
|
|
)
|
|
open_terminals: bpy.props.BoolProperty(
|
|
name="Open terminal windows for logs", default=IS_WIN
|
|
)
|
|
use_persistent_data: bpy.props.BoolProperty(
|
|
name="Use Persistent Data", default=True,
|
|
description="Keep render caches between frames in workers (uses more RAM)"
|
|
)
|
|
denoise_on_gpu: bpy.props.BoolProperty(
|
|
name="Denoise on GPU", default=True,
|
|
description="When enabled, workers try to use Blender's GPU denoiser (OptiX) where applicable"
|
|
)
|
|
use_target_dir_for_video_temp: bpy.props.BoolProperty(
|
|
name="Temp Frames In Target Dir", default=True,
|
|
description="Store video temp frames beside the final output file. Disable to use the system temp/AppData folder instead"
|
|
)
|
|
ghost_filter_mode: bpy.props.EnumProperty(
|
|
name="Ghost filter mode",
|
|
description="How to build the GPU list (your system works best with STRICT − LEGACY).",
|
|
items=[
|
|
("STRICT_MINUS_LEGACY", "Strict − Legacy (default)", "Use strict(full) minus legacy(ghost)"),
|
|
("LEGACY_MINUS_STRICT", "Legacy − Strict", "Use broad scan then subtract strict"),
|
|
("STRICT_ONLY", "Strict only", "Use strict set only"),
|
|
("LEGACY_ONLY", "Legacy only", "Use legacy set only"),
|
|
],
|
|
default="STRICT_MINUS_LEGACY"
|
|
)
|
|
|
|
class MGPU_OT_render_frames(bpy.types.Operator):
|
|
bl_idname = "render.multi_gpu_frames"
|
|
bl_label = "Render (Multi-GPU Frames)"
|
|
bl_options = {'REGISTER', 'INTERNAL'}
|
|
|
|
_timer = None
|
|
confirm_message: bpy.props.StringProperty(default="")
|
|
forced_temp_dir: bpy.props.StringProperty(default="")
|
|
|
|
def _resolve_video_temp_dir(self, context):
|
|
prefs = getattr(context.window_manager, "mgpu_runtime_prefs", None)
|
|
use_target_dir = True
|
|
if prefs is not None:
|
|
use_target_dir = bool(getattr(prefs, "use_target_dir_for_video_temp", True))
|
|
return _mgpu_video_temp_dir_for(context.scene, use_target_dir=use_target_dir)
|
|
|
|
def draw(self, context):
|
|
layout = self.layout
|
|
lines = [l for l in (self.confirm_message or "").split("\n") if l.strip()]
|
|
if not lines:
|
|
layout.label(text="Overwrite existing output?")
|
|
return
|
|
for line in lines:
|
|
layout.label(text=line)
|
|
|
|
def invoke(self, context, event=None):
|
|
is_video = _mgpu_is_video(context.scene)
|
|
temp_dir = self._resolve_video_temp_dir(context) if is_video else None
|
|
if is_video:
|
|
self.forced_temp_dir = temp_dir or ""
|
|
if context.scene.render.use_overwrite:
|
|
warnings = _mgpu_overwrite_warnings(context.scene, is_video, temp_dir=temp_dir)
|
|
if warnings:
|
|
self.confirm_message = "Overwrite existing output?\n" + "\n".join(warnings)
|
|
return context.window_manager.invoke_confirm(self, event)
|
|
return self.execute(context)
|
|
|
|
def execute(self, context):
|
|
global _MANAGER
|
|
_cleanup_stale_manager()
|
|
if _MANAGER is not None:
|
|
self.report({'ERROR'}, "Multi-GPU job already running.")
|
|
return {'CANCELLED'}
|
|
|
|
is_video = _mgpu_is_video(context.scene)
|
|
if is_video:
|
|
if not self.forced_temp_dir:
|
|
self.forced_temp_dir = self._resolve_video_temp_dir(context) or ""
|
|
self.report({'INFO'}, "Video output detected. Rendering to a temp image sequence, then encoding.")
|
|
|
|
p = context.window_manager.mgpu_runtime_prefs
|
|
try:
|
|
mgr = MultiGPUManager(
|
|
context.scene,
|
|
threads=p.threads_per_process,
|
|
instances_per_gpu=p.instances_per_gpu,
|
|
dispatch_mode=p.dispatch_mode,
|
|
max_retries=p.max_retries,
|
|
open_terms=p.open_terminals,
|
|
ghost_mode=p.ghost_filter_mode,
|
|
use_persistent_data=p.use_persistent_data,
|
|
render_guard_tier=p.render_time_guard_tier,
|
|
denoise_on_gpu=p.denoise_on_gpu
|
|
)
|
|
if is_video:
|
|
mgr.video_mode = True
|
|
mgr.video_output_path = bpy.path.abspath(context.scene.render.filepath)
|
|
if self.forced_temp_dir:
|
|
mgr._forced_temp_dir = self.forced_temp_dir
|
|
mgr.start()
|
|
except Exception as e:
|
|
self.report({'ERROR'}, str(e)); return {'CANCELLED'}
|
|
|
|
_MANAGER = mgr
|
|
wm = context.window_manager
|
|
self._timer = wm.event_timer_add(0.25, window=context.window)
|
|
wm.modal_handler_add(self)
|
|
_log("Multi-GPU frames started.")
|
|
return {'RUNNING_MODAL'}
|
|
|
|
def modal(self, context, event):
|
|
global _MANAGER
|
|
if _MANAGER is None:
|
|
return {'CANCELLED'}
|
|
|
|
if event and event.type == 'ESC':
|
|
_MANAGER.stop(); _MANAGER = None
|
|
try: context.window_manager.event_timer_remove(self._timer)
|
|
except Exception: pass
|
|
self.report({'INFO'}, "Multi-GPU cancelled.")
|
|
return {'CANCELLED'}
|
|
|
|
if event.type == 'TIMER':
|
|
try:
|
|
done = _MANAGER.poll()
|
|
if done:
|
|
try:
|
|
context.window_manager.event_timer_remove(self._timer)
|
|
except Exception:
|
|
pass
|
|
_MANAGER = None
|
|
try:
|
|
bpy.ops.wm.redraw_timer(type='DRAW_WIN_SWAP', iterations=1)
|
|
except Exception:
|
|
pass
|
|
self.report({'INFO'}, "Multi-GPU render finished.")
|
|
return {'FINISHED'}
|
|
except Exception as e:
|
|
_log(f"Manager error: {e}")
|
|
_MANAGER.stop(); _MANAGER = None
|
|
try: context.window_manager.event_timer_remove(self._timer)
|
|
except Exception: pass
|
|
self.report({'ERROR'}, str(e))
|
|
return {'CANCELLED'}
|
|
return {'RUNNING_MODAL'}
|
|
|
|
class MGPU_OT_render_mari(bpy.types.Operator):
|
|
bl_idname = "render.multi_gpu_mari"
|
|
bl_label = "Render (Multi-Instance MARI)"
|
|
bl_options = {'REGISTER', 'INTERNAL'}
|
|
|
|
job_json: bpy.props.StringProperty(name="Jobs JSON") # {"jobs":[{"cam_name":..., "H":..,"V":..},...]}
|
|
mode: bpy.props.EnumProperty(items=[("FRAME","FRAME",""),("CIRCLE","CIRCLE","")], default="FRAME")
|
|
action: bpy.props.EnumProperty(items=[("STILL","STILL",""),("ANIM","ANIM","")], default="STILL")
|
|
|
|
_timer = None
|
|
|
|
def invoke(self, context, event=None):
|
|
global _MANAGER
|
|
_cleanup_stale_manager()
|
|
if _MANAGER is not None:
|
|
self.report({'ERROR'}, "Multi-Instance job already running.")
|
|
return {'CANCELLED'}
|
|
|
|
try:
|
|
payload = json.loads(self.job_json or "{}")
|
|
jobs = payload.get("jobs") or []
|
|
except Exception as e:
|
|
self.report({'ERROR'}, f"Bad job_json: {e}")
|
|
return {'CANCELLED'}
|
|
|
|
if not jobs:
|
|
self.report({'ERROR'}, "No MARI jobs provided.")
|
|
return {'CANCELLED'}
|
|
|
|
# Prepare globals sent to children
|
|
is_video = _mgpu_is_video(context.scene)
|
|
if self.action == "STILL" and is_video:
|
|
self.report({'ERROR'}, "Cannot render STILL directly to video. Switch to an image format or use ANIM.")
|
|
return {'CANCELLED'}
|
|
|
|
mari_prop = getattr(context.scene, "mari_props", None)
|
|
mari_settings = {}
|
|
if mari_prop:
|
|
try:
|
|
mari_settings = {
|
|
"frame_ratio": [float(mari_prop.frame_ratio[0]), float(mari_prop.frame_ratio[1])],
|
|
"frame_dimensions": [float(mari_prop.frame_dimensions[0]), float(mari_prop.frame_dimensions[1])],
|
|
"frame_center": [float(mari_prop.frame_center[0]), float(mari_prop.frame_center[1]), float(mari_prop.frame_center[2])],
|
|
"frame_rotation": [float(mari_prop.frame_rotation[0]), float(mari_prop.frame_rotation[1]), float(mari_prop.frame_rotation[2])],
|
|
"render_settings_filepath": bpy.path.abspath(getattr(mari_prop, "render_settings_filepath", "")),
|
|
"render_settings_name": str(getattr(mari_prop, "render_settings_name", "") or ""),
|
|
"render_settings_normalize": bool(getattr(mari_prop, "render_settings_normalize", False)),
|
|
}
|
|
except Exception:
|
|
mari_settings = {}
|
|
|
|
mari_globals = {"mode": "FRAME" if self.mode == "FRAME" else "CIRCLE",
|
|
"action": self.action,
|
|
"is_video": is_video,
|
|
"use_overwrite": bool(getattr(context.scene.render, "use_overwrite", True)),
|
|
"use_placeholder": bool(getattr(context.scene.render, "use_placeholder", False)),
|
|
"render_resolution_x": int(getattr(context.scene.render, "resolution_x", 0)),
|
|
"render_resolution_y": int(getattr(context.scene.render, "resolution_y", 0)),
|
|
"render_resolution_percentage": int(getattr(context.scene.render, "resolution_percentage", 100)),
|
|
"mari_settings": mari_settings}
|
|
|
|
# Pre-export .mari3d and ensure the output folder is prepared exactly like MARI does
|
|
try:
|
|
export_type = "FRAME" if self.mode == "FRAME" else "CIRCLE"
|
|
# --- ensure the MARI output directory exists (mirrors MARI add-on) ---
|
|
try:
|
|
mari_prop = context.scene.mari_props
|
|
base = bpy.path.abspath(getattr(mari_prop, "render_settings_filepath", ""))
|
|
name = getattr(mari_prop, "render_settings_name", "").strip()
|
|
target_dir = os.path.join(base, name)
|
|
if context.scene.render.use_overwrite and os.path.isdir(target_dir):
|
|
shutil.rmtree(target_dir)
|
|
os.makedirs(target_dir, exist_ok=True)
|
|
print(f"[MGPU-PARENT] Ensured MARI output folder exists: {target_dir}")
|
|
except Exception as _e:
|
|
self.report({'ERROR'}, f"Could not prepare MARI folder: {target_dir} ({_e})")
|
|
return {'CANCELLED'}
|
|
# --------------------------------------------------------------------
|
|
|
|
bpy.ops.mari.export_mari(action="RENDER", type=export_type, format=self.action)
|
|
print(f"[MGPU-PARENT] Exported MARI .mari3d (type={export_type}, format={self.action})")
|
|
except Exception as e:
|
|
self.report({'ERROR'}, f"MARI export failed: {e}")
|
|
return {'CANCELLED'}
|
|
|
|
|
|
# Use same runtime prefs as frames operator
|
|
p = context.window_manager.mgpu_runtime_prefs
|
|
threads = p.threads_per_process
|
|
ipg = p.instances_per_gpu
|
|
dispatch = p.dispatch_mode
|
|
retries = p.max_retries
|
|
open_terms= p.open_terminals
|
|
|
|
mgr = MultiGPUManager(context.scene, threads=threads, instances_per_gpu=ipg,
|
|
dispatch_mode=dispatch, max_retries=retries, open_terms=open_terms,
|
|
use_persistent_data=p.use_persistent_data,
|
|
job_mode="MARI", mari_jobs=jobs, mari_globals=mari_globals,
|
|
render_guard_tier=p.render_time_guard_tier,
|
|
denoise_on_gpu=p.denoise_on_gpu)
|
|
_MANAGER = mgr
|
|
mgr.start() # will prepare blend copy, write child script, start server
|
|
|
|
# Replace worker command with MARI mode
|
|
for w in mgr.workers:
|
|
# re-launch with --mgpu-mode MARI
|
|
pass # handled in _launch_worker_process via self.job_mode
|
|
|
|
# Install modal timer for UI progress like frames op
|
|
self._timer = context.window_manager.event_timer_add(0.3, window=context.window)
|
|
context.window_manager.modal_handler_add(self)
|
|
return {'RUNNING_MODAL'}
|
|
|
|
def modal(self, context, event):
|
|
global _MANAGER
|
|
if event.type == 'ESC' and getattr(event, "value", 'PRESS') == 'PRESS':
|
|
_MANAGER.stop(); _MANAGER = None
|
|
try:
|
|
context.window_manager.event_timer_remove(self._timer)
|
|
except Exception:
|
|
pass
|
|
self.report({'INFO'}, "Cancelled.")
|
|
return {'CANCELLED'}
|
|
|
|
if event.type == 'TIMER':
|
|
if _MANAGER and _MANAGER.poll():
|
|
# finished
|
|
try:
|
|
context.window_manager.event_timer_remove(self._timer)
|
|
except Exception:
|
|
pass
|
|
_MANAGER = None
|
|
|
|
# Export .mari3d to the same folder the single-instance flow would use
|
|
try:
|
|
bpy.ops.mari.export_mari('EXEC_DEFAULT')
|
|
self.report({'INFO'}, "Completed (exported .mari3d).")
|
|
except Exception as e:
|
|
self.report({'WARNING'}, f"Completed (but .mari3d export failed: {e})")
|
|
|
|
return {'FINISHED'}
|
|
|
|
return {'RUNNING_MODAL'}
|
|
|
|
|
|
class MGPU_OT_cancel(bpy.types.Operator):
|
|
bl_idname = "render.multi_gpu_frames_cancel"
|
|
bl_label = "Cancel Multi-GPU Frames"
|
|
bl_options = {'INTERNAL'}
|
|
|
|
def execute(self, context):
|
|
global _MANAGER
|
|
if _MANAGER:
|
|
_MANAGER.stop(); _MANAGER = None
|
|
self.report({'INFO'}, "Multi-GPU job cancelled.")
|
|
return {'FINISHED'}
|
|
self.report({'INFO'}, "No Multi-GPU job running.")
|
|
return {'CANCELLED'}
|
|
|
|
class MGPU_OT_open_logs(bpy.types.Operator):
|
|
bl_idname = "render.multi_gpu_frames_open_logs"
|
|
bl_label = "Open Logs Folder"
|
|
bl_options = {'INTERNAL'}
|
|
|
|
def execute(self, context):
|
|
global _MANAGER
|
|
if not _MANAGER or not _MANAGER.logs_dir:
|
|
self.report({'ERROR'}, "No job/logs available.")
|
|
return {'CANCELLED'}
|
|
path = _MANAGER.logs_dir
|
|
if IS_WIN:
|
|
os.startfile(path) # noqa
|
|
elif IS_MAC:
|
|
subprocess.call(["open", path])
|
|
else:
|
|
subprocess.call(["xdg-open", path])
|
|
return {'FINISHED'}
|
|
|
|
class MGPU_PT_panel(bpy.types.Panel):
|
|
bl_label = "Multi-Instance Render"
|
|
bl_space_type = 'PROPERTIES'
|
|
bl_region_type = 'WINDOW'
|
|
bl_context = "render"
|
|
|
|
def draw(self, context):
|
|
layout = self.layout
|
|
p = context.window_manager.mgpu_runtime_prefs
|
|
|
|
col = layout.column(align=True)
|
|
row = col.row(align=True)
|
|
row.scale_y = 1.4
|
|
row.alert = True
|
|
row.operator("render.multi_gpu_frames", icon='RENDER_STILL', text="Render (Multi-GPU Frames)")
|
|
|
|
box = layout.box()
|
|
box.label(text="Scheduler Settings")
|
|
row = box.row(align=True)
|
|
row.prop(p, "dispatch_mode")
|
|
row = box.row(align=True)
|
|
row.prop(p, "threads_per_process")
|
|
row.prop(p, "max_retries")
|
|
row = box.row(align=True)
|
|
row.prop(p, "render_time_guard_tier")
|
|
row = box.row(align=True)
|
|
row.prop(p, "instances_per_gpu")
|
|
row = box.row(align=True)
|
|
row.prop(p, "use_persistent_data")
|
|
row.prop(p, "open_terminals")
|
|
row = box.row(align=True)
|
|
row.prop(p, "denoise_on_gpu")
|
|
row = box.row(align=True)
|
|
row.prop(p, "use_target_dir_for_video_temp")
|
|
box.label(text="Launch diagnostics: check console/logs for [MGPU-LAUNCH] reason codes.")
|
|
if _mgpu_is_video(context.scene):
|
|
temp_dir = _mgpu_video_temp_dir_for(
|
|
context.scene,
|
|
use_target_dir=bool(getattr(p, "use_target_dir_for_video_temp", True))
|
|
)
|
|
box.label(text=f"Video temp folder: {_shorten_path(temp_dir)}")
|
|
|
|
box2 = layout.box()
|
|
box2.label(text="GPU Detection")
|
|
row = box2.row(align=True)
|
|
row.alert = True
|
|
row.label(text="Please Find your Correct number/listing of GPUs")
|
|
row = box2.row(align=True)
|
|
row.prop(p, "ghost_filter_mode", expand=True)
|
|
|
|
backend = _current_compute_type()
|
|
legacy = _detect_gpu_devices_legacy(False)
|
|
strict = _detect_gpu_devices_strict(True)
|
|
final = _detect_gpu_devices_final_from_lists(p.ghost_filter_mode, legacy, strict)
|
|
box2.label(text=f"Compute backend: {backend}")
|
|
box2.label(text="Note: GPU index order follows NVIDIA/nvidia-smi and may differ from Windows Task Manager numbering.")
|
|
final_note = ""
|
|
if (not final) and (bpy.context.scene.render.engine == "CYCLES"):
|
|
final_note = " (no mapped GPU; CPU worker only if Cycles CPU device is enabled)"
|
|
box2.label(text=f"Legacy:{len(legacy)} Strict:{len(strict)} Final:{len(final)}{final_note}")
|
|
if len(final) < len(strict):
|
|
box2.label(text="Note: Final < Strict; ghost filter may be excluding one or more GPUs.")
|
|
mapped_all = _map_selection_to_uuids(final)
|
|
mapped, dropped_unknown = _filter_known_mapped_gpus(mapped_all)
|
|
if dropped_unknown:
|
|
box2.label(text=f"Hidden unresolved GPU entries: {len(dropped_unknown)} (index '?').")
|
|
if len(mapped) < len(final):
|
|
box2.label(text="Note: Mapping lost devices; check [MGPU-GPUSEL] and [MGPU-LAUNCH] logs.")
|
|
phys = _win_query_nvidia_smi_detailed() or []
|
|
if phys:
|
|
mapped_phys = {m.get("phys_index") for m in mapped if m.get("phys_index") is not None}
|
|
if len(mapped_phys) < len(phys):
|
|
phys_idx = {g.get("index") for g in phys if g.get("index") is not None}
|
|
missing_idx = sorted([i for i in phys_idx if i not in mapped_phys])
|
|
box2.label(text=f"Warning: NVIDIA physical GPUs={len(phys)} but mapped={len(mapped_phys)}.")
|
|
if missing_idx:
|
|
box2.label(text=f"Unmapped NVIDIA index(es): {', '.join(str(i) for i in missing_idx)}")
|
|
box2.label(text="Missing GPUs can be backend/type filtered; check [MGPU-GPUSEL] logs.")
|
|
for m in mapped:
|
|
label = f" [GPU {m['phys_index'] if m['phys_index'] is not None else '?'}] {m.get('name') or '?'}"
|
|
c_nm = str(m.get("cycles_name") or "")
|
|
p_nm = str(m.get("name") or "")
|
|
if c_nm and _normalize_gpu_name(c_nm) != _normalize_gpu_name(p_nm):
|
|
label += f" (Cycles row: {c_nm})"
|
|
box2.label(text=label)
|
|
|
|
est = layout.box()
|
|
rss = _proc_rss_bytes()
|
|
avail = _sys_mem_available_bytes()
|
|
n_workers = (len(mapped) if mapped else 1) * (p.instances_per_gpu if mapped else 1)
|
|
per_child = None if rss is None else max(int(rss * 0.8), 512 * 1024 * 1024)
|
|
total_need = None if (per_child is None) else per_child * n_workers
|
|
msg = f"Instances planned: {n_workers} | Blender RSS: {_fmt_bytes(rss) if rss else '?'}"
|
|
est.label(text=msg)
|
|
if total_need is not None and avail is not None:
|
|
risk = " (high risk of OOM)" if total_need > avail*0.8 else ""
|
|
est.label(text=f"Estimated RAM needed: {_fmt_bytes(total_need)} | Free: {_fmt_bytes(avail)}{risk}")
|
|
else:
|
|
est.label(text="RAM estimate not available on this platform (ok to ignore).")
|
|
|
|
if not _mgpu_has_mari_addon():
|
|
ad = layout.box()
|
|
ad.label(text="Render & Share Holographic 3D Images!!")
|
|
row = ad.row(align=True)
|
|
row.scale_y = 1.3
|
|
row.alert = True
|
|
row.operator("wm.url_open", text="holomari.com", icon='URL').url = "https://holomari.com/info/index"
|
|
|
|
# ----------------------- registration -----------------------
|
|
|
|
class MGPU_RuntimePrefsReg(bpy.types.AddonPreferences):
|
|
bl_idname = ADDON_KEY
|
|
def draw(self, context):
|
|
self.layout.label(text="Use the panel in Render Properties â–¸ Multi-GPU Frames.")
|
|
|
|
def _add_keymap():
|
|
"""Bind our operator to Ctrl+F12 (animation), and remove any old F12 binding we created."""
|
|
kc = bpy.context.window_manager.keyconfigs.addon
|
|
if not kc:
|
|
return
|
|
for km_name in ("Screen", "Window"):
|
|
km = kc.keymaps.get(km_name)
|
|
if not km:
|
|
continue
|
|
for kmi in list(km.keymap_items):
|
|
if kmi.idname == "render.multi_gpu_frames" and kmi.type == 'F12' and not kmi.ctrl:
|
|
try:
|
|
km.keymap_items.remove(kmi)
|
|
except Exception:
|
|
pass
|
|
|
|
km = kc.keymaps.new(name="Screen", space_type="EMPTY", region_type='WINDOW')
|
|
kmi = km.keymap_items.new("render.multi_gpu_frames", 'F12', 'PRESS', ctrl=True)
|
|
_KM_ITEMS.append((km, kmi))
|
|
|
|
|
|
def _remove_keymap():
|
|
"""Remove only the keymap items we added during this session."""
|
|
for km, kmi in _KM_ITEMS:
|
|
try:
|
|
km.keymap_items.remove(kmi)
|
|
except Exception:
|
|
pass
|
|
_KM_ITEMS.clear()
|
|
|
|
|
|
def register():
|
|
bpy.utils.register_class(MGPU_RuntimePrefs)
|
|
bpy.utils.register_class(MGPU_RuntimePrefsReg)
|
|
bpy.utils.register_class(MGPU_OT_render_frames)
|
|
bpy.utils.register_class(MGPU_OT_cancel)
|
|
bpy.utils.register_class(MGPU_OT_open_logs)
|
|
bpy.utils.register_class(MGPU_PT_panel)
|
|
bpy.utils.register_class(MGPU_OT_render_mari)
|
|
bpy.types.WindowManager.mgpu_runtime_prefs = bpy.props.PointerProperty(type=MGPU_RuntimePrefs)
|
|
_add_keymap()
|
|
|
|
def unregister():
|
|
_remove_keymap()
|
|
try:
|
|
del bpy.types.WindowManager.mgpu_runtime_prefs
|
|
except Exception:
|
|
pass
|
|
for cls in [
|
|
MGPU_PT_panel,
|
|
MGPU_OT_open_logs,
|
|
MGPU_OT_cancel,
|
|
MGPU_OT_render_frames,
|
|
MGPU_OT_render_mari, # <-- add this
|
|
MGPU_RuntimePrefsReg,
|
|
MGPU_RuntimePrefs,
|
|
]:
|
|
try:
|
|
bpy.utils.unregister_class(cls)
|
|
except Exception:
|
|
pass
|
|
|