1 bulan lalu · 4490c6492a
--- a/services/runtime-service/app/api/routes.py
+++ b/services/runtime-service/app/api/routes.py
@@ -3,7 +3,11 @@ from sqlalchemy import text
 
				 from sqlalchemy.orm import Session
			
 
				 
			
 
				 from core_domain import ServiceHealth
			
 
				-from app.application.services import RuntimeApplicationService, build_runtime_application_service
			
 
				+from app.application.services import (
			
 
				+    RuntimeApplicationService,
			
 
				+    RuntimeDebugSnapshot,
			
 
				+    build_runtime_application_service,
			
 
				+)
			
 
				 from app.bootstrap.settings import RuntimeServiceSettings
			
 
				 from app.db.session import get_db
			
 
				 from app.infrastructure.code_runner_client import CodeRunnerClientError
			
@@ -23,6 +27,9 @@ from app.schemas.run import (
 
				     RunCreateRequest,
			
 
				     RunExecuteRequest,
			
 
				     RunExecuteResponse,
			
 
				+    RuntimeDebugContinueRequest,
			
 
				+    RuntimeDebugSnapshotResponse,
			
 
				+    RuntimeDebugStepResponse,
			
 
				     TraceSpanResponse,
			
 
				     WorkerExecuteNextRequest,
			
 
				     WorkerExecuteNextResponse,
			
@@ -33,6 +40,35 @@ from app.schemas.run import (
 
				 router = APIRouter()
			
 
				 
			
 
				 
			
 
				+def build_runtime_debug_snapshot_response(snapshot: RuntimeDebugSnapshot) -> RuntimeDebugSnapshotResponse:
			
 
				+    return RuntimeDebugSnapshotResponse(
			
 
				+        run=WorkflowRunResponse.from_entity(snapshot.run),
			
 
				+        node_runs=[
			
 
				+            NodeRunResponse.from_entity(item)
			
 
				+            for item in snapshot.node_runs
			
 
				+        ],
			
 
				+        run_state_json=snapshot.run_state_json,
			
 
				+        node_output_json_by_node_id=snapshot.node_output_json_by_node_id,
			
 
				+        node_output_text_by_node_id=snapshot.node_output_text_by_node_id,
			
 
				+        queued_node_ids=snapshot.queued_node_ids,
			
 
				+        running_node_ids=snapshot.running_node_ids,
			
 
				+        completed_node_ids=snapshot.completed_node_ids,
			
 
				+        failed_node_ids=snapshot.failed_node_ids,
			
 
				+        execution_logs=[
			
 
				+            ExecutionLogResponse.from_entity(item)
			
 
				+            for item in snapshot.execution_logs
			
 
				+        ],
			
 
				+        node_artifacts=[
			
 
				+            NodeArtifactResponse.from_entity(item)
			
 
				+            for item in snapshot.node_artifacts
			
 
				+        ],
			
 
				+        trace_spans=[
			
 
				+            TraceSpanResponse.from_entity(item)
			
 
				+            for item in snapshot.trace_spans
			
 
				+        ],
			
 
				+    )
			
 
				+
			
 
				+
			
 
				 def get_runtime_settings() -> RuntimeServiceSettings:
			
 
				     return RuntimeServiceSettings()
			
 
				 
			
@@ -268,6 +304,89 @@ def execute_run(
 
				     )
			
 
				 
			
 
				 
			
 
				+@router.get("/runs/{run_id}/debug/snapshot", response_model=RuntimeDebugSnapshotResponse)
			
 
				+def get_runtime_debug_snapshot(
			
 
				+    run_id: str,
			
 
				+    tenant_id: str = Query(...),
			
 
				+    service: RuntimeApplicationService = Depends(get_runtime_application_service),
			
 
				+) -> RuntimeDebugSnapshotResponse:
			
 
				+    snapshot = service.get_debug_snapshot(tenant_id=tenant_id, run_id=run_id)
			
 
				+    if snapshot is None:
			
 
				+        raise HTTPException(status_code=404, detail=f"workflow_run not found: {run_id}")
			
 
				+    return build_runtime_debug_snapshot_response(snapshot)
			
 
				+
			
 
				+
			
 
				+@router.post("/runs/{run_id}/debug/pause", response_model=RuntimeDebugSnapshotResponse)
			
 
				+def pause_runtime_debug_run(
			
 
				+    run_id: str,
			
 
				+    tenant_id: str = Query(...),
			
 
				+    service: RuntimeApplicationService = Depends(get_runtime_application_service),
			
 
				+) -> RuntimeDebugSnapshotResponse:
			
 
				+    snapshot = service.pause_run(tenant_id=tenant_id, run_id=run_id)
			
 
				+    if snapshot is None:
			
 
				+        raise HTTPException(status_code=404, detail=f"workflow_run not found: {run_id}")
			
 
				+    return build_runtime_debug_snapshot_response(snapshot)
			
 
				+
			
 
				+
			
 
				+@router.post("/runs/{run_id}/debug/resume", response_model=RuntimeDebugSnapshotResponse)
			
 
				+def resume_runtime_debug_run(
			
 
				+    run_id: str,
			
 
				+    tenant_id: str = Query(...),
			
 
				+    service: RuntimeApplicationService = Depends(get_runtime_application_service),
			
 
				+) -> RuntimeDebugSnapshotResponse:
			
 
				+    snapshot = service.resume_run(tenant_id=tenant_id, run_id=run_id)
			
 
				+    if snapshot is None:
			
 
				+        raise HTTPException(status_code=404, detail=f"workflow_run not found: {run_id}")
			
 
				+    return build_runtime_debug_snapshot_response(snapshot)
			
 
				+
			
 
				+
			
 
				+@router.post("/runs/{run_id}/debug/step", response_model=RuntimeDebugStepResponse)
			
 
				+def step_runtime_debug_run(
			
 
				+    run_id: str,
			
 
				+    payload: NodeRunExecuteRequest,
			
 
				+    tenant_id: str = Query(...),
			
 
				+    service: RuntimeApplicationService = Depends(get_runtime_application_service),
			
 
				+) -> RuntimeDebugStepResponse:
			
 
				+    result = service.step_debug_run(
			
 
				+        tenant_id=tenant_id,
			
 
				+        run_id=run_id,
			
 
				+        worker_key=payload.worker_key,
			
 
				+    )
			
 
				+    if result is None:
			
 
				+        raise HTTPException(status_code=404, detail=f"workflow_run not found: {run_id}")
			
 
				+    snapshot, executed_node_runs, executor_names, reason = result
			
 
				+    return RuntimeDebugStepResponse(
			
 
				+        snapshot=build_runtime_debug_snapshot_response(snapshot),
			
 
				+        executed_node_runs=[NodeRunResponse.from_entity(item) for item in executed_node_runs],
			
 
				+        executor_names=executor_names,
			
 
				+        reason=reason,
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+@router.post("/runs/{run_id}/debug/continue", response_model=RuntimeDebugStepResponse)
			
 
				+def continue_runtime_debug_run(
			
 
				+    run_id: str,
			
 
				+    payload: RuntimeDebugContinueRequest,
			
 
				+    tenant_id: str = Query(...),
			
 
				+    service: RuntimeApplicationService = Depends(get_runtime_application_service),
			
 
				+) -> RuntimeDebugStepResponse:
			
 
				+    result = service.continue_debug_run(
			
 
				+        tenant_id=tenant_id,
			
 
				+        run_id=run_id,
			
 
				+        payload=payload,
			
 
				+    )
			
 
				+    if result is None:
			
 
				+        raise HTTPException(status_code=404, detail=f"workflow_run not found: {run_id}")
			
 
				+    snapshot, executed_node_runs, executor_names, paused_before_node_id, reason = result
			
 
				+    return RuntimeDebugStepResponse(
			
 
				+        snapshot=build_runtime_debug_snapshot_response(snapshot),
			
 
				+        executed_node_runs=[NodeRunResponse.from_entity(item) for item in executed_node_runs],
			
 
				+        executor_names=executor_names,
			
 
				+        paused_before_node_id=paused_before_node_id,
			
 
				+        reason=reason,
			
 
				+    )
			
 
				+
			
 
				+
			
 
				 @router.post("/node-runs/{node_run_id}/resume-human", response_model=NodeRunExecuteResponse)
			
 
				 def resume_human_node_run(
			
 
				     node_run_id: str,
			
--- a/services/runtime-service/app/application/services.py
+++ b/services/runtime-service/app/application/services.py
@@ -1,3 +1,4 @@
 
				+from dataclasses import dataclass
			
 
				 from datetime import datetime, timedelta
			
 
				 
			
 
				 from sqlalchemy.orm import Session
			
@@ -13,7 +14,7 @@ from core_domain import (
 
				     WorkflowRunStatus,
			
 
				 )
			
 
				 
			
 
				-from app.db.models import NodeRun, WorkflowRun
			
 
				+from app.db.models import ExecutionLog, NodeArtifact, NodeRun, TraceSpan, WorkflowRun
			
 
				 from app.domain.repositories import (
			
 
				     ExecutionLogRepository,
			
 
				     NodeArtifactRepository,
			
@@ -43,12 +44,29 @@ from app.schemas.run import (
 
				     NodeRunStatusUpdateRequest,
			
 
				     RunCreateRequest,
			
 
				     RunExecuteRequest,
			
 
				+    RuntimeDebugContinueRequest,
			
 
				     WorkflowRunStatusUpdateRequest,
			
 
				 )
			
 
				 from core_shared import JSONValue, try_build_redis_client
			
 
				 from core_shared.task_queue import TaskQueuePublisher
			
 
				 
			
 
				 
			
 
				+@dataclass(frozen=True)
			
 
				+class RuntimeDebugSnapshot:
			
 
				+    run: WorkflowRun
			
 
				+    node_runs: list[NodeRun]
			
 
				+    run_state_json: dict[str, JSONValue]
			
 
				+    node_output_json_by_node_id: dict[str, dict[str, JSONValue]]
			
 
				+    node_output_text_by_node_id: dict[str, str]
			
 
				+    queued_node_ids: list[str]
			
 
				+    running_node_ids: list[str]
			
 
				+    completed_node_ids: list[str]
			
 
				+    failed_node_ids: list[str]
			
 
				+    execution_logs: list[ExecutionLog]
			
 
				+    node_artifacts: list[NodeArtifact]
			
 
				+    trace_spans: list[TraceSpan]
			
 
				+
			
 
				+
			
 
				 class RuntimeApplicationService:
			
 
				     def __init__(
			
 
				         self,
			
@@ -197,6 +215,17 @@ class RuntimeApplicationService:
 
				             span_type=span_type,
			
 
				         )
			
 
				 
			
 
				+    def get_debug_snapshot(
			
 
				+        self,
			
 
				+        *,
			
 
				+        tenant_id: str,
			
 
				+        run_id: str,
			
 
				+    ) -> RuntimeDebugSnapshot | None:
			
 
				+        workflow_run = self.workflow_run_repository.get_by_id(run_id)
			
 
				+        if workflow_run is None or workflow_run.tenant_id != tenant_id:
			
 
				+            return None
			
 
				+        return self._build_debug_snapshot(workflow_run)
			
 
				+
			
 
				     def update_run_status(
			
 
				         self,
			
 
				         run_id: str,
			
@@ -295,6 +324,9 @@ class RuntimeApplicationService:
 
				         if workflow_run is None:
			
 
				             return None
			
 
				 
			
 
				+        if workflow_run.status == "paused":
			
 
				+            return workflow_run, node_run, "debug_paused"
			
 
				+
			
 
				         if node_run.status in {"completed", "failed", "skipped"}:
			
 
				             executor_name = self.execution_dispatcher.resolve_executor(
			
 
				                 node_run.node_type
			
@@ -536,6 +568,200 @@ class RuntimeApplicationService:
 
				             return None
			
 
				         return final_run, executed_node_runs, executor_names
			
 
				 
			
 
				+    def pause_run(
			
 
				+        self,
			
 
				+        *,
			
 
				+        tenant_id: str,
			
 
				+        run_id: str,
			
 
				+        reason: str = "debug_pause",
			
 
				+    ) -> RuntimeDebugSnapshot | None:
			
 
				+        workflow_run = self.workflow_run_repository.get_by_id(run_id)
			
 
				+        if workflow_run is None or workflow_run.tenant_id != tenant_id:
			
 
				+            return None
			
 
				+        self._sync_workflow_run_status_from_nodes(tenant_id=tenant_id, run_id=run_id)
			
 
				+        latest_run = self.workflow_run_repository.get_by_id(run_id)
			
 
				+        if latest_run is None:
			
 
				+            return None
			
 
				+        if latest_run.status in {"completed", "failed", "cancelled"}:
			
 
				+            paused_run = latest_run
			
 
				+        else:
			
 
				+            paused_run = self.workflow_run_repository.update_status(
			
 
				+                run_id=run_id,
			
 
				+                status="paused",
			
 
				+            )
			
 
				+            if paused_run is None:
			
 
				+                return None
			
 
				+        self._log_event(
			
 
				+            tenant_id=tenant_id,
			
 
				+            run_id=run_id,
			
 
				+            node_run_id=None,
			
 
				+            event_type="debug_run_paused",
			
 
				+            message=f"workflow run paused: {reason}",
			
 
				+            detail_json={"reason": reason},
			
 
				+        )
			
 
				+        return self._build_debug_snapshot(paused_run)
			
 
				+
			
 
				+    def resume_run(
			
 
				+        self,
			
 
				+        *,
			
 
				+        tenant_id: str,
			
 
				+        run_id: str,
			
 
				+        reason: str = "debug_resume",
			
 
				+    ) -> RuntimeDebugSnapshot | None:
			
 
				+        workflow_run = self.workflow_run_repository.get_by_id(run_id)
			
 
				+        if workflow_run is None or workflow_run.tenant_id != tenant_id:
			
 
				+            return None
			
 
				+        resumed_run = self.workflow_run_repository.update_status(
			
 
				+            run_id=run_id,
			
 
				+            status="running",
			
 
				+        )
			
 
				+        if resumed_run is None:
			
 
				+            return None
			
 
				+        self._log_event(
			
 
				+            tenant_id=tenant_id,
			
 
				+            run_id=run_id,
			
 
				+            node_run_id=None,
			
 
				+            event_type="debug_run_resumed",
			
 
				+            message=f"workflow run resumed: {reason}",
			
 
				+            detail_json={"reason": reason},
			
 
				+        )
			
 
				+        return self._build_debug_snapshot(resumed_run)
			
 
				+
			
 
				+    def step_debug_run(
			
 
				+        self,
			
 
				+        *,
			
 
				+        tenant_id: str,
			
 
				+        run_id: str,
			
 
				+        worker_key: str | None = None,
			
 
				+    ) -> tuple[RuntimeDebugSnapshot, list[NodeRun], list[str], str] | None:
			
 
				+        workflow_run = self.workflow_run_repository.get_by_id(run_id)
			
 
				+        if workflow_run is None or workflow_run.tenant_id != tenant_id:
			
 
				+            return None
			
 
				+        if workflow_run.status == "paused":
			
 
				+            self.workflow_run_repository.update_status(run_id=run_id, status="running")
			
 
				+
			
 
				+        result = self.execute_next_node_run(
			
 
				+            tenant_id=tenant_id,
			
 
				+            run_id=run_id,
			
 
				+            payload=NodeRunExecuteRequest(worker_key=worker_key),
			
 
				+        )
			
 
				+        executed_node_runs: list[NodeRun] = []
			
 
				+        executor_names: list[str] = []
			
 
				+        reason = "no_queued_node"
			
 
				+        if result is not None:
			
 
				+            _, node_run, executor_name = result
			
 
				+            executed_node_runs.append(node_run)
			
 
				+            executor_names.append(executor_name)
			
 
				+            reason = "step_completed"
			
 
				+
			
 
				+        self._sync_workflow_run_status_from_nodes(tenant_id=tenant_id, run_id=run_id)
			
 
				+        latest_run = self.workflow_run_repository.get_by_id(run_id)
			
 
				+        if latest_run is None:
			
 
				+            return None
			
 
				+        if latest_run.status in {"completed", "failed", "cancelled"}:
			
 
				+            paused_run = latest_run
			
 
				+        else:
			
 
				+            paused_run = self.workflow_run_repository.update_status(
			
 
				+                run_id=run_id,
			
 
				+                status="paused",
			
 
				+            )
			
 
				+            if paused_run is None:
			
 
				+                return None
			
 
				+        self._log_event(
			
 
				+            tenant_id=tenant_id,
			
 
				+            run_id=run_id,
			
 
				+            node_run_id=executed_node_runs[-1].id if executed_node_runs else None,
			
 
				+            event_type="debug_step_finished",
			
 
				+            message=f"debug step finished: {reason}",
			
 
				+            detail_json={
			
 
				+                "reason": reason,
			
 
				+                "executed_node_ids": [item.node_id for item in executed_node_runs],
			
 
				+            },
			
 
				+        )
			
 
				+        return self._build_debug_snapshot(paused_run), executed_node_runs, executor_names, reason
			
 
				+
			
 
				+    def continue_debug_run(
			
 
				+        self,
			
 
				+        *,
			
 
				+        tenant_id: str,
			
 
				+        run_id: str,
			
 
				+        payload: RuntimeDebugContinueRequest,
			
 
				+    ) -> tuple[RuntimeDebugSnapshot, list[NodeRun], list[str], str | None, str] | None:
			
 
				+        workflow_run = self.workflow_run_repository.get_by_id(run_id)
			
 
				+        if workflow_run is None or workflow_run.tenant_id != tenant_id:
			
 
				+            return None
			
 
				+        if workflow_run.status == "paused":
			
 
				+            self.workflow_run_repository.update_status(run_id=run_id, status="running")
			
 
				+
			
 
				+        breakpoint_node_ids = set(payload.breakpoint_node_ids)
			
 
				+        executed_node_runs: list[NodeRun] = []
			
 
				+        executor_names: list[str] = []
			
 
				+        paused_before_node_id: str | None = None
			
 
				+        reason = "completed"
			
 
				+
			
 
				+        for _ in range(payload.max_steps):
			
 
				+            next_node_run = self.node_run_repository.get_next_queued_by_run(
			
 
				+                tenant_id=tenant_id,
			
 
				+                run_id=run_id,
			
 
				+            )
			
 
				+            if next_node_run is None:
			
 
				+                reason = "no_queued_node"
			
 
				+                break
			
 
				+            if next_node_run.node_id in breakpoint_node_ids:
			
 
				+                paused_before_node_id = next_node_run.node_id
			
 
				+                reason = "breakpoint_hit"
			
 
				+                break
			
 
				+
			
 
				+            step_result = self.execute_node_run(
			
 
				+                node_run_id=next_node_run.id,
			
 
				+                payload=NodeRunExecuteRequest(worker_key=payload.worker_key),
			
 
				+            )
			
 
				+            if step_result is None:
			
 
				+                reason = "node_not_found"
			
 
				+                break
			
 
				+            _, node_run, executor_name = step_result
			
 
				+            executed_node_runs.append(node_run)
			
 
				+            executor_names.append(executor_name)
			
 
				+            if node_run.status != "completed":
			
 
				+                reason = f"node_{node_run.status}"
			
 
				+                break
			
 
				+        else:
			
 
				+            reason = "max_steps_reached"
			
 
				+
			
 
				+        self._sync_workflow_run_status_from_nodes(tenant_id=tenant_id, run_id=run_id)
			
 
				+        latest_run = self.workflow_run_repository.get_by_id(run_id)
			
 
				+        if latest_run is None:
			
 
				+            return None
			
 
				+        if latest_run.status in {"completed", "failed", "cancelled"}:
			
 
				+            paused_run = latest_run
			
 
				+        else:
			
 
				+            paused_run = self.workflow_run_repository.update_status(
			
 
				+                run_id=run_id,
			
 
				+                status="paused",
			
 
				+            )
			
 
				+            if paused_run is None:
			
 
				+                return None
			
 
				+        self._log_event(
			
 
				+            tenant_id=tenant_id,
			
 
				+            run_id=run_id,
			
 
				+            node_run_id=executed_node_runs[-1].id if executed_node_runs else None,
			
 
				+            event_type="debug_continue_paused",
			
 
				+            message=f"debug continue paused: {reason}",
			
 
				+            detail_json={
			
 
				+                "reason": reason,
			
 
				+                "paused_before_node_id": paused_before_node_id,
			
 
				+                "executed_node_ids": [item.node_id for item in executed_node_runs],
			
 
				+                "breakpoint_node_ids": list(breakpoint_node_ids),
			
 
				+            },
			
 
				+        )
			
 
				+        return (
			
 
				+            self._build_debug_snapshot(paused_run),
			
 
				+            executed_node_runs,
			
 
				+            executor_names,
			
 
				+            paused_before_node_id,
			
 
				+            reason,
			
 
				+        )
			
 
				+
			
 
				     def resume_human_node_run(
			
 
				         self,
			
 
				         *,
			
@@ -827,6 +1053,45 @@ class RuntimeApplicationService:
 
				 
			
 
				         return run_state_json, node_output_json_by_node_id, node_output_text_by_node_id
			
 
				 
			
 
				+    def _build_debug_snapshot(self, workflow_run: WorkflowRun) -> RuntimeDebugSnapshot:
			
 
				+        node_runs = self.node_run_repository.list_by_run(
			
 
				+            tenant_id=workflow_run.tenant_id,
			
 
				+            run_id=workflow_run.id,
			
 
				+        )
			
 
				+        run_state_json, node_output_json_by_node_id, node_output_text_by_node_id = (
			
 
				+            self._build_run_state_maps(
			
 
				+                tenant_id=workflow_run.tenant_id,
			
 
				+                run_id=workflow_run.id,
			
 
				+            )
			
 
				+        )
			
 
				+        return RuntimeDebugSnapshot(
			
 
				+            run=workflow_run,
			
 
				+            node_runs=node_runs,
			
 
				+            run_state_json=run_state_json,
			
 
				+            node_output_json_by_node_id=node_output_json_by_node_id,
			
 
				+            node_output_text_by_node_id=node_output_text_by_node_id,
			
 
				+            queued_node_ids=[
			
 
				+                item.node_id for item in node_runs if item.status in {"pending", "queued"}
			
 
				+            ],
			
 
				+            running_node_ids=[item.node_id for item in node_runs if item.status == "running"],
			
 
				+            completed_node_ids=[
			
 
				+                item.node_id for item in node_runs if item.status in {"completed", "skipped"}
			
 
				+            ],
			
 
				+            failed_node_ids=[item.node_id for item in node_runs if item.status == "failed"],
			
 
				+            execution_logs=self.execution_log_repository.list_by_scope(
			
 
				+                tenant_id=workflow_run.tenant_id,
			
 
				+                run_id=workflow_run.id,
			
 
				+            ),
			
 
				+            node_artifacts=self.node_artifact_repository.list_by_scope(
			
 
				+                tenant_id=workflow_run.tenant_id,
			
 
				+                run_id=workflow_run.id,
			
 
				+            ),
			
 
				+            trace_spans=self.trace_span_repository.list_by_scope(
			
 
				+                tenant_id=workflow_run.tenant_id,
			
 
				+                run_id=workflow_run.id,
			
 
				+            ),
			
 
				+        )
			
 
				+
			
 
				     def _build_node_timing(
			
 
				         self,
			
 
				         node_config_json: dict[str, JSONValue],
			
--- a/services/runtime-service/app/schemas/run.py
+++ b/services/runtime-service/app/schemas/run.py
@@ -75,6 +75,12 @@ class RunExecuteResponse(BaseModel):
 
				     executor_names: list[str]
			
 
				 
			
 
				 
			
 
				+class RuntimeDebugContinueRequest(BaseModel):
			
 
				+    worker_key: str | None = None
			
 
				+    max_steps: int = Field(default=32, ge=1, le=500)
			
 
				+    breakpoint_node_ids: list[str] = Field(default_factory=list)
			
 
				+
			
 
				+
			
 
				 class WorkerExecuteNextRequest(BaseModel):
			
 
				     worker_key: str
			
 
				     lease_seconds: int | None = Field(default=None, gt=0)
			
@@ -149,3 +155,26 @@ class TraceSpanResponse(BaseModel):
 
				     @classmethod
			
 
				     def from_entity(cls, entity: "TraceSpan") -> "TraceSpanResponse":
			
 
				         return cls.model_validate(entity, from_attributes=True)
			
 
				+
			
 
				+
			
 
				+class RuntimeDebugSnapshotResponse(BaseModel):
			
 
				+    run: WorkflowRunResponse
			
 
				+    node_runs: list[NodeRunResponse]
			
 
				+    run_state_json: dict[str, JSONValue]
			
 
				+    node_output_json_by_node_id: dict[str, dict[str, JSONValue]]
			
 
				+    node_output_text_by_node_id: dict[str, str]
			
 
				+    queued_node_ids: list[str]
			
 
				+    running_node_ids: list[str]
			
 
				+    completed_node_ids: list[str]
			
 
				+    failed_node_ids: list[str]
			
 
				+    execution_logs: list[ExecutionLogResponse]
			
 
				+    node_artifacts: list[NodeArtifactResponse]
			
 
				+    trace_spans: list[TraceSpanResponse]
			
 
				+
			
 
				+
			
 
				+class RuntimeDebugStepResponse(BaseModel):
			
 
				+    snapshot: RuntimeDebugSnapshotResponse
			
 
				+    executed_node_runs: list[NodeRunResponse]
			
 
				+    executor_names: list[str]
			
 
				+    paused_before_node_id: str | None = None
			
 
				+    reason: str
			
--- a/tests/test_runtime_debugger.py
+++ b/tests/test_runtime_debugger.py
@@ -0,0 +1,151 @@
 
				+from __future__ import annotations
			
 
				+
			
 
				+from collections.abc import Generator
			
 
				+from datetime import datetime
			
 
				+from pathlib import Path
			
 
				+
			
 
				+from sqlalchemy.orm import Session
			
 
				+
			
 
				+from tests.conftest import build_fastapi_test_client, prepare_service_import
			
 
				+
			
 
				+
			
 
				+def test_runtime_debugger_pause_step_and_breakpoint_continue(tmp_path: Path) -> None:
			
 
				+    prepare_service_import(
			
 
				+        "runtime-service",
			
 
				+        libs=("core-domain", "core-shared", "core-db", "core-events", "core-dsl"),
			
 
				+    )
			
 
				+
			
 
				+    from app.api.routes import get_runtime_application_service
			
 
				+    from app.application.services import RuntimeApplicationService
			
 
				+    from app.bootstrap.app import create_app
			
 
				+    from app.bootstrap.settings import RuntimeServiceSettings
			
 
				+    from app.db.session import build_session_factory
			
 
				+    from app.domain.repositories import (
			
 
				+        ExecutionLogRepository,
			
 
				+        NodeArtifactRepository,
			
 
				+        NodeRunRepository,
			
 
				+        TraceSpanRepository,
			
 
				+        WorkflowRunRepository,
			
 
				+    )
			
 
				+    from app.infrastructure.executors import build_node_execution_dispatcher
			
 
				+    from core_db import Base
			
 
				+    from core_domain import WorkflowVersionContract
			
 
				+
			
 
				+    class FakeWorkflowClient:
			
 
				+        def get_workflow_version(
			
 
				+            self,
			
 
				+            *,
			
 
				+            tenant_id: str,
			
 
				+            workflow_version_id: str,
			
 
				+        ) -> WorkflowVersionContract:
			
 
				+            assert tenant_id == "t1"
			
 
				+            assert workflow_version_id == "wv1"
			
 
				+            return WorkflowVersionContract(
			
 
				+                id="wv1",
			
 
				+                tenant_id="t1",
			
 
				+                workflow_id="wf1",
			
 
				+                version_no=1,
			
 
				+                status="published",
			
 
				+                created_time=datetime.utcnow(),
			
 
				+                dsl_json={
			
 
				+                    "code": "debug_flow",
			
 
				+                    "nodes": [
			
 
				+                        {"id": "start", "type": "template", "config": {"template": "hello"}},
			
 
				+                        {"id": "answer", "type": "answer", "config": {"text": "done"}},
			
 
				+                    ],
			
 
				+                    "edges": [{"source": "start", "target": "answer"}],
			
 
				+                },
			
 
				+            )
			
 
				+
			
 
				+    settings = RuntimeServiceSettings(database_url=f"sqlite:///{tmp_path / 'runtime.db'}")
			
 
				+    session_factory = build_session_factory(settings)
			
 
				+    engine = session_factory.kw["bind"]
			
 
				+    Base.metadata.create_all(bind=engine)
			
 
				+
			
 
				+    def override_service() -> Generator[RuntimeApplicationService, None, None]:
			
 
				+        db: Session = session_factory()
			
 
				+        try:
			
 
				+            yield RuntimeApplicationService(
			
 
				+                workflow_run_repository=WorkflowRunRepository(db),
			
 
				+                node_run_repository=NodeRunRepository(db),
			
 
				+                execution_log_repository=ExecutionLogRepository(db),
			
 
				+                node_artifact_repository=NodeArtifactRepository(db),
			
 
				+                trace_span_repository=TraceSpanRepository(db),
			
 
				+                execution_dispatcher=build_node_execution_dispatcher(),
			
 
				+                workflow_client=FakeWorkflowClient(),
			
 
				+            )
			
 
				+        finally:
			
 
				+            db.close()
			
 
				+
			
 
				+    app = create_app()
			
 
				+    app.state.session_factory = session_factory
			
 
				+    app.dependency_overrides[get_runtime_application_service] = override_service
			
 
				+    client = build_fastapi_test_client(app)
			
 
				+
			
 
				+    create_response = client.post(
			
 
				+        "/runtime/runs",
			
 
				+        json={
			
 
				+            "tenant_id": "t1",
			
 
				+            "app_id": "app1",
			
 
				+            "app_version_id": "av1",
			
 
				+            "workflow_id": "wf1",
			
 
				+            "workflow_version_id": "wv1",
			
 
				+        },
			
 
				+    )
			
 
				+    assert create_response.status_code == 200
			
 
				+    run_id = create_response.json()["run"]["id"]
			
 
				+
			
 
				+    pause_response = client.post(
			
 
				+        f"/runtime/runs/{run_id}/debug/pause",
			
 
				+        params={"tenant_id": "t1"},
			
 
				+    )
			
 
				+    assert pause_response.status_code == 200
			
 
				+    assert pause_response.json()["run"]["status"] == "paused"
			
 
				+    assert pause_response.json()["queued_node_ids"] == ["start"]
			
 
				+
			
 
				+    protected_execute_response = client.post(
			
 
				+        f"/runtime/runs/{run_id}/execute-next",
			
 
				+        params={"tenant_id": "t1"},
			
 
				+        json={"worker_key": "debugger"},
			
 
				+    )
			
 
				+    assert protected_execute_response.status_code == 200
			
 
				+    assert protected_execute_response.json()["executor_name"] == "debug_paused"
			
 
				+    assert protected_execute_response.json()["node_run"]["status"] == "queued"
			
 
				+
			
 
				+    step_response = client.post(
			
 
				+        f"/runtime/runs/{run_id}/debug/step",
			
 
				+        params={"tenant_id": "t1"},
			
 
				+        json={"worker_key": "debugger"},
			
 
				+    )
			
 
				+    assert step_response.status_code == 200
			
 
				+    step_payload = step_response.json()
			
 
				+    assert step_payload["reason"] == "step_completed"
			
 
				+    assert [item["node_id"] for item in step_payload["executed_node_runs"]] == ["start"]
			
 
				+    assert step_payload["snapshot"]["run"]["status"] == "paused"
			
 
				+    assert step_payload["snapshot"]["completed_node_ids"] == ["start"]
			
 
				+    assert step_payload["snapshot"]["queued_node_ids"] == ["answer"]
			
 
				+
			
 
				+    breakpoint_response = client.post(
			
 
				+        f"/runtime/runs/{run_id}/debug/continue",
			
 
				+        params={"tenant_id": "t1"},
			
 
				+        json={"worker_key": "debugger", "breakpoint_node_ids": ["answer"], "max_steps": 5},
			
 
				+    )
			
 
				+    assert breakpoint_response.status_code == 200
			
 
				+    breakpoint_payload = breakpoint_response.json()
			
 
				+    assert breakpoint_payload["reason"] == "breakpoint_hit"
			
 
				+    assert breakpoint_payload["paused_before_node_id"] == "answer"
			
 
				+    assert breakpoint_payload["executed_node_runs"] == []
			
 
				+    assert breakpoint_payload["snapshot"]["queued_node_ids"] == ["answer"]
			
 
				+
			
 
				+    finish_response = client.post(
			
 
				+        f"/runtime/runs/{run_id}/debug/continue",
			
 
				+        params={"tenant_id": "t1"},
			
 
				+        json={"worker_key": "debugger", "max_steps": 5},
			
 
				+    )
			
 
				+    assert finish_response.status_code == 200
			
 
				+    finish_payload = finish_response.json()
			
 
				+    assert finish_payload["snapshot"]["run"]["status"] == "completed"
			
 
				+    assert finish_payload["snapshot"]["completed_node_ids"] == ["start", "answer"]
			
 
				+    assert finish_payload["snapshot"]["queued_node_ids"] == []
			
 
				+
			
 
				+    engine.dispose()