Browse Source

feat: add production infra and react loop

Jax Docker 1 month ago
parent
commit
a6528d715b

+ 26 - 1
README.md

@@ -258,6 +258,22 @@ Invoke-RestMethod -Method Post `
   -Body '{"tenant_id":"t1","agent_id":"agent-id","status":"published","role":"sales_assistant","goal":"Help qualify leads","system_prompt":"You are a careful sales assistant."}'
 ```
 
+Enable multi-step ReAct planning for an agent version:
+
+```json
+{
+  "model_config": {
+    "react_enabled": true,
+    "react_max_steps": 5
+  }
+}
+```
+
+When ReAct is enabled, the model can emit JSON tool actions such as
+`{"action":"tool","tool_code":"lookup_order","input_json":{"order_id":"123"}}`
+and then finish with `{"action":"finish","answer":"..."}`. Each tool call is
+persisted in `agent_tool_invocation`.
+
 Create an agent run. If `agent_version_id` is omitted, the latest published version is used:
 
 ```powershell
@@ -1206,6 +1222,13 @@ Start in detached mode:
 docker compose -f .\deployments\docker\docker-compose.yml up --build -d
 ```
 
+Production-like infrastructure:
+
+- Compose now starts `postgres` with the `pgvector` image and runs `CREATE EXTENSION IF NOT EXISTS vector`.
+- Compose now starts durable `redis` with append-only persistence.
+- Copy `deployments/docker/.env.example` to `.env` to use per-service PostgreSQL databases such as `workflow_service`, `agent_service`, and `knowledge_service`.
+- Set `AGENT_PLATFORM_REDIS_URL=redis://redis:6379/0` to enable shared Redis-backed locks, idempotency keys, and queues.
+
 Scale runtime workers:
 
 ```powershell
@@ -1238,7 +1261,9 @@ docker compose -f .\deployments\docker\docker-compose.yml down
 
 Important notes:
 
-- `workflow-service`, `session-service`, `runtime-service`, `tool-service`, and `api-gateway` use SQLite files mounted under `/data`
+- Services still fall back to SQLite files under `/data` if `AGENT_PLATFORM_DATABASE_URL` is not set.
+- For scaled workers, use PostgreSQL plus Redis rather than SQLite.
+- `core-shared.redis_primitives` provides `DistributedLock`, `IdempotencyStore`, and `RedisQueue` for services that need cross-process coordination.
 - `agent-service` stores agent definitions, prompt/config versions, and agent run records under `/data`
 - `memory-service` stores scoped memories under `/data`; move it to PostgreSQL before enabling high-volume memory writes
 - `team-service` stores multi-agent team definitions, team versions, and team run records under `/data`

+ 18 - 0
deployments/docker/.env.example

@@ -1,6 +1,24 @@
 AGENT_PLATFORM_PROVIDER_BASE_URL=https://api.openai.com/v1
 AGENT_PLATFORM_PROVIDER_API_KEY=replace-me
 AGENT_PLATFORM_DEFAULT_MODEL=gpt-4o-mini
+AGENT_PLATFORM_POSTGRES_USER=agent_platform
+AGENT_PLATFORM_POSTGRES_PASSWORD=agent_platform
+AGENT_PLATFORM_POSTGRES_DB=agent_platform
+AGENT_PLATFORM_WORKFLOW_DATABASE_URL=postgresql+psycopg://agent_platform:agent_platform@postgres:5432/workflow_service
+AGENT_PLATFORM_SESSION_DATABASE_URL=postgresql+psycopg://agent_platform:agent_platform@postgres:5432/session_service
+AGENT_PLATFORM_RUNTIME_DATABASE_URL=postgresql+psycopg://agent_platform:agent_platform@postgres:5432/runtime_service
+AGENT_PLATFORM_TOOL_DATABASE_URL=postgresql+psycopg://agent_platform:agent_platform@postgres:5432/tool_service
+AGENT_PLATFORM_AGENT_DATABASE_URL=postgresql+psycopg://agent_platform:agent_platform@postgres:5432/agent_service
+AGENT_PLATFORM_MEMORY_DATABASE_URL=postgresql+psycopg://agent_platform:agent_platform@postgres:5432/memory_service
+AGENT_PLATFORM_TEAM_DATABASE_URL=postgresql+psycopg://agent_platform:agent_platform@postgres:5432/team_service
+AGENT_PLATFORM_SKILL_DATABASE_URL=postgresql+psycopg://agent_platform:agent_platform@postgres:5432/skill_service
+AGENT_PLATFORM_HUMAN_DATABASE_URL=postgresql+psycopg://agent_platform:agent_platform@postgres:5432/human_service
+AGENT_PLATFORM_KNOWLEDGE_DATABASE_URL=postgresql+psycopg://agent_platform:agent_platform@postgres:5432/knowledge_service
+AGENT_PLATFORM_EVENT_DATABASE_URL=postgresql+psycopg://agent_platform:agent_platform@postgres:5432/event_service
+AGENT_PLATFORM_AUTH_DATABASE_URL=postgresql+psycopg://agent_platform:agent_platform@postgres:5432/auth_service
+AGENT_PLATFORM_SCHEDULER_DATABASE_URL=postgresql+psycopg://agent_platform:agent_platform@postgres:5432/scheduler_service
+AGENT_PLATFORM_API_GATEWAY_DATABASE_URL=postgresql+psycopg://agent_platform:agent_platform@postgres:5432/api_gateway
+AGENT_PLATFORM_REDIS_URL=redis://redis:6379/0
 AGENT_PLATFORM_EMBEDDING_PROVIDER=local
 AGENT_PLATFORM_EMBEDDING_BASE_URL=
 AGENT_PLATFORM_EMBEDDING_API_KEY=

+ 70 - 18
deployments/docker/docker-compose.yml

@@ -1,4 +1,36 @@
 services:
+  postgres:
+    image: pgvector/pgvector:pg16
+    container_name: agent-platform-postgres
+    environment:
+      POSTGRES_USER: ${AGENT_PLATFORM_POSTGRES_USER:-agent_platform}
+      POSTGRES_PASSWORD: ${AGENT_PLATFORM_POSTGRES_PASSWORD:-agent_platform}
+      POSTGRES_DB: ${AGENT_PLATFORM_POSTGRES_DB:-agent_platform}
+    ports:
+      - "5432:5432"
+    volumes:
+      - postgres_data:/var/lib/postgresql/data
+      - ./postgres-init:/docker-entrypoint-initdb.d:ro
+    healthcheck:
+      test: ["CMD-SHELL", "pg_isready -U ${AGENT_PLATFORM_POSTGRES_USER:-agent_platform} -d ${AGENT_PLATFORM_POSTGRES_DB:-agent_platform}"]
+      interval: 10s
+      timeout: 5s
+      retries: 10
+
+  redis:
+    image: redis:7-alpine
+    container_name: agent-platform-redis
+    command: ["redis-server", "--appendonly", "yes"]
+    ports:
+      - "6379:6379"
+    volumes:
+      - redis_data:/data
+    healthcheck:
+      test: ["CMD", "redis-cli", "ping"]
+      interval: 10s
+      timeout: 5s
+      retries: 10
+
   workflow-service:
     build:
       context: ../..
@@ -8,7 +40,8 @@ services:
     container_name: agent-platform-workflow-service
     command: ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8002"]
     environment:
-      AGENT_PLATFORM_DATABASE_URL: sqlite:////data/workflow_service.db
+      AGENT_PLATFORM_DATABASE_URL: ${AGENT_PLATFORM_WORKFLOW_DATABASE_URL:-sqlite:////data/workflow_service.db}
+      AGENT_PLATFORM_REDIS_URL: ${AGENT_PLATFORM_REDIS_URL:-redis://redis:6379/0}
     ports:
       - "8002:8002"
     volumes:
@@ -28,7 +61,8 @@ services:
     container_name: agent-platform-session-service
     command: ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8001"]
     environment:
-      AGENT_PLATFORM_DATABASE_URL: sqlite:////data/session_service.db
+      AGENT_PLATFORM_DATABASE_URL: ${AGENT_PLATFORM_SESSION_DATABASE_URL:-sqlite:////data/session_service.db}
+      AGENT_PLATFORM_REDIS_URL: ${AGENT_PLATFORM_REDIS_URL:-redis://redis:6379/0}
       AGENT_PLATFORM_RUNTIME_SERVICE_URL: http://runtime-service:8003
     ports:
       - "8001:8001"
@@ -52,7 +86,8 @@ services:
     container_name: agent-platform-tool-service
     command: ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8004"]
     environment:
-      AGENT_PLATFORM_DATABASE_URL: sqlite:////data/tool_service.db
+      AGENT_PLATFORM_DATABASE_URL: ${AGENT_PLATFORM_TOOL_DATABASE_URL:-sqlite:////data/tool_service.db}
+      AGENT_PLATFORM_REDIS_URL: ${AGENT_PLATFORM_REDIS_URL:-redis://redis:6379/0}
     ports:
       - "8004:8004"
     volumes:
@@ -111,7 +146,8 @@ services:
     container_name: agent-platform-agent-service
     command: ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8007"]
     environment:
-      AGENT_PLATFORM_DATABASE_URL: sqlite:////data/agent_service.db
+      AGENT_PLATFORM_DATABASE_URL: ${AGENT_PLATFORM_AGENT_DATABASE_URL:-sqlite:////data/agent_service.db}
+      AGENT_PLATFORM_REDIS_URL: ${AGENT_PLATFORM_REDIS_URL:-redis://redis:6379/0}
       AGENT_PLATFORM_MODEL_GATEWAY_SERVICE_URL: http://model-gateway-service:8005
       AGENT_PLATFORM_MEMORY_SERVICE_URL: http://memory-service:8008
       AGENT_PLATFORM_TOOL_SERVICE_URL: http://tool-service:8004
@@ -146,7 +182,8 @@ services:
         SERVICE_PATH: services/agent-service
     command: ["python", "-m", "app.worker"]
     environment:
-      AGENT_PLATFORM_DATABASE_URL: sqlite:////data/agent_service.db
+      AGENT_PLATFORM_DATABASE_URL: ${AGENT_PLATFORM_AGENT_DATABASE_URL:-sqlite:////data/agent_service.db}
+      AGENT_PLATFORM_REDIS_URL: ${AGENT_PLATFORM_REDIS_URL:-redis://redis:6379/0}
       AGENT_PLATFORM_MODEL_GATEWAY_SERVICE_URL: http://model-gateway-service:8005
       AGENT_PLATFORM_MEMORY_SERVICE_URL: http://memory-service:8008
       AGENT_PLATFORM_TOOL_SERVICE_URL: http://tool-service:8004
@@ -178,7 +215,8 @@ services:
     container_name: agent-platform-memory-service
     command: ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8008"]
     environment:
-      AGENT_PLATFORM_DATABASE_URL: sqlite:////data/memory_service.db
+      AGENT_PLATFORM_DATABASE_URL: ${AGENT_PLATFORM_MEMORY_DATABASE_URL:-sqlite:////data/memory_service.db}
+      AGENT_PLATFORM_REDIS_URL: ${AGENT_PLATFORM_REDIS_URL:-redis://redis:6379/0}
     ports:
       - "8008:8008"
     volumes:
@@ -198,7 +236,8 @@ services:
     container_name: agent-platform-team-service
     command: ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8009"]
     environment:
-      AGENT_PLATFORM_DATABASE_URL: sqlite:////data/team_service.db
+      AGENT_PLATFORM_DATABASE_URL: ${AGENT_PLATFORM_TEAM_DATABASE_URL:-sqlite:////data/team_service.db}
+      AGENT_PLATFORM_REDIS_URL: ${AGENT_PLATFORM_REDIS_URL:-redis://redis:6379/0}
       AGENT_PLATFORM_AGENT_SERVICE_URL: http://agent-service:8007
       AGENT_PLATFORM_EVENT_SERVICE_URL: http://event-service:8013
     ports:
@@ -219,7 +258,8 @@ services:
         SERVICE_PATH: services/team-service
     command: ["python", "-m", "app.worker"]
     environment:
-      AGENT_PLATFORM_DATABASE_URL: sqlite:////data/team_service.db
+      AGENT_PLATFORM_DATABASE_URL: ${AGENT_PLATFORM_TEAM_DATABASE_URL:-sqlite:////data/team_service.db}
+      AGENT_PLATFORM_REDIS_URL: ${AGENT_PLATFORM_REDIS_URL:-redis://redis:6379/0}
       AGENT_PLATFORM_AGENT_SERVICE_URL: http://agent-service:8007
       AGENT_PLATFORM_EVENT_SERVICE_URL: http://event-service:8013
       AGENT_PLATFORM_WORKER_POLL_INTERVAL_SECONDS: ${AGENT_PLATFORM_WORKER_POLL_INTERVAL_SECONDS:-1}
@@ -242,7 +282,8 @@ services:
     container_name: agent-platform-skill-service
     command: ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8010"]
     environment:
-      AGENT_PLATFORM_DATABASE_URL: sqlite:////data/skill_service.db
+      AGENT_PLATFORM_DATABASE_URL: ${AGENT_PLATFORM_SKILL_DATABASE_URL:-sqlite:////data/skill_service.db}
+      AGENT_PLATFORM_REDIS_URL: ${AGENT_PLATFORM_REDIS_URL:-redis://redis:6379/0}
     ports:
       - "8010:8010"
     volumes:
@@ -262,7 +303,8 @@ services:
     container_name: agent-platform-human-service
     command: ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8011"]
     environment:
-      AGENT_PLATFORM_DATABASE_URL: sqlite:////data/human_service.db
+      AGENT_PLATFORM_DATABASE_URL: ${AGENT_PLATFORM_HUMAN_DATABASE_URL:-sqlite:////data/human_service.db}
+      AGENT_PLATFORM_REDIS_URL: ${AGENT_PLATFORM_REDIS_URL:-redis://redis:6379/0}
     ports:
       - "8011:8011"
     volumes:
@@ -282,7 +324,8 @@ services:
     container_name: agent-platform-knowledge-service
     command: ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8012"]
     environment:
-      AGENT_PLATFORM_DATABASE_URL: sqlite:////data/knowledge_service.db
+      AGENT_PLATFORM_DATABASE_URL: ${AGENT_PLATFORM_KNOWLEDGE_DATABASE_URL:-sqlite:////data/knowledge_service.db}
+      AGENT_PLATFORM_REDIS_URL: ${AGENT_PLATFORM_REDIS_URL:-redis://redis:6379/0}
       AGENT_PLATFORM_EMBEDDING_PROVIDER: ${AGENT_PLATFORM_EMBEDDING_PROVIDER:-local}
       AGENT_PLATFORM_EMBEDDING_BASE_URL: ${AGENT_PLATFORM_EMBEDDING_BASE_URL:-}
       AGENT_PLATFORM_EMBEDDING_API_KEY: ${AGENT_PLATFORM_EMBEDDING_API_KEY:-}
@@ -306,7 +349,8 @@ services:
     container_name: agent-platform-event-service
     command: ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8013"]
     environment:
-      AGENT_PLATFORM_DATABASE_URL: sqlite:////data/event_service.db
+      AGENT_PLATFORM_DATABASE_URL: ${AGENT_PLATFORM_EVENT_DATABASE_URL:-sqlite:////data/event_service.db}
+      AGENT_PLATFORM_REDIS_URL: ${AGENT_PLATFORM_REDIS_URL:-redis://redis:6379/0}
     ports:
       - "8013:8013"
     volumes:
@@ -326,7 +370,8 @@ services:
     container_name: agent-platform-auth-service
     command: ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8014"]
     environment:
-      AGENT_PLATFORM_DATABASE_URL: sqlite:////data/auth_service.db
+      AGENT_PLATFORM_DATABASE_URL: ${AGENT_PLATFORM_AUTH_DATABASE_URL:-sqlite:////data/auth_service.db}
+      AGENT_PLATFORM_REDIS_URL: ${AGENT_PLATFORM_REDIS_URL:-redis://redis:6379/0}
     ports:
       - "8014:8014"
     volumes:
@@ -346,7 +391,8 @@ services:
     container_name: agent-platform-scheduler-service
     command: ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8015"]
     environment:
-      AGENT_PLATFORM_DATABASE_URL: sqlite:////data/scheduler_service.db
+      AGENT_PLATFORM_DATABASE_URL: ${AGENT_PLATFORM_SCHEDULER_DATABASE_URL:-sqlite:////data/scheduler_service.db}
+      AGENT_PLATFORM_REDIS_URL: ${AGENT_PLATFORM_REDIS_URL:-redis://redis:6379/0}
     ports:
       - "8015:8015"
     volumes:
@@ -365,7 +411,8 @@ services:
         SERVICE_PATH: services/scheduler-service
     command: ["python", "-m", "app.worker"]
     environment:
-      AGENT_PLATFORM_DATABASE_URL: sqlite:////data/scheduler_service.db
+      AGENT_PLATFORM_DATABASE_URL: ${AGENT_PLATFORM_SCHEDULER_DATABASE_URL:-sqlite:////data/scheduler_service.db}
+      AGENT_PLATFORM_REDIS_URL: ${AGENT_PLATFORM_REDIS_URL:-redis://redis:6379/0}
       AGENT_PLATFORM_EVENT_SERVICE_URL: http://event-service:8013
       AGENT_PLATFORM_WORKER_POLL_INTERVAL_SECONDS: ${AGENT_PLATFORM_WORKER_POLL_INTERVAL_SECONDS:-1}
       AGENT_PLATFORM_WORKER_LEASE_SECONDS: ${AGENT_PLATFORM_WORKER_LEASE_SECONDS:-300}
@@ -387,7 +434,8 @@ services:
     container_name: agent-platform-runtime-service
     command: ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8003"]
     environment:
-      AGENT_PLATFORM_DATABASE_URL: sqlite:////data/runtime_service.db
+      AGENT_PLATFORM_DATABASE_URL: ${AGENT_PLATFORM_RUNTIME_DATABASE_URL:-sqlite:////data/runtime_service.db}
+      AGENT_PLATFORM_REDIS_URL: ${AGENT_PLATFORM_REDIS_URL:-redis://redis:6379/0}
       AGENT_PLATFORM_WORKFLOW_SERVICE_URL: http://workflow-service:8002
       AGENT_PLATFORM_TOOL_SERVICE_URL: http://tool-service:8004
       AGENT_PLATFORM_MODEL_GATEWAY_SERVICE_URL: http://model-gateway-service:8005
@@ -443,7 +491,8 @@ services:
         SERVICE_PATH: services/runtime-service
     command: ["python", "-m", "app.worker"]
     environment:
-      AGENT_PLATFORM_DATABASE_URL: sqlite:////data/runtime_service.db
+      AGENT_PLATFORM_DATABASE_URL: ${AGENT_PLATFORM_RUNTIME_DATABASE_URL:-sqlite:////data/runtime_service.db}
+      AGENT_PLATFORM_REDIS_URL: ${AGENT_PLATFORM_REDIS_URL:-redis://redis:6379/0}
       AGENT_PLATFORM_WORKFLOW_SERVICE_URL: http://workflow-service:8002
       AGENT_PLATFORM_TOOL_SERVICE_URL: http://tool-service:8004
       AGENT_PLATFORM_MODEL_GATEWAY_SERVICE_URL: http://model-gateway-service:8005
@@ -480,7 +529,8 @@ services:
     container_name: agent-platform-api-gateway
     command: ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
     environment:
-      AGENT_PLATFORM_DATABASE_URL: sqlite:////data/api_gateway.db
+      AGENT_PLATFORM_DATABASE_URL: ${AGENT_PLATFORM_API_GATEWAY_DATABASE_URL:-sqlite:////data/api_gateway.db}
+      AGENT_PLATFORM_REDIS_URL: ${AGENT_PLATFORM_REDIS_URL:-redis://redis:6379/0}
       AGENT_PLATFORM_WORKFLOW_SERVICE_URL: http://workflow-service:8002
       AGENT_PLATFORM_SESSION_SERVICE_URL: http://session-service:8001
       AGENT_PLATFORM_RUNTIME_SERVICE_URL: http://runtime-service:8003
@@ -540,6 +590,8 @@ services:
       retries: 5
 
 volumes:
+  postgres_data:
+  redis_data:
   api_gateway_data:
   agent_service_data:
   memory_service_data:

+ 1 - 0
deployments/docker/postgres-init/001_pgvector.sql

@@ -0,0 +1 @@
+CREATE EXTENSION IF NOT EXISTS vector;

+ 28 - 0
deployments/docker/postgres-init/002_service_databases.sql

@@ -0,0 +1,28 @@
+SELECT 'CREATE DATABASE workflow_service'
+WHERE NOT EXISTS (SELECT FROM pg_database WHERE datname = 'workflow_service')\gexec
+SELECT 'CREATE DATABASE session_service'
+WHERE NOT EXISTS (SELECT FROM pg_database WHERE datname = 'session_service')\gexec
+SELECT 'CREATE DATABASE runtime_service'
+WHERE NOT EXISTS (SELECT FROM pg_database WHERE datname = 'runtime_service')\gexec
+SELECT 'CREATE DATABASE tool_service'
+WHERE NOT EXISTS (SELECT FROM pg_database WHERE datname = 'tool_service')\gexec
+SELECT 'CREATE DATABASE agent_service'
+WHERE NOT EXISTS (SELECT FROM pg_database WHERE datname = 'agent_service')\gexec
+SELECT 'CREATE DATABASE memory_service'
+WHERE NOT EXISTS (SELECT FROM pg_database WHERE datname = 'memory_service')\gexec
+SELECT 'CREATE DATABASE team_service'
+WHERE NOT EXISTS (SELECT FROM pg_database WHERE datname = 'team_service')\gexec
+SELECT 'CREATE DATABASE skill_service'
+WHERE NOT EXISTS (SELECT FROM pg_database WHERE datname = 'skill_service')\gexec
+SELECT 'CREATE DATABASE human_service'
+WHERE NOT EXISTS (SELECT FROM pg_database WHERE datname = 'human_service')\gexec
+SELECT 'CREATE DATABASE knowledge_service'
+WHERE NOT EXISTS (SELECT FROM pg_database WHERE datname = 'knowledge_service')\gexec
+SELECT 'CREATE DATABASE event_service'
+WHERE NOT EXISTS (SELECT FROM pg_database WHERE datname = 'event_service')\gexec
+SELECT 'CREATE DATABASE auth_service'
+WHERE NOT EXISTS (SELECT FROM pg_database WHERE datname = 'auth_service')\gexec
+SELECT 'CREATE DATABASE scheduler_service'
+WHERE NOT EXISTS (SELECT FROM pg_database WHERE datname = 'scheduler_service')\gexec
+SELECT 'CREATE DATABASE api_gateway'
+WHERE NOT EXISTS (SELECT FROM pg_database WHERE datname = 'api_gateway')\gexec

+ 4 - 0
deployments/docker/postgres-init/003_service_extensions.sql

@@ -0,0 +1,4 @@
+\connect knowledge_service
+CREATE EXTENSION IF NOT EXISTS vector;
+\connect memory_service
+CREATE EXTENSION IF NOT EXISTS vector;

+ 1 - 0
libs/core-db/pyproject.toml

@@ -10,6 +10,7 @@ requires-python = ">=3.11"
 dependencies = [
   "sqlalchemy>=2.0,<3.0",
   "pydantic>=2.7,<3.0",
+  "psycopg[binary]>=3.1,<4.0",
 ]
 
 [tool.setuptools]

+ 7 - 1
libs/core-db/src/core_db/__init__.py

@@ -1,6 +1,11 @@
 from .base import Base
 from .mixins import AuditMixin, TenantMixin, VersionMixin
-from .session import DatabaseSettings, create_engine_from_settings, create_session_factory
+from .session import (
+    DatabaseSettings,
+    create_engine_from_settings,
+    create_session_factory,
+    transaction_scope,
+)
 
 __all__ = [
     "AuditMixin",
@@ -10,4 +15,5 @@ __all__ = [
     "VersionMixin",
     "create_engine_from_settings",
     "create_session_factory",
+    "transaction_scope",
 ]

+ 15 - 0
libs/core-db/src/core_db/session.py

@@ -1,4 +1,5 @@
 from collections.abc import Generator
+from contextlib import contextmanager
 
 from pydantic import BaseModel, Field
 from sqlalchemy import Engine, create_engine
@@ -41,3 +42,17 @@ def session_scope(session_factory: sessionmaker[Session]) -> Generator[Session,
     finally:
         session.close()
 
+
+@contextmanager
+def transaction_scope(
+    session_factory: sessionmaker[Session],
+) -> Generator[Session, None, None]:
+    session = session_factory()
+    try:
+        yield session
+        session.commit()
+    except Exception:
+        session.rollback()
+        raise
+    finally:
+        session.close()

+ 2 - 0
libs/core-domain/src/core_domain/agent_contracts.py

@@ -16,6 +16,8 @@ class AgentModelConfigContract(BaseModel):
     model: str | None = None
     temperature: float | None = None
     max_tokens: int | None = None
+    react_enabled: bool = False
+    react_max_steps: int | None = None
     extra_json: dict[str, JSONValue] = Field(default_factory=dict)
 
 

+ 1 - 1
libs/core-shared/pyproject.toml

@@ -10,6 +10,7 @@ requires-python = ">=3.11"
 dependencies = [
   "pydantic>=2.7,<3.0",
   "pydantic-settings>=2.2,<3.0",
+  "redis>=5.0,<6.0",
 ]
 
 [tool.setuptools]
@@ -17,4 +18,3 @@ package-dir = {"" = "src"}
 
 [tool.setuptools.packages.find]
 where = ["src"]
-

+ 5 - 1
libs/core-shared/src/core_shared/__init__.py

@@ -1,4 +1,8 @@
 from .config import ServiceSettings
 from .types import JSONPrimitive, JSONValue
 
-__all__ = ["JSONPrimitive", "JSONValue", "ServiceSettings"]
+__all__ = [
+    "JSONPrimitive",
+    "JSONValue",
+    "ServiceSettings",
+]

+ 129 - 0
libs/core-shared/src/core_shared/redis_primitives.py

@@ -0,0 +1,129 @@
+import json
+import time
+from dataclasses import dataclass
+from uuid import uuid4
+
+from redis import Redis
+
+from core_shared.types import JSONValue
+
+
+@dataclass(frozen=True)
+class RedisConnectionSettings:
+    redis_url: str = "redis://127.0.0.1:6379/0"
+
+
+class DistributedLock:
+    def __init__(
+        self,
+        *,
+        client: Redis,
+        name: str,
+        ttl_seconds: int = 30,
+    ) -> None:
+        self.client = client
+        self.name = name
+        self.ttl_seconds = ttl_seconds
+        self.token = uuid4().hex
+
+    def acquire(self) -> bool:
+        return bool(
+            self.client.set(
+                self.name,
+                self.token,
+                nx=True,
+                ex=self.ttl_seconds,
+            )
+        )
+
+    def release(self) -> bool:
+        script = """
+        if redis.call("get", KEYS[1]) == ARGV[1] then
+            return redis.call("del", KEYS[1])
+        end
+        return 0
+        """
+        return bool(self.client.eval(script, 1, self.name, self.token))
+
+    def __enter__(self) -> "DistributedLock":
+        if not self.acquire():
+            raise RuntimeError(f"failed to acquire distributed lock: {self.name}")
+        return self
+
+    def __exit__(self, exc_type: object, exc: object, traceback: object) -> None:
+        self.release()
+
+
+class IdempotencyStore:
+    def __init__(self, *, client: Redis, prefix: str = "idempotency") -> None:
+        self.client = client
+        self.prefix = prefix
+
+    def begin(self, *, key: str, ttl_seconds: int = 86400) -> bool:
+        return bool(self.client.set(self._key(key), "in_progress", nx=True, ex=ttl_seconds))
+
+    def complete(
+        self,
+        *,
+        key: str,
+        result: dict[str, JSONValue],
+        ttl_seconds: int = 86400,
+    ) -> None:
+        self.client.set(
+            self._key(key),
+            json.dumps({"status": "completed", "result": result}, ensure_ascii=False),
+            ex=ttl_seconds,
+        )
+
+    def get_result(self, *, key: str) -> dict[str, JSONValue] | None:
+        value = self.client.get(self._key(key))
+        if not isinstance(value, (bytes, str)):
+            return None
+        decoded = value.decode("utf-8") if isinstance(value, bytes) else value
+        try:
+            payload = json.loads(decoded)
+        except json.JSONDecodeError:
+            return None
+        if not isinstance(payload, dict):
+            return None
+        result = payload.get("result")
+        if isinstance(result, dict):
+            return {str(item_key): item_value for item_key, item_value in result.items()}
+        return None
+
+    def _key(self, key: str) -> str:
+        return f"{self.prefix}:{key}"
+
+
+class RedisQueue:
+    def __init__(self, *, client: Redis, name: str) -> None:
+        self.client = client
+        self.name = name
+
+    def enqueue(self, payload: dict[str, JSONValue]) -> None:
+        self.client.rpush(self.name, json.dumps(payload, ensure_ascii=False))
+
+    def dequeue(self, *, timeout_seconds: int = 1) -> dict[str, JSONValue] | None:
+        result = self.client.blpop([self.name], timeout=timeout_seconds)
+        if result is None:
+            return None
+        _, raw_value = result
+        decoded = raw_value.decode("utf-8") if isinstance(raw_value, bytes) else raw_value
+        payload = json.loads(decoded)
+        if not isinstance(payload, dict):
+            return None
+        return {str(item_key): item_value for item_key, item_value in payload.items()}
+
+
+def build_redis_client(settings: RedisConnectionSettings) -> Redis:
+    return Redis.from_url(settings.redis_url, decode_responses=False)
+
+
+def wait_for_redis(client: Redis, *, timeout_seconds: int = 30) -> bool:
+    deadline = time.monotonic() + timeout_seconds
+    while time.monotonic() < deadline:
+        try:
+            return bool(client.ping())
+        except Exception:
+            time.sleep(0.5)
+    return False

+ 227 - 0
services/agent-service/app/application/services.py

@@ -1,3 +1,4 @@
+import json
 from datetime import datetime, timedelta
 from typing import cast
 
@@ -51,6 +52,7 @@ class AgentApplicationService:
         tool_client: ToolServiceClient | None = None,
         skill_client: SkillServiceClient | None = None,
         event_client: EventServiceClient | None = None,
+        react_max_steps: int = 5,
     ) -> None:
         self.agent_repository = agent_repository
         self.agent_version_repository = agent_version_repository
@@ -61,6 +63,7 @@ class AgentApplicationService:
         self.tool_client = tool_client
         self.skill_client = skill_client
         self.event_client = event_client
+        self.react_max_steps = react_max_steps
 
     def create_agent(self, payload: AgentCreateRequest) -> AgentDefinition:
         return self.agent_repository.create(
@@ -264,6 +267,17 @@ class AgentApplicationService:
                 )
             return completed_run
 
+        if self._read_bool(agent_version.model_config_json, "react_enabled", default=False):
+            return self._execute_react_agent_run(
+                agent_run=agent_run,
+                agent_version=agent_version,
+                payload=payload,
+                memory_results=memory_results,
+                memory_metadata=memory_metadata,
+                selected_tools=selected_tools,
+                selected_skills=selected_skills,
+            )
+
         tool_invocations = self._invoke_selected_tools(
             agent_run=agent_run,
             agent_version=agent_version,
@@ -391,6 +405,154 @@ class AgentApplicationService:
         except EventServiceClientError:
             return
 
+    def _execute_react_agent_run(
+        self,
+        *,
+        agent_run: AgentRun,
+        agent_version: AgentVersion,
+        payload: AgentRunExecuteRequest,
+        memory_results: list[MemorySearchResultContract],
+        memory_metadata: dict[str, JSONValue],
+        selected_tools: list[AgentToolRefContract],
+        selected_skills: list[AgentSkillRefContract],
+    ) -> AgentRun | None:
+        if self.model_gateway_client is None:
+            return self.agent_run_repository.update_status(
+                agent_run_id=agent_run.id,
+                status="failed",
+                worker_key=payload.worker_key,
+                error_code="model_gateway_missing",
+                error_message="model gateway client is not configured",
+            )
+
+        skill_invocations = self._invoke_selected_skills(
+            agent_run=agent_run,
+            selected_skills=selected_skills,
+            worker_key=payload.worker_key,
+        )
+        messages = self._build_chat_messages(
+            agent_run=agent_run,
+            agent_version=agent_version,
+            memory_results=memory_results,
+            capability_context=self._format_react_instruction(
+                selected_tools=selected_tools,
+                skill_invocations=skill_invocations,
+            ),
+        )
+        react_steps: list[dict[str, JSONValue]] = []
+        tool_invocations: list[dict[str, JSONValue]] = []
+        final_answer: str | None = None
+
+        max_steps = self._read_int(
+            agent_version.model_config_json,
+            "react_max_steps",
+            default=self.react_max_steps,
+        )
+        for step_index in range(max(max_steps, 1)):
+            try:
+                response = self.model_gateway_client.create_chat_completion(
+                    self._build_chat_completion_request(
+                        agent_run=agent_run,
+                        agent_version=agent_version,
+                        messages=messages,
+                    )
+                )
+            except ModelGatewayClientError as exc:
+                return self.agent_run_repository.update_status(
+                    agent_run_id=agent_run.id,
+                    status="failed",
+                    worker_key=payload.worker_key,
+                    error_code="model_gateway_error",
+                    error_message=str(exc),
+                    output_json={
+                        "react_steps": react_steps,
+                        "tool_invocations": tool_invocations,
+                        "skill_invocations": skill_invocations,
+                        **memory_metadata,
+                    },
+                )
+
+            action = self._parse_react_action(response.content)
+            react_step: dict[str, JSONValue] = {
+                "step_index": step_index,
+                "model_content": response.content,
+                "action": action,
+            }
+            react_steps.append(react_step)
+            if action.get("action") == "finish":
+                answer_value = action.get("answer")
+                final_answer = answer_value if isinstance(answer_value, str) else response.content
+                break
+
+            if action.get("action") != "tool":
+                final_answer = response.content
+                break
+
+            tool_code = action.get("tool_code")
+            matching_tools = [
+                item for item in selected_tools if item.tool_code == tool_code
+            ]
+            if not matching_tools:
+                observation = f"tool not available: {tool_code}"
+                react_step["observation"] = observation
+                messages.append(ChatMessageContract(role="assistant", content=response.content))
+                messages.append(ChatMessageContract(role="user", content=observation))
+                continue
+
+            tool_input = action.get("input_json")
+            original_input_json = agent_run.input_json
+            if isinstance(tool_input, dict):
+                agent_run.input_json = {
+                    str(item_key): item_value for item_key, item_value in tool_input.items()
+                }
+            current_invocations = self._invoke_selected_tools(
+                agent_run=agent_run,
+                agent_version=agent_version,
+                selected_tools=matching_tools[:1],
+            )
+            agent_run.input_json = original_input_json
+            tool_invocations.extend(current_invocations)
+            observation = self._format_react_observation(current_invocations)
+            react_step["observation"] = observation
+            messages.append(ChatMessageContract(role="assistant", content=response.content))
+            messages.append(ChatMessageContract(role="user", content=observation))
+
+        if final_answer is None:
+            final_answer = "ReAct loop reached max steps without a final answer."
+
+        memory_write_metadata = self._write_interaction_memory(
+            agent_run=agent_run,
+            agent_version=agent_version,
+            output_text=final_answer,
+        )
+        completed_run = self.agent_run_repository.update_status(
+            agent_run_id=agent_run.id,
+            status="completed",
+            worker_key=payload.worker_key,
+            output_text=final_answer,
+            output_json={
+                "dry_run": False,
+                "agent_version_id": agent_version.id,
+                "react_enabled": True,
+                "react_steps": react_steps,
+                "tool_invocations": tool_invocations,
+                "skill_invocations": skill_invocations,
+                **memory_metadata,
+                **memory_write_metadata,
+            },
+        )
+        if completed_run is not None:
+            self._publish_event(
+                event_type="agent.run.completed",
+                agent_run=completed_run,
+                payload_json={
+                    "agent_run_id": completed_run.id,
+                    "react_enabled": True,
+                    "status": completed_run.status,
+                },
+            )
+        return completed_run
+
     def execute_next_claimed_agent_run(
         self,
         *,
@@ -735,6 +897,70 @@ class AgentApplicationService:
             f"Skills: {skill_invocations}"
         )
 
+    def _format_react_instruction(
+        self,
+        *,
+        selected_tools: list[AgentToolRefContract],
+        skill_invocations: list[dict[str, JSONValue]],
+    ) -> str:
+        return (
+            "Use ReAct JSON only. Respond with one JSON object per turn.\n"
+            "To call a tool: "
+            '{"action":"tool","tool_code":"code","input_json":{...}}\n'
+            "To finish: "
+            '{"action":"finish","answer":"final answer"}\n'
+            f"Available tools: {[item.model_dump(mode='json') for item in selected_tools]}\n"
+            f"Pre-run skill results: {skill_invocations}"
+        )
+
+    def _format_react_observation(
+        self,
+        tool_invocations: list[dict[str, JSONValue]],
+    ) -> str:
+        return f"Observation: {tool_invocations}"
+
+    def _parse_react_action(self, content: str) -> dict[str, JSONValue]:
+        try:
+            value = json.loads(content)
+        except json.JSONDecodeError:
+            start_index = content.find("{")
+            end_index = content.rfind("}")
+            if start_index < 0 or end_index <= start_index:
+                return {"action": "finish", "answer": content}
+            try:
+                value = json.loads(content[start_index : end_index + 1])
+            except json.JSONDecodeError:
+                return {"action": "finish", "answer": content}
+        if not isinstance(value, dict):
+            return {"action": "finish", "answer": content}
+        return {str(item_key): item_value for item_key, item_value in value.items()}
+
+    def _build_chat_completion_request(
+        self,
+        *,
+        agent_run: AgentRun,
+        agent_version: AgentVersion,
+        messages: list[ChatMessageContract],
+    ) -> ChatCompletionRequestContract:
+        return ChatCompletionRequestContract(
+            model=self._read_optional_string(agent_version.model_config_json, "model"),
+            temperature=self._read_optional_float(
+                agent_version.model_config_json,
+                "temperature",
+            ),
+            max_tokens=self._read_optional_int(
+                agent_version.model_config_json,
+                "max_tokens",
+            ),
+            messages=messages,
+            metadata_json={
+                "tenant_id": agent_run.tenant_id,
+                "agent_id": agent_run.agent_id,
+                "agent_version_id": agent_version.id,
+                "agent_run_id": agent_run.id,
+            },
+        )
+
     def _build_skill_input_json(
         self,
         *,
@@ -1025,4 +1251,5 @@ def build_agent_application_service(
             base_url=settings.event_service_url,
             timeout_seconds=settings.event_service_timeout_seconds,
         ),
+        react_max_steps=settings.react_max_steps,
     )

+ 1 - 0
services/agent-service/app/bootstrap/settings.py

@@ -19,3 +19,4 @@ class AgentServiceSettings(ServiceSettings):
     worker_lease_seconds: int = 300
     worker_max_idle_cycles: int | None = None
     worker_dry_run: bool = False
+    react_max_steps: int = 5