Bläddra i källkod

feat: add service observability metrics

Jax Docker 1 månad sedan
förälder
incheckning
85111423db

+ 46 - 0
deployments/docker/docker-compose.yml

@@ -31,6 +31,51 @@ services:
       timeout: 5s
       retries: 10
 
+  prometheus:
+    image: prom/prometheus:v2.54.1
+    container_name: agent-platform-prometheus
+    command:
+      - "--config.file=/etc/prometheus/prometheus.yml"
+      - "--storage.tsdb.path=/prometheus"
+    ports:
+      - "9090:9090"
+    volumes:
+      - ./prometheus.yml:/etc/prometheus/prometheus.yml:ro
+      - prometheus_data:/prometheus
+    depends_on:
+      api-gateway:
+        condition: service_started
+      session-service:
+        condition: service_started
+      workflow-service:
+        condition: service_started
+      runtime-service:
+        condition: service_started
+      tool-service:
+        condition: service_started
+      model-gateway-service:
+        condition: service_started
+      code-runner-service:
+        condition: service_started
+      agent-service:
+        condition: service_started
+      memory-service:
+        condition: service_started
+      team-service:
+        condition: service_started
+      skill-service:
+        condition: service_started
+      human-service:
+        condition: service_started
+      knowledge-service:
+        condition: service_started
+      event-service:
+        condition: service_started
+      auth-service:
+        condition: service_started
+      scheduler-service:
+        condition: service_started
+
   workflow-service:
     build:
       context: ../..
@@ -592,6 +637,7 @@ services:
 volumes:
   postgres_data:
   redis_data:
+  prometheus_data:
   api_gateway_data:
   agent_service_data:
   memory_service_data:

+ 25 - 0
deployments/docker/prometheus.yml

@@ -0,0 +1,25 @@
+global:
+  scrape_interval: 15s
+  evaluation_interval: 15s
+
+scrape_configs:
+  - job_name: agent-platform-services
+    metrics_path: /metrics
+    static_configs:
+      - targets:
+          - api-gateway:8000
+          - session-service:8001
+          - workflow-service:8002
+          - runtime-service:8003
+          - tool-service:8004
+          - model-gateway-service:8005
+          - code-runner-service:8006
+          - agent-service:8007
+          - memory-service:8008
+          - team-service:8009
+          - skill-service:8010
+          - human-service:8011
+          - knowledge-service:8012
+          - event-service:8013
+          - auth-service:8014
+          - scheduler-service:8015

+ 1 - 0
libs/core-shared/pyproject.toml

@@ -11,6 +11,7 @@ dependencies = [
   "pydantic>=2.7,<3.0",
   "pydantic-settings>=2.2,<3.0",
   "redis>=5.0,<6.0",
+  "starlette>=0.37,<1.0",
 ]
 
 [tool.setuptools]

+ 203 - 0
libs/core-shared/src/core_shared/observability.py

@@ -0,0 +1,203 @@
+from __future__ import annotations
+
+import json
+import logging
+from collections import defaultdict
+from dataclasses import dataclass
+from time import perf_counter
+from threading import Lock
+from typing import Any, Awaitable, Callable, Iterable, Protocol
+
+from starlette.datastructures import Headers
+from starlette.middleware.base import BaseHTTPMiddleware
+from starlette.requests import Request
+from starlette.responses import PlainTextResponse, Response
+from starlette.types import ASGIApp
+
+
+_METRICS_CONTENT_TYPE = "text/plain; version=0.0.4; charset=utf-8"
+_DURATION_BUCKETS = (0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0)
+RouteDecorator = Callable[[Callable[..., Awaitable[Response]]], Callable[..., Awaitable[Response]]]
+
+
+class ObservableApp(Protocol):
+    state: Any
+
+    def add_middleware(self, middleware_class: type[BaseHTTPMiddleware], **options: Any) -> None: ...
+
+    def get(
+        self,
+        path: str,
+        **options: Any,
+    ) -> RouteDecorator: ...
+
+
+@dataclass(frozen=True, slots=True)
+class HttpMetricLabels:
+    service: str
+    method: str
+    path: str
+    status_code: str
+
+    def as_pairs(self) -> tuple[tuple[str, str], ...]:
+        return (
+            ("service", self.service),
+            ("method", self.method),
+            ("path", self.path),
+            ("status_code", self.status_code),
+        )
+
+
+class MetricsRegistry:
+    """Small in-process Prometheus registry for HTTP service telemetry."""
+
+    def __init__(self, service_name: str) -> None:
+        self._service_name = service_name
+        self._lock = Lock()
+        self._request_counts: defaultdict[HttpMetricLabels, int] = defaultdict(int)
+        self._duration_sums: defaultdict[HttpMetricLabels, float] = defaultdict(float)
+        self._duration_buckets: defaultdict[tuple[HttpMetricLabels, str], int] = defaultdict(int)
+
+    def observe_http_request(self, labels: HttpMetricLabels, duration_seconds: float) -> None:
+        with self._lock:
+            self._request_counts[labels] += 1
+            self._duration_sums[labels] += duration_seconds
+            for bucket in _DURATION_BUCKETS:
+                if duration_seconds <= bucket:
+                    self._duration_buckets[(labels, _format_bucket(bucket))] += 1
+            self._duration_buckets[(labels, "+Inf")] += 1
+
+    def render_prometheus(self) -> str:
+        with self._lock:
+            request_counts = dict(self._request_counts)
+            duration_sums = dict(self._duration_sums)
+            duration_buckets = dict(self._duration_buckets)
+
+        lines: list[str] = [
+            "# HELP agent_platform_service_info Static service metadata.",
+            "# TYPE agent_platform_service_info gauge",
+            f'agent_platform_service_info{{service="{_escape_label(self._service_name)}"}} 1',
+            "# HELP agent_platform_http_requests_total Total HTTP requests by service, route, method and status.",
+            "# TYPE agent_platform_http_requests_total counter",
+        ]
+        for labels, value in sorted(request_counts.items(), key=lambda item: item[0].as_pairs()):
+            lines.append(f"agent_platform_http_requests_total{{{_render_labels(labels.as_pairs())}}} {value}")
+
+        lines.extend(
+            [
+                "# HELP agent_platform_http_request_duration_seconds HTTP request duration histogram.",
+                "# TYPE agent_platform_http_request_duration_seconds histogram",
+            ]
+        )
+        for labels, count in sorted(request_counts.items(), key=lambda item: item[0].as_pairs()):
+            for bucket in (*(_format_bucket(bucket) for bucket in _DURATION_BUCKETS), "+Inf"):
+                bucket_count = duration_buckets.get((labels, bucket), 0)
+                label_pairs = (*labels.as_pairs(), ("le", bucket))
+                lines.append(
+                    "agent_platform_http_request_duration_seconds_bucket"
+                    f"{{{_render_labels(label_pairs)}}} {bucket_count}"
+                )
+            lines.append(
+                "agent_platform_http_request_duration_seconds_sum"
+                f"{{{_render_labels(labels.as_pairs())}}} {duration_sums.get(labels, 0.0):.9f}"
+            )
+            lines.append(
+                "agent_platform_http_request_duration_seconds_count"
+                f"{{{_render_labels(labels.as_pairs())}}} {count}"
+            )
+        return "\n".join(lines) + "\n"
+
+
+class ObservabilityMiddleware(BaseHTTPMiddleware):
+    def __init__(self, app: ASGIApp, service_name: str, registry: MetricsRegistry) -> None:
+        super().__init__(app)
+        self._service_name = service_name
+        self._registry = registry
+        self._logger = logging.getLogger("agent_platform.access")
+
+    async def dispatch(
+        self,
+        request: Request,
+        call_next: Callable[[Request], Awaitable[Response]],
+    ) -> Response:
+        if request.url.path == "/metrics":
+            return await call_next(request)
+
+        started_at_monotonic = perf_counter()
+        status_code = 500
+        try:
+            response = await call_next(request)
+            status_code = response.status_code
+            return response
+        finally:
+            duration_seconds = perf_counter() - started_at_monotonic
+            path = _route_path(request)
+            labels = HttpMetricLabels(
+                service=self._service_name,
+                method=request.method,
+                path=path,
+                status_code=str(status_code),
+            )
+            self._registry.observe_http_request(labels, duration_seconds)
+            self._log_request(request, path, status_code, duration_seconds)
+
+    def _log_request(
+        self,
+        request: Request,
+        path: str,
+        status_code: int,
+        duration_seconds: float,
+    ) -> None:
+        headers = request.headers
+        payload = {
+            "event": "http_request",
+            "service": self._service_name,
+            "method": request.method,
+            "path": path,
+            "status_code": status_code,
+            "duration_ms": round(duration_seconds * 1000, 3),
+            "request_id": _header(headers, "x-request-id"),
+            "tenant_id": _header(headers, "x-tenant-id"),
+        }
+        self._logger.info(json.dumps(payload, ensure_ascii=False, separators=(",", ":")))
+
+
+def add_observability(app: ObservableApp, service_name: str) -> MetricsRegistry:
+    """Attach request metrics and a /metrics endpoint to a FastAPI app."""
+
+    registry = MetricsRegistry(service_name=service_name)
+    setattr(app.state, "metrics_registry", registry)
+    app.add_middleware(ObservabilityMiddleware, service_name=service_name, registry=registry)
+
+    @app.get("/metrics", include_in_schema=False)
+    async def metrics() -> PlainTextResponse:
+        return PlainTextResponse(registry.render_prometheus(), media_type=_METRICS_CONTENT_TYPE)
+
+    return registry
+
+
+def _route_path(request: Request) -> str:
+    route = request.scope.get("route")
+    path = getattr(route, "path", None)
+    if isinstance(path, str):
+        return path
+    return request.url.path
+
+
+def _header(headers: Headers, name: str) -> str | None:
+    value = headers.get(name)
+    if value is None or value == "":
+        return None
+    return value
+
+
+def _format_bucket(value: float) -> str:
+    return f"{value:g}"
+
+
+def _render_labels(pairs: Iterable[tuple[str, str]]) -> str:
+    return ",".join(f'{key}="{_escape_label(value)}"' for key, value in pairs)
+
+
+def _escape_label(value: str) -> str:
+    return value.replace("\\", "\\\\").replace("\n", "\\n").replace('"', '\\"')

+ 2 - 0
services/agent-service/app/bootstrap/app.py

@@ -1,4 +1,5 @@
 from fastapi import FastAPI
+from core_shared.observability import add_observability
 
 from app.api.routes import router
 from app.bootstrap.settings import AgentServiceSettings
@@ -13,5 +14,6 @@ def create_app() -> FastAPI:
     )
     app.state.settings = settings
     app.state.session_factory = build_session_factory(settings)
+    add_observability(app, settings.service_name)
     app.include_router(router, prefix="/agents", tags=["agents"])
     return app

+ 2 - 0
services/api-gateway/app/bootstrap/app.py

@@ -1,4 +1,5 @@
 from fastapi import FastAPI
+from core_shared.observability import add_observability
 
 from app.api.routes import router
 from app.bootstrap.settings import ApiGatewaySettings
@@ -14,6 +15,7 @@ def create_app() -> FastAPI:
     )
     app.state.settings = settings
     app.state.session_factory = build_session_factory(settings)
+    add_observability(app, settings.service_name)
     app.add_middleware(GatewayRequestContextMiddleware)
     app.include_router(router)
     return app

+ 2 - 0
services/auth-service/app/bootstrap/app.py

@@ -1,4 +1,5 @@
 from fastapi import FastAPI
+from core_shared.observability import add_observability
 
 from app.api.routes import router
 from app.bootstrap.settings import AuthServiceSettings
@@ -13,5 +14,6 @@ def create_app() -> FastAPI:
     )
     app.state.settings = settings
     app.state.session_factory = build_session_factory(settings)
+    add_observability(app, settings.service_name)
     app.include_router(router, prefix="/auth", tags=["auth"])
     return app

+ 5 - 0
services/code-runner-service/app/bootstrap/app.py

@@ -1,12 +1,17 @@
 from fastapi import FastAPI
+from core_shared.observability import add_observability
 
 from app.api.routes import router
+from app.bootstrap.settings import CodeRunnerServiceSettings
 
 
 def create_app() -> FastAPI:
+    settings = CodeRunnerServiceSettings()
     app = FastAPI(
         title="agent-platform code-runner-service",
         version="0.1.0",
     )
+    app.state.settings = settings
+    add_observability(app, settings.service_name)
     app.include_router(router, prefix="/code", tags=["code"])
     return app

+ 2 - 0
services/event-service/app/bootstrap/app.py

@@ -1,4 +1,5 @@
 from fastapi import FastAPI
+from core_shared.observability import add_observability
 
 from app.api.routes import router
 from app.bootstrap.settings import EventServiceSettings
@@ -13,5 +14,6 @@ def create_app() -> FastAPI:
     )
     app.state.settings = settings
     app.state.session_factory = build_session_factory(settings)
+    add_observability(app, settings.service_name)
     app.include_router(router, prefix="/events", tags=["events"])
     return app

+ 2 - 0
services/human-service/app/bootstrap/app.py

@@ -1,4 +1,5 @@
 from fastapi import FastAPI
+from core_shared.observability import add_observability
 
 from app.api.routes import router
 from app.bootstrap.settings import HumanServiceSettings
@@ -10,5 +11,6 @@ def create_app() -> FastAPI:
     app = FastAPI(title="agent-platform human-service", version="0.1.0")
     app.state.settings = settings
     app.state.session_factory = build_session_factory(settings)
+    add_observability(app, settings.service_name)
     app.include_router(router, prefix="/human", tags=["human"])
     return app

+ 2 - 0
services/knowledge-service/app/bootstrap/app.py

@@ -1,4 +1,5 @@
 from fastapi import FastAPI
+from core_shared.observability import add_observability
 
 from app.api.routes import router
 from app.bootstrap.settings import KnowledgeServiceSettings
@@ -10,5 +11,6 @@ def create_app() -> FastAPI:
     app = FastAPI(title="agent-platform knowledge-service", version="0.1.0")
     app.state.settings = settings
     app.state.session_factory = build_session_factory(settings)
+    add_observability(app, settings.service_name)
     app.include_router(router, prefix="/knowledge", tags=["knowledge"])
     return app

+ 2 - 0
services/memory-service/app/bootstrap/app.py

@@ -1,4 +1,5 @@
 from fastapi import FastAPI
+from core_shared.observability import add_observability
 
 from app.api.routes import router
 from app.bootstrap.settings import MemoryServiceSettings
@@ -13,5 +14,6 @@ def create_app() -> FastAPI:
     )
     app.state.settings = settings
     app.state.session_factory = build_session_factory(settings)
+    add_observability(app, settings.service_name)
     app.include_router(router, prefix="/memories", tags=["memories"])
     return app

+ 2 - 0
services/model-gateway-service/app/bootstrap/app.py

@@ -1,4 +1,5 @@
 from fastapi import FastAPI
+from core_shared.observability import add_observability
 
 from app.api.routes import router
 from app.bootstrap.settings import ModelGatewayServiceSettings
@@ -11,5 +12,6 @@ def create_app() -> FastAPI:
         version="0.1.0",
     )
     app.state.settings = settings
+    add_observability(app, settings.service_name)
     app.include_router(router, prefix="/models", tags=["models"])
     return app

+ 2 - 0
services/runtime-service/app/bootstrap/app.py

@@ -1,4 +1,5 @@
 from fastapi import FastAPI
+from core_shared.observability import add_observability
 
 from app.api.routes import router
 from app.bootstrap.settings import RuntimeServiceSettings
@@ -13,5 +14,6 @@ def create_app() -> FastAPI:
     )
     app.state.settings = settings
     app.state.session_factory = build_session_factory(settings)
+    add_observability(app, settings.service_name)
     app.include_router(router, prefix="/runtime", tags=["runtime"])
     return app

+ 2 - 0
services/scheduler-service/app/bootstrap/app.py

@@ -1,4 +1,5 @@
 from fastapi import FastAPI
+from core_shared.observability import add_observability
 
 from app.api.routes import router
 from app.bootstrap.settings import SchedulerServiceSettings
@@ -13,5 +14,6 @@ def create_app() -> FastAPI:
     )
     app.state.settings = settings
     app.state.session_factory = build_session_factory(settings)
+    add_observability(app, settings.service_name)
     app.include_router(router, prefix="/scheduler", tags=["scheduler"])
     return app

+ 2 - 0
services/session-service/app/bootstrap/app.py

@@ -1,4 +1,5 @@
 from fastapi import FastAPI
+from core_shared.observability import add_observability
 
 from app.api.routes import router
 from app.bootstrap.settings import SessionServiceSettings
@@ -13,5 +14,6 @@ def create_app() -> FastAPI:
     )
     app.state.settings = settings
     app.state.session_factory = build_session_factory(settings)
+    add_observability(app, settings.service_name)
     app.include_router(router, prefix="/sessions", tags=["sessions"])
     return app

+ 2 - 0
services/skill-service/app/bootstrap/app.py

@@ -1,4 +1,5 @@
 from fastapi import FastAPI
+from core_shared.observability import add_observability
 
 from app.api.routes import router
 from app.bootstrap.settings import SkillServiceSettings
@@ -10,5 +11,6 @@ def create_app() -> FastAPI:
     app = FastAPI(title="agent-platform skill-service", version="0.1.0")
     app.state.settings = settings
     app.state.session_factory = build_session_factory(settings)
+    add_observability(app, settings.service_name)
     app.include_router(router, prefix="/skills", tags=["skills"])
     return app

+ 2 - 0
services/team-service/app/bootstrap/app.py

@@ -1,4 +1,5 @@
 from fastapi import FastAPI
+from core_shared.observability import add_observability
 
 from app.api.routes import router
 from app.bootstrap.settings import TeamServiceSettings
@@ -13,5 +14,6 @@ def create_app() -> FastAPI:
     )
     app.state.settings = settings
     app.state.session_factory = build_session_factory(settings)
+    add_observability(app, settings.service_name)
     app.include_router(router, prefix="/teams", tags=["teams"])
     return app

+ 2 - 0
services/tool-service/app/bootstrap/app.py

@@ -1,4 +1,5 @@
 from fastapi import FastAPI
+from core_shared.observability import add_observability
 
 from app.api.routes import router
 from app.bootstrap.settings import ToolServiceSettings
@@ -13,5 +14,6 @@ def create_app() -> FastAPI:
     )
     app.state.settings = settings
     app.state.session_factory = build_session_factory(settings)
+    add_observability(app, settings.service_name)
     app.include_router(router, prefix="/tools", tags=["tools"])
     return app

+ 2 - 0
services/workflow-service/app/bootstrap/app.py

@@ -1,4 +1,5 @@
 from fastapi import FastAPI
+from core_shared.observability import add_observability
 
 from app.api.routes import router
 from app.bootstrap.settings import WorkflowServiceSettings
@@ -13,5 +14,6 @@ def create_app() -> FastAPI:
     )
     app.state.settings = settings
     app.state.session_factory = build_session_factory(settings)
+    add_observability(app, settings.service_name)
     app.include_router(router, prefix="/workflows", tags=["workflows"])
     return app

+ 31 - 0
tests/test_observability.py

@@ -0,0 +1,31 @@
+import asyncio
+import httpx
+from fastapi import FastAPI
+
+from core_shared.observability import add_observability
+
+
+def test_observability_records_http_metrics() -> None:
+    asyncio.run(_run_observability_smoke())
+
+
+async def _run_observability_smoke() -> None:
+    app = FastAPI()
+    add_observability(app, "test-service")
+
+    @app.get("/health")
+    async def health() -> dict[str, str]:
+        return {"status": "ok"}
+
+    transport = httpx.ASGITransport(app=app)
+    async with httpx.AsyncClient(transport=transport, base_url="http://testserver") as client:
+        health_response = await client.get("/health")
+        metrics_response = await client.get("/metrics")
+
+    assert health_response.status_code == 200
+    assert metrics_response.status_code == 200
+    assert 'agent_platform_service_info{service="test-service"} 1' in metrics_response.text
+    assert "agent_platform_http_requests_total" in metrics_response.text
+    assert 'method="GET"' in metrics_response.text
+    assert 'path="/health"' in metrics_response.text
+    assert 'status_code="200"' in metrics_response.text