knowledge_chunk.py 1.5 KB

123456789101112131415161718192021222324252627282930313233343536
  1. from core_db import AuditMixin, Base, EntityMixin
  2. from core_shared import JSONValue
  3. from sqlalchemy import Integer, String, Text, cast
  4. from sqlalchemy import JSON
  5. from sqlalchemy.orm import Mapped, mapped_column
  6. from sqlalchemy.sql.expression import ColumnElement
  7. from sqlalchemy.types import UserDefinedType
  8. EMBEDDING_DIMENSIONS = 1536
  9. class PgVector(UserDefinedType[str]):
  10. cache_ok = True
  11. def __init__(self, dimensions: int) -> None:
  12. self.dimensions = dimensions
  13. def get_col_spec(self, **kw: object) -> str:
  14. return f"public.vector({self.dimensions})"
  15. def bind_expression(self, bindvalue: ColumnElement[str]) -> ColumnElement[str]:
  16. return cast(bindvalue, self)
  17. class KnowledgeChunk(EntityMixin, AuditMixin, Base):
  18. __tablename__ = "knowledge_chunk"
  19. knowledge_base_id: Mapped[str] = mapped_column(String(36), index=True)
  20. document_id: Mapped[str] = mapped_column(String(36), index=True)
  21. chunk_index: Mapped[int] = mapped_column(Integer)
  22. content_text: Mapped[str] = mapped_column(Text)
  23. token_count: Mapped[int] = mapped_column(Integer, default=0)
  24. embedding_model: Mapped[str | None] = mapped_column(String(64), nullable=True, index=True)
  25. embedding_json: Mapped[list[float] | None] = mapped_column(JSON, nullable=True)
  26. embedding_vector: Mapped[str | None] = mapped_column(PgVector(EMBEDDING_DIMENSIONS), nullable=True)
  27. metadata_json: Mapped[dict[str, JSONValue] | None] = mapped_column(JSON, nullable=True)