knowledge.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458
  1. from datetime import datetime
  2. from typing import TYPE_CHECKING, Generic, Literal, TypeVar
  3. from core_domain import (
  4. KnowledgeBaseContract,
  5. KnowledgeBaseStatus,
  6. KnowledgeChunkContract,
  7. KnowledgeDocumentContract,
  8. KnowledgeDocumentStatus,
  9. KnowledgeSearchRequestContract,
  10. KnowledgeSearchResultContract,
  11. )
  12. from core_shared import JSONValue
  13. from pydantic import BaseModel, Field
  14. if TYPE_CHECKING:
  15. from app.db.models import KnowledgeBase, KnowledgeChunk, KnowledgeDocument
  16. T = TypeVar("T")
  17. class ApiErrorResponse(BaseModel):
  18. errorType: str
  19. message: str
  20. details: dict[str, JSONValue] = Field(default_factory=dict)
  21. class ApiResponse(BaseModel, Generic[T]):
  22. success: bool = True
  23. data: T | None = None
  24. error: ApiErrorResponse | None = None
  25. requestId: str
  26. serverTime: datetime
  27. class PageRequest(BaseModel):
  28. page: int = Field(default=1, ge=1)
  29. pageSize: int = Field(default=20, ge=1, le=200)
  30. keyword: str | None = None
  31. @property
  32. def offset(self) -> int:
  33. return (self.page - 1) * self.pageSize
  34. class PageResult(BaseModel, Generic[T]):
  35. items: list[T]
  36. total: int
  37. page: int
  38. pageSize: int
  39. hasMore: bool
  40. @classmethod
  41. def from_items(
  42. cls,
  43. *,
  44. items: list[T],
  45. total: int,
  46. page: int,
  47. page_size: int) -> "PageResult[T]":
  48. return cls(
  49. items=items,
  50. total=total,
  51. page=page,
  52. pageSize=page_size,
  53. hasMore=page * page_size < total)
  54. class KnowledgeBaseCreateRequest(BaseModel):
  55. code: str
  56. name: str
  57. description: str | None = None
  58. metadata_json: dict[str, JSONValue] = Field(default_factory=dict)
  59. class KnowledgeBaseStatusUpdateRequest(BaseModel):
  60. status: KnowledgeBaseStatus
  61. class KnowledgeBaseResponse(KnowledgeBaseContract):
  62. @classmethod
  63. def from_entity(cls, entity: "KnowledgeBase") -> "KnowledgeBaseResponse":
  64. return cls.model_validate(entity, from_attributes=True)
  65. class KnowledgeDocumentCreateRequest(BaseModel):
  66. knowledge_base_id: str
  67. title: str
  68. content_text: str | None = None
  69. content_base64: str | None = None
  70. source_type: str = "text"
  71. source_uri: str | None = None
  72. metadata_json: dict[str, JSONValue] = Field(default_factory=dict)
  73. chunk_size: int | None = Field(default=None, gt=0)
  74. chunk_overlap: int | None = Field(default=None, ge=0)
  75. class KnowledgeDocumentResponse(KnowledgeDocumentContract):
  76. @classmethod
  77. def from_entity(cls, entity: "KnowledgeDocument") -> "KnowledgeDocumentResponse":
  78. return cls.model_validate(entity, from_attributes=True)
  79. class KnowledgeChunkResponse(KnowledgeChunkContract):
  80. @classmethod
  81. def from_entity(cls, entity: "KnowledgeChunk") -> "KnowledgeChunkResponse":
  82. return cls.model_validate(entity, from_attributes=True)
  83. class KnowledgeDocumentIngestResponse(BaseModel):
  84. document: KnowledgeDocumentResponse
  85. chunks: list[KnowledgeChunkResponse]
  86. class KnowledgeDocumentParseRequest(BaseModel):
  87. source_type: str = "auto"
  88. source_uri: str | None = None
  89. content_text: str | None = None
  90. content_base64: str | None = None
  91. class KnowledgeDocumentParseResponse(BaseModel):
  92. content_text: str
  93. source_type: str
  94. metadata_json: dict[str, JSONValue] = Field(default_factory=dict)
  95. class KnowledgeSearchRequest(KnowledgeSearchRequestContract):
  96. pass
  97. class KnowledgeSearchResultResponse(KnowledgeSearchResultContract):
  98. pass
  99. class KnowledgeBaseDto(BaseModel):
  100. id: str
  101. name: str
  102. description: str | None = None
  103. status: KnowledgeBaseStatus
  104. metadata: dict[str, JSONValue] | None = None
  105. createdTime: datetime
  106. @classmethod
  107. def from_entity(cls, entity: "KnowledgeBase") -> "KnowledgeBaseDto":
  108. return cls(
  109. id=entity.id,
  110. name=entity.name,
  111. description=entity.description,
  112. status=entity.status,
  113. metadata=entity.metadata_json,
  114. createdTime=entity.created_time)
  115. class KnowledgeBaseListRequestDto(PageRequest):
  116. status: KnowledgeBaseStatus | None = None
  117. class KnowledgeBaseCreateRequestDto(BaseModel):
  118. name: str
  119. description: str | None = None
  120. metadata: dict[str, JSONValue] = Field(default_factory=dict)
  121. class KnowledgeBaseDetailRequestDto(BaseModel):
  122. knowledgeBaseId: str
  123. class KnowledgeBaseUpdateRequestDto(BaseModel):
  124. knowledgeBaseId: str
  125. name: str | None = None
  126. description: str | None = None
  127. status: KnowledgeBaseStatus | None = None
  128. metadata: dict[str, JSONValue] | None = None
  129. class KnowledgeBaseStatusRequestDto(BaseModel):
  130. knowledgeBaseId: str
  131. status: KnowledgeBaseStatus
  132. class KnowledgeBaseDeleteRequestDto(BaseModel):
  133. knowledgeBaseId: str
  134. class KnowledgeDocumentDto(BaseModel):
  135. id: str
  136. knowledgeBaseId: str
  137. title: str
  138. sourceType: str
  139. sourceUri: str | None = None
  140. status: KnowledgeDocumentStatus
  141. contentHash: str | None = None
  142. objectStorage: dict[str, JSONValue] | None = None
  143. metadata: dict[str, JSONValue] | None = None
  144. indexedTime: datetime | None = None
  145. createdTime: datetime
  146. @classmethod
  147. def from_entity(cls, entity: "KnowledgeDocument") -> "KnowledgeDocumentDto":
  148. metadata = entity.metadata_json or {}
  149. object_storage = metadata.get("object_storage")
  150. return cls(
  151. id=entity.id,
  152. knowledgeBaseId=entity.knowledge_base_id,
  153. title=entity.title,
  154. sourceType=entity.source_type,
  155. sourceUri=entity.source_uri,
  156. status=entity.status,
  157. contentHash=entity.content_hash,
  158. objectStorage=object_storage if isinstance(object_storage, dict) else None,
  159. metadata=entity.metadata_json,
  160. indexedTime=entity.indexed_time,
  161. createdTime=entity.created_time)
  162. class KnowledgeDocumentListRequestDto(PageRequest):
  163. knowledgeBaseId: str | None = None
  164. status: KnowledgeDocumentStatus | None = None
  165. sourceType: str | None = None
  166. class KnowledgeDocumentCreateRequestDto(BaseModel):
  167. knowledgeBaseId: str
  168. title: str
  169. contentText: str | None = None
  170. contentBase64: str | None = None
  171. sourceType: str = "text"
  172. sourceUri: str | None = None
  173. metadata: dict[str, JSONValue] = Field(default_factory=dict)
  174. chunkSize: int | None = Field(default=None, gt=0)
  175. chunkOverlap: int | None = Field(default=None, ge=0)
  176. asyncMode: bool | None = None
  177. class KnowledgeDocumentDetailRequestDto(BaseModel):
  178. documentId: str
  179. class KnowledgeDocumentUpdateRequestDto(BaseModel):
  180. documentId: str
  181. title: str | None = None
  182. sourceUri: str | None = None
  183. status: KnowledgeDocumentStatus | None = None
  184. metadata: dict[str, JSONValue] | None = None
  185. class KnowledgeDocumentStatusRequestDto(BaseModel):
  186. documentId: str
  187. status: KnowledgeDocumentStatus
  188. class KnowledgeDocumentDeleteRequestDto(BaseModel):
  189. documentId: str
  190. class KnowledgeDocumentReindexRequestDto(BaseModel):
  191. documentId: str
  192. chunkSize: int | None = Field(default=None, gt=0)
  193. chunkOverlap: int | None = Field(default=None, ge=0)
  194. asyncMode: bool | None = None
  195. class KnowledgeDocumentContentRequestDto(BaseModel):
  196. documentId: str
  197. includeText: bool = True
  198. includeBase64: bool = False
  199. class KnowledgeDocumentContentData(BaseModel):
  200. documentId: str
  201. title: str
  202. sourceType: str
  203. contentType: str | None = None
  204. sizeBytes: int
  205. contentText: str | None = None
  206. contentBase64: str | None = None
  207. objectStorage: dict[str, JSONValue] | None = None
  208. class KnowledgeDocumentStorageStatusRequestDto(BaseModel):
  209. documentId: str
  210. class KnowledgeDocumentStorageStatusData(BaseModel):
  211. documentId: str
  212. exists: bool
  213. objectStorage: dict[str, JSONValue] | None = None
  214. contentType: str | None = None
  215. sizeBytes: int | None = None
  216. etag: str | None = None
  217. errorMessage: str | None = None
  218. checkedTime: datetime
  219. class KnowledgeDocumentParseRequestDto(BaseModel):
  220. sourceType: str = "auto"
  221. sourceUri: str | None = None
  222. contentText: str | None = None
  223. contentBase64: str | None = None
  224. class KnowledgeDocumentParseData(BaseModel):
  225. contentText: str
  226. sourceType: str
  227. metadata: dict[str, JSONValue] = Field(default_factory=dict)
  228. class KnowledgeChunkDto(BaseModel):
  229. id: str
  230. knowledgeBaseId: str
  231. documentId: str
  232. chunkIndex: int
  233. contentText: str
  234. tokenCount: int
  235. embeddingModel: str | None = None
  236. embedding: list[float] | None = None
  237. metadata: dict[str, JSONValue] | None = None
  238. createdTime: datetime
  239. @classmethod
  240. def from_entity(cls, entity: "KnowledgeChunk") -> "KnowledgeChunkDto":
  241. return cls(
  242. id=entity.id,
  243. knowledgeBaseId=entity.knowledge_base_id,
  244. documentId=entity.document_id,
  245. chunkIndex=entity.chunk_index,
  246. contentText=entity.content_text,
  247. tokenCount=entity.token_count,
  248. embeddingModel=entity.embedding_model,
  249. embedding=entity.embedding_json,
  250. metadata=entity.metadata_json,
  251. createdTime=entity.created_time)
  252. class KnowledgeChunkListRequestDto(PageRequest):
  253. knowledgeBaseId: str | None = None
  254. documentId: str | None = None
  255. class KnowledgeChunkDetailRequestDto(BaseModel):
  256. chunkId: str
  257. class KnowledgeChunkDeleteRequestDto(BaseModel):
  258. chunkId: str
  259. class KnowledgeSearchRequestDto(BaseModel):
  260. knowledgeBaseId: str
  261. query: str
  262. topK: int = Field(default=5, ge=1, le=50)
  263. filters: dict[str, JSONValue] = Field(default_factory=dict)
  264. class KnowledgeSearchResultDto(BaseModel):
  265. chunk: KnowledgeChunkDto
  266. document: KnowledgeDocumentDto
  267. score: float
  268. scoreDetails: dict[str, JSONValue] = Field(default_factory=dict)
  269. class KnowledgeDocumentIngestData(BaseModel):
  270. document: KnowledgeDocumentDto
  271. chunks: list[KnowledgeChunkDto]
  272. queued: bool = False
  273. job: "KnowledgeIndexJobData | None" = None
  274. KnowledgeIndexJobStatus = Literal["queued", "running", "completed", "failed", "skipped"]
  275. KnowledgeIndexJobAction = Literal["index", "reindex"]
  276. class KnowledgeIndexJobData(BaseModel):
  277. jobId: str
  278. documentId: str
  279. knowledgeBaseId: str | None = None
  280. documentTitle: str | None = None
  281. action: KnowledgeIndexJobAction
  282. status: KnowledgeIndexJobStatus
  283. progress: int = Field(default=0, ge=0, le=100)
  284. queueName: str | None = None
  285. workerKey: str | None = None
  286. errorMessage: str | None = None
  287. chunkSize: int | None = None
  288. chunkOverlap: int | None = None
  289. queuedTime: datetime | None = None
  290. startedTime: datetime | None = None
  291. completedTime: datetime | None = None
  292. class KnowledgeIndexJobListRequestDto(PageRequest):
  293. knowledgeBaseId: str | None = None
  294. documentId: str | None = None
  295. status: KnowledgeIndexJobStatus | None = None
  296. class KnowledgeIndexJobDetailRequestDto(BaseModel):
  297. documentId: str
  298. class KnowledgeIndexJobRetryRequestDto(BaseModel):
  299. documentId: str
  300. chunkSize: int | None = Field(default=None, gt=0)
  301. chunkOverlap: int | None = Field(default=None, ge=0)
  302. class KnowledgeBaseReindexRequestDto(BaseModel):
  303. knowledgeBaseId: str
  304. chunkSize: int | None = Field(default=None, gt=0)
  305. chunkOverlap: int | None = Field(default=None, ge=0)
  306. class KnowledgeBaseReindexData(BaseModel):
  307. knowledgeBaseId: str
  308. queuedCount: int
  309. jobs: list[KnowledgeIndexJobData]
  310. class KnowledgeStorageHealthRequestDto(BaseModel):
  311. pass
  312. class KnowledgeStorageHealthData(BaseModel):
  313. backend: str
  314. bucket: str
  315. available: bool
  316. message: str | None = None
  317. checkedTime: datetime
  318. class KnowledgeSettingsDto(BaseModel):
  319. knowledgeBaseId: str | None = None
  320. retrievalMode: str = "hybrid"
  321. embeddingModelId: str = "auto"
  322. rerankModelId: str = "auto"
  323. chunkSize: int = 800
  324. chunkOverlap: int = 120
  325. topK: int = 5
  326. minScore: float = 0.0
  327. maxCandidates: int = 50
  328. keywordWeight: float = 0.55
  329. vectorWeight: float = 0.30
  330. rerankWeight: float = 0.15
  331. queryRewrite: bool = False
  332. requireCitations: bool = True
  333. class KnowledgeSettingsUpdateRequestDto(KnowledgeSettingsDto):
  334. knowledgeBaseId: str | None = None
  335. class DeleteData(BaseModel):
  336. deleted: bool
  337. knowledgeBaseId: str | None = None
  338. documentId: str | None = None
  339. chunkId: str | None = None
  340. objectDeleted: bool | None = None