|
|
@@ -1,8 +1,8 @@
|
|
|
from core_shared import JSONValue
|
|
|
|
|
|
+from app.application.embeddings import EmbeddingService
|
|
|
from app.application.retrieval import (
|
|
|
build_chunk_payloads,
|
|
|
- build_hash_embedding,
|
|
|
cosine_similarity,
|
|
|
keyword_score,
|
|
|
stable_content_hash,
|
|
|
@@ -35,6 +35,7 @@ class KnowledgeApplicationService:
|
|
|
self.base_repository = base_repository
|
|
|
self.document_repository = document_repository
|
|
|
self.chunk_repository = chunk_repository
|
|
|
+ self.embedding_service = EmbeddingService(settings=settings)
|
|
|
|
|
|
def create_base(self, payload: KnowledgeBaseCreateRequest) -> KnowledgeBase:
|
|
|
return self.base_repository.create(
|
|
|
@@ -108,10 +109,7 @@ class KnowledgeApplicationService:
|
|
|
knowledge_base_id=payload.knowledge_base_id,
|
|
|
)
|
|
|
document_cache: dict[str, KnowledgeDocument] = {}
|
|
|
- query_embedding = build_hash_embedding(
|
|
|
- payload.query,
|
|
|
- dimensions=self.settings.embedding_dimensions,
|
|
|
- )
|
|
|
+ query_embedding_result = self.embedding_service.embed_text(payload.query)
|
|
|
scored: list[tuple[KnowledgeChunk, KnowledgeDocument, float, dict[str, JSONValue]]] = []
|
|
|
for chunk in chunks:
|
|
|
document = document_cache.get(chunk.document_id)
|
|
|
@@ -126,7 +124,7 @@ class KnowledgeApplicationService:
|
|
|
if not self._matches_filters(document=document, filters_json=payload.filters_json):
|
|
|
continue
|
|
|
keyword = keyword_score(payload.query, chunk.content_text)
|
|
|
- vector = cosine_similarity(query_embedding, chunk.embedding_json)
|
|
|
+ vector = cosine_similarity(query_embedding_result.embedding, chunk.embedding_json)
|
|
|
score = round(keyword * 0.7 + vector * 0.3, 6)
|
|
|
scored.append(
|
|
|
(
|
|
|
@@ -136,7 +134,9 @@ class KnowledgeApplicationService:
|
|
|
{
|
|
|
"keyword_score": round(keyword, 6),
|
|
|
"vector_score": round(vector, 6),
|
|
|
- "retrieval_mode": "hybrid-local",
|
|
|
+ "retrieval_mode": "hybrid",
|
|
|
+ "embedding_provider": query_embedding_result.provider,
|
|
|
+ "embedding_model": query_embedding_result.model,
|
|
|
},
|
|
|
)
|
|
|
)
|
|
|
@@ -153,9 +153,15 @@ class KnowledgeApplicationService:
|
|
|
content_text=payload.content_text,
|
|
|
chunk_size=payload.chunk_size or self.settings.default_chunk_size,
|
|
|
chunk_overlap=payload.chunk_overlap or self.settings.default_chunk_overlap,
|
|
|
- embedding_dimensions=self.settings.embedding_dimensions,
|
|
|
- embedding_model=self.settings.embedding_model,
|
|
|
)
|
|
|
+ for chunk_payload in chunk_payloads:
|
|
|
+ content_text = self._read_chunk_content(chunk_payload)
|
|
|
+ embedding_result = self.embedding_service.embed_text(content_text)
|
|
|
+ chunk_payload["embedding_model"] = embedding_result.model
|
|
|
+ chunk_payload["embedding_json"] = embedding_result.embedding
|
|
|
+ chunk_payload["metadata_json"] = {
|
|
|
+ "embedding_provider": embedding_result.provider,
|
|
|
+ }
|
|
|
return self.chunk_repository.replace_document_chunks(
|
|
|
tenant_id=document.tenant_id,
|
|
|
knowledge_base_id=document.knowledge_base_id,
|
|
|
@@ -163,6 +169,10 @@ class KnowledgeApplicationService:
|
|
|
chunks=chunk_payloads,
|
|
|
)
|
|
|
|
|
|
+ def _read_chunk_content(self, chunk_payload: dict[str, JSONValue]) -> str:
|
|
|
+ value = chunk_payload.get("content_text")
|
|
|
+ return value if isinstance(value, str) else ""
|
|
|
+
|
|
|
def _matches_filters(
|
|
|
self,
|
|
|
*,
|