|
@@ -104,12 +104,27 @@ class KnowledgeApplicationService:
|
|
|
self,
|
|
self,
|
|
|
payload: KnowledgeSearchRequest,
|
|
payload: KnowledgeSearchRequest,
|
|
|
) -> list[tuple[KnowledgeChunk, KnowledgeDocument, float, dict[str, JSONValue]]]:
|
|
) -> list[tuple[KnowledgeChunk, KnowledgeDocument, float, dict[str, JSONValue]]]:
|
|
|
- chunks = self.chunk_repository.list_by_base(
|
|
|
|
|
|
|
+ document_cache: dict[str, KnowledgeDocument] = {}
|
|
|
|
|
+ query_embedding_result = self.embedding_service.embed_text(payload.query)
|
|
|
|
|
+ vector_candidates = self.chunk_repository.search_by_vector(
|
|
|
tenant_id=payload.tenant_id,
|
|
tenant_id=payload.tenant_id,
|
|
|
knowledge_base_id=payload.knowledge_base_id,
|
|
knowledge_base_id=payload.knowledge_base_id,
|
|
|
|
|
+ embedding=query_embedding_result.embedding,
|
|
|
|
|
+ limit=max(payload.top_k * 5, payload.top_k),
|
|
|
)
|
|
)
|
|
|
- document_cache: dict[str, KnowledgeDocument] = {}
|
|
|
|
|
- query_embedding_result = self.embedding_service.embed_text(payload.query)
|
|
|
|
|
|
|
+ if vector_candidates:
|
|
|
|
|
+ chunks = [chunk for chunk, _ in vector_candidates]
|
|
|
|
|
+ vector_scores_by_chunk_id = {
|
|
|
|
|
+ chunk.id: score for chunk, score in vector_candidates
|
|
|
|
|
+ }
|
|
|
|
|
+ retrieval_mode = "pgvector-hybrid"
|
|
|
|
|
+ else:
|
|
|
|
|
+ chunks = self.chunk_repository.list_by_base(
|
|
|
|
|
+ tenant_id=payload.tenant_id,
|
|
|
|
|
+ knowledge_base_id=payload.knowledge_base_id,
|
|
|
|
|
+ )
|
|
|
|
|
+ vector_scores_by_chunk_id = {}
|
|
|
|
|
+ retrieval_mode = "hybrid"
|
|
|
scored: list[tuple[KnowledgeChunk, KnowledgeDocument, float, dict[str, JSONValue]]] = []
|
|
scored: list[tuple[KnowledgeChunk, KnowledgeDocument, float, dict[str, JSONValue]]] = []
|
|
|
for chunk in chunks:
|
|
for chunk in chunks:
|
|
|
document = document_cache.get(chunk.document_id)
|
|
document = document_cache.get(chunk.document_id)
|
|
@@ -124,7 +139,9 @@ class KnowledgeApplicationService:
|
|
|
if not self._matches_filters(document=document, filters_json=payload.filters_json):
|
|
if not self._matches_filters(document=document, filters_json=payload.filters_json):
|
|
|
continue
|
|
continue
|
|
|
keyword = keyword_score(payload.query, chunk.content_text)
|
|
keyword = keyword_score(payload.query, chunk.content_text)
|
|
|
- vector = cosine_similarity(query_embedding_result.embedding, chunk.embedding_json)
|
|
|
|
|
|
|
+ vector = vector_scores_by_chunk_id.get(chunk.id)
|
|
|
|
|
+ if vector is None:
|
|
|
|
|
+ vector = cosine_similarity(query_embedding_result.embedding, chunk.embedding_json)
|
|
|
score = round(keyword * 0.7 + vector * 0.3, 6)
|
|
score = round(keyword * 0.7 + vector * 0.3, 6)
|
|
|
scored.append(
|
|
scored.append(
|
|
|
(
|
|
(
|
|
@@ -134,7 +151,7 @@ class KnowledgeApplicationService:
|
|
|
{
|
|
{
|
|
|
"keyword_score": round(keyword, 6),
|
|
"keyword_score": round(keyword, 6),
|
|
|
"vector_score": round(vector, 6),
|
|
"vector_score": round(vector, 6),
|
|
|
- "retrieval_mode": "hybrid",
|
|
|
|
|
|
|
+ "retrieval_mode": retrieval_mode,
|
|
|
"embedding_provider": query_embedding_result.provider,
|
|
"embedding_provider": query_embedding_result.provider,
|
|
|
"embedding_model": query_embedding_result.model,
|
|
"embedding_model": query_embedding_result.model,
|
|
|
},
|
|
},
|