import hashlib import math import re from collections import Counter TOKEN_PATTERN = re.compile(r"[\w\u4e00-\u9fff]+", re.UNICODE) def tokenize(text: str) -> list[str]: return [item.lower() for item in TOKEN_PATTERN.findall(text)] def build_hash_embedding(text: str, *, dimensions: int) -> list[float]: vector = [0.0 for _ in range(dimensions)] tokens = tokenize(text) if not tokens: return vector for token in tokens: digest = hashlib.sha256(token.encode("utf-8")).digest() index = int.from_bytes(digest[:4], "big") % dimensions sign = 1.0 if digest[4] % 2 == 0 else -1.0 vector[index] += sign norm = math.sqrt(sum(item * item for item in vector)) if norm == 0: return vector return [round(item / norm, 6) for item in vector] def cosine_similarity(left: list[float] | None, right: list[float] | None) -> float: if not left or not right or len(left) != len(right): return 0.0 left_norm = math.sqrt(sum(item * item for item in left)) right_norm = math.sqrt(sum(item * item for item in right)) if left_norm == 0 or right_norm == 0: return 0.0 return sum(a * b for a, b in zip(left, right, strict=True)) / (left_norm * right_norm) def keyword_score(query: str, text: str) -> float: query_tokens = tokenize(query) if not query_tokens: return 0.0 text_counts = Counter(tokenize(text)) if not text_counts: return 0.0 unique_query_tokens = set(query_tokens) matched = sum(1 for token in unique_query_tokens if token in text_counts) frequency = sum(text_counts.get(token, 0) for token in query_tokens) return matched / len(unique_query_tokens) + min(frequency / 20.0, 1.0) def rerank_score( *, keyword: float, vector: float, importance_score: int, ) -> float: importance = min(max(importance_score, 0), 100) / 100 return round(keyword * 0.45 + vector * 0.45 + importance * 0.10, 6)