| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859 |
- import hashlib
- import math
- import re
- from collections import Counter
- TOKEN_PATTERN = re.compile(r"[\w\u4e00-\u9fff]+", re.UNICODE)
- def tokenize(text: str) -> list[str]:
- return [item.lower() for item in TOKEN_PATTERN.findall(text)]
- def build_hash_embedding(text: str, *, dimensions: int) -> list[float]:
- vector = [0.0 for _ in range(dimensions)]
- tokens = tokenize(text)
- if not tokens:
- return vector
- for token in tokens:
- digest = hashlib.sha256(token.encode("utf-8")).digest()
- index = int.from_bytes(digest[:4], "big") % dimensions
- sign = 1.0 if digest[4] % 2 == 0 else -1.0
- vector[index] += sign
- norm = math.sqrt(sum(item * item for item in vector))
- if norm == 0:
- return vector
- return [round(item / norm, 6) for item in vector]
- def cosine_similarity(left: list[float] | None, right: list[float] | None) -> float:
- if not left or not right or len(left) != len(right):
- return 0.0
- left_norm = math.sqrt(sum(item * item for item in left))
- right_norm = math.sqrt(sum(item * item for item in right))
- if left_norm == 0 or right_norm == 0:
- return 0.0
- return sum(a * b for a, b in zip(left, right, strict=True)) / (left_norm * right_norm)
- def keyword_score(query: str, text: str) -> float:
- query_tokens = tokenize(query)
- if not query_tokens:
- return 0.0
- text_counts = Counter(tokenize(text))
- if not text_counts:
- return 0.0
- unique_query_tokens = set(query_tokens)
- matched = sum(1 for token in unique_query_tokens if token in text_counts)
- frequency = sum(text_counts.get(token, 0) for token in query_tokens)
- return matched / len(unique_query_tokens) + min(frequency / 20.0, 1.0)
- def rerank_score(
- *,
- keyword: float,
- vector: float,
- importance_score: int,
- ) -> float:
- importance = min(max(importance_score, 0), 100) / 100
- return round(keyword * 0.45 + vector * 0.45 + importance * 0.10, 6)
|