retrieval.py 1.9 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859
  1. import hashlib
  2. import math
  3. import re
  4. from collections import Counter
  5. TOKEN_PATTERN = re.compile(r"[\w\u4e00-\u9fff]+", re.UNICODE)
  6. def tokenize(text: str) -> list[str]:
  7. return [item.lower() for item in TOKEN_PATTERN.findall(text)]
  8. def build_hash_embedding(text: str, *, dimensions: int) -> list[float]:
  9. vector = [0.0 for _ in range(dimensions)]
  10. tokens = tokenize(text)
  11. if not tokens:
  12. return vector
  13. for token in tokens:
  14. digest = hashlib.sha256(token.encode("utf-8")).digest()
  15. index = int.from_bytes(digest[:4], "big") % dimensions
  16. sign = 1.0 if digest[4] % 2 == 0 else -1.0
  17. vector[index] += sign
  18. norm = math.sqrt(sum(item * item for item in vector))
  19. if norm == 0:
  20. return vector
  21. return [round(item / norm, 6) for item in vector]
  22. def cosine_similarity(left: list[float] | None, right: list[float] | None) -> float:
  23. if not left or not right or len(left) != len(right):
  24. return 0.0
  25. left_norm = math.sqrt(sum(item * item for item in left))
  26. right_norm = math.sqrt(sum(item * item for item in right))
  27. if left_norm == 0 or right_norm == 0:
  28. return 0.0
  29. return sum(a * b for a, b in zip(left, right, strict=True)) / (left_norm * right_norm)
  30. def keyword_score(query: str, text: str) -> float:
  31. query_tokens = tokenize(query)
  32. if not query_tokens:
  33. return 0.0
  34. text_counts = Counter(tokenize(text))
  35. if not text_counts:
  36. return 0.0
  37. unique_query_tokens = set(query_tokens)
  38. matched = sum(1 for token in unique_query_tokens if token in text_counts)
  39. frequency = sum(text_counts.get(token, 0) for token in query_tokens)
  40. return matched / len(unique_query_tokens) + min(frequency / 20.0, 1.0)
  41. def rerank_score(
  42. *,
  43. keyword: float,
  44. vector: float,
  45. importance_score: int,
  46. ) -> float:
  47. importance = min(max(importance_score, 0), 100) / 100
  48. return round(keyword * 0.45 + vector * 0.45 + importance * 0.10, 6)