test_knowledge_document_parsers.py 1.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960
  1. from __future__ import annotations
  2. import sys
  3. from pathlib import Path
  4. REPO_ROOT = Path(__file__).resolve().parents[1]
  5. for module_name in list(sys.modules):
  6. if module_name == "app" or module_name.startswith("app."):
  7. del sys.modules[module_name]
  8. for path in [
  9. REPO_ROOT / "libs" / "core-shared" / "src",
  10. REPO_ROOT / "services" / "knowledge-service",
  11. ]:
  12. sys.path.insert(0, str(path))
  13. from app.application.document_parsers import parse_document_content
  14. from app.application.retrieval import rerank_score
  15. def test_parse_markdown_html_json_csv_documents() -> None:
  16. markdown = parse_document_content(
  17. source_type="markdown",
  18. content_text="# Title\n\nUse [docs](https://example.com) and `code`.",
  19. )
  20. html = parse_document_content(
  21. source_type="html",
  22. content_text="<h1>Title</h1><script>hidden()</script><p>Hello <b>world</b></p>",
  23. )
  24. json_doc = parse_document_content(
  25. source_type="json",
  26. content_text='{"order":{"id":"A1","status":"paid"}}',
  27. )
  28. csv_doc = parse_document_content(
  29. source_type="csv",
  30. content_text="id,status\nA1,paid\nA2,refunded\n",
  31. )
  32. assert "Title" in markdown.content_text
  33. assert "docs" in markdown.content_text
  34. assert "hidden" not in html.content_text
  35. assert "Hello world" in html.content_text
  36. assert "order.id: A1" in json_doc.content_text
  37. assert "row 2: id: A2; status: refunded" in csv_doc.content_text
  38. def test_rerank_score_prefers_title_and_phrase_matches() -> None:
  39. strong = rerank_score(
  40. query="refund policy",
  41. chunk_text="The refund policy allows refunds within seven days.",
  42. document_title="Refund Policy",
  43. )
  44. weak = rerank_score(
  45. query="refund policy",
  46. chunk_text="Shipping times are usually three days.",
  47. document_title="Shipping",
  48. )
  49. assert strong > weak
  50. assert strong > 0.5