from __future__ import annotations import sys from pathlib import Path REPO_ROOT = Path(__file__).resolve().parents[1] for module_name in list(sys.modules): if module_name == "app" or module_name.startswith("app."): del sys.modules[module_name] for path in [ REPO_ROOT / "libs" / "core-shared" / "src", REPO_ROOT / "services" / "knowledge-service", ]: sys.path.insert(0, str(path)) from app.application.document_parsers import parse_document_content from app.application.retrieval import rerank_score def test_parse_markdown_html_json_csv_documents() -> None: markdown = parse_document_content( source_type="markdown", content_text="# Title\n\nUse [docs](https://example.com) and `code`.") html = parse_document_content( source_type="html", content_text="
Hello world
") json_doc = parse_document_content( source_type="json", content_text='{"order":{"id":"A1","status":"paid"}}') csv_doc = parse_document_content( source_type="csv", content_text="id,status\nA1,paid\nA2,refunded\n") assert "Title" in markdown.content_text assert "docs" in markdown.content_text assert "hidden" not in html.content_text assert "Hello world" in html.content_text assert "order.id: A1" in json_doc.content_text assert "row 2: id: A2; status: refunded" in csv_doc.content_text def test_rerank_score_prefers_title_and_phrase_matches() -> None: strong = rerank_score( query="refund policy", chunk_text="The refund policy allows refunds within seven days.", document_title="Refund Policy") weak = rerank_score( query="refund policy", chunk_text="Shipping times are usually three days.", document_title="Shipping") assert strong > weak assert strong > 0.5