newpoint
/
agent-platform


			
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253
							from __future__ import annotations

import sys
from pathlib import Path

REPO_ROOT = Path(__file__).resolve().parents[1]
for module_name in list(sys.modules):
    if module_name == "app" or module_name.startswith("app."):
        del sys.modules[module_name]
for path in [
    REPO_ROOT / "libs" / "core-shared" / "src",
    REPO_ROOT / "services" / "knowledge-service",
]:
    sys.path.insert(0, str(path))

from app.application.document_parsers import parse_document_content
from app.application.retrieval import rerank_score


def test_parse_markdown_html_json_csv_documents() -> None:
    markdown = parse_document_content(
        source_type="markdown",
        content_text="# Title\n\nUse [docs](https://example.com) and `code`.")
    html = parse_document_content(
        source_type="html",
        content_text="<h1>Title</h1><script>hidden()</script><p>Hello <b>world</b></p>")
    json_doc = parse_document_content(
        source_type="json",
        content_text='{"order":{"id":"A1","status":"paid"}}')
    csv_doc = parse_document_content(
        source_type="csv",
        content_text="id,status\nA1,paid\nA2,refunded\n")

    assert "Title" in markdown.content_text
    assert "docs" in markdown.content_text
    assert "hidden" not in html.content_text
    assert "Hello world" in html.content_text
    assert "order.id: A1" in json_doc.content_text
    assert "row 2: id: A2; status: refunded" in csv_doc.content_text


def test_rerank_score_prefers_title_and_phrase_matches() -> None:
    strong = rerank_score(
        query="refund policy",
        chunk_text="The refund policy allows refunds within seven days.",
        document_title="Refund Policy")
    weak = rerank_score(
        query="refund policy",
        chunk_text="Shipping times are usually three days.",
        document_title="Shipping")

    assert strong > weak
    assert strong > 0.5