| 12345678910111213141516171819202122232425262728293031323334353637383940414243 |
- from __future__ import annotations
- import sys
- from pathlib import Path
- REPO_ROOT = Path(__file__).resolve().parents[1]
- for module_name in list(sys.modules):
- if module_name == "app" or module_name.startswith("app."):
- del sys.modules[module_name]
- for path in [
- REPO_ROOT / "libs" / "core-shared" / "src",
- REPO_ROOT / "services" / "knowledge-service",
- ]:
- sys.path.insert(0, str(path))
- from app.application.document_parsers import parse_document_content
- def test_parse_markdown_html_json_csv_documents() -> None:
- markdown = parse_document_content(
- source_type="markdown",
- content_text="# Title\n\nUse [docs](https://example.com) and `code`.",
- )
- html = parse_document_content(
- source_type="html",
- content_text="<h1>Title</h1><script>hidden()</script><p>Hello <b>world</b></p>",
- )
- json_doc = parse_document_content(
- source_type="json",
- content_text='{"order":{"id":"A1","status":"paid"}}',
- )
- csv_doc = parse_document_content(
- source_type="csv",
- content_text="id,status\nA1,paid\nA2,refunded\n",
- )
- assert "Title" in markdown.content_text
- assert "docs" in markdown.content_text
- assert "hidden" not in html.content_text
- assert "Hello world" in html.content_text
- assert "order.id: A1" in json_doc.content_text
- assert "row 2: id: A2; status: refunded" in csv_doc.content_text
|