"""Unit tests for ProverkiClient using local HTTP server (no mocks).""" from __future__ import annotations import asyncio import sys import tempfile import types from asyncio import events as asyncio_events from pathlib import Path from xml.etree import ElementPath as element_path from xml.etree import ElementTree as ET from apps.parsers.clients.base import HTTPClientError from apps.parsers.clients.proverki import ProverkiClient from apps.parsers.clients.proverki.client import ( OPEN_DATA_PORTAL_URL, ProverkiClientError, ) from django.test import SimpleTestCase from tests.utils import TestHTTPServer from tests.utils.fixtures import build_zip, fake _CYRILLIC_KNM = "\u041a\u041d\u041c" _CYRILLIC_INN = "\u0418\u041d\u041d" _CYRILLIC_OGRN = "\u041e\u0413\u0420\u041d" def _digits(length: int) -> str: return "".join(str(fake.random_int(0, 9)) for _ in range(length)) def _attrs_string(attrs: dict[str, str]) -> str: return " ".join(f'{key}="{value}"' for key, value in attrs.items()) def _inspection_attrs() -> dict[str, str]: return { "ERPID": _digits(12), "INN": _digits(10), "OGRN": _digits(13), "ORG_NAME": fake.company(), "FRGU_ORG_NAME": fake.company(), "ITYPE_NAME": fake.word(), "ICARRYOUT_TYPE_NAME": fake.word(), "START_DATE": str(fake.date()), "END_DATE": str(fake.date()), "STATUS": fake.word(), "FZ_NAME": fake.sentence(nb_words=3), "RESULT": fake.sentence(nb_words=2), } def _xml_with_tag(tag: str, attrs: dict[str, str]) -> bytes: body = f"<{tag} {_attrs_string(attrs)} />" xml = "" f"{body}" return xml.encode("utf-8") def _xml_with_namespace(tag: str, attrs: dict[str, str]) -> bytes: ns = "http://example.com/ns" body = f"" xml = ( "" f"{body}" ) return xml.encode("utf-8") def _xml_with_container(tag: str, attrs: dict[str, str]) -> bytes: body = f"<{tag} {_attrs_string(attrs)} />" xml = ( "" f"{body}" ) return xml.encode("utf-8") def _xml_with_children() -> bytes: inn = _digits(10) ogrn = _digits(13) registration = _digits(12) xml = ( "" "" f'' f'' f'' "" "" ) return xml.encode("utf-8") def _xml_with_cyrillic_tag() -> bytes: attrs = { _CYRILLIC_INN: _digits(10), _CYRILLIC_OGRN: _digits(13), "I_NUMBER": _digits(12), } body = f"<{_CYRILLIC_KNM} {_attrs_string(attrs)} />" xml = "" f"{body}" return xml.encode("utf-8") def _client_for(server: TestHTTPServer) -> ProverkiClient: return ProverkiClient( host="testserver", scheme="http", http_adapter=server.adapter, use_playwright=False, ) class ProverkiDiscoverFilesTest(SimpleTestCase): def test_discover_data_files_month(self): client = ProverkiClient() plans = client._discover_data_files(year=2025, month=2, is_federal_law_248=True) self.assertEqual(len(plans), 1) self.assertEqual(plans[0].month, 2) self.assertIn("fz248", plans[0].file_name) def test_discover_data_files_year_only(self): client = ProverkiClient() plans = client._discover_data_files( year=2024, month=None, is_federal_law_248=False ) self.assertEqual(len(plans), 1) self.assertIsNone(plans[0].month) self.assertIn("fz294", plans[0].file_name) def test_discover_data_files_without_year(self): client = ProverkiClient() self.assertEqual(client._discover_data_files(year=None), []) class ProverkiDownloadParseTest(SimpleTestCase): def test_download_and_parse_zip(self): xml = _xml_with_tag("INSPECTION", _inspection_attrs()) archive = build_zip([("data.xml", xml)]) with TestHTTPServer() as server: server.add_bytes( "/opendata/data.zip", archive, content_type="application/zip" ) client = _client_for(server) inspections = client.fetch_inspections( file_url=f"{server.base_url}/opendata/data.zip" ) self.assertEqual(len(inspections), 1) def test_download_and_parse_xml(self): xml = _xml_with_tag("INSPECTION", _inspection_attrs()) with TestHTTPServer() as server: server.add_bytes("/opendata/data.xml", xml, content_type="application/xml") client = _client_for(server) inspections = client.fetch_inspections( file_url=f"{server.base_url}/opendata/data.xml" ) self.assertEqual(len(inspections), 1) def test_download_and_parse_portal_without_playwright(self): xml = _xml_with_tag("INSPECTION", _inspection_attrs()) archive = build_zip([("data.xml", xml)]) with TestHTTPServer() as server: server.add_bytes( "/portal/public-open-data/check/2025/1", archive, content_type="application/zip", ) client = _client_for(server) inspections = client._download_and_parse( f"{server.base_url}/portal/public-open-data/check/2025/1", file_format="portal", ) self.assertEqual(len(inspections), 1) def test_download_and_parse_portal_without_playwright_with_progress(self): xml = _xml_with_tag("INSPECTION", _inspection_attrs()) archive = build_zip([("data.xml", xml)]) progress: list[tuple[int, str]] = [] def on_progress(value: int, message: str) -> None: progress.append((value, message)) with TestHTTPServer() as server: server.add_bytes( "/portal/public-open-data/check/2025/2", archive, content_type="application/zip", ) client = _client_for(server) inspections = client._download_and_parse( f"{server.base_url}/portal/public-open-data/check/2025/2", progress_callback=on_progress, file_format="portal", ) self.assertEqual(len(inspections), 1) self.assertTrue(progress) def test_download_and_parse_html_without_playwright_fails(self): html = b"blocked" with TestHTTPServer() as server: server.add_bytes( "/portal/public-open-data/check/2025/1", html, content_type="text/html", ) client = _client_for(server) with self.assertRaises(ProverkiClientError): client._download_and_parse( f"{server.base_url}/portal/public-open-data/check/2025/1", file_format="portal", ) def test_download_and_parse_html_without_playwright_non_portal(self): html = b"blocked" with TestHTTPServer() as server: server.add_bytes("/opendata/data.html", html, content_type="text/html") client = _client_for(server) with self.assertRaises(ProverkiClientError): client._download_and_parse(f"{server.base_url}/opendata/data.html") def test_download_and_parse_unknown_format(self): with TestHTTPServer() as server: server.add_bytes("/opendata/data.bin", b"not-xml-or-zip") client = _client_for(server) with self.assertRaises(ProverkiClientError): client.fetch_inspections( file_url=f"{server.base_url}/opendata/data.bin" ) def test_parse_zip_archive_without_xml_files(self): archive = build_zip([("readme.txt", b"no xml here")]) client = ProverkiClient() inspections = client._parse_zip_archive(archive) self.assertEqual(inspections, []) def test_fetch_inspections_with_progress_callback(self): xml = _xml_with_tag("INSPECTION", _inspection_attrs()) archive = build_zip([("data.xml", xml)]) progress: list[tuple[int, str]] = [] def on_progress(value: int, message: str) -> None: progress.append((value, message)) with TestHTTPServer() as server: server.add_bytes( "/opendata/data.zip", archive, content_type="application/zip" ) client = _client_for(server) inspections = client.fetch_inspections( file_url=f"{server.base_url}/opendata/data.zip", progress_callback=on_progress, ) self.assertEqual(len(inspections), 1) self.assertTrue(progress) def test_fetch_inspections_http_error_bubbles(self): with TestHTTPServer() as server: server.add_bytes("/opendata/data.zip", b"", status=500) client = _client_for(server) with self.assertRaises(HTTPClientError): client.fetch_inspections( file_url=f"{server.base_url}/opendata/data.zip" ) def test_fetch_inspection_plans(self): client = ProverkiClient() plans = client.fetch_inspection_plans(2025) self.assertEqual(len(plans), 1) self.assertIn("plan-2025", plans[0].file_name) def test_fetch_inspections_wraps_generic_error(self): class _FailClient(ProverkiClient): def _download_and_parse(self, *args, **kwargs): # type: ignore[override] raise ValueError("boom") client = _FailClient() with self.assertRaises(ProverkiClientError): client.fetch_inspections(file_url="http://example.com/data.zip") def test_download_and_parse_portal_with_playwright_branch(self): xml = _xml_with_tag("INSPECTION", _inspection_attrs()) archive = build_zip([("data.xml", xml)]) progress = [] class _PortalClient(ProverkiClient): def _download_from_portal(self, *args, **kwargs): # type: ignore[override] return archive def _close_playwright(self): # type: ignore[override] return None def on_progress(value: int, _message: str) -> None: progress.append(value) client = _PortalClient(use_playwright=True) inspections = client._download_and_parse( "http://portal.example.com", progress_callback=on_progress, file_format="portal", ) self.assertEqual(len(inspections), 1) self.assertTrue(progress) def test_download_and_parse_portal_with_playwright_no_progress(self): xml = _xml_with_tag("INSPECTION", _inspection_attrs()) archive = build_zip([("data.xml", xml)]) class _PortalClient(ProverkiClient): def _download_from_portal(self, *args, **kwargs): # type: ignore[override] return archive def _close_playwright(self): # type: ignore[override] return None client = _PortalClient(use_playwright=True) inspections = client._download_and_parse( "http://portal.example.com", file_format="portal" ) self.assertEqual(len(inspections), 1) def test_download_and_parse_html_switches_to_playwright(self): xml = _xml_with_tag("INSPECTION", _inspection_attrs()) class _HtmlClient(ProverkiClient): def _download_with_playwright(self, *args, **kwargs): # type: ignore[override] return xml def _close_playwright(self): # type: ignore[override] return None with TestHTTPServer() as server: server.add_bytes( "/data.html", b"blocked", content_type="text/html" ) client = _HtmlClient( host="testserver", scheme="http", http_adapter=server.adapter, use_playwright=True, ) inspections = client._download_and_parse(f"{server.base_url}/data.html") self.assertEqual(len(inspections), 1) def test_download_and_parse_html_switches_to_playwright_with_progress(self): xml = _xml_with_tag("INSPECTION", _inspection_attrs()) progress: list[tuple[int, str]] = [] class _HtmlClient(ProverkiClient): def _download_with_playwright(self, *args, **kwargs): # type: ignore[override] return xml def _close_playwright(self): # type: ignore[override] return None def on_progress(value: int, message: str) -> None: progress.append((value, message)) with TestHTTPServer() as server: server.add_bytes( "/data.html", b"blocked", content_type="text/html" ) client = _HtmlClient( host="testserver", scheme="http", http_adapter=server.adapter, use_playwright=True, ) inspections = client._download_and_parse( f"{server.base_url}/data.html", progress_callback=on_progress ) self.assertEqual(len(inspections), 1) self.assertTrue(progress) def test_fetch_inspections_with_plans_and_progress(self): xml = _xml_with_tag("INSPECTION", _inspection_attrs()) archive = build_zip([("data.xml", xml)]) progress: list[int] = [] class _TestClient(ProverkiClient): def _discover_data_files(self, **_kwargs): # type: ignore[override] from apps.parsers.clients.proverki.schemas import InspectionPlan return [ InspectionPlan( year=2025, month=1, file_url=f"{server.base_url}/opendata/data.zip", file_name="data.zip", file_format="auto", ) ] def on_progress(value: int, _message: str) -> None: progress.append(value) with TestHTTPServer() as server: server.add_bytes( "/opendata/data.zip", archive, content_type="application/zip" ) client = _TestClient( host="testserver", scheme="http", http_adapter=server.adapter, use_playwright=False, ) inspections = client.fetch_inspections( year=2025, month=1, progress_callback=on_progress ) self.assertEqual(len(inspections), 1) self.assertTrue(progress) class ProverkiParseXMLTest(SimpleTestCase): def test_parse_xml_with_namespace(self): xml = _xml_with_namespace("INSPECTION", _inspection_attrs()) client = ProverkiClient() inspections = client._parse_xml_content(xml) self.assertEqual(len(inspections), 1) def test_parse_xml_with_container(self): xml = _xml_with_container("inspection", _inspection_attrs()) client = ProverkiClient() inspections = client._parse_xml_content(xml) self.assertEqual(len(inspections), 1) def test_parse_xml_with_children(self): client = ProverkiClient() inspections = client._parse_xml_content(_xml_with_children()) self.assertEqual(len(inspections), 1) def test_parse_xml_with_cyrillic_tag(self): client = ProverkiClient() inspections = client._parse_xml_content(_xml_with_cyrillic_tag()) self.assertEqual(len(inspections), 1) def test_parse_xml_streaming_threshold(self): xml = _xml_with_tag("INSPECTION", _inspection_attrs()) client = ProverkiClient() client.STREAMING_THRESHOLD_BYTES = 1 inspections = client._parse_xml_content(xml) self.assertEqual(len(inspections), 1) def test_parse_xml_record_missing_fields_returns_none(self): element = ET.fromstring("") # noqa: S314 client = ProverkiClient() self.assertIsNone(client._parse_xml_record(element)) def test_parse_xml_record_partial_fields(self): element = ET.fromstring(f"") # noqa: S314 client = ProverkiClient() inspection = client._parse_xml_record(element) self.assertIsNotNone(inspection) self.assertEqual(inspection.inn, element.attrib["INN"]) def test_parse_xml_container_records(self): xml = ( b"" b"" ) client = ProverkiClient() inspections = client._parse_xml_content(xml) self.assertEqual(inspections, []) def test_parse_xml_content_decode_fallback(self): xml_str = "" content = _BadBytes(b"\xff\xfe", xml_str=xml_str) client = ProverkiClient() inspections = client._parse_xml_content(content) self.assertEqual(inspections, []) def test_parse_xml_streaming_decode_fallback(self): xml_str = ( "" f'' ) content = _BadBytes(b"\xff\xfe", xml_str=xml_str) client = ProverkiClient() inspections = client._parse_xml_streaming(content) self.assertEqual(len(inspections), 1) def test_parse_xml_streaming_parse_error_returns_partial(self): inn = _digits(10) xml = ( "" f'' xml = ( "" f"{record * 10000}" ).encode() client = ProverkiClient() inspections = client._parse_xml_streaming(xml) self.assertEqual(len(inspections), 10000) def test_parse_xml_streaming_skips_invalid_record(self): xml = b"" b"" client = ProverkiClient() inspections = client._parse_xml_streaming(xml) self.assertEqual(inspections, []) def test_parse_xml_tag_search_handles_error(self): xml = _xml_with_tag("inspection", _inspection_attrs()) client = ProverkiClient() original_findall = element_path.findall def _raising_findall(elem, path, namespaces=None): if path == ".//inspection": raise SyntaxError("boom") return original_findall(elem, path, namespaces) element_path.findall = _raising_findall try: inspections = client._parse_xml_content(xml) finally: element_path.findall = original_findall self.assertEqual(len(inspections), 1) def test_parse_xml_record_namespace_nested_fields(self): ns = "http://example.com/ns" inn = _digits(10) ogrn = _digits(13) inspection_type = fake.word() status = fake.word() xml = ( "" f"" f'' f'' f'' f'' "" "" ).encode() client = ProverkiClient() inspections = client._parse_xml_content(xml) self.assertEqual(len(inspections), 1) self.assertEqual(inspections[0].inspection_type, inspection_type) self.assertEqual(inspections[0].status, status) def test_parse_xml_record_namespace_text_child_fallback(self): ns = "http://example.com/ns" inn = _digits(10) xml = ( "" f"{inn}" ) element = ET.fromstring(xml) # noqa: S314 client = ProverkiClient() inspection = client._parse_xml_record(element) self.assertIsNotNone(inspection) self.assertEqual(inspection.inn, inn) def test_parse_xml_record_bad_element_returns_none(self): element = ET.Element("{") client = ProverkiClient() self.assertIsNone(client._parse_xml_record(element)) class _BadBytes(bytes): def __new__(cls, data: bytes, *, xml_str: str): obj = super().__new__(cls, data) obj._xml_str = xml_str return obj def decode(self, encoding="utf-8", errors="strict"): if errors == "replace": return self._xml_str raise UnicodeDecodeError(encoding, b"", 0, 1, "bad bytes") class _FakeResponse: def __init__(self, headers: dict[str, str] | None = None): self.headers = headers or {} class _FakeDownload: def __init__(self, path: Path | None): self._path = path def path(self): if self._path is None: return None return str(self._path) class _FakeDownloadContext: def __init__(self, path: Path | None): self.value = _FakeDownload(path) def __enter__(self): return self def __exit__(self, exc_type, exc, tb): return False class _FakeLink: def __init__(self, href: str | None = None): self._href = href def get_attribute(self, name: str): if name == "href": return self._href return None def click(self): return None class _FakePage: def __init__( self, *, content_type: str, content: str, download_path: Path | None, download_links: list[_FakeLink] | None = None, portal_links: list[_FakeLink] | None = None, zip_link: _FakeLink | None = None, xml_link: _FakeLink | None = None, download_tab: _FakeLink | None = None, raise_on_wait: bool = False, ): self._content_type = content_type self._content = content self._download_path = download_path self._download_links = download_links or [] self._portal_links = portal_links or [] self._zip_link = zip_link self._xml_link = xml_link self._download_tab = download_tab self._last_url = "" self._raise_on_wait = raise_on_wait def goto(self, url, wait_until=None, timeout=None): self._last_url = url return _FakeResponse({"content-type": self._content_type}) def content(self): return self._content def title(self): return "Page" def wait_for_selector(self, *args, **kwargs): if self._raise_on_wait: raise RuntimeError("timeout") return None def wait_for_timeout(self, *args, **kwargs): return None def query_selector(self, selector: str): if "Скачать" in selector and self._download_tab: return self._download_tab if ".zip" in selector and self._zip_link: return self._zip_link if ".xml" in selector and self._xml_link: return self._xml_link return None def query_selector_all(self, selector: str): if self._last_url == OPEN_DATA_PORTAL_URL: return self._portal_links return self._download_links def expect_download(self, timeout=None): return _FakeDownloadContext(self._download_path) class _FakeContext: def __init__(self, page: _FakePage): self._page = page self.closed = False def new_page(self): return self._page def close(self): self.closed = True class _FakeBrowser: def __init__(self, page: _FakePage): self._page = page self.closed = False def new_context(self, **_kwargs): return _FakeContext(self._page) def close(self): self.closed = True def _temp_file(content: bytes) -> Path: tmp = tempfile.NamedTemporaryFile(delete=False) tmp.write(content) tmp.flush() tmp.close() return Path(tmp.name) class ProverkiPlaywrightStubTest(SimpleTestCase): databases = "__all__" def tearDown(self): super().tearDown() try: asyncio.get_running_loop() except RuntimeError: return asyncio_events._set_running_loop(None) def test_download_with_playwright_direct_response(self): download_path = _temp_file(b"") page = _FakePage( content_type="application/xml", content="", download_path=download_path, ) client = ProverkiClient() client._browser = _FakeBrowser(page) result = client._download_with_playwright("http://example.com") self.assertIn(b"") page = _FakePage( content_type="text/html", content="content", download_path=download_path, zip_link=None, xml_link=_FakeLink(href="file.xml"), download_tab=None, ) client = ProverkiClient() client._browser = _FakeBrowser(page) result = client._download_from_portal("http://portal.example.com") self.assertEqual(result, b"") def test_download_from_portal_xml_link_without_download_path(self): page = _FakePage( content_type="text/html", content="no data", download_path=None, zip_link=None, xml_link=_FakeLink(href="file.xml"), download_tab=None, ) client = ProverkiClient() client._browser = _FakeBrowser(page) with self.assertRaises(ProverkiClientError): client._download_from_portal("http://portal.example.com") def test_download_from_portal_no_links_not_found(self): download_path = _temp_file(b"") page = _FakePage( content_type="text/html", content="Not found", download_path=download_path, zip_link=None, xml_link=None, download_tab=None, raise_on_wait=True, ) client = ProverkiClient() client._browser = _FakeBrowser(page) with self.assertRaises(ProverkiClientError): client._download_from_portal("http://portal.example.com") def test_download_from_portal_no_links_generic_error(self): page = _FakePage( content_type="text/html", content="no links here", download_path=None, zip_link=None, xml_link=None, download_tab=None, ) client = ProverkiClient() client._browser = _FakeBrowser(page) with self.assertRaises(ProverkiClientError): client._download_from_portal("http://portal.example.com") def test_close_playwright_handles_errors(self): class _BrokenBrowser: def close(self): raise RuntimeError("boom") class _BrokenPlaywright: def stop(self): raise RuntimeError("boom") client = ProverkiClient() client._browser = _BrokenBrowser() client._playwright = _BrokenPlaywright() client._close_playwright() self.assertIsNone(client._browser) self.assertIsNone(client._playwright) def test_get_browser_import_error(self): client = ProverkiClient() original_playwright = sys.modules.get("playwright") original_sync_api = sys.modules.get("playwright.sync_api") fake_playwright = types.ModuleType("playwright") fake_playwright.__path__ = [] fake_sync_api = types.ModuleType("playwright.sync_api") sys.modules["playwright"] = fake_playwright sys.modules["playwright.sync_api"] = fake_sync_api try: with self.assertRaises(ProverkiClientError): client._get_browser() finally: if original_playwright is None: sys.modules.pop("playwright", None) else: sys.modules["playwright"] = original_playwright if original_sync_api is None: sys.modules.pop("playwright.sync_api", None) else: sys.modules["playwright.sync_api"] = original_sync_api def test_get_browser_success(self): class _FakeChromium: def launch(self, **_kwargs): return object() class _FakePlaywright: chromium = _FakeChromium() class _FakeSyncPlaywright: def start(self): return _FakePlaywright() fake_module = types.SimpleNamespace( sync_playwright=lambda: _FakeSyncPlaywright() ) client = ProverkiClient() original_module = sys.modules.get("playwright.sync_api") sys.modules["playwright.sync_api"] = fake_module try: browser = client._get_browser() finally: if original_module is None: sys.modules.pop("playwright.sync_api", None) else: sys.modules["playwright.sync_api"] = original_module self.assertIsNotNone(browser) def test_get_browser_start_error(self): class _BrokenPlaywright: def start(self): raise RuntimeError("startup failed") fake_module = types.SimpleNamespace(sync_playwright=lambda: _BrokenPlaywright()) client = ProverkiClient() original_module = sys.modules.get("playwright.sync_api") sys.modules["playwright.sync_api"] = fake_module try: with self.assertRaises(ProverkiClientError): client._get_browser() finally: if original_module is None: sys.modules.pop("playwright.sync_api", None) else: sys.modules["playwright.sync_api"] = original_module