From 1c7c7238befd2bf5df1cff631c13e8f04b7bbfe9 Mon Sep 17 00:00:00 2001 From: Aleksandr Meshchriakov Date: Tue, 19 May 2026 10:28:14 +0200 Subject: [PATCH] fix(parsers): enrich EIS procurement identities from details --- src/apps/parsers/clients/common/structured.py | 152 +++++++++++++++++- .../parsers/test_structured_data_client.py | 106 ++++++++++++ 2 files changed, 254 insertions(+), 4 deletions(-) create mode 100644 tests/apps/parsers/test_structured_data_client.py diff --git a/src/apps/parsers/clients/common/structured.py b/src/apps/parsers/clients/common/structured.py index 5f597b4..f53f4ea 100644 --- a/src/apps/parsers/clients/common/structured.py +++ b/src/apps/parsers/clients/common/structured.py @@ -56,6 +56,18 @@ EIS_CARD_SOURCES = { "unfair_suppliers", } ZAKUPKI_BASE_URL = "https://zakupki.gov.ru" +ZAKUPKI_DETAIL_MAX_FILE_SIZE_BYTES = 2 * 1024 * 1024 +ZAKUPKI_DETAIL_URL_MARKERS = ( + "/view/common-info", + "/card/common-info", + "/contractCard/common-info", + "/notice/", + "/orderclause/card/", + "/contract/contractCard/", +) +INN_RE = re.compile(r"(? str: + """Выбрать URL detail page из карточки ЕИС.""" + urls = [] + for link in card.find_all("a", href=True): + href = str(link.get("href") or "").strip() + if not href or href.startswith(("#", "javascript:")): + continue + urls.append(urljoin(ZAKUPKI_BASE_URL, href)) + + for marker in ZAKUPKI_DETAIL_URL_MARKERS: + for url in urls: + if marker in url: + return url + return urls[0] if urls else "" + + def _enrich_zakupki_card_from_detail(self, row: dict[str, Any]) -> None: + """Дозагрузить ИНН/КПП/ОГРН заказчика со страницы карточки ЕИС.""" + if not self.enrich_eis_detail_pages: + return + if row.get("inn") and row.get("ogrn"): + return + + detail_url = str(row.get("detail_url") or row.get("url") or "").strip() + if not detail_url: + return + + try: + content = self.http_client.download_file( + detail_url, + max_size_bytes=ZAKUPKI_DETAIL_MAX_FILE_SIZE_BYTES, + ) + except HTTPClientError as exc: + logger.info("EIS detail enrichment skipped for %s: %s", detail_url, exc) + return + + identity = self._extract_zakupki_detail_identity(content) + for field_name in ("inn", "kpp", "ogrn", "organisation_name"): + value = identity.get(field_name) + if value and not row.get(field_name): + row[field_name] = value + + def _extract_zakupki_detail_identity(self, content: bytes) -> dict[str, str]: + """Извлечь реквизиты организации из HTML detail page ЕИС.""" + soup = BeautifulSoup(self._decode(content), "html.parser") + lines = self._extract_text_lines(soup) + return { + "inn": self._find_labeled_identifier( + lines, + labels=("инн", "инн заказчика", "инн / кпп", "инн/кпп"), + pattern=INN_RE, + ), + "kpp": self._find_labeled_identifier( + lines, + labels=("кпп", "инн / кпп", "инн/кпп"), + pattern=KPP_RE, + ), + "ogrn": self._find_labeled_identifier( + lines, + labels=("огрн", "огрн заказчика"), + pattern=OGRN_RE, + ), + "organisation_name": self._find_labeled_text( + lines, + labels=( + "полное наименование", + "наименование заказчика", + "заказчик", + ), + ), + } + + def _find_labeled_identifier( + self, + lines: list[str], + *, + labels: tuple[str, ...], + pattern: re.Pattern[str], + ) -> str: + """Найти идентификатор рядом с подписью на detail page.""" + for index, line in enumerate(lines): + if not self._line_contains_label(line, labels): + continue + for candidate in (line, *lines[index + 1 : index + 4]): + match = pattern.search(candidate) + if match: + return match.group(0) + return "" + + def _find_labeled_text( + self, + lines: list[str], + *, + labels: tuple[str, ...], + ) -> str: + """Найти текстовое значение рядом с подписью на detail page.""" + for index, line in enumerate(lines[:-1]): + if not self._line_contains_label(line, labels): + continue + value = lines[index + 1].strip() + if value and not self._looks_like_detail_label(value): + return value + return "" + + @staticmethod + def _line_contains_label(line: str, labels: tuple[str, ...]) -> bool: + normalized_line = StructuredDataClient._normalize_detail_label(line) + return any(label in normalized_line for label in labels) + + @staticmethod + def _looks_like_detail_label(value: str) -> bool: + normalized = StructuredDataClient._normalize_detail_label(value) + return normalized in { + "инн", + "кпп", + "инн кпп", + "огрн", + "полное наименование", + "наименование заказчика", + "заказчик", + } + + @staticmethod + def _normalize_detail_label(value: str) -> str: + normalized = value.lower().replace("ё", "е").replace("/", " ") + normalized = re.sub(r"[^0-9a-zа-я]+", " ", normalized) + return re.sub(r"\s+", " ", normalized).strip() + def _parse_html_table(self, table: Any) -> list[dict]: """Распарсить HTML-таблицу с th или строкой-заголовком в td.""" rows = table.find_all("tr") diff --git a/tests/apps/parsers/test_structured_data_client.py b/tests/apps/parsers/test_structured_data_client.py new file mode 100644 index 0000000..4f506de --- /dev/null +++ b/tests/apps/parsers/test_structured_data_client.py @@ -0,0 +1,106 @@ +from __future__ import annotations + +from apps.parsers.clients.base import HTTPClientError +from apps.parsers.clients.common.structured import StructuredDataClient +from django.test import SimpleTestCase + + +class _FakeHTTPClient: + def __init__(self, responses: dict[str, bytes | Exception]) -> None: + self.responses = responses + self.downloaded_urls: list[str] = [] + + def download_file(self, endpoint: str, **_kwargs) -> bytes: + self.downloaded_urls.append(endpoint) + response = self.responses[endpoint] + if isinstance(response, Exception): + raise response + return response + + +class StructuredDataClientEisCardEnrichmentTest(SimpleTestCase): + def test_procurement_card_enriches_customer_identity_from_detail_page(self): + detail_url = ( + "https://zakupki.gov.ru/epz/order/notice/ea20/view/" + "common-info.html?regNumber=0338100002026000022" + ) + client = StructuredDataClient(source="procurements_44fz") + client._http_client = _FakeHTTPClient( + { + detail_url: """ + + +

Сведения о заказчике

+
Полное наименование
+
ФЕДЕРАЛЬНОЕ ГБУ НАУКИ
+
ИНН / КПП
+
4101020011 / 410101001
+
ОГРН
+
1024101023456
+ + + """.encode() + } + ) + + records = client.fetch_records( + content=f""" + + +
+ № 0338100002026000022 +
Работа комиссии
+
Объект закупки
+
Поставка оборудования
+
Заказчик
+
ФЕДЕРАЛЬНОЕ ГБУ НАУКИ
+
Начальная цена
+
1 000,00 ₽
+
Размещено
+
19.05.2026
+
+ + + """.encode(), + file_name="results.html", + ) + + self.assertEqual(len(records), 1) + record = records[0] + self.assertEqual(record.inn, "4101020011") + self.assertEqual(record.ogrn, "1024101023456") + self.assertEqual(record.payload["kpp"], "410101001") + self.assertEqual(record.payload["detail_url"], detail_url) + self.assertEqual(client._http_client.downloaded_urls, [detail_url]) + + def test_procurement_card_keeps_record_when_detail_page_is_unavailable(self): + detail_url = ( + "https://zakupki.gov.ru/epz/order/notice/ea20/view/" + "common-info.html?regNumber=0338100002026000022" + ) + client = StructuredDataClient(source="procurements_44fz") + client._http_client = _FakeHTTPClient( + {detail_url: HTTPClientError("detail unavailable", url=detail_url)} + ) + + records = client.fetch_records( + content=f""" + + +
+ № 0338100002026000022 +
Объект закупки
+
Поставка оборудования
+
Заказчик
+
ФЕДЕРАЛЬНОЕ ГБУ НАУКИ
+
+ + + """.encode(), + file_name="results.html", + ) + + self.assertEqual(len(records), 1) + self.assertEqual(records[0].inn, "") + self.assertEqual(records[0].ogrn, "") + self.assertEqual(records[0].organisation_name, "ФЕДЕРАЛЬНОЕ ГБУ НАУКИ")