fix(parsers): enrich EIS procurement identities from details
This commit is contained in:
@@ -56,6 +56,18 @@ EIS_CARD_SOURCES = {
|
|||||||
"unfair_suppliers",
|
"unfair_suppliers",
|
||||||
}
|
}
|
||||||
ZAKUPKI_BASE_URL = "https://zakupki.gov.ru"
|
ZAKUPKI_BASE_URL = "https://zakupki.gov.ru"
|
||||||
|
ZAKUPKI_DETAIL_MAX_FILE_SIZE_BYTES = 2 * 1024 * 1024
|
||||||
|
ZAKUPKI_DETAIL_URL_MARKERS = (
|
||||||
|
"/view/common-info",
|
||||||
|
"/card/common-info",
|
||||||
|
"/contractCard/common-info",
|
||||||
|
"/notice/",
|
||||||
|
"/orderclause/card/",
|
||||||
|
"/contract/contractCard/",
|
||||||
|
)
|
||||||
|
INN_RE = re.compile(r"(?<!\d)(?:\d{12}|\d{10})(?!\d)")
|
||||||
|
KPP_RE = re.compile(r"(?<!\d)\d{9}(?!\d)")
|
||||||
|
OGRN_RE = re.compile(r"(?<!\d)(?:\d{15}|\d{13})(?!\d)")
|
||||||
GISP_PRODUCTS_DOWNLOAD_LABEL = "Скачать только действующие"
|
GISP_PRODUCTS_DOWNLOAD_LABEL = "Скачать только действующие"
|
||||||
GISP_PRODUCTS_API_PATH = "/pp719v2/pub/prod/b/"
|
GISP_PRODUCTS_API_PATH = "/pp719v2/pub/prod/b/"
|
||||||
GISP_PRODUCTS_PAGE_SIZE = 100
|
GISP_PRODUCTS_PAGE_SIZE = 100
|
||||||
@@ -85,6 +97,8 @@ class StructuredDataClient:
|
|||||||
max_zip_uncompressed_bytes: int = MAX_ZIP_UNCOMPRESSED_BYTES
|
max_zip_uncompressed_bytes: int = MAX_ZIP_UNCOMPRESSED_BYTES
|
||||||
max_records: int = MAX_RECORDS
|
max_records: int = MAX_RECORDS
|
||||||
verify_ssl: bool = True
|
verify_ssl: bool = True
|
||||||
|
enrich_eis_detail_pages: bool = True
|
||||||
|
max_eis_detail_pages: int = 200
|
||||||
_http_client: BaseHTTPClient | None = field(default=None, repr=False)
|
_http_client: BaseHTTPClient | None = field(default=None, repr=False)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@@ -476,7 +490,7 @@ class StructuredDataClient:
|
|||||||
"Вступление в силу",
|
"Вступление в силу",
|
||||||
"Способы закупки",
|
"Способы закупки",
|
||||||
}
|
}
|
||||||
for card in soup.select(".search-registry-entry-block"):
|
for card_index, card in enumerate(soup.select(".search-registry-entry-block")):
|
||||||
lines = self._extract_text_lines(card)
|
lines = self._extract_text_lines(card)
|
||||||
if not lines:
|
if not lines:
|
||||||
continue
|
continue
|
||||||
@@ -486,16 +500,146 @@ class StructuredDataClient:
|
|||||||
self._fill_label_pairs(row, lines, labels)
|
self._fill_label_pairs(row, lines, labels)
|
||||||
self._fill_zakupki_status(row, lines, number_index, labels)
|
self._fill_zakupki_status(row, lines, number_index, labels)
|
||||||
|
|
||||||
link = card.find("a", href=True)
|
detail_url = self._select_zakupki_card_url(card)
|
||||||
if link:
|
if detail_url:
|
||||||
row["url"] = urljoin(ZAKUPKI_BASE_URL, link["href"])
|
row["url"] = detail_url
|
||||||
|
row["detail_url"] = detail_url
|
||||||
if lines[0].endswith("-ФЗ"):
|
if lines[0].endswith("-ФЗ"):
|
||||||
row["law"] = lines[0]
|
row["law"] = lines[0]
|
||||||
|
if card_index < self.max_eis_detail_pages:
|
||||||
|
self._enrich_zakupki_card_from_detail(row)
|
||||||
|
|
||||||
result.append(row)
|
result.append(row)
|
||||||
self._validate_record_count(len(result))
|
self._validate_record_count(len(result))
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
def _select_zakupki_card_url(self, card: Any) -> str:
|
||||||
|
"""Выбрать URL detail page из карточки ЕИС."""
|
||||||
|
urls = []
|
||||||
|
for link in card.find_all("a", href=True):
|
||||||
|
href = str(link.get("href") or "").strip()
|
||||||
|
if not href or href.startswith(("#", "javascript:")):
|
||||||
|
continue
|
||||||
|
urls.append(urljoin(ZAKUPKI_BASE_URL, href))
|
||||||
|
|
||||||
|
for marker in ZAKUPKI_DETAIL_URL_MARKERS:
|
||||||
|
for url in urls:
|
||||||
|
if marker in url:
|
||||||
|
return url
|
||||||
|
return urls[0] if urls else ""
|
||||||
|
|
||||||
|
def _enrich_zakupki_card_from_detail(self, row: dict[str, Any]) -> None:
|
||||||
|
"""Дозагрузить ИНН/КПП/ОГРН заказчика со страницы карточки ЕИС."""
|
||||||
|
if not self.enrich_eis_detail_pages:
|
||||||
|
return
|
||||||
|
if row.get("inn") and row.get("ogrn"):
|
||||||
|
return
|
||||||
|
|
||||||
|
detail_url = str(row.get("detail_url") or row.get("url") or "").strip()
|
||||||
|
if not detail_url:
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
content = self.http_client.download_file(
|
||||||
|
detail_url,
|
||||||
|
max_size_bytes=ZAKUPKI_DETAIL_MAX_FILE_SIZE_BYTES,
|
||||||
|
)
|
||||||
|
except HTTPClientError as exc:
|
||||||
|
logger.info("EIS detail enrichment skipped for %s: %s", detail_url, exc)
|
||||||
|
return
|
||||||
|
|
||||||
|
identity = self._extract_zakupki_detail_identity(content)
|
||||||
|
for field_name in ("inn", "kpp", "ogrn", "organisation_name"):
|
||||||
|
value = identity.get(field_name)
|
||||||
|
if value and not row.get(field_name):
|
||||||
|
row[field_name] = value
|
||||||
|
|
||||||
|
def _extract_zakupki_detail_identity(self, content: bytes) -> dict[str, str]:
|
||||||
|
"""Извлечь реквизиты организации из HTML detail page ЕИС."""
|
||||||
|
soup = BeautifulSoup(self._decode(content), "html.parser")
|
||||||
|
lines = self._extract_text_lines(soup)
|
||||||
|
return {
|
||||||
|
"inn": self._find_labeled_identifier(
|
||||||
|
lines,
|
||||||
|
labels=("инн", "инн заказчика", "инн / кпп", "инн/кпп"),
|
||||||
|
pattern=INN_RE,
|
||||||
|
),
|
||||||
|
"kpp": self._find_labeled_identifier(
|
||||||
|
lines,
|
||||||
|
labels=("кпп", "инн / кпп", "инн/кпп"),
|
||||||
|
pattern=KPP_RE,
|
||||||
|
),
|
||||||
|
"ogrn": self._find_labeled_identifier(
|
||||||
|
lines,
|
||||||
|
labels=("огрн", "огрн заказчика"),
|
||||||
|
pattern=OGRN_RE,
|
||||||
|
),
|
||||||
|
"organisation_name": self._find_labeled_text(
|
||||||
|
lines,
|
||||||
|
labels=(
|
||||||
|
"полное наименование",
|
||||||
|
"наименование заказчика",
|
||||||
|
"заказчик",
|
||||||
|
),
|
||||||
|
),
|
||||||
|
}
|
||||||
|
|
||||||
|
def _find_labeled_identifier(
|
||||||
|
self,
|
||||||
|
lines: list[str],
|
||||||
|
*,
|
||||||
|
labels: tuple[str, ...],
|
||||||
|
pattern: re.Pattern[str],
|
||||||
|
) -> str:
|
||||||
|
"""Найти идентификатор рядом с подписью на detail page."""
|
||||||
|
for index, line in enumerate(lines):
|
||||||
|
if not self._line_contains_label(line, labels):
|
||||||
|
continue
|
||||||
|
for candidate in (line, *lines[index + 1 : index + 4]):
|
||||||
|
match = pattern.search(candidate)
|
||||||
|
if match:
|
||||||
|
return match.group(0)
|
||||||
|
return ""
|
||||||
|
|
||||||
|
def _find_labeled_text(
|
||||||
|
self,
|
||||||
|
lines: list[str],
|
||||||
|
*,
|
||||||
|
labels: tuple[str, ...],
|
||||||
|
) -> str:
|
||||||
|
"""Найти текстовое значение рядом с подписью на detail page."""
|
||||||
|
for index, line in enumerate(lines[:-1]):
|
||||||
|
if not self._line_contains_label(line, labels):
|
||||||
|
continue
|
||||||
|
value = lines[index + 1].strip()
|
||||||
|
if value and not self._looks_like_detail_label(value):
|
||||||
|
return value
|
||||||
|
return ""
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _line_contains_label(line: str, labels: tuple[str, ...]) -> bool:
|
||||||
|
normalized_line = StructuredDataClient._normalize_detail_label(line)
|
||||||
|
return any(label in normalized_line for label in labels)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _looks_like_detail_label(value: str) -> bool:
|
||||||
|
normalized = StructuredDataClient._normalize_detail_label(value)
|
||||||
|
return normalized in {
|
||||||
|
"инн",
|
||||||
|
"кпп",
|
||||||
|
"инн кпп",
|
||||||
|
"огрн",
|
||||||
|
"полное наименование",
|
||||||
|
"наименование заказчика",
|
||||||
|
"заказчик",
|
||||||
|
}
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _normalize_detail_label(value: str) -> str:
|
||||||
|
normalized = value.lower().replace("ё", "е").replace("/", " ")
|
||||||
|
normalized = re.sub(r"[^0-9a-zа-я]+", " ", normalized)
|
||||||
|
return re.sub(r"\s+", " ", normalized).strip()
|
||||||
|
|
||||||
def _parse_html_table(self, table: Any) -> list[dict]:
|
def _parse_html_table(self, table: Any) -> list[dict]:
|
||||||
"""Распарсить HTML-таблицу с th или строкой-заголовком в td."""
|
"""Распарсить HTML-таблицу с th или строкой-заголовком в td."""
|
||||||
rows = table.find_all("tr")
|
rows = table.find_all("tr")
|
||||||
|
|||||||
106
tests/apps/parsers/test_structured_data_client.py
Normal file
106
tests/apps/parsers/test_structured_data_client.py
Normal file
@@ -0,0 +1,106 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from apps.parsers.clients.base import HTTPClientError
|
||||||
|
from apps.parsers.clients.common.structured import StructuredDataClient
|
||||||
|
from django.test import SimpleTestCase
|
||||||
|
|
||||||
|
|
||||||
|
class _FakeHTTPClient:
|
||||||
|
def __init__(self, responses: dict[str, bytes | Exception]) -> None:
|
||||||
|
self.responses = responses
|
||||||
|
self.downloaded_urls: list[str] = []
|
||||||
|
|
||||||
|
def download_file(self, endpoint: str, **_kwargs) -> bytes:
|
||||||
|
self.downloaded_urls.append(endpoint)
|
||||||
|
response = self.responses[endpoint]
|
||||||
|
if isinstance(response, Exception):
|
||||||
|
raise response
|
||||||
|
return response
|
||||||
|
|
||||||
|
|
||||||
|
class StructuredDataClientEisCardEnrichmentTest(SimpleTestCase):
|
||||||
|
def test_procurement_card_enriches_customer_identity_from_detail_page(self):
|
||||||
|
detail_url = (
|
||||||
|
"https://zakupki.gov.ru/epz/order/notice/ea20/view/"
|
||||||
|
"common-info.html?regNumber=0338100002026000022"
|
||||||
|
)
|
||||||
|
client = StructuredDataClient(source="procurements_44fz")
|
||||||
|
client._http_client = _FakeHTTPClient(
|
||||||
|
{
|
||||||
|
detail_url: """
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<h2>Сведения о заказчике</h2>
|
||||||
|
<div>Полное наименование</div>
|
||||||
|
<div>ФЕДЕРАЛЬНОЕ ГБУ НАУКИ</div>
|
||||||
|
<div>ИНН / КПП</div>
|
||||||
|
<div>4101020011 / 410101001</div>
|
||||||
|
<div>ОГРН</div>
|
||||||
|
<div>1024101023456</div>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
""".encode()
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
records = client.fetch_records(
|
||||||
|
content=f"""
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<div class="search-registry-entry-block">
|
||||||
|
<a href="{detail_url}">№ 0338100002026000022</a>
|
||||||
|
<div>Работа комиссии</div>
|
||||||
|
<div>Объект закупки</div>
|
||||||
|
<div>Поставка оборудования</div>
|
||||||
|
<div>Заказчик</div>
|
||||||
|
<div>ФЕДЕРАЛЬНОЕ ГБУ НАУКИ</div>
|
||||||
|
<div>Начальная цена</div>
|
||||||
|
<div>1 000,00 ₽</div>
|
||||||
|
<div>Размещено</div>
|
||||||
|
<div>19.05.2026</div>
|
||||||
|
</div>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
""".encode(),
|
||||||
|
file_name="results.html",
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(len(records), 1)
|
||||||
|
record = records[0]
|
||||||
|
self.assertEqual(record.inn, "4101020011")
|
||||||
|
self.assertEqual(record.ogrn, "1024101023456")
|
||||||
|
self.assertEqual(record.payload["kpp"], "410101001")
|
||||||
|
self.assertEqual(record.payload["detail_url"], detail_url)
|
||||||
|
self.assertEqual(client._http_client.downloaded_urls, [detail_url])
|
||||||
|
|
||||||
|
def test_procurement_card_keeps_record_when_detail_page_is_unavailable(self):
|
||||||
|
detail_url = (
|
||||||
|
"https://zakupki.gov.ru/epz/order/notice/ea20/view/"
|
||||||
|
"common-info.html?regNumber=0338100002026000022"
|
||||||
|
)
|
||||||
|
client = StructuredDataClient(source="procurements_44fz")
|
||||||
|
client._http_client = _FakeHTTPClient(
|
||||||
|
{detail_url: HTTPClientError("detail unavailable", url=detail_url)}
|
||||||
|
)
|
||||||
|
|
||||||
|
records = client.fetch_records(
|
||||||
|
content=f"""
|
||||||
|
<html>
|
||||||
|
<body>
|
||||||
|
<div class="search-registry-entry-block">
|
||||||
|
<a href="{detail_url}">№ 0338100002026000022</a>
|
||||||
|
<div>Объект закупки</div>
|
||||||
|
<div>Поставка оборудования</div>
|
||||||
|
<div>Заказчик</div>
|
||||||
|
<div>ФЕДЕРАЛЬНОЕ ГБУ НАУКИ</div>
|
||||||
|
</div>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
""".encode(),
|
||||||
|
file_name="results.html",
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(len(records), 1)
|
||||||
|
self.assertEqual(records[0].inn, "")
|
||||||
|
self.assertEqual(records[0].ogrn, "")
|
||||||
|
self.assertEqual(records[0].organisation_name, "ФЕДЕРАЛЬНОЕ ГБУ НАУКИ")
|
||||||
Reference in New Issue
Block a user