fix(parsers): enrich EIS procurement identities from details

This commit is contained in:
2026-05-19 10:28:14 +02:00
parent 4ca2fa25d5
commit 1c7c7238be
2 changed files with 254 additions and 4 deletions

View File

@@ -56,6 +56,18 @@ EIS_CARD_SOURCES = {
"unfair_suppliers",
}
ZAKUPKI_BASE_URL = "https://zakupki.gov.ru"
ZAKUPKI_DETAIL_MAX_FILE_SIZE_BYTES = 2 * 1024 * 1024
ZAKUPKI_DETAIL_URL_MARKERS = (
"/view/common-info",
"/card/common-info",
"/contractCard/common-info",
"/notice/",
"/orderclause/card/",
"/contract/contractCard/",
)
INN_RE = re.compile(r"(?<!\d)(?:\d{12}|\d{10})(?!\d)")
KPP_RE = re.compile(r"(?<!\d)\d{9}(?!\d)")
OGRN_RE = re.compile(r"(?<!\d)(?:\d{15}|\d{13})(?!\d)")
GISP_PRODUCTS_DOWNLOAD_LABEL = "Скачать только действующие"
GISP_PRODUCTS_API_PATH = "/pp719v2/pub/prod/b/"
GISP_PRODUCTS_PAGE_SIZE = 100
@@ -85,6 +97,8 @@ class StructuredDataClient:
max_zip_uncompressed_bytes: int = MAX_ZIP_UNCOMPRESSED_BYTES
max_records: int = MAX_RECORDS
verify_ssl: bool = True
enrich_eis_detail_pages: bool = True
max_eis_detail_pages: int = 200
_http_client: BaseHTTPClient | None = field(default=None, repr=False)
@property
@@ -476,7 +490,7 @@ class StructuredDataClient:
"Вступление в силу",
"Способы закупки",
}
for card in soup.select(".search-registry-entry-block"):
for card_index, card in enumerate(soup.select(".search-registry-entry-block")):
lines = self._extract_text_lines(card)
if not lines:
continue
@@ -486,16 +500,146 @@ class StructuredDataClient:
self._fill_label_pairs(row, lines, labels)
self._fill_zakupki_status(row, lines, number_index, labels)
link = card.find("a", href=True)
if link:
row["url"] = urljoin(ZAKUPKI_BASE_URL, link["href"])
detail_url = self._select_zakupki_card_url(card)
if detail_url:
row["url"] = detail_url
row["detail_url"] = detail_url
if lines[0].endswith("-ФЗ"):
row["law"] = lines[0]
if card_index < self.max_eis_detail_pages:
self._enrich_zakupki_card_from_detail(row)
result.append(row)
self._validate_record_count(len(result))
return result
def _select_zakupki_card_url(self, card: Any) -> str:
"""Выбрать URL detail page из карточки ЕИС."""
urls = []
for link in card.find_all("a", href=True):
href = str(link.get("href") or "").strip()
if not href or href.startswith(("#", "javascript:")):
continue
urls.append(urljoin(ZAKUPKI_BASE_URL, href))
for marker in ZAKUPKI_DETAIL_URL_MARKERS:
for url in urls:
if marker in url:
return url
return urls[0] if urls else ""
def _enrich_zakupki_card_from_detail(self, row: dict[str, Any]) -> None:
"""Дозагрузить ИНН/КПП/ОГРН заказчика со страницы карточки ЕИС."""
if not self.enrich_eis_detail_pages:
return
if row.get("inn") and row.get("ogrn"):
return
detail_url = str(row.get("detail_url") or row.get("url") or "").strip()
if not detail_url:
return
try:
content = self.http_client.download_file(
detail_url,
max_size_bytes=ZAKUPKI_DETAIL_MAX_FILE_SIZE_BYTES,
)
except HTTPClientError as exc:
logger.info("EIS detail enrichment skipped for %s: %s", detail_url, exc)
return
identity = self._extract_zakupki_detail_identity(content)
for field_name in ("inn", "kpp", "ogrn", "organisation_name"):
value = identity.get(field_name)
if value and not row.get(field_name):
row[field_name] = value
def _extract_zakupki_detail_identity(self, content: bytes) -> dict[str, str]:
"""Извлечь реквизиты организации из HTML detail page ЕИС."""
soup = BeautifulSoup(self._decode(content), "html.parser")
lines = self._extract_text_lines(soup)
return {
"inn": self._find_labeled_identifier(
lines,
labels=("инн", "инн заказчика", "инн / кпп", "инн/кпп"),
pattern=INN_RE,
),
"kpp": self._find_labeled_identifier(
lines,
labels=("кпп", "инн / кпп", "инн/кпп"),
pattern=KPP_RE,
),
"ogrn": self._find_labeled_identifier(
lines,
labels=("огрн", "огрн заказчика"),
pattern=OGRN_RE,
),
"organisation_name": self._find_labeled_text(
lines,
labels=(
"полное наименование",
"наименование заказчика",
"заказчик",
),
),
}
def _find_labeled_identifier(
self,
lines: list[str],
*,
labels: tuple[str, ...],
pattern: re.Pattern[str],
) -> str:
"""Найти идентификатор рядом с подписью на detail page."""
for index, line in enumerate(lines):
if not self._line_contains_label(line, labels):
continue
for candidate in (line, *lines[index + 1 : index + 4]):
match = pattern.search(candidate)
if match:
return match.group(0)
return ""
def _find_labeled_text(
self,
lines: list[str],
*,
labels: tuple[str, ...],
) -> str:
"""Найти текстовое значение рядом с подписью на detail page."""
for index, line in enumerate(lines[:-1]):
if not self._line_contains_label(line, labels):
continue
value = lines[index + 1].strip()
if value and not self._looks_like_detail_label(value):
return value
return ""
@staticmethod
def _line_contains_label(line: str, labels: tuple[str, ...]) -> bool:
normalized_line = StructuredDataClient._normalize_detail_label(line)
return any(label in normalized_line for label in labels)
@staticmethod
def _looks_like_detail_label(value: str) -> bool:
normalized = StructuredDataClient._normalize_detail_label(value)
return normalized in {
"инн",
"кпп",
"инн кпп",
"огрн",
"полное наименование",
"наименование заказчика",
"заказчик",
}
@staticmethod
def _normalize_detail_label(value: str) -> str:
normalized = value.lower().replace("ё", "е").replace("/", " ")
normalized = re.sub(r"[^0-9a-zа-я]+", " ", normalized)
return re.sub(r"\s+", " ", normalized).strip()
def _parse_html_table(self, table: Any) -> list[dict]:
"""Распарсить HTML-таблицу с th или строкой-заголовком в td."""
rows = table.find_all("tr")