fix(parsers): unblock source status and heavy product import
All checks were successful
CI/CD Pipeline / Quality Gate (push) Successful in 19s
CI/CD Pipeline / Build and Push Images (push) Successful in 6s
CI/CD Pipeline / Internal Notify (push) Successful in 1s
CI/CD Pipeline / Deploy Dev in Dokploy (push) Successful in 1s

This commit is contained in:
2026-04-28 22:51:52 +02:00
parent e921f4b204
commit 01387ae13b
6 changed files with 182 additions and 25 deletions

View File

@@ -93,6 +93,12 @@ def _normalize_header(value) -> str:
return re.sub(r"[^a-zа-я0-9]+", "", text)
NORMALIZED_HEADER_ALIASES = {
field_name: {_normalize_header(alias) for alias in aliases}
for field_name, aliases in HEADER_ALIASES.items()
}
@dataclass
class IndustrialProductsClient:
"""
@@ -205,7 +211,12 @@ class IndustrialProductsClient:
logger.info("Downloading Excel file: %s", file_url)
content = self.http_client.download_file(file_url)
workbook = load_workbook(filename=BytesIO(content), data_only=True)
workbook = load_workbook(
filename=BytesIO(content),
data_only=True,
read_only=True,
)
try:
worksheet = workbook.active
header_row_index, header_map = self._detect_headers(worksheet)
@@ -219,20 +230,17 @@ class IndustrialProductsClient:
if product:
products.append(product)
workbook.close()
return products
finally:
workbook.close()
def _detect_headers(self, worksheet) -> tuple[int, dict[str, int]]:
best_map: dict[str, int] = {}
for row_index in range(1, min(worksheet.max_row, 10) + 1):
row = next(
worksheet.iter_rows(
min_row=row_index,
max_row=row_index,
values_only=True,
)
)
for row_index, row in enumerate(
worksheet.iter_rows(max_row=10, values_only=True),
start=1,
):
header_map = self._build_header_map(row)
if len(header_map) > len(best_map):
best_map = header_map
@@ -254,8 +262,8 @@ class IndustrialProductsClient:
if not normalized:
continue
for field_name, aliases in HEADER_ALIASES.items():
if normalized in {_normalize_header(alias) for alias in aliases}:
for field_name, aliases in NORMALIZED_HEADER_ALIASES.items():
if normalized in aliases:
header_map[field_name] = index
break

View File

@@ -13,6 +13,7 @@ from apps.core.services import BackgroundJobService
from apps.parsers.models import (
FinancialReport,
FinancialReportLine,
GenericParserRecord,
IndustrialCertificateRecord,
IndustrialProductRecord,
InspectionRecord,
@@ -100,6 +101,9 @@ SOURCE_CARD_DEFINITIONS: tuple[SourceCardDefinition, ...] = (
task_names=(
"apps.parsers.tasks.parse_procurements",
"apps.parsers.tasks.sync_procurements",
"apps.parsers.tasks.parse_procurements_44fz",
"apps.parsers.tasks.parse_procurements_223fz",
"apps.parsers.tasks.parse_contracts",
),
source_items=(
SourceItemDefinition(
@@ -108,6 +112,24 @@ SOURCE_CARD_DEFINITIONS: tuple[SourceCardDefinition, ...] = (
description=("Закупки и связанные данные из ЕИС по 44-ФЗ и 223-ФЗ."),
parser_source=ParserLoadLog.Source.PROCUREMENTS,
),
SourceItemDefinition(
code="procurements_44fz",
title="Закупки 44-ФЗ",
description="Извещения и закупочные процедуры 44-ФЗ.",
parser_source=ParserLoadLog.Source.PROCUREMENTS_44FZ,
),
SourceItemDefinition(
code="procurements_223fz",
title="Закупки 223-ФЗ",
description="Извещения и закупочные процедуры 223-ФЗ.",
parser_source=ParserLoadLog.Source.PROCUREMENTS_223FZ,
),
SourceItemDefinition(
code="contracts",
title="Контракты ЕИС",
description="Государственные и корпоративные контракты.",
parser_source=ParserLoadLog.Source.CONTRACTS,
),
),
refresh_params=(
RefreshParamDefinition(
@@ -203,6 +225,11 @@ SOURCE_CARD_BY_PARSER_SOURCE = {
for source_item in definition.source_items
if source_item.parser_source
}
GENERIC_RECORD_SOURCES_BY_ITEM_CODE = {
"procurements_44fz": ParserLoadLog.Source.PROCUREMENTS_44FZ,
"procurements_223fz": ParserLoadLog.Source.PROCUREMENTS_223FZ,
"contracts": ParserLoadLog.Source.CONTRACTS,
}
class SourceCardService:
@@ -576,6 +603,9 @@ class SourceCardService:
@classmethod
def _get_source_records_count(cls, item_code: str) -> int:
generic_source = GENERIC_RECORD_SOURCES_BY_ITEM_CODE.get(item_code)
if generic_source:
return GenericParserRecord.objects.filter(source=generic_source).count()
if item_code == "fns_reports":
return FinancialReportLine.objects.count()
if item_code == "industrial":
@@ -592,6 +622,15 @@ class SourceCardService:
@classmethod
def _get_source_organizations_count(cls, item_code: str) -> int:
generic_source = GENERIC_RECORD_SOURCES_BY_ITEM_CODE.get(item_code)
if generic_source:
return (
GenericParserRecord.objects.filter(source=generic_source)
.exclude(inn="")
.values("inn")
.distinct()
.count()
)
if item_code == "fns_reports":
return (
FinancialReport.objects.exclude(ogrn="")
@@ -638,6 +677,11 @@ class SourceCardService:
@classmethod
def _get_source_data_timestamp(cls, item_code: str):
generic_source = GENERIC_RECORD_SOURCES_BY_ITEM_CODE.get(item_code)
if generic_source:
return GenericParserRecord.objects.filter(source=generic_source).aggregate(
last_updated=Max("updated_at")
)["last_updated"]
if item_code == "fns_reports":
return FinancialReport.objects.aggregate(last_updated=Max("updated_at"))[
"last_updated"
@@ -670,6 +714,24 @@ class SourceCardService:
definition: SourceCardDefinition,
source_items: list[dict[str, Any]],
) -> int:
if definition.slug == "public-procurements":
legacy_inns = (
ProcurementRecord.objects.exclude(customer_inn="")
.order_by()
.values_list("customer_inn", flat=True)
.distinct()
)
generic_inns = (
GenericParserRecord.objects.filter(
source__in=GENERIC_RECORD_SOURCES_BY_ITEM_CODE.values()
)
.exclude(inn="")
.order_by()
.values_list("inn", flat=True)
.distinct()
)
return legacy_inns.union(generic_inns).count()
if definition.slug != "manufacturers-and-products":
return sum(item["organizations_count"] for item in source_items)

View File

@@ -59,6 +59,8 @@ FEDRESURS_CHECKO_FALLBACK_LIMIT = 100
PARSER_STALE_LOAD_MAX_AGE_MINUTES = 90
PARSER_SOFT_TIME_LIMIT_SECONDS = 15 * 60
PARSER_TIME_LIMIT_SECONDS = 20 * 60
INDUSTRIAL_PRODUCTS_SOFT_TIME_LIMIT_SECONDS = 45 * 60
INDUSTRIAL_PRODUCTS_TIME_LIMIT_SECONDS = 60 * 60
class ParserSourceSkipped(Exception):
@@ -792,8 +794,8 @@ def parse_manufactures(
@shared_task(
bind=True,
soft_time_limit=PARSER_SOFT_TIME_LIMIT_SECONDS,
time_limit=PARSER_TIME_LIMIT_SECONDS,
soft_time_limit=INDUSTRIAL_PRODUCTS_SOFT_TIME_LIMIT_SECONDS,
time_limit=INDUSTRIAL_PRODUCTS_TIME_LIMIT_SECONDS,
)
def parse_industrial_products(
self,

View File

@@ -2,6 +2,7 @@
from __future__ import annotations
from unittest.mock import patch
from urllib.parse import urlparse
import requests
@@ -22,6 +23,7 @@ from apps.parsers.clients.minpromtorg.schemas import (
from apps.parsers.clients.proverki import ProverkiClient
from apps.parsers.clients.proverki.schemas import Inspection
from django.test import TestCase, tag
from openpyxl import load_workbook as openpyxl_load_workbook
from requests.adapters import BaseAdapter
from tests.utils import Response, TestHTTPServer
@@ -513,6 +515,40 @@ class IndustrialProductsClientTest(TestCase):
self.assertEqual(products, [])
def test_fetch_products_uses_read_only_workbook(self):
excel_bytes, rows = build_minpromtorg_products_excel(count=2)
file_name = "industrial_products_20260428.xlsx"
with TestHTTPServer() as server:
server.add_json(
"/api/kss-document-preview",
{
"data": [
{
"name": IndustrialProductsClient().query,
"files": [
{"name": file_name, "url": f"/files/{file_name}"}
],
}
]
},
)
server.add_bytes(f"/files/{file_name}", excel_bytes)
client = IndustrialProductsClient(
host=_host_from_base_url(server.base_url),
scheme="http",
http_adapter=server.adapter,
)
with patch(
"apps.parsers.clients.minpromtorg.products.load_workbook",
wraps=openpyxl_load_workbook,
) as load_workbook_mock:
products = client.fetch_products()
self.assertEqual(len(products), len(rows))
self.assertTrue(load_workbook_mock.call_args.kwargs["read_only"])
def test_get_latest_file_url_falls_back_to_excel_file(self):
client = IndustrialProductsClient()
files = [

View File

@@ -5,6 +5,7 @@ from types import SimpleNamespace
from unittest.mock import MagicMock, patch
from apps.core.models import BackgroundJob, JobStatus
from apps.parsers.models import GenericParserRecord, ParserLoadLog
from apps.parsers.source_cards import (
SourceCardDefinition,
SourceCardService,
@@ -296,6 +297,42 @@ class SourceCardServiceUnitTest(SimpleTestCase):
@override_settings(PARSER_STALE_LOAD_MAX_AGE_MINUTES=90)
class SourceCardServiceDatabaseTest(TestCase):
def test_public_procurements_counts_generic_eis_sources(self):
GenericParserRecord.objects.create(
source=ParserLoadLog.Source.PROCUREMENTS_44FZ,
load_batch=1,
external_id="notice-1",
inn="7701234567",
title="Закупка 44-ФЗ",
payload={"number": "notice-1"},
)
GenericParserRecord.objects.create(
source=ParserLoadLog.Source.CONTRACTS,
load_batch=1,
external_id="contract-1",
inn="7701234567",
title="Контракт ЕИС",
payload={"number": "contract-1"},
)
ParserLoadLog.objects.create(
source=ParserLoadLog.Source.PROCUREMENTS_44FZ,
batch_id=1,
records_count=1,
status=ParserLoadLog.Status.SUCCESS,
)
ParserLoadLog.objects.create(
source=ParserLoadLog.Source.CONTRACTS,
batch_id=1,
records_count=1,
status=ParserLoadLog.Status.SUCCESS,
)
card = SourceCardService.get_card("public-procurements")
self.assertEqual(card["status"], "success")
self.assertEqual(card["records_count"], 2)
self.assertEqual(card["organizations_count"], 1)
def test_get_active_tasks_ignores_old_jobs_even_when_updated_recently(self):
job = BackgroundJob.objects.create(
task_id="old-source-task",

View File

@@ -40,6 +40,8 @@ from apps.parsers.models import (
)
from apps.parsers.services import ParserLoadLogService
from apps.parsers.tasks import (
INDUSTRIAL_PRODUCTS_SOFT_TIME_LIMIT_SECONDS,
INDUSTRIAL_PRODUCTS_TIME_LIMIT_SECONDS,
_move_to_dir,
_process_fns_file_sync,
_remove_lock,
@@ -589,6 +591,16 @@ class SyncProcurementsTaskTestCase(TestCase):
class MinpromtorgTasksTestCase(TestCase):
"""Tests for Minpromtorg tasks."""
def test_parse_industrial_products_has_extended_time_limits(self):
self.assertEqual(
parse_industrial_products.soft_time_limit,
INDUSTRIAL_PRODUCTS_SOFT_TIME_LIMIT_SECONDS,
)
self.assertEqual(
parse_industrial_products.time_limit,
INDUSTRIAL_PRODUCTS_TIME_LIMIT_SECONDS,
)
def _add_minpromtorg_routes(self, server: TestHTTPServer):
certificates_bytes, cert_rows = build_minpromtorg_certificates_excel(count=2)
manufacturers_bytes, manuf_rows = build_minpromtorg_manufacturers_excel(count=2)