diff --git a/src/apps/parsers/clients/minpromtorg/products.py b/src/apps/parsers/clients/minpromtorg/products.py index e352219..b9dd81b 100644 --- a/src/apps/parsers/clients/minpromtorg/products.py +++ b/src/apps/parsers/clients/minpromtorg/products.py @@ -93,6 +93,12 @@ def _normalize_header(value) -> str: return re.sub(r"[^a-zа-я0-9]+", "", text) +NORMALIZED_HEADER_ALIASES = { + field_name: {_normalize_header(alias) for alias in aliases} + for field_name, aliases in HEADER_ALIASES.items() +} + + @dataclass class IndustrialProductsClient: """ @@ -205,34 +211,36 @@ class IndustrialProductsClient: logger.info("Downloading Excel file: %s", file_url) content = self.http_client.download_file(file_url) - workbook = load_workbook(filename=BytesIO(content), data_only=True) - worksheet = workbook.active + workbook = load_workbook( + filename=BytesIO(content), + data_only=True, + read_only=True, + ) + try: + worksheet = workbook.active - header_row_index, header_map = self._detect_headers(worksheet) - products: list[IndustrialProduct] = [] + header_row_index, header_map = self._detect_headers(worksheet) + products: list[IndustrialProduct] = [] - for row in worksheet.iter_rows( - min_row=header_row_index + 1, - values_only=True, - ): - product = self._parse_row(row, header_map) - if product: - products.append(product) + for row in worksheet.iter_rows( + min_row=header_row_index + 1, + values_only=True, + ): + product = self._parse_row(row, header_map) + if product: + products.append(product) - workbook.close() - return products + return products + finally: + workbook.close() def _detect_headers(self, worksheet) -> tuple[int, dict[str, int]]: best_map: dict[str, int] = {} - for row_index in range(1, min(worksheet.max_row, 10) + 1): - row = next( - worksheet.iter_rows( - min_row=row_index, - max_row=row_index, - values_only=True, - ) - ) + for row_index, row in enumerate( + worksheet.iter_rows(max_row=10, values_only=True), + start=1, + ): header_map = self._build_header_map(row) if len(header_map) > len(best_map): best_map = header_map @@ -254,8 +262,8 @@ class IndustrialProductsClient: if not normalized: continue - for field_name, aliases in HEADER_ALIASES.items(): - if normalized in {_normalize_header(alias) for alias in aliases}: + for field_name, aliases in NORMALIZED_HEADER_ALIASES.items(): + if normalized in aliases: header_map[field_name] = index break diff --git a/src/apps/parsers/source_cards.py b/src/apps/parsers/source_cards.py index 0b721fb..06ed6dd 100644 --- a/src/apps/parsers/source_cards.py +++ b/src/apps/parsers/source_cards.py @@ -13,6 +13,7 @@ from apps.core.services import BackgroundJobService from apps.parsers.models import ( FinancialReport, FinancialReportLine, + GenericParserRecord, IndustrialCertificateRecord, IndustrialProductRecord, InspectionRecord, @@ -100,6 +101,9 @@ SOURCE_CARD_DEFINITIONS: tuple[SourceCardDefinition, ...] = ( task_names=( "apps.parsers.tasks.parse_procurements", "apps.parsers.tasks.sync_procurements", + "apps.parsers.tasks.parse_procurements_44fz", + "apps.parsers.tasks.parse_procurements_223fz", + "apps.parsers.tasks.parse_contracts", ), source_items=( SourceItemDefinition( @@ -108,6 +112,24 @@ SOURCE_CARD_DEFINITIONS: tuple[SourceCardDefinition, ...] = ( description=("Закупки и связанные данные из ЕИС по 44-ФЗ и 223-ФЗ."), parser_source=ParserLoadLog.Source.PROCUREMENTS, ), + SourceItemDefinition( + code="procurements_44fz", + title="Закупки 44-ФЗ", + description="Извещения и закупочные процедуры 44-ФЗ.", + parser_source=ParserLoadLog.Source.PROCUREMENTS_44FZ, + ), + SourceItemDefinition( + code="procurements_223fz", + title="Закупки 223-ФЗ", + description="Извещения и закупочные процедуры 223-ФЗ.", + parser_source=ParserLoadLog.Source.PROCUREMENTS_223FZ, + ), + SourceItemDefinition( + code="contracts", + title="Контракты ЕИС", + description="Государственные и корпоративные контракты.", + parser_source=ParserLoadLog.Source.CONTRACTS, + ), ), refresh_params=( RefreshParamDefinition( @@ -203,6 +225,11 @@ SOURCE_CARD_BY_PARSER_SOURCE = { for source_item in definition.source_items if source_item.parser_source } +GENERIC_RECORD_SOURCES_BY_ITEM_CODE = { + "procurements_44fz": ParserLoadLog.Source.PROCUREMENTS_44FZ, + "procurements_223fz": ParserLoadLog.Source.PROCUREMENTS_223FZ, + "contracts": ParserLoadLog.Source.CONTRACTS, +} class SourceCardService: @@ -576,6 +603,9 @@ class SourceCardService: @classmethod def _get_source_records_count(cls, item_code: str) -> int: + generic_source = GENERIC_RECORD_SOURCES_BY_ITEM_CODE.get(item_code) + if generic_source: + return GenericParserRecord.objects.filter(source=generic_source).count() if item_code == "fns_reports": return FinancialReportLine.objects.count() if item_code == "industrial": @@ -592,6 +622,15 @@ class SourceCardService: @classmethod def _get_source_organizations_count(cls, item_code: str) -> int: + generic_source = GENERIC_RECORD_SOURCES_BY_ITEM_CODE.get(item_code) + if generic_source: + return ( + GenericParserRecord.objects.filter(source=generic_source) + .exclude(inn="") + .values("inn") + .distinct() + .count() + ) if item_code == "fns_reports": return ( FinancialReport.objects.exclude(ogrn="") @@ -638,6 +677,11 @@ class SourceCardService: @classmethod def _get_source_data_timestamp(cls, item_code: str): + generic_source = GENERIC_RECORD_SOURCES_BY_ITEM_CODE.get(item_code) + if generic_source: + return GenericParserRecord.objects.filter(source=generic_source).aggregate( + last_updated=Max("updated_at") + )["last_updated"] if item_code == "fns_reports": return FinancialReport.objects.aggregate(last_updated=Max("updated_at"))[ "last_updated" @@ -670,6 +714,24 @@ class SourceCardService: definition: SourceCardDefinition, source_items: list[dict[str, Any]], ) -> int: + if definition.slug == "public-procurements": + legacy_inns = ( + ProcurementRecord.objects.exclude(customer_inn="") + .order_by() + .values_list("customer_inn", flat=True) + .distinct() + ) + generic_inns = ( + GenericParserRecord.objects.filter( + source__in=GENERIC_RECORD_SOURCES_BY_ITEM_CODE.values() + ) + .exclude(inn="") + .order_by() + .values_list("inn", flat=True) + .distinct() + ) + return legacy_inns.union(generic_inns).count() + if definition.slug != "manufacturers-and-products": return sum(item["organizations_count"] for item in source_items) diff --git a/src/apps/parsers/tasks.py b/src/apps/parsers/tasks.py index f62652f..20b99a1 100644 --- a/src/apps/parsers/tasks.py +++ b/src/apps/parsers/tasks.py @@ -59,6 +59,8 @@ FEDRESURS_CHECKO_FALLBACK_LIMIT = 100 PARSER_STALE_LOAD_MAX_AGE_MINUTES = 90 PARSER_SOFT_TIME_LIMIT_SECONDS = 15 * 60 PARSER_TIME_LIMIT_SECONDS = 20 * 60 +INDUSTRIAL_PRODUCTS_SOFT_TIME_LIMIT_SECONDS = 45 * 60 +INDUSTRIAL_PRODUCTS_TIME_LIMIT_SECONDS = 60 * 60 class ParserSourceSkipped(Exception): @@ -792,8 +794,8 @@ def parse_manufactures( @shared_task( bind=True, - soft_time_limit=PARSER_SOFT_TIME_LIMIT_SECONDS, - time_limit=PARSER_TIME_LIMIT_SECONDS, + soft_time_limit=INDUSTRIAL_PRODUCTS_SOFT_TIME_LIMIT_SECONDS, + time_limit=INDUSTRIAL_PRODUCTS_TIME_LIMIT_SECONDS, ) def parse_industrial_products( self, diff --git a/tests/apps/parsers/test_clients.py b/tests/apps/parsers/test_clients.py index c13e980..cc9110f 100644 --- a/tests/apps/parsers/test_clients.py +++ b/tests/apps/parsers/test_clients.py @@ -2,6 +2,7 @@ from __future__ import annotations +from unittest.mock import patch from urllib.parse import urlparse import requests @@ -22,6 +23,7 @@ from apps.parsers.clients.minpromtorg.schemas import ( from apps.parsers.clients.proverki import ProverkiClient from apps.parsers.clients.proverki.schemas import Inspection from django.test import TestCase, tag +from openpyxl import load_workbook as openpyxl_load_workbook from requests.adapters import BaseAdapter from tests.utils import Response, TestHTTPServer @@ -513,6 +515,40 @@ class IndustrialProductsClientTest(TestCase): self.assertEqual(products, []) + def test_fetch_products_uses_read_only_workbook(self): + excel_bytes, rows = build_minpromtorg_products_excel(count=2) + file_name = "industrial_products_20260428.xlsx" + + with TestHTTPServer() as server: + server.add_json( + "/api/kss-document-preview", + { + "data": [ + { + "name": IndustrialProductsClient().query, + "files": [ + {"name": file_name, "url": f"/files/{file_name}"} + ], + } + ] + }, + ) + server.add_bytes(f"/files/{file_name}", excel_bytes) + + client = IndustrialProductsClient( + host=_host_from_base_url(server.base_url), + scheme="http", + http_adapter=server.adapter, + ) + with patch( + "apps.parsers.clients.minpromtorg.products.load_workbook", + wraps=openpyxl_load_workbook, + ) as load_workbook_mock: + products = client.fetch_products() + + self.assertEqual(len(products), len(rows)) + self.assertTrue(load_workbook_mock.call_args.kwargs["read_only"]) + def test_get_latest_file_url_falls_back_to_excel_file(self): client = IndustrialProductsClient() files = [ diff --git a/tests/apps/parsers/test_source_cards_service.py b/tests/apps/parsers/test_source_cards_service.py index 3ba3bce..80c2ebc 100644 --- a/tests/apps/parsers/test_source_cards_service.py +++ b/tests/apps/parsers/test_source_cards_service.py @@ -5,6 +5,7 @@ from types import SimpleNamespace from unittest.mock import MagicMock, patch from apps.core.models import BackgroundJob, JobStatus +from apps.parsers.models import GenericParserRecord, ParserLoadLog from apps.parsers.source_cards import ( SourceCardDefinition, SourceCardService, @@ -296,6 +297,42 @@ class SourceCardServiceUnitTest(SimpleTestCase): @override_settings(PARSER_STALE_LOAD_MAX_AGE_MINUTES=90) class SourceCardServiceDatabaseTest(TestCase): + def test_public_procurements_counts_generic_eis_sources(self): + GenericParserRecord.objects.create( + source=ParserLoadLog.Source.PROCUREMENTS_44FZ, + load_batch=1, + external_id="notice-1", + inn="7701234567", + title="Закупка 44-ФЗ", + payload={"number": "notice-1"}, + ) + GenericParserRecord.objects.create( + source=ParserLoadLog.Source.CONTRACTS, + load_batch=1, + external_id="contract-1", + inn="7701234567", + title="Контракт ЕИС", + payload={"number": "contract-1"}, + ) + ParserLoadLog.objects.create( + source=ParserLoadLog.Source.PROCUREMENTS_44FZ, + batch_id=1, + records_count=1, + status=ParserLoadLog.Status.SUCCESS, + ) + ParserLoadLog.objects.create( + source=ParserLoadLog.Source.CONTRACTS, + batch_id=1, + records_count=1, + status=ParserLoadLog.Status.SUCCESS, + ) + + card = SourceCardService.get_card("public-procurements") + + self.assertEqual(card["status"], "success") + self.assertEqual(card["records_count"], 2) + self.assertEqual(card["organizations_count"], 1) + def test_get_active_tasks_ignores_old_jobs_even_when_updated_recently(self): job = BackgroundJob.objects.create( task_id="old-source-task", diff --git a/tests/apps/parsers/test_tasks.py b/tests/apps/parsers/test_tasks.py index 143e124..2eb8e8f 100644 --- a/tests/apps/parsers/test_tasks.py +++ b/tests/apps/parsers/test_tasks.py @@ -40,6 +40,8 @@ from apps.parsers.models import ( ) from apps.parsers.services import ParserLoadLogService from apps.parsers.tasks import ( + INDUSTRIAL_PRODUCTS_SOFT_TIME_LIMIT_SECONDS, + INDUSTRIAL_PRODUCTS_TIME_LIMIT_SECONDS, _move_to_dir, _process_fns_file_sync, _remove_lock, @@ -589,6 +591,16 @@ class SyncProcurementsTaskTestCase(TestCase): class MinpromtorgTasksTestCase(TestCase): """Tests for Minpromtorg tasks.""" + def test_parse_industrial_products_has_extended_time_limits(self): + self.assertEqual( + parse_industrial_products.soft_time_limit, + INDUSTRIAL_PRODUCTS_SOFT_TIME_LIMIT_SECONDS, + ) + self.assertEqual( + parse_industrial_products.time_limit, + INDUSTRIAL_PRODUCTS_TIME_LIMIT_SECONDS, + ) + def _add_minpromtorg_routes(self, server: TestHTTPServer): certificates_bytes, cert_rows = build_minpromtorg_certificates_excel(count=2) manufacturers_bytes, manuf_rows = build_minpromtorg_manufacturers_excel(count=2)