fix(parsers): unblock source status and heavy product import
This commit is contained in:
@@ -93,6 +93,12 @@ def _normalize_header(value) -> str:
|
||||
return re.sub(r"[^a-zа-я0-9]+", "", text)
|
||||
|
||||
|
||||
NORMALIZED_HEADER_ALIASES = {
|
||||
field_name: {_normalize_header(alias) for alias in aliases}
|
||||
for field_name, aliases in HEADER_ALIASES.items()
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class IndustrialProductsClient:
|
||||
"""
|
||||
@@ -205,34 +211,36 @@ class IndustrialProductsClient:
|
||||
logger.info("Downloading Excel file: %s", file_url)
|
||||
|
||||
content = self.http_client.download_file(file_url)
|
||||
workbook = load_workbook(filename=BytesIO(content), data_only=True)
|
||||
worksheet = workbook.active
|
||||
workbook = load_workbook(
|
||||
filename=BytesIO(content),
|
||||
data_only=True,
|
||||
read_only=True,
|
||||
)
|
||||
try:
|
||||
worksheet = workbook.active
|
||||
|
||||
header_row_index, header_map = self._detect_headers(worksheet)
|
||||
products: list[IndustrialProduct] = []
|
||||
header_row_index, header_map = self._detect_headers(worksheet)
|
||||
products: list[IndustrialProduct] = []
|
||||
|
||||
for row in worksheet.iter_rows(
|
||||
min_row=header_row_index + 1,
|
||||
values_only=True,
|
||||
):
|
||||
product = self._parse_row(row, header_map)
|
||||
if product:
|
||||
products.append(product)
|
||||
for row in worksheet.iter_rows(
|
||||
min_row=header_row_index + 1,
|
||||
values_only=True,
|
||||
):
|
||||
product = self._parse_row(row, header_map)
|
||||
if product:
|
||||
products.append(product)
|
||||
|
||||
workbook.close()
|
||||
return products
|
||||
return products
|
||||
finally:
|
||||
workbook.close()
|
||||
|
||||
def _detect_headers(self, worksheet) -> tuple[int, dict[str, int]]:
|
||||
best_map: dict[str, int] = {}
|
||||
|
||||
for row_index in range(1, min(worksheet.max_row, 10) + 1):
|
||||
row = next(
|
||||
worksheet.iter_rows(
|
||||
min_row=row_index,
|
||||
max_row=row_index,
|
||||
values_only=True,
|
||||
)
|
||||
)
|
||||
for row_index, row in enumerate(
|
||||
worksheet.iter_rows(max_row=10, values_only=True),
|
||||
start=1,
|
||||
):
|
||||
header_map = self._build_header_map(row)
|
||||
if len(header_map) > len(best_map):
|
||||
best_map = header_map
|
||||
@@ -254,8 +262,8 @@ class IndustrialProductsClient:
|
||||
if not normalized:
|
||||
continue
|
||||
|
||||
for field_name, aliases in HEADER_ALIASES.items():
|
||||
if normalized in {_normalize_header(alias) for alias in aliases}:
|
||||
for field_name, aliases in NORMALIZED_HEADER_ALIASES.items():
|
||||
if normalized in aliases:
|
||||
header_map[field_name] = index
|
||||
break
|
||||
|
||||
|
||||
@@ -13,6 +13,7 @@ from apps.core.services import BackgroundJobService
|
||||
from apps.parsers.models import (
|
||||
FinancialReport,
|
||||
FinancialReportLine,
|
||||
GenericParserRecord,
|
||||
IndustrialCertificateRecord,
|
||||
IndustrialProductRecord,
|
||||
InspectionRecord,
|
||||
@@ -100,6 +101,9 @@ SOURCE_CARD_DEFINITIONS: tuple[SourceCardDefinition, ...] = (
|
||||
task_names=(
|
||||
"apps.parsers.tasks.parse_procurements",
|
||||
"apps.parsers.tasks.sync_procurements",
|
||||
"apps.parsers.tasks.parse_procurements_44fz",
|
||||
"apps.parsers.tasks.parse_procurements_223fz",
|
||||
"apps.parsers.tasks.parse_contracts",
|
||||
),
|
||||
source_items=(
|
||||
SourceItemDefinition(
|
||||
@@ -108,6 +112,24 @@ SOURCE_CARD_DEFINITIONS: tuple[SourceCardDefinition, ...] = (
|
||||
description=("Закупки и связанные данные из ЕИС по 44-ФЗ и 223-ФЗ."),
|
||||
parser_source=ParserLoadLog.Source.PROCUREMENTS,
|
||||
),
|
||||
SourceItemDefinition(
|
||||
code="procurements_44fz",
|
||||
title="Закупки 44-ФЗ",
|
||||
description="Извещения и закупочные процедуры 44-ФЗ.",
|
||||
parser_source=ParserLoadLog.Source.PROCUREMENTS_44FZ,
|
||||
),
|
||||
SourceItemDefinition(
|
||||
code="procurements_223fz",
|
||||
title="Закупки 223-ФЗ",
|
||||
description="Извещения и закупочные процедуры 223-ФЗ.",
|
||||
parser_source=ParserLoadLog.Source.PROCUREMENTS_223FZ,
|
||||
),
|
||||
SourceItemDefinition(
|
||||
code="contracts",
|
||||
title="Контракты ЕИС",
|
||||
description="Государственные и корпоративные контракты.",
|
||||
parser_source=ParserLoadLog.Source.CONTRACTS,
|
||||
),
|
||||
),
|
||||
refresh_params=(
|
||||
RefreshParamDefinition(
|
||||
@@ -203,6 +225,11 @@ SOURCE_CARD_BY_PARSER_SOURCE = {
|
||||
for source_item in definition.source_items
|
||||
if source_item.parser_source
|
||||
}
|
||||
GENERIC_RECORD_SOURCES_BY_ITEM_CODE = {
|
||||
"procurements_44fz": ParserLoadLog.Source.PROCUREMENTS_44FZ,
|
||||
"procurements_223fz": ParserLoadLog.Source.PROCUREMENTS_223FZ,
|
||||
"contracts": ParserLoadLog.Source.CONTRACTS,
|
||||
}
|
||||
|
||||
|
||||
class SourceCardService:
|
||||
@@ -576,6 +603,9 @@ class SourceCardService:
|
||||
|
||||
@classmethod
|
||||
def _get_source_records_count(cls, item_code: str) -> int:
|
||||
generic_source = GENERIC_RECORD_SOURCES_BY_ITEM_CODE.get(item_code)
|
||||
if generic_source:
|
||||
return GenericParserRecord.objects.filter(source=generic_source).count()
|
||||
if item_code == "fns_reports":
|
||||
return FinancialReportLine.objects.count()
|
||||
if item_code == "industrial":
|
||||
@@ -592,6 +622,15 @@ class SourceCardService:
|
||||
|
||||
@classmethod
|
||||
def _get_source_organizations_count(cls, item_code: str) -> int:
|
||||
generic_source = GENERIC_RECORD_SOURCES_BY_ITEM_CODE.get(item_code)
|
||||
if generic_source:
|
||||
return (
|
||||
GenericParserRecord.objects.filter(source=generic_source)
|
||||
.exclude(inn="")
|
||||
.values("inn")
|
||||
.distinct()
|
||||
.count()
|
||||
)
|
||||
if item_code == "fns_reports":
|
||||
return (
|
||||
FinancialReport.objects.exclude(ogrn="")
|
||||
@@ -638,6 +677,11 @@ class SourceCardService:
|
||||
|
||||
@classmethod
|
||||
def _get_source_data_timestamp(cls, item_code: str):
|
||||
generic_source = GENERIC_RECORD_SOURCES_BY_ITEM_CODE.get(item_code)
|
||||
if generic_source:
|
||||
return GenericParserRecord.objects.filter(source=generic_source).aggregate(
|
||||
last_updated=Max("updated_at")
|
||||
)["last_updated"]
|
||||
if item_code == "fns_reports":
|
||||
return FinancialReport.objects.aggregate(last_updated=Max("updated_at"))[
|
||||
"last_updated"
|
||||
@@ -670,6 +714,24 @@ class SourceCardService:
|
||||
definition: SourceCardDefinition,
|
||||
source_items: list[dict[str, Any]],
|
||||
) -> int:
|
||||
if definition.slug == "public-procurements":
|
||||
legacy_inns = (
|
||||
ProcurementRecord.objects.exclude(customer_inn="")
|
||||
.order_by()
|
||||
.values_list("customer_inn", flat=True)
|
||||
.distinct()
|
||||
)
|
||||
generic_inns = (
|
||||
GenericParserRecord.objects.filter(
|
||||
source__in=GENERIC_RECORD_SOURCES_BY_ITEM_CODE.values()
|
||||
)
|
||||
.exclude(inn="")
|
||||
.order_by()
|
||||
.values_list("inn", flat=True)
|
||||
.distinct()
|
||||
)
|
||||
return legacy_inns.union(generic_inns).count()
|
||||
|
||||
if definition.slug != "manufacturers-and-products":
|
||||
return sum(item["organizations_count"] for item in source_items)
|
||||
|
||||
|
||||
@@ -59,6 +59,8 @@ FEDRESURS_CHECKO_FALLBACK_LIMIT = 100
|
||||
PARSER_STALE_LOAD_MAX_AGE_MINUTES = 90
|
||||
PARSER_SOFT_TIME_LIMIT_SECONDS = 15 * 60
|
||||
PARSER_TIME_LIMIT_SECONDS = 20 * 60
|
||||
INDUSTRIAL_PRODUCTS_SOFT_TIME_LIMIT_SECONDS = 45 * 60
|
||||
INDUSTRIAL_PRODUCTS_TIME_LIMIT_SECONDS = 60 * 60
|
||||
|
||||
|
||||
class ParserSourceSkipped(Exception):
|
||||
@@ -792,8 +794,8 @@ def parse_manufactures(
|
||||
|
||||
@shared_task(
|
||||
bind=True,
|
||||
soft_time_limit=PARSER_SOFT_TIME_LIMIT_SECONDS,
|
||||
time_limit=PARSER_TIME_LIMIT_SECONDS,
|
||||
soft_time_limit=INDUSTRIAL_PRODUCTS_SOFT_TIME_LIMIT_SECONDS,
|
||||
time_limit=INDUSTRIAL_PRODUCTS_TIME_LIMIT_SECONDS,
|
||||
)
|
||||
def parse_industrial_products(
|
||||
self,
|
||||
|
||||
Reference in New Issue
Block a user