Files
mostovik-backend/src/organizations/api_enrichment.py
Aleksandr Meshchriakov 19a7d5a91c
All checks were successful
CI/CD Pipeline / Quality Gate (push) Successful in 28s
CI/CD Pipeline / Build and Push Images (push) Successful in 10s
CI/CD Pipeline / Internal Notify (push) Successful in 0s
CI/CD Pipeline / Deploy Dev in Dokploy (push) Successful in 1s
perf(organizations): speed up filtered API lists
2026-05-14 17:08:03 +02:00

788 lines
28 KiB
Python

"""Batch enrichment helpers for organizations API v2."""
from __future__ import annotations
from dataclasses import dataclass
from datetime import date, datetime
from typing import Any
from apps.parsers.models import (
VACANCY_RECORD_SOURCES,
FinancialReport,
FinancialReportLine,
GenericParserRecord,
IndustrialCertificateRecord,
IndustrialProductRecord,
InspectionRecord,
ManufacturerRecord,
ParserLoadLog,
ProcurementRecord,
)
from django.db.models import Count, Prefetch, Q
from registers.models import RegistryMembershipPeriod
from organizations.data_sources import to_api_data_source, to_internal_data_source
from organizations.models import Organization
GENERIC_SOURCES = (
ParserLoadLog.Source.PROCUREMENTS_44FZ,
ParserLoadLog.Source.PROCUREMENTS_223FZ,
ParserLoadLog.Source.CONTRACTS,
ParserLoadLog.Source.UNFAIR_SUPPLIERS,
ParserLoadLog.Source.FAS_GOZ,
ParserLoadLog.Source.ARBITRATION,
ParserLoadLog.Source.FEDRESURS_BANKRUPTCY,
ParserLoadLog.Source.FSTEC,
ParserLoadLog.Source.TRUDVSEM,
)
DATA_PRESENCE_KEYS = (
ParserLoadLog.Source.INDUSTRIAL,
ParserLoadLog.Source.INDUSTRIAL_PRODUCTS,
ParserLoadLog.Source.MANUFACTURES,
ParserLoadLog.Source.INSPECTIONS,
ParserLoadLog.Source.PROCUREMENTS,
*GENERIC_SOURCES,
ParserLoadLog.Source.FNS_REPORTS,
)
DATA_PRESENCE_KEY_SET = {str(source) for source in DATA_PRESENCE_KEYS}
API_DATA_SOURCE_KEY_SET = {to_api_data_source(source) for source in DATA_PRESENCE_KEYS}
@dataclass(frozen=True)
class RegistrySummary:
"""Registry identity returned in organizations API."""
id: str
name: str
@dataclass(frozen=True)
class OrganizationEnrichment:
"""Computed parser and registry availability for one organization."""
data_presence: dict[str, Any]
registries: list[RegistrySummary]
def active_registry_identity_values(
*,
registry_id: str | None = None,
registry_name: str | None = None,
) -> tuple[set[str], set[str]]:
"""Return INN/OGRN values of organizations with active registry membership."""
memberships = RegistryMembershipPeriod.objects.filter(ended_at__isnull=True)
if registry_id:
memberships = memberships.filter(registry_id=registry_id)
if registry_name:
memberships = memberships.filter(registry__name__icontains=registry_name)
inn_values: set[str] = set()
ogrn_values: set[str] = set()
for inn, ogrn in memberships.values_list(
"organization__mn_inn",
"organization__mn_ogrn",
):
inn_values.add(str(inn))
ogrn_values.add(str(ogrn))
return inn_values, ogrn_values
def data_presence_identity_values(source: str) -> tuple[set[str], set[str]]:
"""Return INN/OGRN values of organizations with data for a parser source."""
matches = _source_matches(to_internal_data_source(source))
return matches["inn"], matches["ogrn"]
def _source_matches(source: str) -> dict[str, set[str]]:
if source == ParserLoadLog.Source.INDUSTRIAL:
return OrganizationApiEnrichmentService._matching_identifiers_for_all(
IndustrialCertificateRecord.objects,
inn_field="inn",
ogrn_field="ogrn",
)
if source == ParserLoadLog.Source.INDUSTRIAL_PRODUCTS:
return OrganizationApiEnrichmentService._matching_identifiers_for_all(
IndustrialProductRecord.objects,
inn_field="inn",
ogrn_field="ogrn",
)
if source == ParserLoadLog.Source.MANUFACTURES:
return OrganizationApiEnrichmentService._matching_identifiers_for_all(
ManufacturerRecord.objects,
inn_field="inn",
ogrn_field="ogrn",
)
if source == ParserLoadLog.Source.INSPECTIONS:
return OrganizationApiEnrichmentService._matching_identifiers_for_all(
InspectionRecord.objects,
inn_field="inn",
ogrn_field="ogrn",
)
if source == ParserLoadLog.Source.PROCUREMENTS:
return OrganizationApiEnrichmentService._matching_identifiers_for_all(
ProcurementRecord.objects,
inn_field="customer_inn",
ogrn_field="customer_ogrn",
)
if source == ParserLoadLog.Source.FNS_REPORTS:
return {
"inn": set(),
"ogrn": set(
FinancialReport.objects.values_list("ogrn", flat=True).distinct()
),
}
if source == ParserLoadLog.Source.TRUDVSEM:
return OrganizationApiEnrichmentService._matching_identifiers_for_all(
GenericParserRecord.objects.filter(source__in=VACANCY_RECORD_SOURCES),
inn_field="inn",
ogrn_field="ogrn",
)
if source in GENERIC_SOURCES:
return OrganizationApiEnrichmentService._matching_identifiers_for_all(
GenericParserRecord.objects.filter(source=source),
inn_field="inn",
ogrn_field="ogrn",
)
raise ValueError(f"Unsupported data_presence source: {source}")
class OrganizationApiEnrichmentService:
"""Computes list/detail enrichment without per-row database queries."""
@classmethod
def build_for(
cls,
organizations: list[Organization],
data_sources: set[str] | None = None,
) -> dict[str, OrganizationEnrichment]:
if not organizations:
return {}
selected_sources = (
API_DATA_SOURCE_KEY_SET
if data_sources is None
else {to_api_data_source(source) for source in data_sources}
)
identifiers = cls._collect_identifiers(organizations)
presence = cls._build_presence(organizations, identifiers, selected_sources)
registries = cls._build_registries(organizations, identifiers)
return {
str(organization.uid): OrganizationEnrichment(
data_presence=presence[str(organization.uid)],
registries=registries[str(organization.uid)],
)
for organization in organizations
}
@staticmethod
def empty_presence(data_sources: set[str] | None = None) -> dict[str, Any]:
selected_sources = (
API_DATA_SOURCE_KEY_SET
if data_sources is None
else {to_api_data_source(source) for source in data_sources}
)
return {
to_api_data_source(source): []
for source in DATA_PRESENCE_KEYS
if to_api_data_source(source) in selected_sources
}
@classmethod
def _collect_identifiers(
cls, organizations: list[Organization]
) -> dict[str, set[str]]:
return {
"inn": {
organization.inn for organization in organizations if organization.inn
},
"ogrn": {
organization.ogrn for organization in organizations if organization.ogrn
},
"ogrip": {
organization.ogrip
for organization in organizations
if organization.ogrip
},
}
@classmethod
def _build_presence(
cls,
organizations: list[Organization],
identifiers: dict[str, set[str]],
selected_sources: set[str],
) -> dict[str, dict[str, Any]]:
presence = {
str(organization.uid): cls.empty_presence(selected_sources)
for organization in organizations
}
if to_api_data_source(ParserLoadLog.Source.INDUSTRIAL) in selected_sources:
cls._attach_industrial_certificates(presence, organizations, identifiers)
if (
to_api_data_source(ParserLoadLog.Source.INDUSTRIAL_PRODUCTS)
in selected_sources
):
cls._attach_source_records(
presence,
organizations,
ParserLoadLog.Source.INDUSTRIAL_PRODUCTS,
IndustrialProductRecord.objects,
identifiers,
inn_field="inn",
ogrn_field="ogrn",
serializer=cls._serialize_industrial_product,
)
if to_api_data_source(ParserLoadLog.Source.MANUFACTURES) in selected_sources:
cls._attach_source_records(
presence,
organizations,
ParserLoadLog.Source.MANUFACTURES,
ManufacturerRecord.objects,
identifiers,
inn_field="inn",
ogrn_field="ogrn",
serializer=cls._serialize_manufacturer,
)
if to_api_data_source(ParserLoadLog.Source.INSPECTIONS) in selected_sources:
cls._attach_source_records(
presence,
organizations,
ParserLoadLog.Source.INSPECTIONS,
InspectionRecord.objects,
identifiers,
inn_field="inn",
ogrn_field="ogrn",
serializer=cls._serialize_inspection,
)
if to_api_data_source(ParserLoadLog.Source.PROCUREMENTS) in selected_sources:
cls._attach_source_records(
presence,
organizations,
ParserLoadLog.Source.PROCUREMENTS,
ProcurementRecord.objects,
identifiers,
inn_field="customer_inn",
ogrn_field="customer_ogrn",
serializer=cls._serialize_procurement,
)
if to_api_data_source(ParserLoadLog.Source.FNS_REPORTS) in selected_sources:
cls._attach_source_records(
presence,
organizations,
ParserLoadLog.Source.FNS_REPORTS,
FinancialReport.objects.annotate(
lines_count=Count("lines")
).prefetch_related(
Prefetch(
"lines",
queryset=FinancialReportLine.objects.order_by(
"year",
"form_code",
"line_code",
),
)
),
identifiers,
inn_field=None,
ogrn_field="ogrn",
serializer=cls._serialize_financial_report,
)
selected_generic_sources = [
source
for source in GENERIC_SOURCES
if to_api_data_source(source) in selected_sources
]
if selected_generic_sources:
cls._attach_generic_records(
presence,
organizations,
identifiers,
selected_generic_sources,
)
return presence
@classmethod
def _attach_industrial_certificates(
cls,
presence: dict[str, dict[str, Any]],
organizations: list[Organization],
identifiers: dict[str, set[str]],
) -> None:
cls._attach_source_records(
presence,
organizations,
ParserLoadLog.Source.INDUSTRIAL,
IndustrialCertificateRecord.objects,
identifiers,
inn_field="inn",
ogrn_field="ogrn",
serializer=cls._serialize_industrial_certificate,
)
@classmethod
def _attach_source_records(
cls,
presence: dict[str, dict[str, Any]],
organizations: list[Organization],
source: str,
queryset,
identifiers: dict[str, set[str]],
*,
inn_field: str | None,
ogrn_field: str,
serializer,
) -> None:
if inn_field is not None:
identity_filter = cls._identity_filter(
identifiers,
inn_field=inn_field,
ogrn_field=ogrn_field,
)
else:
identity_filter = cls._identity_filter(
{
"inn": set(),
"ogrn": identifiers["ogrn"],
"ogrip": identifiers["ogrip"],
},
inn_field=None,
ogrn_field=ogrn_field,
)
if identity_filter is None:
return
records_by_inn: dict[str, list[dict[str, Any]]] = {}
records_by_ogrn: dict[str, list[dict[str, Any]]] = {}
records = queryset.filter(identity_filter).order_by("-created_at", "-id")
for record in records:
item = serializer(record)
if inn_field is not None:
inn_value = getattr(record, inn_field)
if inn_value:
records_by_inn.setdefault(inn_value, []).append(item)
ogrn_value = getattr(record, ogrn_field)
if ogrn_value:
records_by_ogrn.setdefault(ogrn_value, []).append(item)
for organization in organizations:
seen: set[int] = set()
items = []
for item in (
records_by_inn.get(organization.inn, [])
+ records_by_ogrn.get(organization.ogrn, [])
+ records_by_ogrn.get(organization.ogrip, [])
):
item_id = item["id"]
if item_id in seen:
continue
seen.add(item_id)
items.append(item)
presence[str(organization.uid)][to_api_data_source(source)] = items
@staticmethod
def _generic_query_sources(
selected_sources: list[str],
) -> tuple[list[str], dict[str, str]]:
query_sources: list[str] = []
source_bucket_by_record_source: dict[str, str] = {}
for source in selected_sources:
source_key = str(source)
expanded_sources = (
VACANCY_RECORD_SOURCES
if source == ParserLoadLog.Source.TRUDVSEM
else (source_key,)
)
for expanded_source in expanded_sources:
query_sources.append(str(expanded_source))
source_bucket_by_record_source[str(expanded_source)] = source_key
return query_sources, source_bucket_by_record_source
@classmethod
def _attach_generic_records(
cls,
presence: dict[str, dict[str, Any]],
organizations: list[Organization],
identifiers: dict[str, set[str]],
selected_sources: list[str],
) -> None:
identity_filter = cls._identity_filter(
identifiers,
inn_field="inn",
ogrn_field="ogrn",
)
if identity_filter is None:
return
query_sources, source_bucket_by_record_source = cls._generic_query_sources(
selected_sources
)
records_by_source_and_inn: dict[str, dict[str, list[dict[str, Any]]]] = {
str(source): {} for source in selected_sources
}
records_by_source_and_ogrn: dict[str, dict[str, list[dict[str, Any]]]] = {
str(source): {} for source in selected_sources
}
records = (
GenericParserRecord.objects.filter(source__in=query_sources)
.filter(identity_filter)
.order_by("source", "-created_at", "-id")
)
for record in records:
item = cls._serialize_generic_record(record)
source = source_bucket_by_record_source[str(record.source)]
if record.inn:
records_by_source_and_inn[source].setdefault(record.inn, []).append(
item
)
if record.ogrn:
records_by_source_and_ogrn[source].setdefault(record.ogrn, []).append(
item
)
for organization in organizations:
organization_key = str(organization.uid)
for source in selected_sources:
source_key = str(source)
seen: set[int] = set()
items = []
records_by_inn = records_by_source_and_inn[source_key]
records_by_ogrn = records_by_source_and_ogrn[source_key]
for item in (
records_by_inn.get(organization.inn, [])
+ records_by_ogrn.get(organization.ogrn, [])
+ records_by_ogrn.get(organization.ogrip, [])
):
item_id = item["id"]
if item_id in seen:
continue
seen.add(item_id)
items.append(item)
presence[organization_key][to_api_data_source(source_key)] = items
@staticmethod
def _serialize_industrial_certificate(
record: IndustrialCertificateRecord,
) -> dict[str, Any]:
return {
"id": record.id,
"load_batch": record.load_batch,
"issue_date": record.issue_date,
"issue_date_normalized": _isoformat(record.issue_date_normalized),
"certificate_number": record.certificate_number,
"expiry_date": record.expiry_date,
"expiry_date_normalized": _isoformat(record.expiry_date_normalized),
"certificate_file_url": record.certificate_file_url,
"organisation_name": record.organisation_name,
"inn": record.inn,
"ogrn": record.ogrn,
"registry_organization": record.registry_organization_id,
"created_at": _isoformat(record.created_at),
"updated_at": _isoformat(record.updated_at),
}
@staticmethod
def _serialize_industrial_product(
record: IndustrialProductRecord,
) -> dict[str, Any]:
return {
"id": record.id,
"load_batch": record.load_batch,
"full_organisation_name": record.full_organisation_name,
"ogrn": record.ogrn,
"inn": record.inn,
"registry_number": record.registry_number,
"product_name": record.product_name,
"product_model": record.product_model,
"okpd2_code": record.okpd2_code,
"tnved_code": record.tnved_code,
"regulatory_document": record.regulatory_document,
"registry_organization": record.registry_organization_id,
"created_at": _isoformat(record.created_at),
"updated_at": _isoformat(record.updated_at),
}
@staticmethod
def _serialize_manufacturer(record: ManufacturerRecord) -> dict[str, Any]:
return {
"id": record.id,
"load_batch": record.load_batch,
"full_legal_name": record.full_legal_name,
"inn": record.inn,
"ogrn": record.ogrn,
"address": record.address,
"registry_organization": record.registry_organization_id,
"created_at": _isoformat(record.created_at),
"updated_at": _isoformat(record.updated_at),
}
@staticmethod
def _serialize_inspection(record: InspectionRecord) -> dict[str, Any]:
return {
"id": record.id,
"load_batch": record.load_batch,
"registration_number": record.registration_number,
"inn": record.inn,
"ogrn": record.ogrn,
"organisation_name": record.organisation_name,
"control_authority": record.control_authority,
"inspection_type": record.inspection_type,
"inspection_form": record.inspection_form,
"start_date": record.start_date,
"start_date_normalized": _isoformat(record.start_date_normalized),
"end_date": record.end_date,
"end_date_normalized": _isoformat(record.end_date_normalized),
"status": record.status,
"legal_basis": record.legal_basis,
"result": record.result,
"is_federal_law_248": record.is_federal_law_248,
"data_year": record.data_year,
"data_month": record.data_month,
"registry_organization": record.registry_organization_id,
"created_at": _isoformat(record.created_at),
"updated_at": _isoformat(record.updated_at),
}
@staticmethod
def _serialize_procurement(record: ProcurementRecord) -> dict[str, Any]:
return {
"id": record.id,
"load_batch": record.load_batch,
"purchase_number": record.purchase_number,
"purchase_name": record.purchase_name,
"customer_inn": record.customer_inn,
"customer_kpp": record.customer_kpp,
"customer_ogrn": record.customer_ogrn,
"customer_name": record.customer_name,
"max_price": record.max_price,
"max_price_amount": _decimal_string(record.max_price_amount),
"currency_code": record.currency_code,
"placement_method": record.placement_method,
"publish_date": record.publish_date,
"publish_date_normalized": _isoformat(record.publish_date_normalized),
"end_date": record.end_date,
"end_date_normalized": _isoformat(record.end_date_normalized),
"status": record.status,
"law_type": record.law_type,
"purchase_object_info": record.purchase_object_info,
"href": record.href,
"region_code": record.region_code,
"data_year": record.data_year,
"data_month": record.data_month,
"registry_organization": record.registry_organization_id,
"created_at": _isoformat(record.created_at),
"updated_at": _isoformat(record.updated_at),
}
@staticmethod
def _serialize_generic_record(record: GenericParserRecord) -> dict[str, Any]:
return {
"id": record.id,
"load_batch": record.load_batch,
"source": record.source,
"external_id": record.external_id,
"inn": record.inn,
"ogrn": record.ogrn,
"organisation_name": record.organisation_name,
"title": record.title,
"record_date": record.record_date,
"amount": _decimal_string(record.amount),
"status": record.status,
"url": record.url,
"payload": record.payload,
"registry_organization": record.registry_organization_id,
"created_at": _isoformat(record.created_at),
"updated_at": _isoformat(record.updated_at),
}
@staticmethod
def _serialize_financial_report(record: FinancialReport) -> dict[str, Any]:
return {
"id": record.id,
"external_id": record.external_id,
"ogrn": record.ogrn,
"registry_organization": record.registry_organization_id,
"file_name": record.file_name,
"file_hash": record.file_hash,
"load_batch": record.load_batch,
"status": record.status,
"source": record.source,
"error_message": record.error_message,
"created_at": _isoformat(record.created_at),
"updated_at": _isoformat(record.updated_at),
"lines_count": getattr(record, "lines_count", 0),
"lines": _financial_report_lines_by_year(record),
}
@staticmethod
def _matching_identifiers(
queryset,
identifiers: dict[str, set[str]],
*,
inn_field: str,
ogrn_field: str,
) -> dict[str, set[str]]:
matched_inn = set()
matched_ogrn = set()
if identifiers["inn"]:
matched_inn = set(
queryset.filter(**{f"{inn_field}__in": identifiers["inn"]})
.values_list(inn_field, flat=True)
.distinct()
)
ogrn_identifiers = identifiers["ogrn"] | identifiers["ogrip"]
if ogrn_identifiers:
matched_ogrn = set(
queryset.filter(**{f"{ogrn_field}__in": ogrn_identifiers})
.values_list(ogrn_field, flat=True)
.distinct()
)
return {"inn": matched_inn, "ogrn": matched_ogrn}
@staticmethod
def _identity_filter(
identifiers: dict[str, set[str]],
*,
inn_field: str | None,
ogrn_field: str,
) -> Q | None:
identity_filter = Q()
has_identity = False
if inn_field is not None and identifiers["inn"]:
identity_filter |= Q(**{f"{inn_field}__in": identifiers["inn"]})
has_identity = True
ogrn_identifiers = identifiers["ogrn"] | identifiers["ogrip"]
if ogrn_identifiers:
identity_filter |= Q(**{f"{ogrn_field}__in": ogrn_identifiers})
has_identity = True
if not has_identity:
return None
return identity_filter
@staticmethod
def _matching_identifiers_for_all(
queryset,
*,
inn_field: str,
ogrn_field: str,
) -> dict[str, set[str]]:
matched_inn = set(
queryset.exclude(**{inn_field: ""})
.values_list(inn_field, flat=True)
.distinct()
)
matched_ogrn = set(
queryset.exclude(**{ogrn_field: ""})
.values_list(ogrn_field, flat=True)
.distinct()
)
return {"inn": matched_inn, "ogrn": matched_ogrn}
@staticmethod
def _build_registries(
organizations: list[Organization],
identifiers: dict[str, set[str]],
) -> dict[str, list[RegistrySummary]]:
registries = {str(organization.uid): [] for organization in organizations}
if not identifiers["inn"] and not identifiers["ogrn"]:
return registries
identity_filter = Q()
if identifiers["inn"]:
identity_filter |= Q(organization__mn_inn__in=identifiers["inn"])
if identifiers["ogrn"]:
identity_filter |= Q(organization__mn_ogrn__in=identifiers["ogrn"])
memberships = (
RegistryMembershipPeriod.objects.filter(ended_at__isnull=True)
.filter(identity_filter)
.select_related("registry", "organization")
.order_by("registry__name")
)
membership_by_inn: dict[str, list[RegistrySummary]] = {}
membership_by_ogrn: dict[str, list[RegistrySummary]] = {}
for membership in memberships:
summary = RegistrySummary(
id=str(membership.registry_id),
name=membership.registry.name,
)
membership_by_inn.setdefault(
str(membership.organization.mn_inn),
[],
).append(summary)
membership_by_ogrn.setdefault(
str(membership.organization.mn_ogrn),
[],
).append(summary)
for organization in organizations:
seen: set[str] = set()
summaries = []
for summary in membership_by_inn.get(
organization.inn, []
) + membership_by_ogrn.get(organization.ogrn, []):
if summary.id in seen:
continue
seen.add(summary.id)
summaries.append(summary)
registries[str(organization.uid)] = summaries
return registries
def _isoformat(value: date | datetime | None) -> str | None:
if value is None:
return None
return value.isoformat().replace("+00:00", "Z")
def _decimal_string(value: Any | None) -> str | None:
if value is None:
return None
return str(value)
def _financial_report_lines_by_year(
record: FinancialReport,
) -> dict[str, dict[str, Any]]:
lines_by_year: dict[str, dict[str, Any]] = {}
for line in record.lines.all():
year = str(line.year)
section = _financial_report_line_section(line)
lines_by_year.setdefault(year, {}).setdefault(section, {})[line.line_code] = {
"form_code": line.form_code,
"name": line.line_name,
"period_start": line.period_start,
"period_end": line.period_end,
}
return lines_by_year
def _financial_report_line_section(line: FinancialReportLine) -> str:
if line.form_code != "1":
return f"form_{line.form_code}"
try:
line_code = int(line.line_code)
except ValueError:
return "balance"
if 1000 <= line_code < 1300 or line_code == 1600:
return "active"
if 1300 <= line_code < 1600 or line_code == 1700:
return "passive"
return "balance"