"""Batch enrichment helpers for organizations API v2.""" from __future__ import annotations from dataclasses import dataclass from datetime import date, datetime from typing import Any from apps.parsers.models import ( VACANCY_RECORD_SOURCES, FinancialReport, FinancialReportLine, GenericParserRecord, IndustrialCertificateRecord, IndustrialProductRecord, InspectionRecord, ManufacturerRecord, ParserLoadLog, ProcurementRecord, ) from django.db.models import Count, Prefetch, Q from registers.models import RegistryMembershipPeriod from organizations.data_sources import to_api_data_source, to_internal_data_source from organizations.models import Organization GENERIC_SOURCES = ( ParserLoadLog.Source.PROCUREMENTS_44FZ, ParserLoadLog.Source.PROCUREMENTS_223FZ, ParserLoadLog.Source.CONTRACTS, ParserLoadLog.Source.UNFAIR_SUPPLIERS, ParserLoadLog.Source.FAS_GOZ, ParserLoadLog.Source.ARBITRATION, ParserLoadLog.Source.FEDRESURS_BANKRUPTCY, ParserLoadLog.Source.FSTEC, ParserLoadLog.Source.TRUDVSEM, ) DATA_PRESENCE_KEYS = ( ParserLoadLog.Source.INDUSTRIAL, ParserLoadLog.Source.INDUSTRIAL_PRODUCTS, ParserLoadLog.Source.MANUFACTURES, ParserLoadLog.Source.INSPECTIONS, ParserLoadLog.Source.PROCUREMENTS, *GENERIC_SOURCES, ParserLoadLog.Source.FNS_REPORTS, ) DATA_PRESENCE_KEY_SET = {str(source) for source in DATA_PRESENCE_KEYS} API_DATA_SOURCE_KEY_SET = {to_api_data_source(source) for source in DATA_PRESENCE_KEYS} @dataclass(frozen=True) class RegistrySummary: """Registry identity returned in organizations API.""" id: str name: str @dataclass(frozen=True) class OrganizationEnrichment: """Computed parser and registry availability for one organization.""" data_presence: dict[str, Any] registries: list[RegistrySummary] def active_registry_identity_values( *, registry_id: str | None = None, registry_name: str | None = None, ) -> tuple[set[str], set[str]]: """Return INN/OGRN values of organizations with active registry membership.""" memberships = RegistryMembershipPeriod.objects.filter(ended_at__isnull=True) if registry_id: memberships = memberships.filter(registry_id=registry_id) if registry_name: memberships = memberships.filter(registry__name__icontains=registry_name) inn_values: set[str] = set() ogrn_values: set[str] = set() for inn, ogrn in memberships.values_list( "organization__mn_inn", "organization__mn_ogrn", ): inn_values.add(str(inn)) ogrn_values.add(str(ogrn)) return inn_values, ogrn_values def data_presence_identity_values(source: str) -> tuple[set[str], set[str]]: """Return INN/OGRN values of organizations with data for a parser source.""" matches = _source_matches(to_internal_data_source(source)) return matches["inn"], matches["ogrn"] def _source_matches(source: str) -> dict[str, set[str]]: if source == ParserLoadLog.Source.INDUSTRIAL: return OrganizationApiEnrichmentService._matching_identifiers_for_all( IndustrialCertificateRecord.objects, inn_field="inn", ogrn_field="ogrn", ) if source == ParserLoadLog.Source.INDUSTRIAL_PRODUCTS: return OrganizationApiEnrichmentService._matching_identifiers_for_all( IndustrialProductRecord.objects, inn_field="inn", ogrn_field="ogrn", ) if source == ParserLoadLog.Source.MANUFACTURES: return OrganizationApiEnrichmentService._matching_identifiers_for_all( ManufacturerRecord.objects, inn_field="inn", ogrn_field="ogrn", ) if source == ParserLoadLog.Source.INSPECTIONS: return OrganizationApiEnrichmentService._matching_identifiers_for_all( InspectionRecord.objects, inn_field="inn", ogrn_field="ogrn", ) if source == ParserLoadLog.Source.PROCUREMENTS: return OrganizationApiEnrichmentService._matching_identifiers_for_all( ProcurementRecord.objects, inn_field="customer_inn", ogrn_field="customer_ogrn", ) if source == ParserLoadLog.Source.FNS_REPORTS: return { "inn": set(), "ogrn": set( FinancialReport.objects.values_list("ogrn", flat=True).distinct() ), } if source == ParserLoadLog.Source.TRUDVSEM: return OrganizationApiEnrichmentService._matching_identifiers_for_all( GenericParserRecord.objects.filter(source__in=VACANCY_RECORD_SOURCES), inn_field="inn", ogrn_field="ogrn", ) if source in GENERIC_SOURCES: return OrganizationApiEnrichmentService._matching_identifiers_for_all( GenericParserRecord.objects.filter(source=source), inn_field="inn", ogrn_field="ogrn", ) raise ValueError(f"Unsupported data_presence source: {source}") class OrganizationApiEnrichmentService: """Computes list/detail enrichment without per-row database queries.""" @classmethod def build_for( cls, organizations: list[Organization], data_sources: set[str] | None = None, ) -> dict[str, OrganizationEnrichment]: if not organizations: return {} selected_sources = ( API_DATA_SOURCE_KEY_SET if data_sources is None else {to_api_data_source(source) for source in data_sources} ) identifiers = cls._collect_identifiers(organizations) presence = cls._build_presence(organizations, identifiers, selected_sources) registries = cls._build_registries(organizations, identifiers) return { str(organization.uid): OrganizationEnrichment( data_presence=presence[str(organization.uid)], registries=registries[str(organization.uid)], ) for organization in organizations } @staticmethod def empty_presence(data_sources: set[str] | None = None) -> dict[str, Any]: selected_sources = ( API_DATA_SOURCE_KEY_SET if data_sources is None else {to_api_data_source(source) for source in data_sources} ) return { to_api_data_source(source): [] for source in DATA_PRESENCE_KEYS if to_api_data_source(source) in selected_sources } @classmethod def _collect_identifiers( cls, organizations: list[Organization] ) -> dict[str, set[str]]: return { "inn": { organization.inn for organization in organizations if organization.inn }, "ogrn": { organization.ogrn for organization in organizations if organization.ogrn }, "ogrip": { organization.ogrip for organization in organizations if organization.ogrip }, } @classmethod def _build_presence( cls, organizations: list[Organization], identifiers: dict[str, set[str]], selected_sources: set[str], ) -> dict[str, dict[str, Any]]: presence = { str(organization.uid): cls.empty_presence(selected_sources) for organization in organizations } if to_api_data_source(ParserLoadLog.Source.INDUSTRIAL) in selected_sources: cls._attach_industrial_certificates(presence, organizations, identifiers) if ( to_api_data_source(ParserLoadLog.Source.INDUSTRIAL_PRODUCTS) in selected_sources ): cls._attach_source_records( presence, organizations, ParserLoadLog.Source.INDUSTRIAL_PRODUCTS, IndustrialProductRecord.objects, identifiers, inn_field="inn", ogrn_field="ogrn", serializer=cls._serialize_industrial_product, ) if to_api_data_source(ParserLoadLog.Source.MANUFACTURES) in selected_sources: cls._attach_source_records( presence, organizations, ParserLoadLog.Source.MANUFACTURES, ManufacturerRecord.objects, identifiers, inn_field="inn", ogrn_field="ogrn", serializer=cls._serialize_manufacturer, ) if to_api_data_source(ParserLoadLog.Source.INSPECTIONS) in selected_sources: cls._attach_source_records( presence, organizations, ParserLoadLog.Source.INSPECTIONS, InspectionRecord.objects, identifiers, inn_field="inn", ogrn_field="ogrn", serializer=cls._serialize_inspection, ) if to_api_data_source(ParserLoadLog.Source.PROCUREMENTS) in selected_sources: cls._attach_source_records( presence, organizations, ParserLoadLog.Source.PROCUREMENTS, ProcurementRecord.objects, identifiers, inn_field="customer_inn", ogrn_field="customer_ogrn", serializer=cls._serialize_procurement, ) if to_api_data_source(ParserLoadLog.Source.FNS_REPORTS) in selected_sources: cls._attach_source_records( presence, organizations, ParserLoadLog.Source.FNS_REPORTS, FinancialReport.objects.annotate( lines_count=Count("lines") ).prefetch_related( Prefetch( "lines", queryset=FinancialReportLine.objects.order_by( "year", "form_code", "line_code", ), ) ), identifiers, inn_field=None, ogrn_field="ogrn", serializer=cls._serialize_financial_report, ) selected_generic_sources = [ source for source in GENERIC_SOURCES if to_api_data_source(source) in selected_sources ] if selected_generic_sources: cls._attach_generic_records( presence, organizations, identifiers, selected_generic_sources, ) return presence @classmethod def _attach_industrial_certificates( cls, presence: dict[str, dict[str, Any]], organizations: list[Organization], identifiers: dict[str, set[str]], ) -> None: cls._attach_source_records( presence, organizations, ParserLoadLog.Source.INDUSTRIAL, IndustrialCertificateRecord.objects, identifiers, inn_field="inn", ogrn_field="ogrn", serializer=cls._serialize_industrial_certificate, ) @classmethod def _attach_source_records( cls, presence: dict[str, dict[str, Any]], organizations: list[Organization], source: str, queryset, identifiers: dict[str, set[str]], *, inn_field: str | None, ogrn_field: str, serializer, ) -> None: if inn_field is not None: identity_filter = cls._identity_filter( identifiers, inn_field=inn_field, ogrn_field=ogrn_field, ) else: identity_filter = cls._identity_filter( { "inn": set(), "ogrn": identifiers["ogrn"], "ogrip": identifiers["ogrip"], }, inn_field=None, ogrn_field=ogrn_field, ) if identity_filter is None: return records_by_inn: dict[str, list[dict[str, Any]]] = {} records_by_ogrn: dict[str, list[dict[str, Any]]] = {} records = queryset.filter(identity_filter).order_by("-created_at", "-id") for record in records: item = serializer(record) if inn_field is not None: inn_value = getattr(record, inn_field) if inn_value: records_by_inn.setdefault(inn_value, []).append(item) ogrn_value = getattr(record, ogrn_field) if ogrn_value: records_by_ogrn.setdefault(ogrn_value, []).append(item) for organization in organizations: seen: set[int] = set() items = [] for item in ( records_by_inn.get(organization.inn, []) + records_by_ogrn.get(organization.ogrn, []) + records_by_ogrn.get(organization.ogrip, []) ): item_id = item["id"] if item_id in seen: continue seen.add(item_id) items.append(item) presence[str(organization.uid)][to_api_data_source(source)] = items @staticmethod def _generic_query_sources( selected_sources: list[str], ) -> tuple[list[str], dict[str, str]]: query_sources: list[str] = [] source_bucket_by_record_source: dict[str, str] = {} for source in selected_sources: source_key = str(source) expanded_sources = ( VACANCY_RECORD_SOURCES if source == ParserLoadLog.Source.TRUDVSEM else (source_key,) ) for expanded_source in expanded_sources: query_sources.append(str(expanded_source)) source_bucket_by_record_source[str(expanded_source)] = source_key return query_sources, source_bucket_by_record_source @classmethod def _attach_generic_records( cls, presence: dict[str, dict[str, Any]], organizations: list[Organization], identifiers: dict[str, set[str]], selected_sources: list[str], ) -> None: identity_filter = cls._identity_filter( identifiers, inn_field="inn", ogrn_field="ogrn", ) if identity_filter is None: return query_sources, source_bucket_by_record_source = cls._generic_query_sources( selected_sources ) records_by_source_and_inn: dict[str, dict[str, list[dict[str, Any]]]] = { str(source): {} for source in selected_sources } records_by_source_and_ogrn: dict[str, dict[str, list[dict[str, Any]]]] = { str(source): {} for source in selected_sources } records = ( GenericParserRecord.objects.filter(source__in=query_sources) .filter(identity_filter) .order_by("source", "-created_at", "-id") ) for record in records: item = cls._serialize_generic_record(record) source = source_bucket_by_record_source[str(record.source)] if record.inn: records_by_source_and_inn[source].setdefault(record.inn, []).append( item ) if record.ogrn: records_by_source_and_ogrn[source].setdefault(record.ogrn, []).append( item ) for organization in organizations: organization_key = str(organization.uid) for source in selected_sources: source_key = str(source) seen: set[int] = set() items = [] records_by_inn = records_by_source_and_inn[source_key] records_by_ogrn = records_by_source_and_ogrn[source_key] for item in ( records_by_inn.get(organization.inn, []) + records_by_ogrn.get(organization.ogrn, []) + records_by_ogrn.get(organization.ogrip, []) ): item_id = item["id"] if item_id in seen: continue seen.add(item_id) items.append(item) presence[organization_key][to_api_data_source(source_key)] = items @staticmethod def _serialize_industrial_certificate( record: IndustrialCertificateRecord, ) -> dict[str, Any]: return { "id": record.id, "load_batch": record.load_batch, "issue_date": record.issue_date, "issue_date_normalized": _isoformat(record.issue_date_normalized), "certificate_number": record.certificate_number, "expiry_date": record.expiry_date, "expiry_date_normalized": _isoformat(record.expiry_date_normalized), "certificate_file_url": record.certificate_file_url, "organisation_name": record.organisation_name, "inn": record.inn, "ogrn": record.ogrn, "registry_organization": record.registry_organization_id, "created_at": _isoformat(record.created_at), "updated_at": _isoformat(record.updated_at), } @staticmethod def _serialize_industrial_product( record: IndustrialProductRecord, ) -> dict[str, Any]: return { "id": record.id, "load_batch": record.load_batch, "full_organisation_name": record.full_organisation_name, "ogrn": record.ogrn, "inn": record.inn, "registry_number": record.registry_number, "product_name": record.product_name, "product_model": record.product_model, "okpd2_code": record.okpd2_code, "tnved_code": record.tnved_code, "regulatory_document": record.regulatory_document, "registry_organization": record.registry_organization_id, "created_at": _isoformat(record.created_at), "updated_at": _isoformat(record.updated_at), } @staticmethod def _serialize_manufacturer(record: ManufacturerRecord) -> dict[str, Any]: return { "id": record.id, "load_batch": record.load_batch, "full_legal_name": record.full_legal_name, "inn": record.inn, "ogrn": record.ogrn, "address": record.address, "registry_organization": record.registry_organization_id, "created_at": _isoformat(record.created_at), "updated_at": _isoformat(record.updated_at), } @staticmethod def _serialize_inspection(record: InspectionRecord) -> dict[str, Any]: return { "id": record.id, "load_batch": record.load_batch, "registration_number": record.registration_number, "inn": record.inn, "ogrn": record.ogrn, "organisation_name": record.organisation_name, "control_authority": record.control_authority, "inspection_type": record.inspection_type, "inspection_form": record.inspection_form, "start_date": record.start_date, "start_date_normalized": _isoformat(record.start_date_normalized), "end_date": record.end_date, "end_date_normalized": _isoformat(record.end_date_normalized), "status": record.status, "legal_basis": record.legal_basis, "result": record.result, "is_federal_law_248": record.is_federal_law_248, "data_year": record.data_year, "data_month": record.data_month, "registry_organization": record.registry_organization_id, "created_at": _isoformat(record.created_at), "updated_at": _isoformat(record.updated_at), } @staticmethod def _serialize_procurement(record: ProcurementRecord) -> dict[str, Any]: return { "id": record.id, "load_batch": record.load_batch, "purchase_number": record.purchase_number, "purchase_name": record.purchase_name, "customer_inn": record.customer_inn, "customer_kpp": record.customer_kpp, "customer_ogrn": record.customer_ogrn, "customer_name": record.customer_name, "max_price": record.max_price, "max_price_amount": _decimal_string(record.max_price_amount), "currency_code": record.currency_code, "placement_method": record.placement_method, "publish_date": record.publish_date, "publish_date_normalized": _isoformat(record.publish_date_normalized), "end_date": record.end_date, "end_date_normalized": _isoformat(record.end_date_normalized), "status": record.status, "law_type": record.law_type, "purchase_object_info": record.purchase_object_info, "href": record.href, "region_code": record.region_code, "data_year": record.data_year, "data_month": record.data_month, "registry_organization": record.registry_organization_id, "created_at": _isoformat(record.created_at), "updated_at": _isoformat(record.updated_at), } @staticmethod def _serialize_generic_record(record: GenericParserRecord) -> dict[str, Any]: return { "id": record.id, "load_batch": record.load_batch, "source": record.source, "external_id": record.external_id, "inn": record.inn, "ogrn": record.ogrn, "organisation_name": record.organisation_name, "title": record.title, "record_date": record.record_date, "amount": _decimal_string(record.amount), "status": record.status, "url": record.url, "payload": record.payload, "registry_organization": record.registry_organization_id, "created_at": _isoformat(record.created_at), "updated_at": _isoformat(record.updated_at), } @staticmethod def _serialize_financial_report(record: FinancialReport) -> dict[str, Any]: return { "id": record.id, "external_id": record.external_id, "ogrn": record.ogrn, "registry_organization": record.registry_organization_id, "file_name": record.file_name, "file_hash": record.file_hash, "load_batch": record.load_batch, "status": record.status, "source": record.source, "error_message": record.error_message, "created_at": _isoformat(record.created_at), "updated_at": _isoformat(record.updated_at), "lines_count": getattr(record, "lines_count", 0), "lines": _financial_report_lines_by_year(record), } @staticmethod def _matching_identifiers( queryset, identifiers: dict[str, set[str]], *, inn_field: str, ogrn_field: str, ) -> dict[str, set[str]]: matched_inn = set() matched_ogrn = set() if identifiers["inn"]: matched_inn = set( queryset.filter(**{f"{inn_field}__in": identifiers["inn"]}) .values_list(inn_field, flat=True) .distinct() ) ogrn_identifiers = identifiers["ogrn"] | identifiers["ogrip"] if ogrn_identifiers: matched_ogrn = set( queryset.filter(**{f"{ogrn_field}__in": ogrn_identifiers}) .values_list(ogrn_field, flat=True) .distinct() ) return {"inn": matched_inn, "ogrn": matched_ogrn} @staticmethod def _identity_filter( identifiers: dict[str, set[str]], *, inn_field: str | None, ogrn_field: str, ) -> Q | None: identity_filter = Q() has_identity = False if inn_field is not None and identifiers["inn"]: identity_filter |= Q(**{f"{inn_field}__in": identifiers["inn"]}) has_identity = True ogrn_identifiers = identifiers["ogrn"] | identifiers["ogrip"] if ogrn_identifiers: identity_filter |= Q(**{f"{ogrn_field}__in": ogrn_identifiers}) has_identity = True if not has_identity: return None return identity_filter @staticmethod def _matching_identifiers_for_all( queryset, *, inn_field: str, ogrn_field: str, ) -> dict[str, set[str]]: matched_inn = set( queryset.exclude(**{inn_field: ""}) .values_list(inn_field, flat=True) .distinct() ) matched_ogrn = set( queryset.exclude(**{ogrn_field: ""}) .values_list(ogrn_field, flat=True) .distinct() ) return {"inn": matched_inn, "ogrn": matched_ogrn} @staticmethod def _build_registries( organizations: list[Organization], identifiers: dict[str, set[str]], ) -> dict[str, list[RegistrySummary]]: registries = {str(organization.uid): [] for organization in organizations} if not identifiers["inn"] and not identifiers["ogrn"]: return registries identity_filter = Q() if identifiers["inn"]: identity_filter |= Q(organization__mn_inn__in=identifiers["inn"]) if identifiers["ogrn"]: identity_filter |= Q(organization__mn_ogrn__in=identifiers["ogrn"]) memberships = ( RegistryMembershipPeriod.objects.filter(ended_at__isnull=True) .filter(identity_filter) .select_related("registry", "organization") .order_by("registry__name") ) membership_by_inn: dict[str, list[RegistrySummary]] = {} membership_by_ogrn: dict[str, list[RegistrySummary]] = {} for membership in memberships: summary = RegistrySummary( id=str(membership.registry_id), name=membership.registry.name, ) membership_by_inn.setdefault( str(membership.organization.mn_inn), [], ).append(summary) membership_by_ogrn.setdefault( str(membership.organization.mn_ogrn), [], ).append(summary) for organization in organizations: seen: set[str] = set() summaries = [] for summary in membership_by_inn.get( organization.inn, [] ) + membership_by_ogrn.get(organization.ogrn, []): if summary.id in seen: continue seen.add(summary.id) summaries.append(summary) registries[str(organization.uid)] = summaries return registries def _isoformat(value: date | datetime | None) -> str | None: if value is None: return None return value.isoformat().replace("+00:00", "Z") def _decimal_string(value: Any | None) -> str | None: if value is None: return None return str(value) def _financial_report_lines_by_year( record: FinancialReport, ) -> dict[str, dict[str, Any]]: lines_by_year: dict[str, dict[str, Any]] = {} for line in record.lines.all(): year = str(line.year) section = _financial_report_line_section(line) lines_by_year.setdefault(year, {}).setdefault(section, {})[line.line_code] = { "form_code": line.form_code, "name": line.line_name, "period_start": line.period_start, "period_end": line.period_end, } return lines_by_year def _financial_report_line_section(line: FinancialReportLine) -> str: if line.form_code != "1": return f"form_{line.form_code}" try: line_code = int(line.line_code) except ValueError: return "balance" if 1000 <= line_code < 1300 or line_code == 1600: return "active" if 1300 <= line_code < 1600 or line_code == 1700: return "passive" return "balance"