"""Backfill organization source extensions from legacy parser tables.""" from __future__ import annotations from collections.abc import Iterable from dataclasses import dataclass from datetime import date, datetime from decimal import Decimal from typing import Any from uuid import UUID from apps.parsers.models import ( FinancialReport, FinancialReportLine, GenericParserRecord, IndustrialCertificateRecord, IndustrialProductRecord, InspectionRecord, ManufacturerRecord, ParserLoadLog, ProcurementRecord, ) from django.db import transaction from django.db.models import Count, Max, Min, Model from organizations.models import ( Organization, OrganizationSourceExtension, OrganizationSourceFinancialLine, OrganizationSourceRecord, ) from organizations.name_normalization import normalize_organization_name from organizations.source_groups import ( SOURCE_GROUP_DESCRIPTORS, SourceGroupDescriptor, get_source_group_descriptor, ) from organizations.source_identity import normalize_identity_fields @dataclass(frozen=True) class OrganizationSourceBackfillResult: """Counters returned by organization source backfill.""" scanned: int = 0 created_organizations: int = 0 created_extensions: int = 0 updated_extensions: int = 0 created_records: int = 0 updated_records: int = 0 created_financial_lines: int = 0 updated_financial_lines: int = 0 unresolved: int = 0 def plus( self, other: OrganizationSourceBackfillResult ) -> OrganizationSourceBackfillResult: return OrganizationSourceBackfillResult( scanned=self.scanned + other.scanned, created_organizations=self.created_organizations + other.created_organizations, created_extensions=self.created_extensions + other.created_extensions, updated_extensions=self.updated_extensions + other.updated_extensions, created_records=self.created_records + other.created_records, updated_records=self.updated_records + other.updated_records, created_financial_lines=self.created_financial_lines + other.created_financial_lines, updated_financial_lines=self.updated_financial_lines + other.updated_financial_lines, unresolved=self.unresolved + other.unresolved, ) @dataclass(frozen=True) class LegacyRecordAdapter: """Normalized legacy source record data.""" source: str record_type: str external_id: str title: str organization_name: str inn: str kpp: str ogrn: str ogrip: str record_date: str amount: Decimal | None status: str url: str payload: dict[str, Any] legacy_model: str legacy_pk: str load_batch: int | None class OrganizationSourceBackfillService: """Build organization source extensions from current parser tables.""" @classmethod def backfill( cls, *, source: str | None = None, batch_id: int | None = None, ) -> OrganizationSourceBackfillResult: if source is None: result = OrganizationSourceBackfillResult() for descriptor_source in SOURCE_GROUP_DESCRIPTORS: result = result.plus( cls.backfill(source=descriptor_source, batch_id=batch_id) ) return result descriptor = get_source_group_descriptor(str(source)) adapters = cls._iter_adapters(descriptor, batch_id=batch_id) return cls._backfill_adapters(descriptor, adapters) @classmethod def _backfill_adapters( cls, descriptor: SourceGroupDescriptor, adapters: Iterable[LegacyRecordAdapter], ) -> OrganizationSourceBackfillResult: scanned = 0 created_organizations = 0 created_extensions = 0 updated_extensions = 0 created_records = 0 updated_records = 0 created_financial_lines = 0 updated_financial_lines = 0 unresolved = 0 touched_extension_ids: set[str] = set() with transaction.atomic(): for adapter in adapters: scanned += 1 ( organization, organization_created, ) = cls._resolve_or_create_organization(adapter) if organization is None: unresolved += 1 continue if organization_created: created_organizations += 1 ( extension, extension_created, ) = descriptor.extension_model.objects.get_or_create( organization=organization, defaults={ "source_group": descriptor.source_group, "title": descriptor.title, "last_load_batch": adapter.load_batch, }, ) if extension_created: created_extensions += 1 else: updated_extensions += cls._update_extension( extension, descriptor, adapter ) ( source_record, record_created, ) = OrganizationSourceRecord.objects.update_or_create( legacy_model=adapter.legacy_model, legacy_pk=adapter.legacy_pk, defaults={ "extension": extension, "record_type": adapter.record_type, "source": adapter.source, "external_id": adapter.external_id, "title": adapter.title, "record_date": adapter.record_date, "amount": adapter.amount, "status": adapter.status, "url": adapter.url, "payload": adapter.payload, "load_batch": adapter.load_batch, }, ) if record_created: created_records += 1 else: updated_records += 1 if adapter.source == ParserLoadLog.Source.FNS_REPORTS: line_result = cls._backfill_financial_lines( source_record, adapter.legacy_pk ) created_financial_lines += line_result[0] updated_financial_lines += line_result[1] touched_extension_ids.add(str(extension.uid)) cls._refresh_extension_counters(touched_extension_ids) return OrganizationSourceBackfillResult( scanned=scanned, created_organizations=created_organizations, created_extensions=created_extensions, updated_extensions=updated_extensions, created_records=created_records, updated_records=updated_records, created_financial_lines=created_financial_lines, updated_financial_lines=updated_financial_lines, unresolved=unresolved, ) @staticmethod def _update_extension( extension: OrganizationSourceExtension, descriptor: SourceGroupDescriptor, adapter: LegacyRecordAdapter, ) -> int: changed = False if extension.title != descriptor.title: extension.title = descriptor.title changed = True if ( adapter.load_batch is not None and extension.last_load_batch != adapter.load_batch ): extension.last_load_batch = adapter.load_batch changed = True if changed: extension.save(update_fields=["title", "last_load_batch", "updated_at"]) return 1 return 0 @classmethod def _iter_adapters( cls, descriptor: SourceGroupDescriptor, *, batch_id: int | None, ) -> Iterable[LegacyRecordAdapter]: source = str(descriptor.source) model_and_adapter = cls._legacy_model_adapter(source) if model_and_adapter is None: queryset = GenericParserRecord.objects.filter(source=source) adapter_factory = cls._generic_adapter else: model, adapter_factory = model_and_adapter queryset = model.objects.all() if batch_id is not None: queryset = queryset.filter(load_batch=batch_id) for record in queryset.iterator(): yield adapter_factory(record, descriptor) @classmethod def _legacy_model_adapter(cls, source: str): return { ParserLoadLog.Source.INSPECTIONS: ( InspectionRecord, cls._inspection_adapter, ), ParserLoadLog.Source.FNS_REPORTS: ( FinancialReport, cls._financial_report_adapter, ), ParserLoadLog.Source.PROCUREMENTS: ( ProcurementRecord, cls._procurement_adapter, ), ParserLoadLog.Source.INDUSTRIAL: ( IndustrialCertificateRecord, cls._industrial_certificate_adapter, ), ParserLoadLog.Source.INDUSTRIAL_PRODUCTS: ( IndustrialProductRecord, cls._industrial_product_adapter, ), ParserLoadLog.Source.MANUFACTURES: ( ManufacturerRecord, cls._manufacturer_adapter, ), }.get(source) @classmethod def _resolve_or_create_organization( cls, adapter: LegacyRecordAdapter, ) -> tuple[Organization | None, bool]: organization = cls._resolve_organization(adapter) if organization is not None: return organization, False name = adapter.organization_name or adapter.title or adapter.external_id if not name: return None, False organization = Organization.objects.create( name=name, inn=adapter.inn, kpp=adapter.kpp, ogrn=adapter.ogrn if len(adapter.ogrn) == 13 else "", ogrip=adapter.ogrip, ) return organization, True @classmethod def _resolve_organization(cls, adapter: LegacyRecordAdapter) -> Organization | None: for resolver in ( cls._resolve_by_inn_kpp, cls._resolve_by_ogrn_or_ogrip, cls._resolve_by_ogrip, cls._resolve_by_unique_inn, cls._resolve_by_exact_normalized_name, ): organization = resolver(adapter) if organization is not None: return organization return None @staticmethod def _resolve_by_inn_kpp(adapter: LegacyRecordAdapter) -> Organization | None: if not adapter.inn or not adapter.kpp: return None return Organization.objects.filter( inn=adapter.inn, kpp=adapter.kpp, ).first() @staticmethod def _resolve_by_ogrn_or_ogrip(adapter: LegacyRecordAdapter) -> Organization | None: if not adapter.ogrn: return None return ( Organization.objects.filter(ogrn=adapter.ogrn).first() or Organization.objects.filter(ogrip=adapter.ogrn).first() ) @staticmethod def _resolve_by_ogrip(adapter: LegacyRecordAdapter) -> Organization | None: if not adapter.ogrip: return None return Organization.objects.filter(ogrip=adapter.ogrip).first() @staticmethod def _resolve_by_unique_inn(adapter: LegacyRecordAdapter) -> Organization | None: if not adapter.inn: return None organizations = list(Organization.objects.filter(inn=adapter.inn)[:2]) return organizations[0] if len(organizations) == 1 else None @staticmethod def _resolve_by_exact_normalized_name( adapter: LegacyRecordAdapter, ) -> Organization | None: normalized_name = normalize_organization_name(adapter.organization_name) if not normalized_name: return None matches = list( Organization.objects.filter(name__iexact=adapter.organization_name)[:2] ) return matches[0] if len(matches) == 1 else None @classmethod def _backfill_financial_lines( cls, source_record: OrganizationSourceRecord, report_pk: str, ) -> tuple[int, int]: created = 0 updated = 0 for line in FinancialReportLine.objects.filter(report_id=report_pk).iterator(): _, was_created = OrganizationSourceFinancialLine.objects.update_or_create( source_record=source_record, form_code=line.form_code, line_code=line.line_code, year=line.year, defaults={ "line_name": line.line_name, "period_start": line.period_start, "period_end": line.period_end, }, ) if was_created: created += 1 else: updated += 1 return created, updated @staticmethod def _refresh_extension_counters(extension_ids: set[str]) -> None: for extension in OrganizationSourceExtension.objects.filter( uid__in=extension_ids ): aggregate = extension.records.aggregate( records_count=Count("uid"), first_seen_at=Min("created_at"), last_seen_at=Max("created_at"), last_load_batch=Max("load_batch"), ) extension.records_count = aggregate["records_count"] or 0 extension.first_seen_at = aggregate["first_seen_at"] extension.last_seen_at = aggregate["last_seen_at"] extension.last_load_batch = aggregate["last_load_batch"] extension.save( update_fields=[ "records_count", "first_seen_at", "last_seen_at", "last_load_batch", "updated_at", ] ) @classmethod def _inspection_adapter( cls, record: InspectionRecord, descriptor: SourceGroupDescriptor, ) -> LegacyRecordAdapter: return cls._adapter( record, descriptor, external_id=record.registration_number, title=record.organisation_name, organization_name=record.organisation_name, inn=record.inn, ogrn=record.ogrn, record_date=record.start_date, status=record.status, ) @classmethod def _financial_report_adapter( cls, record: FinancialReport, descriptor: SourceGroupDescriptor, ) -> LegacyRecordAdapter: return cls._adapter( record, descriptor, external_id=record.external_id, title=record.file_name, organization_name="", ogrn=record.ogrn, status=record.status, ) @classmethod def _procurement_adapter( cls, record: ProcurementRecord, descriptor: SourceGroupDescriptor, ) -> LegacyRecordAdapter: return cls._adapter( record, descriptor, external_id=record.purchase_number, title=record.purchase_name, organization_name=record.customer_name, inn=record.customer_inn, kpp=record.customer_kpp, ogrn=record.customer_ogrn, record_date=record.publish_date, amount=record.max_price_amount, status=record.status, url=record.href, ) @classmethod def _industrial_certificate_adapter( cls, record: IndustrialCertificateRecord, descriptor: SourceGroupDescriptor, ) -> LegacyRecordAdapter: return cls._adapter( record, descriptor, external_id=record.certificate_number, title=record.certificate_number, organization_name=record.organisation_name, inn=record.inn, ogrn=record.ogrn, record_date=record.issue_date, url=record.certificate_file_url, ) @classmethod def _industrial_product_adapter( cls, record: IndustrialProductRecord, descriptor: SourceGroupDescriptor, ) -> LegacyRecordAdapter: return cls._adapter( record, descriptor, external_id=record.registry_number, title=record.product_name, organization_name=record.full_organisation_name, inn=record.inn, ogrn=record.ogrn, ) @classmethod def _manufacturer_adapter( cls, record: ManufacturerRecord, descriptor: SourceGroupDescriptor, ) -> LegacyRecordAdapter: return cls._adapter( record, descriptor, external_id=record.inn, title=record.full_legal_name, organization_name=record.full_legal_name, inn=record.inn, ogrn=record.ogrn, ) @classmethod def _generic_adapter( cls, record: GenericParserRecord, descriptor: SourceGroupDescriptor, ) -> LegacyRecordAdapter: payload = cls._model_payload(record) if isinstance(record.payload, dict): payload.update(record.payload) return cls._adapter( record, descriptor, external_id=record.external_id, title=record.title, organization_name=record.organisation_name, inn=record.inn, ogrn=record.ogrn, record_date=record.record_date, amount=record.amount, status=record.status, url=record.url, payload=payload, ) @classmethod def _adapter( cls, record, descriptor: SourceGroupDescriptor, *, external_id: str, title: str, organization_name: str, inn: str = "", kpp: str = "", ogrn: str = "", ogrip: str = "", record_date: str = "", amount: Decimal | None = None, status: str = "", url: str = "", payload: dict[str, Any] | None = None, ) -> LegacyRecordAdapter: ( normalized_inn, normalized_kpp, normalized_ogrn, normalized_ogrip, ) = normalize_identity_fields( inn=inn, kpp=kpp, ogrn=ogrn, ogrip=ogrip, ) return LegacyRecordAdapter( source=str(descriptor.source), record_type=descriptor.record_type, external_id=str(external_id or ""), title=str(title or ""), organization_name=str(organization_name or ""), inn=normalized_inn, kpp=normalized_kpp, ogrn=normalized_ogrn, ogrip=normalized_ogrip, record_date=str(record_date or ""), amount=amount, status=str(status or ""), url=str(url or ""), payload=payload if payload is not None else cls._model_payload(record), legacy_model=cls._legacy_model_name(record), legacy_pk=str(record.pk), load_batch=getattr(record, "load_batch", None), ) @staticmethod def _legacy_model_name(record: Model) -> str: module = record.__class__.__module__.removesuffix(".models") return f"{module}.{record.__class__.__name__}" @staticmethod def _model_payload(record: Model) -> dict[str, Any]: payload = {"id": record.pk} for field in record._meta.concrete_fields: name = field.name value = ( getattr(record, field.attname) if field.is_relation and field.many_to_one else getattr(record, name) ) if name == "id": continue if isinstance(value, datetime | date): payload[name] = value.isoformat() elif isinstance(value, Decimal | UUID): payload[name] = str(value) else: payload[name] = value return payload