feat(organizations): migrate source storage to polymorphic records

This commit is contained in:
2026-05-19 10:23:53 +02:00
parent 19a7d5a91c
commit 4ca2fa25d5
44 changed files with 7129 additions and 1551 deletions

View File

@@ -0,0 +1,594 @@
"""Backfill organization source extensions from legacy parser tables."""
from __future__ import annotations
from collections.abc import Iterable
from dataclasses import dataclass
from datetime import date, datetime
from decimal import Decimal
from typing import Any
from uuid import UUID
from apps.parsers.models import (
FinancialReport,
FinancialReportLine,
GenericParserRecord,
IndustrialCertificateRecord,
IndustrialProductRecord,
InspectionRecord,
ManufacturerRecord,
ParserLoadLog,
ProcurementRecord,
)
from django.db import transaction
from django.db.models import Count, Max, Min, Model
from organizations.models import (
Organization,
OrganizationSourceExtension,
OrganizationSourceFinancialLine,
OrganizationSourceRecord,
)
from organizations.name_normalization import normalize_organization_name
from organizations.source_groups import (
SOURCE_GROUP_DESCRIPTORS,
SourceGroupDescriptor,
get_source_group_descriptor,
)
from organizations.source_identity import normalize_identity_fields
@dataclass(frozen=True)
class OrganizationSourceBackfillResult:
"""Counters returned by organization source backfill."""
scanned: int = 0
created_organizations: int = 0
created_extensions: int = 0
updated_extensions: int = 0
created_records: int = 0
updated_records: int = 0
created_financial_lines: int = 0
updated_financial_lines: int = 0
unresolved: int = 0
def plus(self, other: OrganizationSourceBackfillResult) -> OrganizationSourceBackfillResult:
return OrganizationSourceBackfillResult(
scanned=self.scanned + other.scanned,
created_organizations=self.created_organizations + other.created_organizations,
created_extensions=self.created_extensions + other.created_extensions,
updated_extensions=self.updated_extensions + other.updated_extensions,
created_records=self.created_records + other.created_records,
updated_records=self.updated_records + other.updated_records,
created_financial_lines=self.created_financial_lines + other.created_financial_lines,
updated_financial_lines=self.updated_financial_lines + other.updated_financial_lines,
unresolved=self.unresolved + other.unresolved,
)
@dataclass(frozen=True)
class LegacyRecordAdapter:
"""Normalized legacy source record data."""
source: str
record_type: str
external_id: str
title: str
organization_name: str
inn: str
kpp: str
ogrn: str
ogrip: str
record_date: str
amount: Decimal | None
status: str
url: str
payload: dict[str, Any]
legacy_model: str
legacy_pk: str
load_batch: int | None
class OrganizationSourceBackfillService:
"""Build organization source extensions from current parser tables."""
@classmethod
def backfill(
cls,
*,
source: str | None = None,
batch_id: int | None = None,
) -> OrganizationSourceBackfillResult:
if source is None:
result = OrganizationSourceBackfillResult()
for descriptor_source in SOURCE_GROUP_DESCRIPTORS:
result = result.plus(
cls.backfill(source=descriptor_source, batch_id=batch_id)
)
return result
descriptor = get_source_group_descriptor(str(source))
adapters = cls._iter_adapters(descriptor, batch_id=batch_id)
return cls._backfill_adapters(descriptor, adapters)
@classmethod
def _backfill_adapters(
cls,
descriptor: SourceGroupDescriptor,
adapters: Iterable[LegacyRecordAdapter],
) -> OrganizationSourceBackfillResult:
scanned = 0
created_organizations = 0
created_extensions = 0
updated_extensions = 0
created_records = 0
updated_records = 0
created_financial_lines = 0
updated_financial_lines = 0
unresolved = 0
touched_extension_ids: set[str] = set()
with transaction.atomic():
for adapter in adapters:
scanned += 1
organization, organization_created = cls._resolve_or_create_organization(adapter)
if organization is None:
unresolved += 1
continue
if organization_created:
created_organizations += 1
extension, extension_created = descriptor.extension_model.objects.get_or_create(
organization=organization,
defaults={
"source_group": descriptor.source_group,
"title": descriptor.title,
"last_load_batch": adapter.load_batch,
},
)
if extension_created:
created_extensions += 1
else:
updated_extensions += cls._update_extension(extension, descriptor, adapter)
source_record, record_created = OrganizationSourceRecord.objects.update_or_create(
legacy_model=adapter.legacy_model,
legacy_pk=adapter.legacy_pk,
defaults={
"extension": extension,
"record_type": adapter.record_type,
"source": adapter.source,
"external_id": adapter.external_id,
"title": adapter.title,
"record_date": adapter.record_date,
"amount": adapter.amount,
"status": adapter.status,
"url": adapter.url,
"payload": adapter.payload,
"load_batch": adapter.load_batch,
},
)
if record_created:
created_records += 1
else:
updated_records += 1
if adapter.source == ParserLoadLog.Source.FNS_REPORTS:
line_result = cls._backfill_financial_lines(source_record, adapter.legacy_pk)
created_financial_lines += line_result[0]
updated_financial_lines += line_result[1]
touched_extension_ids.add(str(extension.uid))
cls._refresh_extension_counters(touched_extension_ids)
return OrganizationSourceBackfillResult(
scanned=scanned,
created_organizations=created_organizations,
created_extensions=created_extensions,
updated_extensions=updated_extensions,
created_records=created_records,
updated_records=updated_records,
created_financial_lines=created_financial_lines,
updated_financial_lines=updated_financial_lines,
unresolved=unresolved,
)
@staticmethod
def _update_extension(
extension: OrganizationSourceExtension,
descriptor: SourceGroupDescriptor,
adapter: LegacyRecordAdapter,
) -> int:
changed = False
if extension.title != descriptor.title:
extension.title = descriptor.title
changed = True
if adapter.load_batch is not None and extension.last_load_batch != adapter.load_batch:
extension.last_load_batch = adapter.load_batch
changed = True
if changed:
extension.save(update_fields=["title", "last_load_batch", "updated_at"])
return 1
return 0
@classmethod
def _iter_adapters(
cls,
descriptor: SourceGroupDescriptor,
*,
batch_id: int | None,
) -> Iterable[LegacyRecordAdapter]:
source = str(descriptor.source)
model_and_adapter = cls._legacy_model_adapter(source)
if model_and_adapter is None:
queryset = GenericParserRecord.objects.filter(source=source)
adapter_factory = cls._generic_adapter
else:
model, adapter_factory = model_and_adapter
queryset = model.objects.all()
if batch_id is not None:
queryset = queryset.filter(load_batch=batch_id)
for record in queryset.iterator():
yield adapter_factory(record, descriptor)
@classmethod
def _legacy_model_adapter(cls, source: str):
return {
ParserLoadLog.Source.INSPECTIONS: (
InspectionRecord,
cls._inspection_adapter,
),
ParserLoadLog.Source.FNS_REPORTS: (
FinancialReport,
cls._financial_report_adapter,
),
ParserLoadLog.Source.PROCUREMENTS: (
ProcurementRecord,
cls._procurement_adapter,
),
ParserLoadLog.Source.INDUSTRIAL: (
IndustrialCertificateRecord,
cls._industrial_certificate_adapter,
),
ParserLoadLog.Source.INDUSTRIAL_PRODUCTS: (
IndustrialProductRecord,
cls._industrial_product_adapter,
),
ParserLoadLog.Source.MANUFACTURES: (
ManufacturerRecord,
cls._manufacturer_adapter,
),
}.get(source)
@classmethod
def _resolve_or_create_organization(
cls,
adapter: LegacyRecordAdapter,
) -> tuple[Organization | None, bool]:
organization = cls._resolve_organization(adapter)
if organization is not None:
return organization, False
name = adapter.organization_name or adapter.title or adapter.external_id
if not name:
return None, False
organization = Organization.objects.create(
name=name,
inn=adapter.inn,
kpp=adapter.kpp,
ogrn=adapter.ogrn if len(adapter.ogrn) == 13 else "",
ogrip=adapter.ogrip,
)
return organization, True
@classmethod
def _resolve_organization(cls, adapter: LegacyRecordAdapter) -> Organization | None:
for resolver in (
cls._resolve_by_inn_kpp,
cls._resolve_by_ogrn_or_ogrip,
cls._resolve_by_ogrip,
cls._resolve_by_unique_inn,
cls._resolve_by_exact_normalized_name,
):
organization = resolver(adapter)
if organization is not None:
return organization
return None
@staticmethod
def _resolve_by_inn_kpp(adapter: LegacyRecordAdapter) -> Organization | None:
if not adapter.inn or not adapter.kpp:
return None
return Organization.objects.filter(
inn=adapter.inn,
kpp=adapter.kpp,
).first()
@staticmethod
def _resolve_by_ogrn_or_ogrip(adapter: LegacyRecordAdapter) -> Organization | None:
if not adapter.ogrn:
return None
return (
Organization.objects.filter(ogrn=adapter.ogrn).first()
or Organization.objects.filter(ogrip=adapter.ogrn).first()
)
@staticmethod
def _resolve_by_ogrip(adapter: LegacyRecordAdapter) -> Organization | None:
if not adapter.ogrip:
return None
return Organization.objects.filter(ogrip=adapter.ogrip).first()
@staticmethod
def _resolve_by_unique_inn(adapter: LegacyRecordAdapter) -> Organization | None:
if not adapter.inn:
return None
organizations = list(Organization.objects.filter(inn=adapter.inn)[:2])
return organizations[0] if len(organizations) == 1 else None
@staticmethod
def _resolve_by_exact_normalized_name(
adapter: LegacyRecordAdapter,
) -> Organization | None:
normalized_name = normalize_organization_name(adapter.organization_name)
if not normalized_name:
return None
matches = list(
Organization.objects.filter(name__iexact=adapter.organization_name)[:2]
)
return matches[0] if len(matches) == 1 else None
@classmethod
def _backfill_financial_lines(
cls,
source_record: OrganizationSourceRecord,
report_pk: str,
) -> tuple[int, int]:
created = 0
updated = 0
for line in FinancialReportLine.objects.filter(report_id=report_pk).iterator():
_, was_created = OrganizationSourceFinancialLine.objects.update_or_create(
source_record=source_record,
form_code=line.form_code,
line_code=line.line_code,
year=line.year,
defaults={
"line_name": line.line_name,
"period_start": line.period_start,
"period_end": line.period_end,
},
)
if was_created:
created += 1
else:
updated += 1
return created, updated
@staticmethod
def _refresh_extension_counters(extension_ids: set[str]) -> None:
for extension in OrganizationSourceExtension.objects.filter(uid__in=extension_ids):
aggregate = extension.records.aggregate(
records_count=Count("uid"),
first_seen_at=Min("created_at"),
last_seen_at=Max("created_at"),
last_load_batch=Max("load_batch"),
)
extension.records_count = aggregate["records_count"] or 0
extension.first_seen_at = aggregate["first_seen_at"]
extension.last_seen_at = aggregate["last_seen_at"]
extension.last_load_batch = aggregate["last_load_batch"]
extension.save(
update_fields=[
"records_count",
"first_seen_at",
"last_seen_at",
"last_load_batch",
"updated_at",
]
)
@classmethod
def _inspection_adapter(
cls,
record: InspectionRecord,
descriptor: SourceGroupDescriptor,
) -> LegacyRecordAdapter:
return cls._adapter(
record,
descriptor,
external_id=record.registration_number,
title=record.organisation_name,
organization_name=record.organisation_name,
inn=record.inn,
ogrn=record.ogrn,
record_date=record.start_date,
status=record.status,
)
@classmethod
def _financial_report_adapter(
cls,
record: FinancialReport,
descriptor: SourceGroupDescriptor,
) -> LegacyRecordAdapter:
return cls._adapter(
record,
descriptor,
external_id=record.external_id,
title=record.file_name,
organization_name="",
ogrn=record.ogrn,
status=record.status,
)
@classmethod
def _procurement_adapter(
cls,
record: ProcurementRecord,
descriptor: SourceGroupDescriptor,
) -> LegacyRecordAdapter:
return cls._adapter(
record,
descriptor,
external_id=record.purchase_number,
title=record.purchase_name,
organization_name=record.customer_name,
inn=record.customer_inn,
kpp=record.customer_kpp,
ogrn=record.customer_ogrn,
record_date=record.publish_date,
amount=record.max_price_amount,
status=record.status,
url=record.href,
)
@classmethod
def _industrial_certificate_adapter(
cls,
record: IndustrialCertificateRecord,
descriptor: SourceGroupDescriptor,
) -> LegacyRecordAdapter:
return cls._adapter(
record,
descriptor,
external_id=record.certificate_number,
title=record.certificate_number,
organization_name=record.organisation_name,
inn=record.inn,
ogrn=record.ogrn,
record_date=record.issue_date,
url=record.certificate_file_url,
)
@classmethod
def _industrial_product_adapter(
cls,
record: IndustrialProductRecord,
descriptor: SourceGroupDescriptor,
) -> LegacyRecordAdapter:
return cls._adapter(
record,
descriptor,
external_id=record.registry_number,
title=record.product_name,
organization_name=record.full_organisation_name,
inn=record.inn,
ogrn=record.ogrn,
)
@classmethod
def _manufacturer_adapter(
cls,
record: ManufacturerRecord,
descriptor: SourceGroupDescriptor,
) -> LegacyRecordAdapter:
return cls._adapter(
record,
descriptor,
external_id=record.inn,
title=record.full_legal_name,
organization_name=record.full_legal_name,
inn=record.inn,
ogrn=record.ogrn,
)
@classmethod
def _generic_adapter(
cls,
record: GenericParserRecord,
descriptor: SourceGroupDescriptor,
) -> LegacyRecordAdapter:
payload = cls._model_payload(record)
if isinstance(record.payload, dict):
payload.update(record.payload)
return cls._adapter(
record,
descriptor,
external_id=record.external_id,
title=record.title,
organization_name=record.organisation_name,
inn=record.inn,
ogrn=record.ogrn,
record_date=record.record_date,
amount=record.amount,
status=record.status,
url=record.url,
payload=payload,
)
@classmethod
def _adapter(
cls,
record,
descriptor: SourceGroupDescriptor,
*,
external_id: str,
title: str,
organization_name: str,
inn: str = "",
kpp: str = "",
ogrn: str = "",
ogrip: str = "",
record_date: str = "",
amount: Decimal | None = None,
status: str = "",
url: str = "",
payload: dict[str, Any] | None = None,
) -> LegacyRecordAdapter:
normalized_inn, normalized_kpp, normalized_ogrn, normalized_ogrip = (
normalize_identity_fields(
inn=inn,
kpp=kpp,
ogrn=ogrn,
ogrip=ogrip,
)
)
return LegacyRecordAdapter(
source=str(descriptor.source),
record_type=descriptor.record_type,
external_id=str(external_id or ""),
title=str(title or ""),
organization_name=str(organization_name or ""),
inn=normalized_inn,
kpp=normalized_kpp,
ogrn=normalized_ogrn,
ogrip=normalized_ogrip,
record_date=str(record_date or ""),
amount=amount,
status=str(status or ""),
url=str(url or ""),
payload=payload if payload is not None else cls._model_payload(record),
legacy_model=cls._legacy_model_name(record),
legacy_pk=str(record.pk),
load_batch=getattr(record, "load_batch", None),
)
@staticmethod
def _legacy_model_name(record: Model) -> str:
module = record.__class__.__module__.removesuffix(".models")
return f"{module}.{record.__class__.__name__}"
@staticmethod
def _model_payload(record: Model) -> dict[str, Any]:
payload = {"id": record.pk}
for field in record._meta.concrete_fields:
name = field.name
value = (
getattr(record, field.attname)
if field.is_relation and field.many_to_one
else getattr(record, name)
)
if name == "id":
continue
if isinstance(value, datetime | date):
payload[name] = value.isoformat()
elif isinstance(value, Decimal | UUID):
payload[name] = str(value)
else:
payload[name] = value
return payload