feat(organizations): migrate source storage to polymorphic records
This commit is contained in:
594
src/organizations/source_backfill.py
Normal file
594
src/organizations/source_backfill.py
Normal file
@@ -0,0 +1,594 @@
|
||||
"""Backfill organization source extensions from legacy parser tables."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Iterable
|
||||
from dataclasses import dataclass
|
||||
from datetime import date, datetime
|
||||
from decimal import Decimal
|
||||
from typing import Any
|
||||
from uuid import UUID
|
||||
|
||||
from apps.parsers.models import (
|
||||
FinancialReport,
|
||||
FinancialReportLine,
|
||||
GenericParserRecord,
|
||||
IndustrialCertificateRecord,
|
||||
IndustrialProductRecord,
|
||||
InspectionRecord,
|
||||
ManufacturerRecord,
|
||||
ParserLoadLog,
|
||||
ProcurementRecord,
|
||||
)
|
||||
from django.db import transaction
|
||||
from django.db.models import Count, Max, Min, Model
|
||||
|
||||
from organizations.models import (
|
||||
Organization,
|
||||
OrganizationSourceExtension,
|
||||
OrganizationSourceFinancialLine,
|
||||
OrganizationSourceRecord,
|
||||
)
|
||||
from organizations.name_normalization import normalize_organization_name
|
||||
from organizations.source_groups import (
|
||||
SOURCE_GROUP_DESCRIPTORS,
|
||||
SourceGroupDescriptor,
|
||||
get_source_group_descriptor,
|
||||
)
|
||||
from organizations.source_identity import normalize_identity_fields
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class OrganizationSourceBackfillResult:
|
||||
"""Counters returned by organization source backfill."""
|
||||
|
||||
scanned: int = 0
|
||||
created_organizations: int = 0
|
||||
created_extensions: int = 0
|
||||
updated_extensions: int = 0
|
||||
created_records: int = 0
|
||||
updated_records: int = 0
|
||||
created_financial_lines: int = 0
|
||||
updated_financial_lines: int = 0
|
||||
unresolved: int = 0
|
||||
|
||||
def plus(self, other: OrganizationSourceBackfillResult) -> OrganizationSourceBackfillResult:
|
||||
return OrganizationSourceBackfillResult(
|
||||
scanned=self.scanned + other.scanned,
|
||||
created_organizations=self.created_organizations + other.created_organizations,
|
||||
created_extensions=self.created_extensions + other.created_extensions,
|
||||
updated_extensions=self.updated_extensions + other.updated_extensions,
|
||||
created_records=self.created_records + other.created_records,
|
||||
updated_records=self.updated_records + other.updated_records,
|
||||
created_financial_lines=self.created_financial_lines + other.created_financial_lines,
|
||||
updated_financial_lines=self.updated_financial_lines + other.updated_financial_lines,
|
||||
unresolved=self.unresolved + other.unresolved,
|
||||
)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class LegacyRecordAdapter:
|
||||
"""Normalized legacy source record data."""
|
||||
|
||||
source: str
|
||||
record_type: str
|
||||
external_id: str
|
||||
title: str
|
||||
organization_name: str
|
||||
inn: str
|
||||
kpp: str
|
||||
ogrn: str
|
||||
ogrip: str
|
||||
record_date: str
|
||||
amount: Decimal | None
|
||||
status: str
|
||||
url: str
|
||||
payload: dict[str, Any]
|
||||
legacy_model: str
|
||||
legacy_pk: str
|
||||
load_batch: int | None
|
||||
|
||||
|
||||
class OrganizationSourceBackfillService:
|
||||
"""Build organization source extensions from current parser tables."""
|
||||
|
||||
@classmethod
|
||||
def backfill(
|
||||
cls,
|
||||
*,
|
||||
source: str | None = None,
|
||||
batch_id: int | None = None,
|
||||
) -> OrganizationSourceBackfillResult:
|
||||
if source is None:
|
||||
result = OrganizationSourceBackfillResult()
|
||||
for descriptor_source in SOURCE_GROUP_DESCRIPTORS:
|
||||
result = result.plus(
|
||||
cls.backfill(source=descriptor_source, batch_id=batch_id)
|
||||
)
|
||||
return result
|
||||
|
||||
descriptor = get_source_group_descriptor(str(source))
|
||||
adapters = cls._iter_adapters(descriptor, batch_id=batch_id)
|
||||
return cls._backfill_adapters(descriptor, adapters)
|
||||
|
||||
@classmethod
|
||||
def _backfill_adapters(
|
||||
cls,
|
||||
descriptor: SourceGroupDescriptor,
|
||||
adapters: Iterable[LegacyRecordAdapter],
|
||||
) -> OrganizationSourceBackfillResult:
|
||||
scanned = 0
|
||||
created_organizations = 0
|
||||
created_extensions = 0
|
||||
updated_extensions = 0
|
||||
created_records = 0
|
||||
updated_records = 0
|
||||
created_financial_lines = 0
|
||||
updated_financial_lines = 0
|
||||
unresolved = 0
|
||||
touched_extension_ids: set[str] = set()
|
||||
|
||||
with transaction.atomic():
|
||||
for adapter in adapters:
|
||||
scanned += 1
|
||||
organization, organization_created = cls._resolve_or_create_organization(adapter)
|
||||
if organization is None:
|
||||
unresolved += 1
|
||||
continue
|
||||
if organization_created:
|
||||
created_organizations += 1
|
||||
|
||||
extension, extension_created = descriptor.extension_model.objects.get_or_create(
|
||||
organization=organization,
|
||||
defaults={
|
||||
"source_group": descriptor.source_group,
|
||||
"title": descriptor.title,
|
||||
"last_load_batch": adapter.load_batch,
|
||||
},
|
||||
)
|
||||
if extension_created:
|
||||
created_extensions += 1
|
||||
else:
|
||||
updated_extensions += cls._update_extension(extension, descriptor, adapter)
|
||||
|
||||
source_record, record_created = OrganizationSourceRecord.objects.update_or_create(
|
||||
legacy_model=adapter.legacy_model,
|
||||
legacy_pk=adapter.legacy_pk,
|
||||
defaults={
|
||||
"extension": extension,
|
||||
"record_type": adapter.record_type,
|
||||
"source": adapter.source,
|
||||
"external_id": adapter.external_id,
|
||||
"title": adapter.title,
|
||||
"record_date": adapter.record_date,
|
||||
"amount": adapter.amount,
|
||||
"status": adapter.status,
|
||||
"url": adapter.url,
|
||||
"payload": adapter.payload,
|
||||
"load_batch": adapter.load_batch,
|
||||
},
|
||||
)
|
||||
if record_created:
|
||||
created_records += 1
|
||||
else:
|
||||
updated_records += 1
|
||||
|
||||
if adapter.source == ParserLoadLog.Source.FNS_REPORTS:
|
||||
line_result = cls._backfill_financial_lines(source_record, adapter.legacy_pk)
|
||||
created_financial_lines += line_result[0]
|
||||
updated_financial_lines += line_result[1]
|
||||
|
||||
touched_extension_ids.add(str(extension.uid))
|
||||
|
||||
cls._refresh_extension_counters(touched_extension_ids)
|
||||
|
||||
return OrganizationSourceBackfillResult(
|
||||
scanned=scanned,
|
||||
created_organizations=created_organizations,
|
||||
created_extensions=created_extensions,
|
||||
updated_extensions=updated_extensions,
|
||||
created_records=created_records,
|
||||
updated_records=updated_records,
|
||||
created_financial_lines=created_financial_lines,
|
||||
updated_financial_lines=updated_financial_lines,
|
||||
unresolved=unresolved,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _update_extension(
|
||||
extension: OrganizationSourceExtension,
|
||||
descriptor: SourceGroupDescriptor,
|
||||
adapter: LegacyRecordAdapter,
|
||||
) -> int:
|
||||
changed = False
|
||||
if extension.title != descriptor.title:
|
||||
extension.title = descriptor.title
|
||||
changed = True
|
||||
if adapter.load_batch is not None and extension.last_load_batch != adapter.load_batch:
|
||||
extension.last_load_batch = adapter.load_batch
|
||||
changed = True
|
||||
if changed:
|
||||
extension.save(update_fields=["title", "last_load_batch", "updated_at"])
|
||||
return 1
|
||||
return 0
|
||||
|
||||
@classmethod
|
||||
def _iter_adapters(
|
||||
cls,
|
||||
descriptor: SourceGroupDescriptor,
|
||||
*,
|
||||
batch_id: int | None,
|
||||
) -> Iterable[LegacyRecordAdapter]:
|
||||
source = str(descriptor.source)
|
||||
model_and_adapter = cls._legacy_model_adapter(source)
|
||||
if model_and_adapter is None:
|
||||
queryset = GenericParserRecord.objects.filter(source=source)
|
||||
adapter_factory = cls._generic_adapter
|
||||
else:
|
||||
model, adapter_factory = model_and_adapter
|
||||
queryset = model.objects.all()
|
||||
|
||||
if batch_id is not None:
|
||||
queryset = queryset.filter(load_batch=batch_id)
|
||||
|
||||
for record in queryset.iterator():
|
||||
yield adapter_factory(record, descriptor)
|
||||
|
||||
@classmethod
|
||||
def _legacy_model_adapter(cls, source: str):
|
||||
return {
|
||||
ParserLoadLog.Source.INSPECTIONS: (
|
||||
InspectionRecord,
|
||||
cls._inspection_adapter,
|
||||
),
|
||||
ParserLoadLog.Source.FNS_REPORTS: (
|
||||
FinancialReport,
|
||||
cls._financial_report_adapter,
|
||||
),
|
||||
ParserLoadLog.Source.PROCUREMENTS: (
|
||||
ProcurementRecord,
|
||||
cls._procurement_adapter,
|
||||
),
|
||||
ParserLoadLog.Source.INDUSTRIAL: (
|
||||
IndustrialCertificateRecord,
|
||||
cls._industrial_certificate_adapter,
|
||||
),
|
||||
ParserLoadLog.Source.INDUSTRIAL_PRODUCTS: (
|
||||
IndustrialProductRecord,
|
||||
cls._industrial_product_adapter,
|
||||
),
|
||||
ParserLoadLog.Source.MANUFACTURES: (
|
||||
ManufacturerRecord,
|
||||
cls._manufacturer_adapter,
|
||||
),
|
||||
}.get(source)
|
||||
|
||||
@classmethod
|
||||
def _resolve_or_create_organization(
|
||||
cls,
|
||||
adapter: LegacyRecordAdapter,
|
||||
) -> tuple[Organization | None, bool]:
|
||||
organization = cls._resolve_organization(adapter)
|
||||
if organization is not None:
|
||||
return organization, False
|
||||
|
||||
name = adapter.organization_name or adapter.title or adapter.external_id
|
||||
if not name:
|
||||
return None, False
|
||||
|
||||
organization = Organization.objects.create(
|
||||
name=name,
|
||||
inn=adapter.inn,
|
||||
kpp=adapter.kpp,
|
||||
ogrn=adapter.ogrn if len(adapter.ogrn) == 13 else "",
|
||||
ogrip=adapter.ogrip,
|
||||
)
|
||||
return organization, True
|
||||
|
||||
@classmethod
|
||||
def _resolve_organization(cls, adapter: LegacyRecordAdapter) -> Organization | None:
|
||||
for resolver in (
|
||||
cls._resolve_by_inn_kpp,
|
||||
cls._resolve_by_ogrn_or_ogrip,
|
||||
cls._resolve_by_ogrip,
|
||||
cls._resolve_by_unique_inn,
|
||||
cls._resolve_by_exact_normalized_name,
|
||||
):
|
||||
organization = resolver(adapter)
|
||||
if organization is not None:
|
||||
return organization
|
||||
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _resolve_by_inn_kpp(adapter: LegacyRecordAdapter) -> Organization | None:
|
||||
if not adapter.inn or not adapter.kpp:
|
||||
return None
|
||||
return Organization.objects.filter(
|
||||
inn=adapter.inn,
|
||||
kpp=adapter.kpp,
|
||||
).first()
|
||||
|
||||
@staticmethod
|
||||
def _resolve_by_ogrn_or_ogrip(adapter: LegacyRecordAdapter) -> Organization | None:
|
||||
if not adapter.ogrn:
|
||||
return None
|
||||
return (
|
||||
Organization.objects.filter(ogrn=adapter.ogrn).first()
|
||||
or Organization.objects.filter(ogrip=adapter.ogrn).first()
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _resolve_by_ogrip(adapter: LegacyRecordAdapter) -> Organization | None:
|
||||
if not adapter.ogrip:
|
||||
return None
|
||||
return Organization.objects.filter(ogrip=adapter.ogrip).first()
|
||||
|
||||
@staticmethod
|
||||
def _resolve_by_unique_inn(adapter: LegacyRecordAdapter) -> Organization | None:
|
||||
if not adapter.inn:
|
||||
return None
|
||||
organizations = list(Organization.objects.filter(inn=adapter.inn)[:2])
|
||||
return organizations[0] if len(organizations) == 1 else None
|
||||
|
||||
@staticmethod
|
||||
def _resolve_by_exact_normalized_name(
|
||||
adapter: LegacyRecordAdapter,
|
||||
) -> Organization | None:
|
||||
normalized_name = normalize_organization_name(adapter.organization_name)
|
||||
if not normalized_name:
|
||||
return None
|
||||
matches = list(
|
||||
Organization.objects.filter(name__iexact=adapter.organization_name)[:2]
|
||||
)
|
||||
return matches[0] if len(matches) == 1 else None
|
||||
|
||||
@classmethod
|
||||
def _backfill_financial_lines(
|
||||
cls,
|
||||
source_record: OrganizationSourceRecord,
|
||||
report_pk: str,
|
||||
) -> tuple[int, int]:
|
||||
created = 0
|
||||
updated = 0
|
||||
for line in FinancialReportLine.objects.filter(report_id=report_pk).iterator():
|
||||
_, was_created = OrganizationSourceFinancialLine.objects.update_or_create(
|
||||
source_record=source_record,
|
||||
form_code=line.form_code,
|
||||
line_code=line.line_code,
|
||||
year=line.year,
|
||||
defaults={
|
||||
"line_name": line.line_name,
|
||||
"period_start": line.period_start,
|
||||
"period_end": line.period_end,
|
||||
},
|
||||
)
|
||||
if was_created:
|
||||
created += 1
|
||||
else:
|
||||
updated += 1
|
||||
return created, updated
|
||||
|
||||
@staticmethod
|
||||
def _refresh_extension_counters(extension_ids: set[str]) -> None:
|
||||
for extension in OrganizationSourceExtension.objects.filter(uid__in=extension_ids):
|
||||
aggregate = extension.records.aggregate(
|
||||
records_count=Count("uid"),
|
||||
first_seen_at=Min("created_at"),
|
||||
last_seen_at=Max("created_at"),
|
||||
last_load_batch=Max("load_batch"),
|
||||
)
|
||||
extension.records_count = aggregate["records_count"] or 0
|
||||
extension.first_seen_at = aggregate["first_seen_at"]
|
||||
extension.last_seen_at = aggregate["last_seen_at"]
|
||||
extension.last_load_batch = aggregate["last_load_batch"]
|
||||
extension.save(
|
||||
update_fields=[
|
||||
"records_count",
|
||||
"first_seen_at",
|
||||
"last_seen_at",
|
||||
"last_load_batch",
|
||||
"updated_at",
|
||||
]
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _inspection_adapter(
|
||||
cls,
|
||||
record: InspectionRecord,
|
||||
descriptor: SourceGroupDescriptor,
|
||||
) -> LegacyRecordAdapter:
|
||||
return cls._adapter(
|
||||
record,
|
||||
descriptor,
|
||||
external_id=record.registration_number,
|
||||
title=record.organisation_name,
|
||||
organization_name=record.organisation_name,
|
||||
inn=record.inn,
|
||||
ogrn=record.ogrn,
|
||||
record_date=record.start_date,
|
||||
status=record.status,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _financial_report_adapter(
|
||||
cls,
|
||||
record: FinancialReport,
|
||||
descriptor: SourceGroupDescriptor,
|
||||
) -> LegacyRecordAdapter:
|
||||
return cls._adapter(
|
||||
record,
|
||||
descriptor,
|
||||
external_id=record.external_id,
|
||||
title=record.file_name,
|
||||
organization_name="",
|
||||
ogrn=record.ogrn,
|
||||
status=record.status,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _procurement_adapter(
|
||||
cls,
|
||||
record: ProcurementRecord,
|
||||
descriptor: SourceGroupDescriptor,
|
||||
) -> LegacyRecordAdapter:
|
||||
return cls._adapter(
|
||||
record,
|
||||
descriptor,
|
||||
external_id=record.purchase_number,
|
||||
title=record.purchase_name,
|
||||
organization_name=record.customer_name,
|
||||
inn=record.customer_inn,
|
||||
kpp=record.customer_kpp,
|
||||
ogrn=record.customer_ogrn,
|
||||
record_date=record.publish_date,
|
||||
amount=record.max_price_amount,
|
||||
status=record.status,
|
||||
url=record.href,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _industrial_certificate_adapter(
|
||||
cls,
|
||||
record: IndustrialCertificateRecord,
|
||||
descriptor: SourceGroupDescriptor,
|
||||
) -> LegacyRecordAdapter:
|
||||
return cls._adapter(
|
||||
record,
|
||||
descriptor,
|
||||
external_id=record.certificate_number,
|
||||
title=record.certificate_number,
|
||||
organization_name=record.organisation_name,
|
||||
inn=record.inn,
|
||||
ogrn=record.ogrn,
|
||||
record_date=record.issue_date,
|
||||
url=record.certificate_file_url,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _industrial_product_adapter(
|
||||
cls,
|
||||
record: IndustrialProductRecord,
|
||||
descriptor: SourceGroupDescriptor,
|
||||
) -> LegacyRecordAdapter:
|
||||
return cls._adapter(
|
||||
record,
|
||||
descriptor,
|
||||
external_id=record.registry_number,
|
||||
title=record.product_name,
|
||||
organization_name=record.full_organisation_name,
|
||||
inn=record.inn,
|
||||
ogrn=record.ogrn,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _manufacturer_adapter(
|
||||
cls,
|
||||
record: ManufacturerRecord,
|
||||
descriptor: SourceGroupDescriptor,
|
||||
) -> LegacyRecordAdapter:
|
||||
return cls._adapter(
|
||||
record,
|
||||
descriptor,
|
||||
external_id=record.inn,
|
||||
title=record.full_legal_name,
|
||||
organization_name=record.full_legal_name,
|
||||
inn=record.inn,
|
||||
ogrn=record.ogrn,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _generic_adapter(
|
||||
cls,
|
||||
record: GenericParserRecord,
|
||||
descriptor: SourceGroupDescriptor,
|
||||
) -> LegacyRecordAdapter:
|
||||
payload = cls._model_payload(record)
|
||||
if isinstance(record.payload, dict):
|
||||
payload.update(record.payload)
|
||||
return cls._adapter(
|
||||
record,
|
||||
descriptor,
|
||||
external_id=record.external_id,
|
||||
title=record.title,
|
||||
organization_name=record.organisation_name,
|
||||
inn=record.inn,
|
||||
ogrn=record.ogrn,
|
||||
record_date=record.record_date,
|
||||
amount=record.amount,
|
||||
status=record.status,
|
||||
url=record.url,
|
||||
payload=payload,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _adapter(
|
||||
cls,
|
||||
record,
|
||||
descriptor: SourceGroupDescriptor,
|
||||
*,
|
||||
external_id: str,
|
||||
title: str,
|
||||
organization_name: str,
|
||||
inn: str = "",
|
||||
kpp: str = "",
|
||||
ogrn: str = "",
|
||||
ogrip: str = "",
|
||||
record_date: str = "",
|
||||
amount: Decimal | None = None,
|
||||
status: str = "",
|
||||
url: str = "",
|
||||
payload: dict[str, Any] | None = None,
|
||||
) -> LegacyRecordAdapter:
|
||||
normalized_inn, normalized_kpp, normalized_ogrn, normalized_ogrip = (
|
||||
normalize_identity_fields(
|
||||
inn=inn,
|
||||
kpp=kpp,
|
||||
ogrn=ogrn,
|
||||
ogrip=ogrip,
|
||||
)
|
||||
)
|
||||
return LegacyRecordAdapter(
|
||||
source=str(descriptor.source),
|
||||
record_type=descriptor.record_type,
|
||||
external_id=str(external_id or ""),
|
||||
title=str(title or ""),
|
||||
organization_name=str(organization_name or ""),
|
||||
inn=normalized_inn,
|
||||
kpp=normalized_kpp,
|
||||
ogrn=normalized_ogrn,
|
||||
ogrip=normalized_ogrip,
|
||||
record_date=str(record_date or ""),
|
||||
amount=amount,
|
||||
status=str(status or ""),
|
||||
url=str(url or ""),
|
||||
payload=payload if payload is not None else cls._model_payload(record),
|
||||
legacy_model=cls._legacy_model_name(record),
|
||||
legacy_pk=str(record.pk),
|
||||
load_batch=getattr(record, "load_batch", None),
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _legacy_model_name(record: Model) -> str:
|
||||
module = record.__class__.__module__.removesuffix(".models")
|
||||
return f"{module}.{record.__class__.__name__}"
|
||||
|
||||
@staticmethod
|
||||
def _model_payload(record: Model) -> dict[str, Any]:
|
||||
payload = {"id": record.pk}
|
||||
for field in record._meta.concrete_fields:
|
||||
name = field.name
|
||||
value = (
|
||||
getattr(record, field.attname)
|
||||
if field.is_relation and field.many_to_one
|
||||
else getattr(record, name)
|
||||
)
|
||||
if name == "id":
|
||||
continue
|
||||
if isinstance(value, datetime | date):
|
||||
payload[name] = value.isoformat()
|
||||
elif isinstance(value, Decimal | UUID):
|
||||
payload[name] = str(value)
|
||||
else:
|
||||
payload[name] = value
|
||||
return payload
|
||||
Reference in New Issue
Block a user