621 lines
20 KiB
Python
621 lines
20 KiB
Python
"""Backfill organization source extensions from legacy parser tables."""
|
|
|
|
from __future__ import annotations
|
|
|
|
from collections.abc import Iterable
|
|
from dataclasses import dataclass
|
|
from datetime import date, datetime
|
|
from decimal import Decimal
|
|
from typing import Any
|
|
from uuid import UUID
|
|
|
|
from apps.parsers.models import (
|
|
FinancialReport,
|
|
FinancialReportLine,
|
|
GenericParserRecord,
|
|
IndustrialCertificateRecord,
|
|
IndustrialProductRecord,
|
|
InspectionRecord,
|
|
ManufacturerRecord,
|
|
ParserLoadLog,
|
|
ProcurementRecord,
|
|
)
|
|
from django.db import transaction
|
|
from django.db.models import Count, Max, Min, Model
|
|
|
|
from organizations.models import (
|
|
Organization,
|
|
OrganizationSourceExtension,
|
|
OrganizationSourceFinancialLine,
|
|
OrganizationSourceRecord,
|
|
)
|
|
from organizations.name_normalization import normalize_organization_name
|
|
from organizations.source_groups import (
|
|
SOURCE_GROUP_DESCRIPTORS,
|
|
SourceGroupDescriptor,
|
|
get_source_group_descriptor,
|
|
)
|
|
from organizations.source_identity import normalize_identity_fields
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class OrganizationSourceBackfillResult:
|
|
"""Counters returned by organization source backfill."""
|
|
|
|
scanned: int = 0
|
|
created_organizations: int = 0
|
|
created_extensions: int = 0
|
|
updated_extensions: int = 0
|
|
created_records: int = 0
|
|
updated_records: int = 0
|
|
created_financial_lines: int = 0
|
|
updated_financial_lines: int = 0
|
|
unresolved: int = 0
|
|
|
|
def plus(
|
|
self, other: OrganizationSourceBackfillResult
|
|
) -> OrganizationSourceBackfillResult:
|
|
return OrganizationSourceBackfillResult(
|
|
scanned=self.scanned + other.scanned,
|
|
created_organizations=self.created_organizations
|
|
+ other.created_organizations,
|
|
created_extensions=self.created_extensions + other.created_extensions,
|
|
updated_extensions=self.updated_extensions + other.updated_extensions,
|
|
created_records=self.created_records + other.created_records,
|
|
updated_records=self.updated_records + other.updated_records,
|
|
created_financial_lines=self.created_financial_lines
|
|
+ other.created_financial_lines,
|
|
updated_financial_lines=self.updated_financial_lines
|
|
+ other.updated_financial_lines,
|
|
unresolved=self.unresolved + other.unresolved,
|
|
)
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class LegacyRecordAdapter:
|
|
"""Normalized legacy source record data."""
|
|
|
|
source: str
|
|
record_type: str
|
|
external_id: str
|
|
title: str
|
|
organization_name: str
|
|
inn: str
|
|
kpp: str
|
|
ogrn: str
|
|
ogrip: str
|
|
record_date: str
|
|
amount: Decimal | None
|
|
status: str
|
|
url: str
|
|
payload: dict[str, Any]
|
|
legacy_model: str
|
|
legacy_pk: str
|
|
load_batch: int | None
|
|
|
|
|
|
class OrganizationSourceBackfillService:
|
|
"""Build organization source extensions from current parser tables."""
|
|
|
|
@classmethod
|
|
def backfill(
|
|
cls,
|
|
*,
|
|
source: str | None = None,
|
|
batch_id: int | None = None,
|
|
) -> OrganizationSourceBackfillResult:
|
|
if source is None:
|
|
result = OrganizationSourceBackfillResult()
|
|
for descriptor_source in SOURCE_GROUP_DESCRIPTORS:
|
|
result = result.plus(
|
|
cls.backfill(source=descriptor_source, batch_id=batch_id)
|
|
)
|
|
return result
|
|
|
|
descriptor = get_source_group_descriptor(str(source))
|
|
adapters = cls._iter_adapters(descriptor, batch_id=batch_id)
|
|
return cls._backfill_adapters(descriptor, adapters)
|
|
|
|
@classmethod
|
|
def _backfill_adapters(
|
|
cls,
|
|
descriptor: SourceGroupDescriptor,
|
|
adapters: Iterable[LegacyRecordAdapter],
|
|
) -> OrganizationSourceBackfillResult:
|
|
scanned = 0
|
|
created_organizations = 0
|
|
created_extensions = 0
|
|
updated_extensions = 0
|
|
created_records = 0
|
|
updated_records = 0
|
|
created_financial_lines = 0
|
|
updated_financial_lines = 0
|
|
unresolved = 0
|
|
touched_extension_ids: set[str] = set()
|
|
|
|
with transaction.atomic():
|
|
for adapter in adapters:
|
|
scanned += 1
|
|
(
|
|
organization,
|
|
organization_created,
|
|
) = cls._resolve_or_create_organization(adapter)
|
|
if organization is None:
|
|
unresolved += 1
|
|
continue
|
|
if organization_created:
|
|
created_organizations += 1
|
|
|
|
(
|
|
extension,
|
|
extension_created,
|
|
) = descriptor.extension_model.objects.get_or_create(
|
|
organization=organization,
|
|
defaults={
|
|
"source_group": descriptor.source_group,
|
|
"title": descriptor.title,
|
|
"last_load_batch": adapter.load_batch,
|
|
},
|
|
)
|
|
if extension_created:
|
|
created_extensions += 1
|
|
else:
|
|
updated_extensions += cls._update_extension(
|
|
extension, descriptor, adapter
|
|
)
|
|
|
|
(
|
|
source_record,
|
|
record_created,
|
|
) = OrganizationSourceRecord.objects.update_or_create(
|
|
legacy_model=adapter.legacy_model,
|
|
legacy_pk=adapter.legacy_pk,
|
|
defaults={
|
|
"extension": extension,
|
|
"record_type": adapter.record_type,
|
|
"source": adapter.source,
|
|
"external_id": adapter.external_id,
|
|
"title": adapter.title,
|
|
"record_date": adapter.record_date,
|
|
"amount": adapter.amount,
|
|
"status": adapter.status,
|
|
"url": adapter.url,
|
|
"payload": adapter.payload,
|
|
"load_batch": adapter.load_batch,
|
|
},
|
|
)
|
|
if record_created:
|
|
created_records += 1
|
|
else:
|
|
updated_records += 1
|
|
|
|
if adapter.source == ParserLoadLog.Source.FNS_REPORTS:
|
|
line_result = cls._backfill_financial_lines(
|
|
source_record, adapter.legacy_pk
|
|
)
|
|
created_financial_lines += line_result[0]
|
|
updated_financial_lines += line_result[1]
|
|
|
|
touched_extension_ids.add(str(extension.uid))
|
|
|
|
cls._refresh_extension_counters(touched_extension_ids)
|
|
|
|
return OrganizationSourceBackfillResult(
|
|
scanned=scanned,
|
|
created_organizations=created_organizations,
|
|
created_extensions=created_extensions,
|
|
updated_extensions=updated_extensions,
|
|
created_records=created_records,
|
|
updated_records=updated_records,
|
|
created_financial_lines=created_financial_lines,
|
|
updated_financial_lines=updated_financial_lines,
|
|
unresolved=unresolved,
|
|
)
|
|
|
|
@staticmethod
|
|
def _update_extension(
|
|
extension: OrganizationSourceExtension,
|
|
descriptor: SourceGroupDescriptor,
|
|
adapter: LegacyRecordAdapter,
|
|
) -> int:
|
|
changed = False
|
|
if extension.title != descriptor.title:
|
|
extension.title = descriptor.title
|
|
changed = True
|
|
if (
|
|
adapter.load_batch is not None
|
|
and extension.last_load_batch != adapter.load_batch
|
|
):
|
|
extension.last_load_batch = adapter.load_batch
|
|
changed = True
|
|
if changed:
|
|
extension.save(update_fields=["title", "last_load_batch", "updated_at"])
|
|
return 1
|
|
return 0
|
|
|
|
@classmethod
|
|
def _iter_adapters(
|
|
cls,
|
|
descriptor: SourceGroupDescriptor,
|
|
*,
|
|
batch_id: int | None,
|
|
) -> Iterable[LegacyRecordAdapter]:
|
|
source = str(descriptor.source)
|
|
model_and_adapter = cls._legacy_model_adapter(source)
|
|
if model_and_adapter is None:
|
|
queryset = GenericParserRecord.objects.filter(source=source)
|
|
adapter_factory = cls._generic_adapter
|
|
else:
|
|
model, adapter_factory = model_and_adapter
|
|
queryset = model.objects.all()
|
|
|
|
if batch_id is not None:
|
|
queryset = queryset.filter(load_batch=batch_id)
|
|
|
|
for record in queryset.iterator():
|
|
yield adapter_factory(record, descriptor)
|
|
|
|
@classmethod
|
|
def _legacy_model_adapter(cls, source: str):
|
|
return {
|
|
ParserLoadLog.Source.INSPECTIONS: (
|
|
InspectionRecord,
|
|
cls._inspection_adapter,
|
|
),
|
|
ParserLoadLog.Source.FNS_REPORTS: (
|
|
FinancialReport,
|
|
cls._financial_report_adapter,
|
|
),
|
|
ParserLoadLog.Source.PROCUREMENTS: (
|
|
ProcurementRecord,
|
|
cls._procurement_adapter,
|
|
),
|
|
ParserLoadLog.Source.INDUSTRIAL: (
|
|
IndustrialCertificateRecord,
|
|
cls._industrial_certificate_adapter,
|
|
),
|
|
ParserLoadLog.Source.INDUSTRIAL_PRODUCTS: (
|
|
IndustrialProductRecord,
|
|
cls._industrial_product_adapter,
|
|
),
|
|
ParserLoadLog.Source.MANUFACTURES: (
|
|
ManufacturerRecord,
|
|
cls._manufacturer_adapter,
|
|
),
|
|
}.get(source)
|
|
|
|
@classmethod
|
|
def _resolve_or_create_organization(
|
|
cls,
|
|
adapter: LegacyRecordAdapter,
|
|
) -> tuple[Organization | None, bool]:
|
|
organization = cls._resolve_organization(adapter)
|
|
if organization is not None:
|
|
return organization, False
|
|
|
|
name = adapter.organization_name or adapter.title or adapter.external_id
|
|
if not name:
|
|
return None, False
|
|
|
|
organization = Organization.objects.create(
|
|
name=name,
|
|
inn=adapter.inn,
|
|
kpp=adapter.kpp,
|
|
ogrn=adapter.ogrn if len(adapter.ogrn) == 13 else "",
|
|
ogrip=adapter.ogrip,
|
|
)
|
|
return organization, True
|
|
|
|
@classmethod
|
|
def _resolve_organization(cls, adapter: LegacyRecordAdapter) -> Organization | None:
|
|
for resolver in (
|
|
cls._resolve_by_inn_kpp,
|
|
cls._resolve_by_ogrn_or_ogrip,
|
|
cls._resolve_by_ogrip,
|
|
cls._resolve_by_unique_inn,
|
|
cls._resolve_by_exact_normalized_name,
|
|
):
|
|
organization = resolver(adapter)
|
|
if organization is not None:
|
|
return organization
|
|
|
|
return None
|
|
|
|
@staticmethod
|
|
def _resolve_by_inn_kpp(adapter: LegacyRecordAdapter) -> Organization | None:
|
|
if not adapter.inn or not adapter.kpp:
|
|
return None
|
|
return Organization.objects.filter(
|
|
inn=adapter.inn,
|
|
kpp=adapter.kpp,
|
|
).first()
|
|
|
|
@staticmethod
|
|
def _resolve_by_ogrn_or_ogrip(adapter: LegacyRecordAdapter) -> Organization | None:
|
|
if not adapter.ogrn:
|
|
return None
|
|
return (
|
|
Organization.objects.filter(ogrn=adapter.ogrn).first()
|
|
or Organization.objects.filter(ogrip=adapter.ogrn).first()
|
|
)
|
|
|
|
@staticmethod
|
|
def _resolve_by_ogrip(adapter: LegacyRecordAdapter) -> Organization | None:
|
|
if not adapter.ogrip:
|
|
return None
|
|
return Organization.objects.filter(ogrip=adapter.ogrip).first()
|
|
|
|
@staticmethod
|
|
def _resolve_by_unique_inn(adapter: LegacyRecordAdapter) -> Organization | None:
|
|
if not adapter.inn:
|
|
return None
|
|
organizations = list(Organization.objects.filter(inn=adapter.inn)[:2])
|
|
return organizations[0] if len(organizations) == 1 else None
|
|
|
|
@staticmethod
|
|
def _resolve_by_exact_normalized_name(
|
|
adapter: LegacyRecordAdapter,
|
|
) -> Organization | None:
|
|
normalized_name = normalize_organization_name(adapter.organization_name)
|
|
if not normalized_name:
|
|
return None
|
|
matches = list(
|
|
Organization.objects.filter(name__iexact=adapter.organization_name)[:2]
|
|
)
|
|
return matches[0] if len(matches) == 1 else None
|
|
|
|
@classmethod
|
|
def _backfill_financial_lines(
|
|
cls,
|
|
source_record: OrganizationSourceRecord,
|
|
report_pk: str,
|
|
) -> tuple[int, int]:
|
|
created = 0
|
|
updated = 0
|
|
for line in FinancialReportLine.objects.filter(report_id=report_pk).iterator():
|
|
_, was_created = OrganizationSourceFinancialLine.objects.update_or_create(
|
|
source_record=source_record,
|
|
form_code=line.form_code,
|
|
line_code=line.line_code,
|
|
year=line.year,
|
|
defaults={
|
|
"line_name": line.line_name,
|
|
"period_start": line.period_start,
|
|
"period_end": line.period_end,
|
|
},
|
|
)
|
|
if was_created:
|
|
created += 1
|
|
else:
|
|
updated += 1
|
|
return created, updated
|
|
|
|
@staticmethod
|
|
def _refresh_extension_counters(extension_ids: set[str]) -> None:
|
|
for extension in OrganizationSourceExtension.objects.filter(
|
|
uid__in=extension_ids
|
|
):
|
|
aggregate = extension.records.aggregate(
|
|
records_count=Count("uid"),
|
|
first_seen_at=Min("created_at"),
|
|
last_seen_at=Max("created_at"),
|
|
last_load_batch=Max("load_batch"),
|
|
)
|
|
extension.records_count = aggregate["records_count"] or 0
|
|
extension.first_seen_at = aggregate["first_seen_at"]
|
|
extension.last_seen_at = aggregate["last_seen_at"]
|
|
extension.last_load_batch = aggregate["last_load_batch"]
|
|
extension.save(
|
|
update_fields=[
|
|
"records_count",
|
|
"first_seen_at",
|
|
"last_seen_at",
|
|
"last_load_batch",
|
|
"updated_at",
|
|
]
|
|
)
|
|
|
|
@classmethod
|
|
def _inspection_adapter(
|
|
cls,
|
|
record: InspectionRecord,
|
|
descriptor: SourceGroupDescriptor,
|
|
) -> LegacyRecordAdapter:
|
|
return cls._adapter(
|
|
record,
|
|
descriptor,
|
|
external_id=record.registration_number,
|
|
title=record.organisation_name,
|
|
organization_name=record.organisation_name,
|
|
inn=record.inn,
|
|
ogrn=record.ogrn,
|
|
record_date=record.start_date,
|
|
status=record.status,
|
|
)
|
|
|
|
@classmethod
|
|
def _financial_report_adapter(
|
|
cls,
|
|
record: FinancialReport,
|
|
descriptor: SourceGroupDescriptor,
|
|
) -> LegacyRecordAdapter:
|
|
return cls._adapter(
|
|
record,
|
|
descriptor,
|
|
external_id=record.external_id,
|
|
title=record.file_name,
|
|
organization_name="",
|
|
ogrn=record.ogrn,
|
|
status=record.status,
|
|
)
|
|
|
|
@classmethod
|
|
def _procurement_adapter(
|
|
cls,
|
|
record: ProcurementRecord,
|
|
descriptor: SourceGroupDescriptor,
|
|
) -> LegacyRecordAdapter:
|
|
return cls._adapter(
|
|
record,
|
|
descriptor,
|
|
external_id=record.purchase_number,
|
|
title=record.purchase_name,
|
|
organization_name=record.customer_name,
|
|
inn=record.customer_inn,
|
|
kpp=record.customer_kpp,
|
|
ogrn=record.customer_ogrn,
|
|
record_date=record.publish_date,
|
|
amount=record.max_price_amount,
|
|
status=record.status,
|
|
url=record.href,
|
|
)
|
|
|
|
@classmethod
|
|
def _industrial_certificate_adapter(
|
|
cls,
|
|
record: IndustrialCertificateRecord,
|
|
descriptor: SourceGroupDescriptor,
|
|
) -> LegacyRecordAdapter:
|
|
return cls._adapter(
|
|
record,
|
|
descriptor,
|
|
external_id=record.certificate_number,
|
|
title=record.certificate_number,
|
|
organization_name=record.organisation_name,
|
|
inn=record.inn,
|
|
ogrn=record.ogrn,
|
|
record_date=record.issue_date,
|
|
url=record.certificate_file_url,
|
|
)
|
|
|
|
@classmethod
|
|
def _industrial_product_adapter(
|
|
cls,
|
|
record: IndustrialProductRecord,
|
|
descriptor: SourceGroupDescriptor,
|
|
) -> LegacyRecordAdapter:
|
|
return cls._adapter(
|
|
record,
|
|
descriptor,
|
|
external_id=record.registry_number,
|
|
title=record.product_name,
|
|
organization_name=record.full_organisation_name,
|
|
inn=record.inn,
|
|
ogrn=record.ogrn,
|
|
)
|
|
|
|
@classmethod
|
|
def _manufacturer_adapter(
|
|
cls,
|
|
record: ManufacturerRecord,
|
|
descriptor: SourceGroupDescriptor,
|
|
) -> LegacyRecordAdapter:
|
|
return cls._adapter(
|
|
record,
|
|
descriptor,
|
|
external_id=record.inn,
|
|
title=record.full_legal_name,
|
|
organization_name=record.full_legal_name,
|
|
inn=record.inn,
|
|
ogrn=record.ogrn,
|
|
)
|
|
|
|
@classmethod
|
|
def _generic_adapter(
|
|
cls,
|
|
record: GenericParserRecord,
|
|
descriptor: SourceGroupDescriptor,
|
|
) -> LegacyRecordAdapter:
|
|
payload = cls._model_payload(record)
|
|
if isinstance(record.payload, dict):
|
|
payload.update(record.payload)
|
|
return cls._adapter(
|
|
record,
|
|
descriptor,
|
|
external_id=record.external_id,
|
|
title=record.title,
|
|
organization_name=record.organisation_name,
|
|
inn=record.inn,
|
|
ogrn=record.ogrn,
|
|
record_date=record.record_date,
|
|
amount=record.amount,
|
|
status=record.status,
|
|
url=record.url,
|
|
payload=payload,
|
|
)
|
|
|
|
@classmethod
|
|
def _adapter(
|
|
cls,
|
|
record,
|
|
descriptor: SourceGroupDescriptor,
|
|
*,
|
|
external_id: str,
|
|
title: str,
|
|
organization_name: str,
|
|
inn: str = "",
|
|
kpp: str = "",
|
|
ogrn: str = "",
|
|
ogrip: str = "",
|
|
record_date: str = "",
|
|
amount: Decimal | None = None,
|
|
status: str = "",
|
|
url: str = "",
|
|
payload: dict[str, Any] | None = None,
|
|
) -> LegacyRecordAdapter:
|
|
(
|
|
normalized_inn,
|
|
normalized_kpp,
|
|
normalized_ogrn,
|
|
normalized_ogrip,
|
|
) = normalize_identity_fields(
|
|
inn=inn,
|
|
kpp=kpp,
|
|
ogrn=ogrn,
|
|
ogrip=ogrip,
|
|
)
|
|
return LegacyRecordAdapter(
|
|
source=str(descriptor.source),
|
|
record_type=descriptor.record_type,
|
|
external_id=str(external_id or ""),
|
|
title=str(title or ""),
|
|
organization_name=str(organization_name or ""),
|
|
inn=normalized_inn,
|
|
kpp=normalized_kpp,
|
|
ogrn=normalized_ogrn,
|
|
ogrip=normalized_ogrip,
|
|
record_date=str(record_date or ""),
|
|
amount=amount,
|
|
status=str(status or ""),
|
|
url=str(url or ""),
|
|
payload=payload if payload is not None else cls._model_payload(record),
|
|
legacy_model=cls._legacy_model_name(record),
|
|
legacy_pk=str(record.pk),
|
|
load_batch=getattr(record, "load_batch", None),
|
|
)
|
|
|
|
@staticmethod
|
|
def _legacy_model_name(record: Model) -> str:
|
|
module = record.__class__.__module__.removesuffix(".models")
|
|
return f"{module}.{record.__class__.__name__}"
|
|
|
|
@staticmethod
|
|
def _model_payload(record: Model) -> dict[str, Any]:
|
|
payload = {"id": record.pk}
|
|
for field in record._meta.concrete_fields:
|
|
name = field.name
|
|
value = (
|
|
getattr(record, field.attname)
|
|
if field.is_relation and field.many_to_one
|
|
else getattr(record, name)
|
|
)
|
|
if name == "id":
|
|
continue
|
|
if isinstance(value, datetime | date):
|
|
payload[name] = value.isoformat()
|
|
elif isinstance(value, Decimal | UUID):
|
|
payload[name] = str(value)
|
|
else:
|
|
payload[name] = value
|
|
return payload
|