feat: migrate parser data to source records
This commit is contained in:
@@ -20,6 +20,7 @@ from organizations.models import (
|
||||
OrganizationSourceRecord,
|
||||
)
|
||||
from organizations.name_normalization import normalize_organization_name
|
||||
from organizations.source_cache import invalidate_source_data_cache
|
||||
from organizations.source_groups import (
|
||||
SourceGroupDescriptor,
|
||||
get_source_group_descriptor,
|
||||
@@ -128,18 +129,21 @@ class OrganizationSourceIngestionService:
|
||||
|
||||
with transaction.atomic():
|
||||
normalized_records = cls._normalize_records(records)
|
||||
organizations_by_index, created_organizations = (
|
||||
cls._resolve_or_create_organizations(normalized_records)
|
||||
)
|
||||
(
|
||||
organizations_by_index,
|
||||
created_organizations,
|
||||
) = cls._resolve_or_create_organizations(normalized_records)
|
||||
del normalized_records
|
||||
unresolved = scanned - len(organizations_by_index)
|
||||
|
||||
extensions_by_organization_id, created_extensions, updated_extensions = (
|
||||
cls._resolve_or_create_extensions(
|
||||
descriptor=descriptor,
|
||||
load_batch=load_batch,
|
||||
organizations=organizations_by_index.values(),
|
||||
)
|
||||
(
|
||||
extensions_by_organization_id,
|
||||
created_extensions,
|
||||
updated_extensions,
|
||||
) = cls._resolve_or_create_extensions(
|
||||
descriptor=descriptor,
|
||||
load_batch=load_batch,
|
||||
organizations=organizations_by_index.values(),
|
||||
)
|
||||
|
||||
touched_extension_ids: set[str] = set()
|
||||
@@ -166,6 +170,8 @@ class OrganizationSourceIngestionService:
|
||||
|
||||
cls._refresh_extension_counters(touched_extension_ids)
|
||||
|
||||
invalidate_source_data_cache()
|
||||
|
||||
return OrganizationSourceIngestionResult(
|
||||
scanned=scanned,
|
||||
created_organizations=created_organizations,
|
||||
@@ -256,10 +262,18 @@ class OrganizationSourceIngestionService:
|
||||
organizations_by_index,
|
||||
)
|
||||
|
||||
return cls._create_missing_organizations(
|
||||
(
|
||||
organizations_by_index,
|
||||
created_organizations,
|
||||
) = cls._create_missing_organizations(
|
||||
normalized_records,
|
||||
organizations_by_index,
|
||||
)
|
||||
cls._update_resolved_organization_identities(
|
||||
normalized_records,
|
||||
organizations_by_index,
|
||||
)
|
||||
return organizations_by_index, created_organizations
|
||||
|
||||
@classmethod
|
||||
def _resolve_organizations_by_inn_kpp(
|
||||
@@ -285,7 +299,9 @@ class OrganizationSourceIngestionService:
|
||||
for inn, kpp in chunk:
|
||||
query |= Q(inn=inn, kpp=kpp)
|
||||
for organization in Organization.objects.filter(query):
|
||||
organizations_by_key[(organization.inn, organization.kpp)] = organization
|
||||
organizations_by_key[
|
||||
(organization.inn, organization.kpp)
|
||||
] = organization
|
||||
|
||||
for record in normalized_records:
|
||||
if record.index in organizations_by_index:
|
||||
@@ -384,6 +400,7 @@ class OrganizationSourceIngestionService:
|
||||
record.organization_name.strip()
|
||||
for record in normalized_records
|
||||
if record.index not in organizations_by_index
|
||||
and not cls._record_has_identity(record)
|
||||
and normalize_organization_name(record.organization_name)
|
||||
}
|
||||
)
|
||||
@@ -406,10 +423,136 @@ class OrganizationSourceIngestionService:
|
||||
for record in normalized_records:
|
||||
if record.index in organizations_by_index:
|
||||
continue
|
||||
if cls._record_has_identity(record):
|
||||
continue
|
||||
organization = unique_by_name.get(record.organization_name.strip().lower())
|
||||
if organization is not None:
|
||||
organizations_by_index[record.index] = organization
|
||||
|
||||
@staticmethod
|
||||
def _record_has_identity(record: _NormalizedRecordInput) -> bool:
|
||||
return bool(record.inn or record.ogrn or record.ogrip)
|
||||
|
||||
@classmethod
|
||||
def _update_resolved_organization_identities(
|
||||
cls,
|
||||
normalized_records: list[_NormalizedRecordInput],
|
||||
organizations_by_index: dict[int, Organization],
|
||||
) -> None:
|
||||
safe_inn_by_organization_id = cls._safe_missing_inn_updates(
|
||||
normalized_records,
|
||||
organizations_by_index,
|
||||
)
|
||||
changed_by_uid: dict[str, Organization] = {}
|
||||
for record in normalized_records:
|
||||
organization = organizations_by_index.get(record.index)
|
||||
if organization is None:
|
||||
continue
|
||||
if cls._apply_missing_identity_fields(
|
||||
organization,
|
||||
record,
|
||||
safe_inn_by_organization_id=safe_inn_by_organization_id,
|
||||
):
|
||||
changed_by_uid[str(organization.uid)] = organization
|
||||
|
||||
if changed_by_uid:
|
||||
Organization.objects.bulk_update(
|
||||
list(changed_by_uid.values()),
|
||||
fields=[
|
||||
"name",
|
||||
"inn",
|
||||
"kpp",
|
||||
"ogrn",
|
||||
"ogrip",
|
||||
"identity_status",
|
||||
"primary_identity",
|
||||
],
|
||||
batch_size=cls.chunk_size,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def _safe_missing_inn_updates(
|
||||
cls,
|
||||
normalized_records: list[_NormalizedRecordInput],
|
||||
organizations_by_index: dict[int, Organization],
|
||||
) -> dict[str, str]:
|
||||
desired_inn_by_organization_id: dict[str, str] = {}
|
||||
desired_organization_ids_by_inn: dict[str, set[str]] = defaultdict(set)
|
||||
for record in normalized_records:
|
||||
organization = organizations_by_index.get(record.index)
|
||||
if organization is None or organization.inn or not record.inn:
|
||||
continue
|
||||
organization_id = str(organization.uid)
|
||||
desired_inn_by_organization_id.setdefault(organization_id, record.inn)
|
||||
desired_organization_ids_by_inn[record.inn].add(organization_id)
|
||||
|
||||
conflicting_inns = set()
|
||||
if desired_inn_by_organization_id:
|
||||
conflicting_inns = set(
|
||||
Organization.objects.filter(
|
||||
inn__in=set(desired_inn_by_organization_id.values())
|
||||
)
|
||||
.exclude(uid__in=list(desired_inn_by_organization_id))
|
||||
.values_list("inn", flat=True)
|
||||
)
|
||||
|
||||
safe_updates = {}
|
||||
for organization_id, inn in desired_inn_by_organization_id.items():
|
||||
if (
|
||||
len(desired_organization_ids_by_inn[inn]) == 1
|
||||
and inn not in conflicting_inns
|
||||
):
|
||||
safe_updates[organization_id] = inn
|
||||
return safe_updates
|
||||
|
||||
@classmethod
|
||||
def _apply_missing_identity_fields(
|
||||
cls,
|
||||
organization: Organization,
|
||||
record: _NormalizedRecordInput,
|
||||
*,
|
||||
safe_inn_by_organization_id: dict[str, str],
|
||||
) -> bool:
|
||||
changed = False
|
||||
organization_id = str(organization.uid)
|
||||
safe_inn = safe_inn_by_organization_id.get(organization_id)
|
||||
if not organization.inn and safe_inn == record.inn:
|
||||
organization.inn = record.inn
|
||||
changed = True
|
||||
if not organization.kpp and record.kpp:
|
||||
organization.kpp = record.kpp
|
||||
changed = True
|
||||
if not organization.ogrn and record.ogrn:
|
||||
organization.ogrn = record.ogrn
|
||||
changed = True
|
||||
if not organization.ogrip and record.ogrip:
|
||||
organization.ogrip = record.ogrip
|
||||
changed = True
|
||||
if cls._should_replace_placeholder_name(organization, record.organization_name):
|
||||
organization.name = record.organization_name.strip()
|
||||
changed = True
|
||||
if changed:
|
||||
organization.identity_status = organization._resolve_identity_status()
|
||||
organization.primary_identity = organization._resolve_primary_identity()
|
||||
return changed
|
||||
|
||||
@staticmethod
|
||||
def _should_replace_placeholder_name(
|
||||
organization: Organization,
|
||||
candidate_name: str,
|
||||
) -> bool:
|
||||
normalized_candidate = normalize_organization_name(candidate_name)
|
||||
if not normalized_candidate:
|
||||
return False
|
||||
current_name = organization.name.strip()
|
||||
if not normalize_organization_name(current_name):
|
||||
return True
|
||||
return current_name in {
|
||||
organization.inn,
|
||||
organization.ogrn,
|
||||
organization.ogrip,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def _create_missing_organizations(
|
||||
cls,
|
||||
@@ -510,7 +653,9 @@ class OrganizationSourceIngestionService:
|
||||
load_batch: int | None,
|
||||
organizations: Iterable[Organization],
|
||||
) -> tuple[dict[Any, OrganizationSourceExtension], int, int]:
|
||||
unique_organizations = {organization.uid: organization for organization in organizations}
|
||||
unique_organizations = {
|
||||
organization.uid: organization for organization in organizations
|
||||
}
|
||||
if not unique_organizations:
|
||||
return {}, 0, 0
|
||||
|
||||
@@ -526,13 +671,13 @@ class OrganizationSourceIngestionService:
|
||||
for organization_id, organization in unique_organizations.items():
|
||||
if organization_id in extensions_by_organization_id:
|
||||
continue
|
||||
extensions_by_organization_id[organization_id] = (
|
||||
descriptor.extension_model.objects.create(
|
||||
organization=organization,
|
||||
source_group=descriptor.source_group,
|
||||
title=descriptor.title,
|
||||
last_load_batch=load_batch,
|
||||
)
|
||||
extensions_by_organization_id[
|
||||
organization_id
|
||||
] = descriptor.extension_model.objects.create(
|
||||
organization=organization,
|
||||
source_group=descriptor.source_group,
|
||||
title=descriptor.title,
|
||||
last_load_batch=load_batch,
|
||||
)
|
||||
created_extensions += 1
|
||||
|
||||
@@ -579,12 +724,14 @@ class OrganizationSourceIngestionService:
|
||||
updated_financial_lines = 0
|
||||
|
||||
for chunk in cls._iter_chunks(record_inputs_with_extensions, cls.chunk_size):
|
||||
source_records_by_index, chunk_created, chunk_updated = (
|
||||
cls._bulk_upsert_source_records_chunk(
|
||||
descriptor=descriptor,
|
||||
load_batch=load_batch,
|
||||
record_inputs_with_extensions=chunk,
|
||||
)
|
||||
(
|
||||
source_records_by_index,
|
||||
chunk_created,
|
||||
chunk_updated,
|
||||
) = cls._bulk_upsert_source_records_chunk(
|
||||
descriptor=descriptor,
|
||||
load_batch=load_batch,
|
||||
record_inputs_with_extensions=chunk,
|
||||
)
|
||||
created_records += chunk_created
|
||||
updated_records += chunk_updated
|
||||
|
||||
Reference in New Issue
Block a user