feat: migrate parser data to source records
Some checks failed
CI/CD Pipeline / Quality Gate (push) Failing after 14s
CI/CD Pipeline / Build and Push Images (push) Has been skipped
CI/CD Pipeline / Deploy Dev in Dokploy (push) Has been skipped
CI/CD Pipeline / Internal Notify (push) Successful in 0s

This commit is contained in:
2026-05-19 20:21:31 +02:00
parent 1c7c7238be
commit b8a18d6da4
46 changed files with 2689 additions and 6179 deletions

View File

@@ -20,6 +20,7 @@ from organizations.models import (
OrganizationSourceRecord,
)
from organizations.name_normalization import normalize_organization_name
from organizations.source_cache import invalidate_source_data_cache
from organizations.source_groups import (
SourceGroupDescriptor,
get_source_group_descriptor,
@@ -128,18 +129,21 @@ class OrganizationSourceIngestionService:
with transaction.atomic():
normalized_records = cls._normalize_records(records)
organizations_by_index, created_organizations = (
cls._resolve_or_create_organizations(normalized_records)
)
(
organizations_by_index,
created_organizations,
) = cls._resolve_or_create_organizations(normalized_records)
del normalized_records
unresolved = scanned - len(organizations_by_index)
extensions_by_organization_id, created_extensions, updated_extensions = (
cls._resolve_or_create_extensions(
descriptor=descriptor,
load_batch=load_batch,
organizations=organizations_by_index.values(),
)
(
extensions_by_organization_id,
created_extensions,
updated_extensions,
) = cls._resolve_or_create_extensions(
descriptor=descriptor,
load_batch=load_batch,
organizations=organizations_by_index.values(),
)
touched_extension_ids: set[str] = set()
@@ -166,6 +170,8 @@ class OrganizationSourceIngestionService:
cls._refresh_extension_counters(touched_extension_ids)
invalidate_source_data_cache()
return OrganizationSourceIngestionResult(
scanned=scanned,
created_organizations=created_organizations,
@@ -256,10 +262,18 @@ class OrganizationSourceIngestionService:
organizations_by_index,
)
return cls._create_missing_organizations(
(
organizations_by_index,
created_organizations,
) = cls._create_missing_organizations(
normalized_records,
organizations_by_index,
)
cls._update_resolved_organization_identities(
normalized_records,
organizations_by_index,
)
return organizations_by_index, created_organizations
@classmethod
def _resolve_organizations_by_inn_kpp(
@@ -285,7 +299,9 @@ class OrganizationSourceIngestionService:
for inn, kpp in chunk:
query |= Q(inn=inn, kpp=kpp)
for organization in Organization.objects.filter(query):
organizations_by_key[(organization.inn, organization.kpp)] = organization
organizations_by_key[
(organization.inn, organization.kpp)
] = organization
for record in normalized_records:
if record.index in organizations_by_index:
@@ -384,6 +400,7 @@ class OrganizationSourceIngestionService:
record.organization_name.strip()
for record in normalized_records
if record.index not in organizations_by_index
and not cls._record_has_identity(record)
and normalize_organization_name(record.organization_name)
}
)
@@ -406,10 +423,136 @@ class OrganizationSourceIngestionService:
for record in normalized_records:
if record.index in organizations_by_index:
continue
if cls._record_has_identity(record):
continue
organization = unique_by_name.get(record.organization_name.strip().lower())
if organization is not None:
organizations_by_index[record.index] = organization
@staticmethod
def _record_has_identity(record: _NormalizedRecordInput) -> bool:
return bool(record.inn or record.ogrn or record.ogrip)
@classmethod
def _update_resolved_organization_identities(
cls,
normalized_records: list[_NormalizedRecordInput],
organizations_by_index: dict[int, Organization],
) -> None:
safe_inn_by_organization_id = cls._safe_missing_inn_updates(
normalized_records,
organizations_by_index,
)
changed_by_uid: dict[str, Organization] = {}
for record in normalized_records:
organization = organizations_by_index.get(record.index)
if organization is None:
continue
if cls._apply_missing_identity_fields(
organization,
record,
safe_inn_by_organization_id=safe_inn_by_organization_id,
):
changed_by_uid[str(organization.uid)] = organization
if changed_by_uid:
Organization.objects.bulk_update(
list(changed_by_uid.values()),
fields=[
"name",
"inn",
"kpp",
"ogrn",
"ogrip",
"identity_status",
"primary_identity",
],
batch_size=cls.chunk_size,
)
@classmethod
def _safe_missing_inn_updates(
cls,
normalized_records: list[_NormalizedRecordInput],
organizations_by_index: dict[int, Organization],
) -> dict[str, str]:
desired_inn_by_organization_id: dict[str, str] = {}
desired_organization_ids_by_inn: dict[str, set[str]] = defaultdict(set)
for record in normalized_records:
organization = organizations_by_index.get(record.index)
if organization is None or organization.inn or not record.inn:
continue
organization_id = str(organization.uid)
desired_inn_by_organization_id.setdefault(organization_id, record.inn)
desired_organization_ids_by_inn[record.inn].add(organization_id)
conflicting_inns = set()
if desired_inn_by_organization_id:
conflicting_inns = set(
Organization.objects.filter(
inn__in=set(desired_inn_by_organization_id.values())
)
.exclude(uid__in=list(desired_inn_by_organization_id))
.values_list("inn", flat=True)
)
safe_updates = {}
for organization_id, inn in desired_inn_by_organization_id.items():
if (
len(desired_organization_ids_by_inn[inn]) == 1
and inn not in conflicting_inns
):
safe_updates[organization_id] = inn
return safe_updates
@classmethod
def _apply_missing_identity_fields(
cls,
organization: Organization,
record: _NormalizedRecordInput,
*,
safe_inn_by_organization_id: dict[str, str],
) -> bool:
changed = False
organization_id = str(organization.uid)
safe_inn = safe_inn_by_organization_id.get(organization_id)
if not organization.inn and safe_inn == record.inn:
organization.inn = record.inn
changed = True
if not organization.kpp and record.kpp:
organization.kpp = record.kpp
changed = True
if not organization.ogrn and record.ogrn:
organization.ogrn = record.ogrn
changed = True
if not organization.ogrip and record.ogrip:
organization.ogrip = record.ogrip
changed = True
if cls._should_replace_placeholder_name(organization, record.organization_name):
organization.name = record.organization_name.strip()
changed = True
if changed:
organization.identity_status = organization._resolve_identity_status()
organization.primary_identity = organization._resolve_primary_identity()
return changed
@staticmethod
def _should_replace_placeholder_name(
organization: Organization,
candidate_name: str,
) -> bool:
normalized_candidate = normalize_organization_name(candidate_name)
if not normalized_candidate:
return False
current_name = organization.name.strip()
if not normalize_organization_name(current_name):
return True
return current_name in {
organization.inn,
organization.ogrn,
organization.ogrip,
}
@classmethod
def _create_missing_organizations(
cls,
@@ -510,7 +653,9 @@ class OrganizationSourceIngestionService:
load_batch: int | None,
organizations: Iterable[Organization],
) -> tuple[dict[Any, OrganizationSourceExtension], int, int]:
unique_organizations = {organization.uid: organization for organization in organizations}
unique_organizations = {
organization.uid: organization for organization in organizations
}
if not unique_organizations:
return {}, 0, 0
@@ -526,13 +671,13 @@ class OrganizationSourceIngestionService:
for organization_id, organization in unique_organizations.items():
if organization_id in extensions_by_organization_id:
continue
extensions_by_organization_id[organization_id] = (
descriptor.extension_model.objects.create(
organization=organization,
source_group=descriptor.source_group,
title=descriptor.title,
last_load_batch=load_batch,
)
extensions_by_organization_id[
organization_id
] = descriptor.extension_model.objects.create(
organization=organization,
source_group=descriptor.source_group,
title=descriptor.title,
last_load_batch=load_batch,
)
created_extensions += 1
@@ -579,12 +724,14 @@ class OrganizationSourceIngestionService:
updated_financial_lines = 0
for chunk in cls._iter_chunks(record_inputs_with_extensions, cls.chunk_size):
source_records_by_index, chunk_created, chunk_updated = (
cls._bulk_upsert_source_records_chunk(
descriptor=descriptor,
load_batch=load_batch,
record_inputs_with_extensions=chunk,
)
(
source_records_by_index,
chunk_created,
chunk_updated,
) = cls._bulk_upsert_source_records_chunk(
descriptor=descriptor,
load_batch=load_batch,
record_inputs_with_extensions=chunk,
)
created_records += chunk_created
updated_records += chunk_updated