Files
mostovik-backend/src/organizations/source_ingestion.py
Aleksandr Meshchriakov 76a99a4a1e
All checks were successful
CI/CD Pipeline / Quality Gate (push) Successful in 35s
CI/CD Pipeline / Build and Push Images (push) Successful in 19s
CI/CD Pipeline / Internal Notify (push) Successful in 0s
CI/CD Pipeline / Deploy Dev in Dokploy (push) Successful in 1s
fix: keep organization identity types exclusive
2026-05-20 12:45:06 +02:00

1114 lines
38 KiB
Python

"""Direct parser ingestion into organization source storage."""
from __future__ import annotations
from collections import defaultdict
from collections.abc import Iterable, Iterator
from dataclasses import dataclass, field
from decimal import Decimal
from typing import Any
from django.db import transaction
from django.db.models import Count, Max, Min, Q
from django.db.models.functions import Lower
from django.utils import timezone
from organizations.models import (
Organization,
OrganizationSourceExtension,
OrganizationSourceFinancialLine,
OrganizationSourceRecord,
)
from organizations.name_normalization import normalize_organization_name
from organizations.source_cache import invalidate_source_data_cache
from organizations.source_groups import (
SourceGroupDescriptor,
get_source_group_descriptor,
)
from organizations.source_identity import normalize_identity_fields
@dataclass(frozen=True)
class SourceFinancialLineInput:
"""Structured financial line parsed from a source record."""
form_code: str
line_code: str
line_name: str
year: int
period_start: int | None = None
period_end: int | None = None
@dataclass(frozen=True)
class SourceRecordInput:
"""Normalized parser output ready for organization source storage."""
external_id: str
title: str
organization_name: str
inn: str = ""
kpp: str = ""
ogrn: str = ""
ogrip: str = ""
record_date: str = ""
amount: Decimal | None = None
status: str = ""
url: str = ""
payload: dict[str, Any] = field(default_factory=dict)
financial_lines: Iterable[SourceFinancialLineInput] = field(default_factory=tuple)
@dataclass(frozen=True)
class OrganizationSourceIngestionResult:
"""Counters returned by direct source ingestion."""
scanned: int = 0
created_organizations: int = 0
created_extensions: int = 0
updated_extensions: int = 0
created_records: int = 0
updated_records: int = 0
created_financial_lines: int = 0
updated_financial_lines: int = 0
unresolved: int = 0
@dataclass(frozen=True)
class _NormalizedRecordInput:
index: int
record: SourceRecordInput
inn: str
kpp: str
ogrn: str
ogrip: str
organization_name: str
class OrganizationSourceIngestionService:
"""Persist parser outputs directly into organization source storage."""
chunk_size = 500
@classmethod
def save_records(
cls,
*,
source: str,
load_batch: int | None,
records: Iterable[SourceRecordInput],
) -> OrganizationSourceIngestionResult:
descriptor = get_source_group_descriptor(str(source))
deduplicated = cls._deduplicate_records(records)
return cls._save_records(descriptor, load_batch, deduplicated)
@staticmethod
def _deduplicate_records(
records: Iterable[SourceRecordInput],
) -> list[SourceRecordInput]:
by_external_id: dict[str, SourceRecordInput] = {}
without_external_id = []
for record in records:
external_id = str(record.external_id or "")
if not external_id:
without_external_id.append(record)
continue
by_external_id[external_id] = record
return [*by_external_id.values(), *without_external_id]
@classmethod
def _save_records(
cls,
descriptor: SourceGroupDescriptor,
load_batch: int | None,
records: list[SourceRecordInput],
) -> OrganizationSourceIngestionResult:
scanned = len(records)
if not records:
return OrganizationSourceIngestionResult()
with transaction.atomic():
normalized_records = cls._normalize_records(records)
(
organizations_by_index,
created_organizations,
) = cls._resolve_or_create_organizations(normalized_records)
del normalized_records
unresolved = scanned - len(organizations_by_index)
(
extensions_by_organization_id,
created_extensions,
updated_extensions,
) = cls._resolve_or_create_extensions(
descriptor=descriptor,
load_batch=load_batch,
organizations=organizations_by_index.values(),
)
touched_extension_ids: set[str] = set()
for organization in organizations_by_index.values():
extension = extensions_by_organization_id.get(organization.uid)
if extension is not None:
touched_extension_ids.add(str(extension.uid))
(
created_records,
updated_records,
created_financial_lines,
updated_financial_lines,
) = cls._bulk_upsert_source_records(
descriptor=descriptor,
load_batch=load_batch,
records=records,
record_inputs_with_extensions=cls._iter_record_inputs_with_extensions(
records=records,
organizations_by_index=organizations_by_index,
extensions_by_organization_id=extensions_by_organization_id,
),
)
cls._refresh_extension_counters(touched_extension_ids)
invalidate_source_data_cache()
return OrganizationSourceIngestionResult(
scanned=scanned,
created_organizations=created_organizations,
created_extensions=created_extensions,
updated_extensions=updated_extensions,
created_records=created_records,
updated_records=updated_records,
created_financial_lines=created_financial_lines,
updated_financial_lines=updated_financial_lines,
unresolved=unresolved,
)
@staticmethod
def _chunks(values: list[Any], chunk_size: int) -> Iterable[list[Any]]:
for offset in range(0, len(values), chunk_size):
yield values[offset : offset + chunk_size]
@staticmethod
def _iter_chunks(values: Iterable[Any], chunk_size: int) -> Iterator[list[Any]]:
chunk = []
for value in values:
chunk.append(value)
if len(chunk) >= chunk_size:
yield chunk
chunk = []
if chunk:
yield chunk
@staticmethod
def _iter_record_inputs_with_extensions(
*,
records: list[SourceRecordInput],
organizations_by_index: dict[int, Organization],
extensions_by_organization_id: dict[Any, OrganizationSourceExtension],
) -> Iterator[tuple[int, SourceRecordInput, OrganizationSourceExtension]]:
for index, organization in organizations_by_index.items():
extension = extensions_by_organization_id.get(organization.uid)
if extension is not None:
yield index, records[index], extension
@classmethod
def _normalize_records(
cls,
records: list[SourceRecordInput],
) -> list[_NormalizedRecordInput]:
normalized_records = []
for index, record_input in enumerate(records):
inn, kpp, ogrn, ogrip = normalize_identity_fields(
inn=record_input.inn,
kpp=record_input.kpp,
ogrn=record_input.ogrn,
ogrip=record_input.ogrip,
)
normalized_records.append(
_NormalizedRecordInput(
index=index,
record=record_input,
inn=inn,
kpp=kpp,
ogrn=ogrn,
ogrip=ogrip,
organization_name=str(record_input.organization_name or ""),
)
)
return normalized_records
@classmethod
def _resolve_or_create_organizations(
cls,
normalized_records: list[_NormalizedRecordInput],
) -> tuple[dict[int, Organization], int]:
organizations_by_index: dict[int, Organization] = {}
cls._resolve_organizations_by_inn_kpp(
normalized_records,
organizations_by_index,
)
cls._resolve_organizations_by_ogrn_or_ogrip(
normalized_records,
organizations_by_index,
)
cls._resolve_organizations_by_unique_inn(
normalized_records,
organizations_by_index,
)
cls._resolve_organizations_by_exact_name(
normalized_records,
organizations_by_index,
)
(
organizations_by_index,
created_organizations,
) = cls._create_missing_organizations(
normalized_records,
organizations_by_index,
)
cls._update_resolved_organization_identities(
normalized_records,
organizations_by_index,
)
return organizations_by_index, created_organizations
@classmethod
def _resolve_organizations_by_inn_kpp(
cls,
normalized_records: list[_NormalizedRecordInput],
organizations_by_index: dict[int, Organization],
) -> None:
keys = sorted(
{
(record.inn, record.kpp)
for record in normalized_records
if record.index not in organizations_by_index
and record.inn
and record.kpp
}
)
if not keys:
return
organizations_by_key: dict[tuple[str, str], Organization] = {}
for chunk in cls._chunks(keys, cls.chunk_size):
query = Q()
for inn, kpp in chunk:
query |= Q(inn=inn, kpp=kpp)
for organization in Organization.objects.filter(query):
organizations_by_key[
(organization.inn, organization.kpp)
] = organization
for record in normalized_records:
if record.index in organizations_by_index:
continue
organization = organizations_by_key.get((record.inn, record.kpp))
if organization is not None:
organizations_by_index[record.index] = organization
@classmethod
def _resolve_organizations_by_ogrn_or_ogrip(
cls,
normalized_records: list[_NormalizedRecordInput],
organizations_by_index: dict[int, Organization],
) -> None:
ogrn_values = sorted(
{
record.ogrn
for record in normalized_records
if record.index not in organizations_by_index and record.ogrn
}
)
ogrip_values = sorted(
{
record.ogrip
for record in normalized_records
if record.index not in organizations_by_index and record.ogrip
}
)
lookup_values = sorted({*ogrn_values, *ogrip_values})
if not lookup_values:
return
by_ogrn: dict[str, Organization] = {}
by_ogrip: dict[str, Organization] = {}
for chunk in cls._chunks(lookup_values, cls.chunk_size):
for organization in Organization.objects.filter(
Q(ogrn__in=chunk) | Q(ogrip__in=chunk)
):
if organization.ogrn:
by_ogrn[organization.ogrn] = organization
if organization.ogrip:
by_ogrip[organization.ogrip] = organization
for record in normalized_records:
if record.index in organizations_by_index:
continue
organization = by_ogrn.get(record.ogrn) or by_ogrip.get(record.ogrn)
if organization is None and record.ogrip:
organization = by_ogrip.get(record.ogrip)
if organization is not None:
organizations_by_index[record.index] = organization
@classmethod
def _resolve_organizations_by_unique_inn(
cls,
normalized_records: list[_NormalizedRecordInput],
organizations_by_index: dict[int, Organization],
) -> None:
inn_values = sorted(
{
record.inn
for record in normalized_records
if record.index not in organizations_by_index and record.inn
}
)
if not inn_values:
return
organizations_by_inn: dict[str, list[Organization]] = defaultdict(list)
for chunk in cls._chunks(inn_values, cls.chunk_size):
for organization in Organization.objects.filter(inn__in=chunk).order_by(
"inn"
):
organizations_by_inn[organization.inn].append(organization)
unique_by_inn = {
inn: organizations[0]
for inn, organizations in organizations_by_inn.items()
if len(organizations) == 1
}
for record in normalized_records:
if record.index in organizations_by_index:
continue
organization = unique_by_inn.get(record.inn)
if organization is not None:
organizations_by_index[record.index] = organization
@classmethod
def _resolve_organizations_by_exact_name(
cls,
normalized_records: list[_NormalizedRecordInput],
organizations_by_index: dict[int, Organization],
) -> None:
names = sorted(
{
record.organization_name.strip()
for record in normalized_records
if record.index not in organizations_by_index
and not cls._record_has_identity(record)
and normalize_organization_name(record.organization_name)
}
)
if not names:
return
names_lower = [name.lower() for name in names]
organizations_by_name: dict[str, list[Organization]] = defaultdict(list)
for chunk in cls._chunks(names_lower, cls.chunk_size):
for organization in Organization.objects.annotate(
name_lower=Lower("name")
).filter(name_lower__in=chunk):
organizations_by_name[organization.name.lower()].append(organization)
unique_by_name = {
name: organizations[0]
for name, organizations in organizations_by_name.items()
if len(organizations) == 1
}
for record in normalized_records:
if record.index in organizations_by_index:
continue
if cls._record_has_identity(record):
continue
organization = unique_by_name.get(record.organization_name.strip().lower())
if organization is not None:
organizations_by_index[record.index] = organization
@staticmethod
def _record_has_identity(record: _NormalizedRecordInput) -> bool:
return bool(record.inn or record.ogrn or record.ogrip)
@classmethod
def _update_resolved_organization_identities(
cls,
normalized_records: list[_NormalizedRecordInput],
organizations_by_index: dict[int, Organization],
) -> None:
safe_inn_by_organization_id = cls._safe_missing_inn_updates(
normalized_records,
organizations_by_index,
)
changed_by_uid: dict[str, Organization] = {}
for record in normalized_records:
organization = organizations_by_index.get(record.index)
if organization is None:
continue
if cls._apply_missing_identity_fields(
organization,
record,
safe_inn_by_organization_id=safe_inn_by_organization_id,
):
changed_by_uid[str(organization.uid)] = organization
if changed_by_uid:
Organization.objects.bulk_update(
list(changed_by_uid.values()),
fields=[
"name",
"inn",
"kpp",
"ogrn",
"ogrip",
"identity_status",
"primary_identity",
],
batch_size=cls.chunk_size,
)
@classmethod
def _safe_missing_inn_updates(
cls,
normalized_records: list[_NormalizedRecordInput],
organizations_by_index: dict[int, Organization],
) -> dict[str, str]:
desired_inn_by_organization_id: dict[str, str] = {}
desired_organization_ids_by_inn: dict[str, set[str]] = defaultdict(set)
for record in normalized_records:
organization = organizations_by_index.get(record.index)
if organization is None or organization.inn or not record.inn:
continue
organization_id = str(organization.uid)
desired_inn_by_organization_id.setdefault(organization_id, record.inn)
desired_organization_ids_by_inn[record.inn].add(organization_id)
conflicting_inns = set()
if desired_inn_by_organization_id:
conflicting_inns = set(
Organization.objects.filter(
inn__in=set(desired_inn_by_organization_id.values())
)
.exclude(uid__in=list(desired_inn_by_organization_id))
.values_list("inn", flat=True)
)
safe_updates = {}
for organization_id, inn in desired_inn_by_organization_id.items():
if (
len(desired_organization_ids_by_inn[inn]) == 1
and inn not in conflicting_inns
):
safe_updates[organization_id] = inn
return safe_updates
@classmethod
def _apply_missing_identity_fields(
cls,
organization: Organization,
record: _NormalizedRecordInput,
*,
safe_inn_by_organization_id: dict[str, str],
) -> bool:
changed = False
organization_id = str(organization.uid)
safe_inn = safe_inn_by_organization_id.get(organization_id)
if not organization.inn and safe_inn == record.inn:
organization.inn = record.inn
changed = True
has_entrepreneur_identity = bool(organization.ogrip or record.ogrip)
has_legal_identity = bool(
organization.kpp or organization.ogrn or record.kpp or record.ogrn
)
if not organization.kpp and record.kpp and not has_entrepreneur_identity:
organization.kpp = record.kpp
changed = True
if not organization.ogrn and record.ogrn and not has_entrepreneur_identity:
organization.ogrn = record.ogrn
changed = True
if not organization.ogrip and record.ogrip and not has_legal_identity:
organization.ogrip = record.ogrip
changed = True
if cls._should_replace_placeholder_name(organization, record.organization_name):
organization.name = record.organization_name.strip()
changed = True
if changed:
organization.identity_status = organization._resolve_identity_status()
organization.primary_identity = organization._resolve_primary_identity()
return changed
@staticmethod
def _should_replace_placeholder_name(
organization: Organization,
candidate_name: str,
) -> bool:
normalized_candidate = normalize_organization_name(candidate_name)
if not normalized_candidate:
return False
current_name = organization.name.strip()
if not normalize_organization_name(current_name):
return True
return current_name in {
organization.inn,
organization.ogrn,
organization.ogrip,
}
@classmethod
def _create_missing_organizations(
cls,
normalized_records: list[_NormalizedRecordInput],
organizations_by_index: dict[int, Organization],
) -> tuple[dict[int, Organization], int]:
indexes_by_create_key: dict[tuple[str, str], list[int]] = defaultdict(list)
organizations_by_create_key: dict[tuple[str, str], Organization] = {}
for record in normalized_records:
if record.index in organizations_by_index:
continue
name = (
record.organization_name.strip()
or str(record.record.title or "").strip()
or str(record.record.external_id or "").strip()
)
if not name:
continue
create_key = cls._organization_create_key(record, name)
indexes_by_create_key[create_key].append(record.index)
if create_key in organizations_by_create_key:
continue
organization = Organization(
name=name,
inn=record.inn,
kpp=record.kpp,
ogrn=record.ogrn,
ogrip=record.ogrip,
)
organization.identity_status = organization._resolve_identity_status()
organization.primary_identity = organization._resolve_primary_identity()
organizations_by_create_key[create_key] = organization
create_instances = list(organizations_by_create_key.values())
if not create_instances:
return organizations_by_index, 0
created_count = 0
for chunk in cls._chunks(create_instances, cls.chunk_size):
created_count += len(
Organization.objects.bulk_create(chunk, ignore_conflicts=True)
)
for create_key, organization in organizations_by_create_key.items():
persisted = cls._refetch_created_organization(organization)
if persisted is None:
continue
for index in indexes_by_create_key[create_key]:
organizations_by_index[index] = persisted
return organizations_by_index, created_count
@staticmethod
def _organization_create_key(
record: _NormalizedRecordInput,
name: str,
) -> tuple[str, str]:
if record.inn and record.kpp:
return ("inn_kpp", f"{record.inn}:{record.kpp}")
if record.ogrip:
return ("ogrip", record.ogrip)
if record.ogrn:
return ("ogrn", record.ogrn)
if record.inn:
return ("inn", record.inn)
normalized_name = normalize_organization_name(name)
if normalized_name:
return ("name", normalized_name)
return ("external_id", str(record.record.external_id or name))
@staticmethod
def _refetch_created_organization(
organization: Organization,
) -> Organization | None:
if organization.inn and organization.kpp:
return Organization.objects.filter(
inn=organization.inn,
kpp=organization.kpp,
).first()
if organization.ogrn:
return Organization.objects.filter(ogrn=organization.ogrn).first()
if organization.ogrip:
return Organization.objects.filter(ogrip=organization.ogrip).first()
if organization.inn:
matches = list(Organization.objects.filter(inn=organization.inn)[:2])
return matches[0] if len(matches) == 1 else None
return Organization.objects.filter(name__iexact=organization.name).first()
@classmethod
def _resolve_or_create_extensions(
cls,
*,
descriptor: SourceGroupDescriptor,
load_batch: int | None,
organizations: Iterable[Organization],
) -> tuple[dict[Any, OrganizationSourceExtension], int, int]:
unique_organizations = {
organization.uid: organization for organization in organizations
}
if not unique_organizations:
return {}, 0, 0
existing_extensions = {
extension.organization_id: extension
for extension in descriptor.extension_model.objects.filter(
organization_id__in=list(unique_organizations)
)
}
created_extensions = 0
extensions_by_organization_id = dict(existing_extensions)
for organization_id, organization in unique_organizations.items():
if organization_id in extensions_by_organization_id:
continue
extensions_by_organization_id[
organization_id
] = descriptor.extension_model.objects.create(
organization=organization,
source_group=descriptor.source_group,
title=descriptor.title,
last_load_batch=load_batch,
)
created_extensions += 1
changed_extension_ids = []
for extension in existing_extensions.values():
changed = False
if extension.title != descriptor.title:
extension.title = descriptor.title
changed = True
if load_batch is not None and extension.last_load_batch != load_batch:
extension.last_load_batch = load_batch
changed = True
if changed:
changed_extension_ids.append(extension.uid)
updated_extensions = 0
if changed_extension_ids:
update_kwargs = {
"title": descriptor.title,
"updated_at": timezone.now(),
}
if load_batch is not None:
update_kwargs["last_load_batch"] = load_batch
updated_extensions = descriptor.extension_model.objects.filter(
uid__in=changed_extension_ids
).update(**update_kwargs)
return extensions_by_organization_id, created_extensions, updated_extensions
@classmethod
def _bulk_upsert_source_records(
cls,
*,
descriptor: SourceGroupDescriptor,
load_batch: int | None,
records: list[SourceRecordInput],
record_inputs_with_extensions: Iterable[
tuple[int, SourceRecordInput, OrganizationSourceExtension]
],
) -> tuple[int, int, int, int]:
created_records = 0
updated_records = 0
created_financial_lines = 0
updated_financial_lines = 0
for chunk in cls._iter_chunks(record_inputs_with_extensions, cls.chunk_size):
(
source_records_by_index,
chunk_created,
chunk_updated,
) = cls._bulk_upsert_source_records_chunk(
descriptor=descriptor,
load_batch=load_batch,
record_inputs_with_extensions=chunk,
)
created_records += chunk_created
updated_records += chunk_updated
line_result = cls._save_financial_lines_for_records(
records=records,
source_records_by_index=source_records_by_index,
)
created_financial_lines += line_result[0]
updated_financial_lines += line_result[1]
source_records_by_index.clear()
return (
created_records,
updated_records,
created_financial_lines,
updated_financial_lines,
)
@classmethod
def _bulk_upsert_source_records_chunk(
cls,
*,
descriptor: SourceGroupDescriptor,
load_batch: int | None,
record_inputs_with_extensions: list[
tuple[int, SourceRecordInput, OrganizationSourceExtension]
],
) -> tuple[dict[int, OrganizationSourceRecord], int, int]:
if not record_inputs_with_extensions:
return {}, 0, 0
source = str(descriptor.source)
external_ids = [
str(record_input.external_id or "")
for _, record_input, _ in record_inputs_with_extensions
if str(record_input.external_id or "")
]
existing_by_external_id: dict[str, OrganizationSourceRecord] = {}
for source_record in OrganizationSourceRecord.objects.filter(
source=source,
external_id__in=sorted(set(external_ids)),
):
existing_by_external_id[source_record.external_id] = source_record
now = timezone.now()
create_instances: list[OrganizationSourceRecord] = []
update_instances: list[OrganizationSourceRecord] = []
source_records_by_index: dict[int, OrganizationSourceRecord] = {}
update_fields = [
"extension",
"record_type",
"title",
"record_date",
"amount",
"status",
"url",
"payload",
"load_batch",
"legacy_model",
"legacy_pk",
"updated_at",
]
for index, record_input, extension in record_inputs_with_extensions:
external_id = str(record_input.external_id or "")
defaults = {
"extension": extension,
"record_type": descriptor.record_type,
"title": str(record_input.title or ""),
"record_date": str(record_input.record_date or ""),
"amount": record_input.amount,
"status": str(record_input.status or ""),
"url": str(record_input.url or ""),
"payload": dict(record_input.payload or {}),
"load_batch": load_batch,
"legacy_model": "",
"legacy_pk": "",
"updated_at": now,
}
source_record = existing_by_external_id.get(external_id)
if source_record is None:
source_record = OrganizationSourceRecord(
source=source,
external_id=external_id,
created_at=now,
**defaults,
)
create_instances.append(source_record)
else:
for field_name, value in defaults.items():
setattr(source_record, field_name, value)
update_instances.append(source_record)
source_records_by_index[index] = source_record
created_records = 0
for chunk in cls._chunks(create_instances, cls.chunk_size):
created_records += len(OrganizationSourceRecord.objects.bulk_create(chunk))
for chunk in cls._chunks(update_instances, cls.chunk_size):
OrganizationSourceRecord.objects.bulk_update(
chunk,
fields=update_fields,
batch_size=cls.chunk_size,
)
return source_records_by_index, created_records, len(update_instances)
@classmethod
def _save_financial_lines_for_records(
cls,
*,
records: list[SourceRecordInput],
source_records_by_index: dict[int, OrganizationSourceRecord],
) -> tuple[int, int]:
created = 0
updated = 0
for index, source_record in source_records_by_index.items():
line_result = cls._save_financial_lines(
source_record,
records[index].financial_lines,
)
created += line_result[0]
updated += line_result[1]
return created, updated
@classmethod
def _resolve_or_create_organization(
cls,
record_input: SourceRecordInput,
) -> tuple[Organization | None, bool]:
inn, kpp, ogrn, ogrip = normalize_identity_fields(
inn=record_input.inn,
kpp=record_input.kpp,
ogrn=record_input.ogrn,
ogrip=record_input.ogrip,
)
organization = cls._resolve_organization(
inn=inn,
kpp=kpp,
ogrn=ogrn,
ogrip=ogrip,
organization_name=record_input.organization_name,
)
if organization is not None:
return organization, False
name = (
str(record_input.organization_name or "").strip()
or str(record_input.title or "").strip()
or str(record_input.external_id or "").strip()
)
if not name:
return None, False
return (
Organization.objects.create(
name=name,
inn=inn,
kpp=kpp,
ogrn=ogrn,
ogrip=ogrip,
),
True,
)
@classmethod
def _resolve_organization(
cls,
*,
inn: str,
kpp: str,
ogrn: str,
ogrip: str,
organization_name: str,
) -> Organization | None:
for resolver in (
cls._resolve_by_inn_kpp,
cls._resolve_by_ogrn_or_ogrip,
cls._resolve_by_ogrip,
cls._resolve_by_unique_inn,
cls._resolve_by_exact_normalized_name,
):
organization = resolver(
inn=inn,
kpp=kpp,
ogrn=ogrn,
ogrip=ogrip,
organization_name=organization_name,
)
if organization is not None:
return organization
return None
@staticmethod
def _resolve_by_inn_kpp(
*,
inn: str,
kpp: str,
**_kwargs,
) -> Organization | None:
if not inn or not kpp:
return None
return Organization.objects.filter(inn=inn, kpp=kpp).first()
@staticmethod
def _resolve_by_ogrn_or_ogrip(
*,
ogrn: str,
**_kwargs,
) -> Organization | None:
if not ogrn:
return None
return (
Organization.objects.filter(ogrn=ogrn).first()
or Organization.objects.filter(ogrip=ogrn).first()
)
@staticmethod
def _resolve_by_ogrip(
*,
ogrip: str,
**_kwargs,
) -> Organization | None:
if not ogrip:
return None
return Organization.objects.filter(ogrip=ogrip).first()
@staticmethod
def _resolve_by_unique_inn(
*,
inn: str,
**_kwargs,
) -> Organization | None:
if not inn:
return None
organizations = list(Organization.objects.filter(inn=inn)[:2])
return organizations[0] if len(organizations) == 1 else None
@staticmethod
def _resolve_by_exact_normalized_name(
*,
organization_name: str,
**_kwargs,
) -> Organization | None:
normalized_name = normalize_organization_name(organization_name)
if not normalized_name:
return None
matches = list(Organization.objects.filter(name__iexact=organization_name)[:2])
return matches[0] if len(matches) == 1 else None
@staticmethod
def _update_extension(
extension: OrganizationSourceExtension,
descriptor: SourceGroupDescriptor,
load_batch: int | None,
) -> int:
changed = False
if extension.title != descriptor.title:
extension.title = descriptor.title
changed = True
if load_batch is not None and extension.last_load_batch != load_batch:
extension.last_load_batch = load_batch
changed = True
if changed:
extension.save(update_fields=["title", "last_load_batch", "updated_at"])
return 1
return 0
@staticmethod
def _upsert_source_record(
*,
descriptor: SourceGroupDescriptor,
extension: OrganizationSourceExtension,
record_input: SourceRecordInput,
load_batch: int | None,
) -> tuple[OrganizationSourceRecord, bool]:
defaults = {
"extension": extension,
"record_type": descriptor.record_type,
"title": str(record_input.title or ""),
"record_date": str(record_input.record_date or ""),
"amount": record_input.amount,
"status": str(record_input.status or ""),
"url": str(record_input.url or ""),
"payload": dict(record_input.payload or {}),
"load_batch": load_batch,
"legacy_model": "",
"legacy_pk": "",
}
external_id = str(record_input.external_id or "")
if external_id:
return OrganizationSourceRecord.objects.update_or_create(
source=str(descriptor.source),
external_id=external_id,
defaults=defaults,
)
return (
OrganizationSourceRecord.objects.create(
source=str(descriptor.source),
external_id="",
**defaults,
),
True,
)
@staticmethod
def _save_financial_lines(
source_record: OrganizationSourceRecord,
financial_lines: Iterable[SourceFinancialLineInput],
) -> tuple[int, int]:
created = 0
updated = 0
for line in financial_lines:
_, was_created = OrganizationSourceFinancialLine.objects.update_or_create(
source_record=source_record,
form_code=str(line.form_code),
line_code=str(line.line_code),
year=line.year,
defaults={
"line_name": str(line.line_name),
"period_start": line.period_start,
"period_end": line.period_end,
},
)
if was_created:
created += 1
else:
updated += 1
return created, updated
@staticmethod
def _refresh_extension_counters(extension_ids: set[str]) -> None:
if not extension_ids:
return
aggregates = {
row["extension_id"]: row
for row in OrganizationSourceRecord.objects.filter(
extension_id__in=extension_ids
)
.values("extension_id")
.annotate(
records_count=Count("uid"),
first_seen_at=Min("created_at"),
last_seen_at=Max("created_at"),
last_load_batch=Max("load_batch"),
)
}
now = timezone.now()
extensions = list(
OrganizationSourceExtension.objects.filter(uid__in=extension_ids)
)
for extension in extensions:
aggregate = aggregates.get(extension.uid, {})
extension.records_count = aggregate.get("records_count") or 0
extension.first_seen_at = aggregate.get("first_seen_at")
extension.last_seen_at = aggregate.get("last_seen_at")
extension.last_load_batch = aggregate.get("last_load_batch")
extension.updated_at = now
OrganizationSourceExtension.objects.bulk_update(
extensions,
fields=[
"records_count",
"first_seen_at",
"last_seen_at",
"last_load_batch",
"updated_at",
],
batch_size=1000,
)