"""Direct parser ingestion into organization source storage.""" from __future__ import annotations from collections import defaultdict from collections.abc import Iterable, Iterator from dataclasses import dataclass, field from decimal import Decimal from typing import Any from django.db import transaction from django.db.models import Count, Max, Min, Q from django.db.models.functions import Lower from django.utils import timezone from organizations.models import ( Organization, OrganizationSourceExtension, OrganizationSourceFinancialLine, OrganizationSourceRecord, ) from organizations.name_normalization import normalize_organization_name from organizations.source_cache import invalidate_source_data_cache from organizations.source_groups import ( SourceGroupDescriptor, get_source_group_descriptor, ) from organizations.source_identity import normalize_identity_fields @dataclass(frozen=True) class SourceFinancialLineInput: """Structured financial line parsed from a source record.""" form_code: str line_code: str line_name: str year: int period_start: int | None = None period_end: int | None = None @dataclass(frozen=True) class SourceRecordInput: """Normalized parser output ready for organization source storage.""" external_id: str title: str organization_name: str inn: str = "" kpp: str = "" ogrn: str = "" ogrip: str = "" record_date: str = "" amount: Decimal | None = None status: str = "" url: str = "" payload: dict[str, Any] = field(default_factory=dict) financial_lines: Iterable[SourceFinancialLineInput] = field(default_factory=tuple) @dataclass(frozen=True) class OrganizationSourceIngestionResult: """Counters returned by direct source ingestion.""" scanned: int = 0 created_organizations: int = 0 created_extensions: int = 0 updated_extensions: int = 0 created_records: int = 0 updated_records: int = 0 created_financial_lines: int = 0 updated_financial_lines: int = 0 unresolved: int = 0 @dataclass(frozen=True) class _NormalizedRecordInput: index: int record: SourceRecordInput inn: str kpp: str ogrn: str ogrip: str organization_name: str class OrganizationSourceIngestionService: """Persist parser outputs directly into organization source storage.""" chunk_size = 500 @classmethod def save_records( cls, *, source: str, load_batch: int | None, records: Iterable[SourceRecordInput], ) -> OrganizationSourceIngestionResult: descriptor = get_source_group_descriptor(str(source)) deduplicated = cls._deduplicate_records(records) return cls._save_records(descriptor, load_batch, deduplicated) @staticmethod def _deduplicate_records( records: Iterable[SourceRecordInput], ) -> list[SourceRecordInput]: by_external_id: dict[str, SourceRecordInput] = {} without_external_id = [] for record in records: external_id = str(record.external_id or "") if not external_id: without_external_id.append(record) continue by_external_id[external_id] = record return [*by_external_id.values(), *without_external_id] @classmethod def _save_records( cls, descriptor: SourceGroupDescriptor, load_batch: int | None, records: list[SourceRecordInput], ) -> OrganizationSourceIngestionResult: scanned = len(records) if not records: return OrganizationSourceIngestionResult() with transaction.atomic(): normalized_records = cls._normalize_records(records) ( organizations_by_index, created_organizations, ) = cls._resolve_or_create_organizations(normalized_records) del normalized_records unresolved = scanned - len(organizations_by_index) ( extensions_by_organization_id, created_extensions, updated_extensions, ) = cls._resolve_or_create_extensions( descriptor=descriptor, load_batch=load_batch, organizations=organizations_by_index.values(), ) touched_extension_ids: set[str] = set() for organization in organizations_by_index.values(): extension = extensions_by_organization_id.get(organization.uid) if extension is not None: touched_extension_ids.add(str(extension.uid)) ( created_records, updated_records, created_financial_lines, updated_financial_lines, ) = cls._bulk_upsert_source_records( descriptor=descriptor, load_batch=load_batch, records=records, record_inputs_with_extensions=cls._iter_record_inputs_with_extensions( records=records, organizations_by_index=organizations_by_index, extensions_by_organization_id=extensions_by_organization_id, ), ) cls._refresh_extension_counters(touched_extension_ids) invalidate_source_data_cache() return OrganizationSourceIngestionResult( scanned=scanned, created_organizations=created_organizations, created_extensions=created_extensions, updated_extensions=updated_extensions, created_records=created_records, updated_records=updated_records, created_financial_lines=created_financial_lines, updated_financial_lines=updated_financial_lines, unresolved=unresolved, ) @staticmethod def _chunks(values: list[Any], chunk_size: int) -> Iterable[list[Any]]: for offset in range(0, len(values), chunk_size): yield values[offset : offset + chunk_size] @staticmethod def _iter_chunks(values: Iterable[Any], chunk_size: int) -> Iterator[list[Any]]: chunk = [] for value in values: chunk.append(value) if len(chunk) >= chunk_size: yield chunk chunk = [] if chunk: yield chunk @staticmethod def _iter_record_inputs_with_extensions( *, records: list[SourceRecordInput], organizations_by_index: dict[int, Organization], extensions_by_organization_id: dict[Any, OrganizationSourceExtension], ) -> Iterator[tuple[int, SourceRecordInput, OrganizationSourceExtension]]: for index, organization in organizations_by_index.items(): extension = extensions_by_organization_id.get(organization.uid) if extension is not None: yield index, records[index], extension @classmethod def _normalize_records( cls, records: list[SourceRecordInput], ) -> list[_NormalizedRecordInput]: normalized_records = [] for index, record_input in enumerate(records): inn, kpp, ogrn, ogrip = normalize_identity_fields( inn=record_input.inn, kpp=record_input.kpp, ogrn=record_input.ogrn, ogrip=record_input.ogrip, ) normalized_records.append( _NormalizedRecordInput( index=index, record=record_input, inn=inn, kpp=kpp, ogrn=ogrn, ogrip=ogrip, organization_name=str(record_input.organization_name or ""), ) ) return normalized_records @classmethod def _resolve_or_create_organizations( cls, normalized_records: list[_NormalizedRecordInput], ) -> tuple[dict[int, Organization], int]: organizations_by_index: dict[int, Organization] = {} cls._resolve_organizations_by_inn_kpp( normalized_records, organizations_by_index, ) cls._resolve_organizations_by_ogrn_or_ogrip( normalized_records, organizations_by_index, ) cls._resolve_organizations_by_unique_inn( normalized_records, organizations_by_index, ) cls._resolve_organizations_by_exact_name( normalized_records, organizations_by_index, ) ( organizations_by_index, created_organizations, ) = cls._create_missing_organizations( normalized_records, organizations_by_index, ) cls._update_resolved_organization_identities( normalized_records, organizations_by_index, ) return organizations_by_index, created_organizations @classmethod def _resolve_organizations_by_inn_kpp( cls, normalized_records: list[_NormalizedRecordInput], organizations_by_index: dict[int, Organization], ) -> None: keys = sorted( { (record.inn, record.kpp) for record in normalized_records if record.index not in organizations_by_index and record.inn and record.kpp } ) if not keys: return organizations_by_key: dict[tuple[str, str], Organization] = {} for chunk in cls._chunks(keys, cls.chunk_size): query = Q() for inn, kpp in chunk: query |= Q(inn=inn, kpp=kpp) for organization in Organization.objects.filter(query): organizations_by_key[ (organization.inn, organization.kpp) ] = organization for record in normalized_records: if record.index in organizations_by_index: continue organization = organizations_by_key.get((record.inn, record.kpp)) if organization is not None: organizations_by_index[record.index] = organization @classmethod def _resolve_organizations_by_ogrn_or_ogrip( cls, normalized_records: list[_NormalizedRecordInput], organizations_by_index: dict[int, Organization], ) -> None: ogrn_values = sorted( { record.ogrn for record in normalized_records if record.index not in organizations_by_index and record.ogrn } ) ogrip_values = sorted( { record.ogrip for record in normalized_records if record.index not in organizations_by_index and record.ogrip } ) lookup_values = sorted({*ogrn_values, *ogrip_values}) if not lookup_values: return by_ogrn: dict[str, Organization] = {} by_ogrip: dict[str, Organization] = {} for chunk in cls._chunks(lookup_values, cls.chunk_size): for organization in Organization.objects.filter( Q(ogrn__in=chunk) | Q(ogrip__in=chunk) ): if organization.ogrn: by_ogrn[organization.ogrn] = organization if organization.ogrip: by_ogrip[organization.ogrip] = organization for record in normalized_records: if record.index in organizations_by_index: continue organization = by_ogrn.get(record.ogrn) or by_ogrip.get(record.ogrn) if organization is None and record.ogrip: organization = by_ogrip.get(record.ogrip) if organization is not None: organizations_by_index[record.index] = organization @classmethod def _resolve_organizations_by_unique_inn( cls, normalized_records: list[_NormalizedRecordInput], organizations_by_index: dict[int, Organization], ) -> None: inn_values = sorted( { record.inn for record in normalized_records if record.index not in organizations_by_index and record.inn } ) if not inn_values: return organizations_by_inn: dict[str, list[Organization]] = defaultdict(list) for chunk in cls._chunks(inn_values, cls.chunk_size): for organization in Organization.objects.filter(inn__in=chunk).order_by( "inn" ): organizations_by_inn[organization.inn].append(organization) unique_by_inn = { inn: organizations[0] for inn, organizations in organizations_by_inn.items() if len(organizations) == 1 } for record in normalized_records: if record.index in organizations_by_index: continue organization = unique_by_inn.get(record.inn) if organization is not None: organizations_by_index[record.index] = organization @classmethod def _resolve_organizations_by_exact_name( cls, normalized_records: list[_NormalizedRecordInput], organizations_by_index: dict[int, Organization], ) -> None: names = sorted( { record.organization_name.strip() for record in normalized_records if record.index not in organizations_by_index and not cls._record_has_identity(record) and normalize_organization_name(record.organization_name) } ) if not names: return names_lower = [name.lower() for name in names] organizations_by_name: dict[str, list[Organization]] = defaultdict(list) for chunk in cls._chunks(names_lower, cls.chunk_size): for organization in Organization.objects.annotate( name_lower=Lower("name") ).filter(name_lower__in=chunk): organizations_by_name[organization.name.lower()].append(organization) unique_by_name = { name: organizations[0] for name, organizations in organizations_by_name.items() if len(organizations) == 1 } for record in normalized_records: if record.index in organizations_by_index: continue if cls._record_has_identity(record): continue organization = unique_by_name.get(record.organization_name.strip().lower()) if organization is not None: organizations_by_index[record.index] = organization @staticmethod def _record_has_identity(record: _NormalizedRecordInput) -> bool: return bool(record.inn or record.ogrn or record.ogrip) @classmethod def _update_resolved_organization_identities( cls, normalized_records: list[_NormalizedRecordInput], organizations_by_index: dict[int, Organization], ) -> None: safe_inn_by_organization_id = cls._safe_missing_inn_updates( normalized_records, organizations_by_index, ) changed_by_uid: dict[str, Organization] = {} for record in normalized_records: organization = organizations_by_index.get(record.index) if organization is None: continue if cls._apply_missing_identity_fields( organization, record, safe_inn_by_organization_id=safe_inn_by_organization_id, ): changed_by_uid[str(organization.uid)] = organization if changed_by_uid: Organization.objects.bulk_update( list(changed_by_uid.values()), fields=[ "name", "inn", "kpp", "ogrn", "ogrip", "identity_status", "primary_identity", ], batch_size=cls.chunk_size, ) @classmethod def _safe_missing_inn_updates( cls, normalized_records: list[_NormalizedRecordInput], organizations_by_index: dict[int, Organization], ) -> dict[str, str]: desired_inn_by_organization_id: dict[str, str] = {} desired_organization_ids_by_inn: dict[str, set[str]] = defaultdict(set) for record in normalized_records: organization = organizations_by_index.get(record.index) if organization is None or organization.inn or not record.inn: continue organization_id = str(organization.uid) desired_inn_by_organization_id.setdefault(organization_id, record.inn) desired_organization_ids_by_inn[record.inn].add(organization_id) conflicting_inns = set() if desired_inn_by_organization_id: conflicting_inns = set( Organization.objects.filter( inn__in=set(desired_inn_by_organization_id.values()) ) .exclude(uid__in=list(desired_inn_by_organization_id)) .values_list("inn", flat=True) ) safe_updates = {} for organization_id, inn in desired_inn_by_organization_id.items(): if ( len(desired_organization_ids_by_inn[inn]) == 1 and inn not in conflicting_inns ): safe_updates[organization_id] = inn return safe_updates @classmethod def _apply_missing_identity_fields( cls, organization: Organization, record: _NormalizedRecordInput, *, safe_inn_by_organization_id: dict[str, str], ) -> bool: changed = False organization_id = str(organization.uid) safe_inn = safe_inn_by_organization_id.get(organization_id) if not organization.inn and safe_inn == record.inn: organization.inn = record.inn changed = True has_entrepreneur_identity = bool(organization.ogrip or record.ogrip) has_legal_identity = bool( organization.kpp or organization.ogrn or record.kpp or record.ogrn ) if not organization.kpp and record.kpp and not has_entrepreneur_identity: organization.kpp = record.kpp changed = True if not organization.ogrn and record.ogrn and not has_entrepreneur_identity: organization.ogrn = record.ogrn changed = True if not organization.ogrip and record.ogrip and not has_legal_identity: organization.ogrip = record.ogrip changed = True if cls._should_replace_placeholder_name(organization, record.organization_name): organization.name = record.organization_name.strip() changed = True if changed: organization.identity_status = organization._resolve_identity_status() organization.primary_identity = organization._resolve_primary_identity() return changed @staticmethod def _should_replace_placeholder_name( organization: Organization, candidate_name: str, ) -> bool: normalized_candidate = normalize_organization_name(candidate_name) if not normalized_candidate: return False current_name = organization.name.strip() if not normalize_organization_name(current_name): return True return current_name in { organization.inn, organization.ogrn, organization.ogrip, } @classmethod def _create_missing_organizations( cls, normalized_records: list[_NormalizedRecordInput], organizations_by_index: dict[int, Organization], ) -> tuple[dict[int, Organization], int]: indexes_by_create_key: dict[tuple[str, str], list[int]] = defaultdict(list) organizations_by_create_key: dict[tuple[str, str], Organization] = {} for record in normalized_records: if record.index in organizations_by_index: continue name = ( record.organization_name.strip() or str(record.record.title or "").strip() or str(record.record.external_id or "").strip() ) if not name: continue create_key = cls._organization_create_key(record, name) indexes_by_create_key[create_key].append(record.index) if create_key in organizations_by_create_key: continue organization = Organization( name=name, inn=record.inn, kpp=record.kpp, ogrn=record.ogrn, ogrip=record.ogrip, ) organization.identity_status = organization._resolve_identity_status() organization.primary_identity = organization._resolve_primary_identity() organizations_by_create_key[create_key] = organization create_instances = list(organizations_by_create_key.values()) if not create_instances: return organizations_by_index, 0 created_count = 0 for chunk in cls._chunks(create_instances, cls.chunk_size): created_count += len( Organization.objects.bulk_create(chunk, ignore_conflicts=True) ) for create_key, organization in organizations_by_create_key.items(): persisted = cls._refetch_created_organization(organization) if persisted is None: continue for index in indexes_by_create_key[create_key]: organizations_by_index[index] = persisted return organizations_by_index, created_count @staticmethod def _organization_create_key( record: _NormalizedRecordInput, name: str, ) -> tuple[str, str]: if record.inn and record.kpp: return ("inn_kpp", f"{record.inn}:{record.kpp}") if record.ogrip: return ("ogrip", record.ogrip) if record.ogrn: return ("ogrn", record.ogrn) if record.inn: return ("inn", record.inn) normalized_name = normalize_organization_name(name) if normalized_name: return ("name", normalized_name) return ("external_id", str(record.record.external_id or name)) @staticmethod def _refetch_created_organization( organization: Organization, ) -> Organization | None: if organization.inn and organization.kpp: return Organization.objects.filter( inn=organization.inn, kpp=organization.kpp, ).first() if organization.ogrn: return Organization.objects.filter(ogrn=organization.ogrn).first() if organization.ogrip: return Organization.objects.filter(ogrip=organization.ogrip).first() if organization.inn: matches = list(Organization.objects.filter(inn=organization.inn)[:2]) return matches[0] if len(matches) == 1 else None return Organization.objects.filter(name__iexact=organization.name).first() @classmethod def _resolve_or_create_extensions( cls, *, descriptor: SourceGroupDescriptor, load_batch: int | None, organizations: Iterable[Organization], ) -> tuple[dict[Any, OrganizationSourceExtension], int, int]: unique_organizations = { organization.uid: organization for organization in organizations } if not unique_organizations: return {}, 0, 0 existing_extensions = { extension.organization_id: extension for extension in descriptor.extension_model.objects.filter( organization_id__in=list(unique_organizations) ) } created_extensions = 0 extensions_by_organization_id = dict(existing_extensions) for organization_id, organization in unique_organizations.items(): if organization_id in extensions_by_organization_id: continue extensions_by_organization_id[ organization_id ] = descriptor.extension_model.objects.create( organization=organization, source_group=descriptor.source_group, title=descriptor.title, last_load_batch=load_batch, ) created_extensions += 1 changed_extension_ids = [] for extension in existing_extensions.values(): changed = False if extension.title != descriptor.title: extension.title = descriptor.title changed = True if load_batch is not None and extension.last_load_batch != load_batch: extension.last_load_batch = load_batch changed = True if changed: changed_extension_ids.append(extension.uid) updated_extensions = 0 if changed_extension_ids: update_kwargs = { "title": descriptor.title, "updated_at": timezone.now(), } if load_batch is not None: update_kwargs["last_load_batch"] = load_batch updated_extensions = descriptor.extension_model.objects.filter( uid__in=changed_extension_ids ).update(**update_kwargs) return extensions_by_organization_id, created_extensions, updated_extensions @classmethod def _bulk_upsert_source_records( cls, *, descriptor: SourceGroupDescriptor, load_batch: int | None, records: list[SourceRecordInput], record_inputs_with_extensions: Iterable[ tuple[int, SourceRecordInput, OrganizationSourceExtension] ], ) -> tuple[int, int, int, int]: created_records = 0 updated_records = 0 created_financial_lines = 0 updated_financial_lines = 0 for chunk in cls._iter_chunks(record_inputs_with_extensions, cls.chunk_size): ( source_records_by_index, chunk_created, chunk_updated, ) = cls._bulk_upsert_source_records_chunk( descriptor=descriptor, load_batch=load_batch, record_inputs_with_extensions=chunk, ) created_records += chunk_created updated_records += chunk_updated line_result = cls._save_financial_lines_for_records( records=records, source_records_by_index=source_records_by_index, ) created_financial_lines += line_result[0] updated_financial_lines += line_result[1] source_records_by_index.clear() return ( created_records, updated_records, created_financial_lines, updated_financial_lines, ) @classmethod def _bulk_upsert_source_records_chunk( cls, *, descriptor: SourceGroupDescriptor, load_batch: int | None, record_inputs_with_extensions: list[ tuple[int, SourceRecordInput, OrganizationSourceExtension] ], ) -> tuple[dict[int, OrganizationSourceRecord], int, int]: if not record_inputs_with_extensions: return {}, 0, 0 source = str(descriptor.source) external_ids = [ str(record_input.external_id or "") for _, record_input, _ in record_inputs_with_extensions if str(record_input.external_id or "") ] existing_by_external_id: dict[str, OrganizationSourceRecord] = {} for source_record in OrganizationSourceRecord.objects.filter( source=source, external_id__in=sorted(set(external_ids)), ): existing_by_external_id[source_record.external_id] = source_record now = timezone.now() create_instances: list[OrganizationSourceRecord] = [] update_instances: list[OrganizationSourceRecord] = [] source_records_by_index: dict[int, OrganizationSourceRecord] = {} update_fields = [ "extension", "record_type", "title", "record_date", "amount", "status", "url", "payload", "load_batch", "legacy_model", "legacy_pk", "updated_at", ] for index, record_input, extension in record_inputs_with_extensions: external_id = str(record_input.external_id or "") defaults = { "extension": extension, "record_type": descriptor.record_type, "title": str(record_input.title or ""), "record_date": str(record_input.record_date or ""), "amount": record_input.amount, "status": str(record_input.status or ""), "url": str(record_input.url or ""), "payload": dict(record_input.payload or {}), "load_batch": load_batch, "legacy_model": "", "legacy_pk": "", "updated_at": now, } source_record = existing_by_external_id.get(external_id) if source_record is None: source_record = OrganizationSourceRecord( source=source, external_id=external_id, created_at=now, **defaults, ) create_instances.append(source_record) else: for field_name, value in defaults.items(): setattr(source_record, field_name, value) update_instances.append(source_record) source_records_by_index[index] = source_record created_records = 0 for chunk in cls._chunks(create_instances, cls.chunk_size): created_records += len(OrganizationSourceRecord.objects.bulk_create(chunk)) for chunk in cls._chunks(update_instances, cls.chunk_size): OrganizationSourceRecord.objects.bulk_update( chunk, fields=update_fields, batch_size=cls.chunk_size, ) return source_records_by_index, created_records, len(update_instances) @classmethod def _save_financial_lines_for_records( cls, *, records: list[SourceRecordInput], source_records_by_index: dict[int, OrganizationSourceRecord], ) -> tuple[int, int]: created = 0 updated = 0 for index, source_record in source_records_by_index.items(): line_result = cls._save_financial_lines( source_record, records[index].financial_lines, ) created += line_result[0] updated += line_result[1] return created, updated @classmethod def _resolve_or_create_organization( cls, record_input: SourceRecordInput, ) -> tuple[Organization | None, bool]: inn, kpp, ogrn, ogrip = normalize_identity_fields( inn=record_input.inn, kpp=record_input.kpp, ogrn=record_input.ogrn, ogrip=record_input.ogrip, ) organization = cls._resolve_organization( inn=inn, kpp=kpp, ogrn=ogrn, ogrip=ogrip, organization_name=record_input.organization_name, ) if organization is not None: return organization, False name = ( str(record_input.organization_name or "").strip() or str(record_input.title or "").strip() or str(record_input.external_id or "").strip() ) if not name: return None, False return ( Organization.objects.create( name=name, inn=inn, kpp=kpp, ogrn=ogrn, ogrip=ogrip, ), True, ) @classmethod def _resolve_organization( cls, *, inn: str, kpp: str, ogrn: str, ogrip: str, organization_name: str, ) -> Organization | None: for resolver in ( cls._resolve_by_inn_kpp, cls._resolve_by_ogrn_or_ogrip, cls._resolve_by_ogrip, cls._resolve_by_unique_inn, cls._resolve_by_exact_normalized_name, ): organization = resolver( inn=inn, kpp=kpp, ogrn=ogrn, ogrip=ogrip, organization_name=organization_name, ) if organization is not None: return organization return None @staticmethod def _resolve_by_inn_kpp( *, inn: str, kpp: str, **_kwargs, ) -> Organization | None: if not inn or not kpp: return None return Organization.objects.filter(inn=inn, kpp=kpp).first() @staticmethod def _resolve_by_ogrn_or_ogrip( *, ogrn: str, **_kwargs, ) -> Organization | None: if not ogrn: return None return ( Organization.objects.filter(ogrn=ogrn).first() or Organization.objects.filter(ogrip=ogrn).first() ) @staticmethod def _resolve_by_ogrip( *, ogrip: str, **_kwargs, ) -> Organization | None: if not ogrip: return None return Organization.objects.filter(ogrip=ogrip).first() @staticmethod def _resolve_by_unique_inn( *, inn: str, **_kwargs, ) -> Organization | None: if not inn: return None organizations = list(Organization.objects.filter(inn=inn)[:2]) return organizations[0] if len(organizations) == 1 else None @staticmethod def _resolve_by_exact_normalized_name( *, organization_name: str, **_kwargs, ) -> Organization | None: normalized_name = normalize_organization_name(organization_name) if not normalized_name: return None matches = list(Organization.objects.filter(name__iexact=organization_name)[:2]) return matches[0] if len(matches) == 1 else None @staticmethod def _update_extension( extension: OrganizationSourceExtension, descriptor: SourceGroupDescriptor, load_batch: int | None, ) -> int: changed = False if extension.title != descriptor.title: extension.title = descriptor.title changed = True if load_batch is not None and extension.last_load_batch != load_batch: extension.last_load_batch = load_batch changed = True if changed: extension.save(update_fields=["title", "last_load_batch", "updated_at"]) return 1 return 0 @staticmethod def _upsert_source_record( *, descriptor: SourceGroupDescriptor, extension: OrganizationSourceExtension, record_input: SourceRecordInput, load_batch: int | None, ) -> tuple[OrganizationSourceRecord, bool]: defaults = { "extension": extension, "record_type": descriptor.record_type, "title": str(record_input.title or ""), "record_date": str(record_input.record_date or ""), "amount": record_input.amount, "status": str(record_input.status or ""), "url": str(record_input.url or ""), "payload": dict(record_input.payload or {}), "load_batch": load_batch, "legacy_model": "", "legacy_pk": "", } external_id = str(record_input.external_id or "") if external_id: return OrganizationSourceRecord.objects.update_or_create( source=str(descriptor.source), external_id=external_id, defaults=defaults, ) return ( OrganizationSourceRecord.objects.create( source=str(descriptor.source), external_id="", **defaults, ), True, ) @staticmethod def _save_financial_lines( source_record: OrganizationSourceRecord, financial_lines: Iterable[SourceFinancialLineInput], ) -> tuple[int, int]: created = 0 updated = 0 for line in financial_lines: _, was_created = OrganizationSourceFinancialLine.objects.update_or_create( source_record=source_record, form_code=str(line.form_code), line_code=str(line.line_code), year=line.year, defaults={ "line_name": str(line.line_name), "period_start": line.period_start, "period_end": line.period_end, }, ) if was_created: created += 1 else: updated += 1 return created, updated @staticmethod def _refresh_extension_counters(extension_ids: set[str]) -> None: if not extension_ids: return aggregates = { row["extension_id"]: row for row in OrganizationSourceRecord.objects.filter( extension_id__in=extension_ids ) .values("extension_id") .annotate( records_count=Count("uid"), first_seen_at=Min("created_at"), last_seen_at=Max("created_at"), last_load_batch=Max("load_batch"), ) } now = timezone.now() extensions = list( OrganizationSourceExtension.objects.filter(uid__in=extension_ids) ) for extension in extensions: aggregate = aggregates.get(extension.uid, {}) extension.records_count = aggregate.get("records_count") or 0 extension.first_seen_at = aggregate.get("first_seen_at") extension.last_seen_at = aggregate.get("last_seen_at") extension.last_load_batch = aggregate.get("last_load_batch") extension.updated_at = now OrganizationSourceExtension.objects.bulk_update( extensions, fields=[ "records_count", "first_seen_at", "last_seen_at", "last_load_batch", "updated_at", ], batch_size=1000, )