"""Services for building the canonical organizations directory.""" from __future__ import annotations import re from collections.abc import Iterable from dataclasses import dataclass from apps.parsers.models import ( VACANCY_RECORD_SOURCES, FinancialReport, GenericParserRecord, IndustrialCertificateRecord, IndustrialProductRecord, InspectionRecord, ManufacturerRecord, ParserLoadLog, ProcurementRecord, ) from django.db import transaction from django.db.models import Q from django.utils import timezone from registers.models import Organization as RegisterOrganization from organizations.api_enrichment import OrganizationApiEnrichmentService from organizations.data_sources import data_source_summary from organizations.models import Organization, OrganizationDataSnapshot _QUOTE_CHARS = "\"'«»„“”" _LEGAL_FORM_PATTERNS = ( r"\bобщество\s+с\s+ограниченной\s+ответственностью\b", r"\bооо\b", r"\booo\b", r"\bакционерное\s+общество\b", r"\bао\b", r"\bao\b", r"\bпубличное\s+акционерное\s+общество\b", r"\bпао\b", r"\bpao\b", r"\bзакрытое\s+акционерное\s+общество\b", r"\bзао\b", r"\bzao\b", r"\bиндивидуальный\s+предприниматель\b", r"\bип\b", ) _ABBREVIATED_PREFIXES = ( "ооо ", "ooo ", "ао ", "ao ", "пао ", "pao ", "зао ", "zao ", "ип ", ) @dataclass(frozen=True) class OrganizationCandidate: """Organization data extracted from an existing source table.""" name: str inn: str = "" kpp: str = "" ogrn: str = "" ogrip: str = "" @dataclass(frozen=True) class PopulateOrganizationsResult: """Result counters for organization population.""" scanned: int created: int updated: int skipped: int @dataclass(frozen=True) class RefreshOrganizationDataSnapshotsResult: """Result counters for precomputed organization API data snapshots.""" processed: int created: int updated: int @dataclass class OrganizationLookup: """In-memory indexes for matching organization candidates.""" by_inn_kpp: dict[tuple[str, str], Organization] by_ogrn_kpp: dict[tuple[str, str], Organization] by_inn: dict[str, list[Organization]] by_ogrn: dict[str, list[Organization]] by_ogrip: dict[str, Organization] by_normalized_name: dict[str, Organization] class OrganizationDataSnapshotRefreshService: """Refreshes precomputed v2 data JSON for canonical organizations.""" @classmethod def refresh( cls, *, organization_uids: Iterable[str] | None = None, batch_size: int = 100, ) -> RefreshOrganizationDataSnapshotsResult: queryset = Organization.objects.all().order_by("uid") if organization_uids is not None: queryset = queryset.filter(uid__in=list(organization_uids)) processed = 0 created = 0 updated = 0 for organizations in cls._iter_batches(queryset, batch_size): enrichment = OrganizationApiEnrichmentService.build_for(organizations) existing_snapshots = { str(snapshot.organization_id): snapshot for snapshot in OrganizationDataSnapshot.objects.filter( organization_id__in=[ organization.uid for organization in organizations ] ) } create_instances: list[OrganizationDataSnapshot] = [] update_instances: list[OrganizationDataSnapshot] = [] for organization in organizations: processed += 1 item = enrichment[str(organization.uid)] data = item.data_presence data_source_counts = data_source_summary(data) registries = [ { "id": registry.id, "name": registry.name, } for registry in item.registries ] snapshot = existing_snapshots.get(str(organization.uid)) if snapshot is None: create_instances.append( OrganizationDataSnapshot( organization=organization, data=data, data_source_counts=data_source_counts, registries=registries, ) ) continue snapshot.data = data snapshot.data_source_counts = data_source_counts snapshot.registries = registries snapshot.updated_at = timezone.now() update_instances.append(snapshot) if create_instances: OrganizationDataSnapshot.objects.bulk_create( create_instances, batch_size=batch_size, ) created += len(create_instances) if update_instances: OrganizationDataSnapshot.objects.bulk_update( update_instances, fields=["data", "data_source_counts", "registries", "updated_at"], batch_size=batch_size, ) updated += len(update_instances) return RefreshOrganizationDataSnapshotsResult( processed=processed, created=created, updated=updated, ) @classmethod def refresh_for_parser_batch( cls, *, source: str, batch_id: int, batch_size: int = 100, ) -> RefreshOrganizationDataSnapshotsResult: organization_uids = cls.organization_uids_for_parser_batch( source=source, batch_id=batch_id, ) return cls.refresh( organization_uids=organization_uids, batch_size=batch_size, ) @classmethod def organization_uids_for_parser_batch( cls, *, source: str, batch_id: int, ) -> list[str]: inn_values, ogrn_values = cls._parser_batch_identities( source=source, batch_id=batch_id, ) if not inn_values and not ogrn_values: return [] query = Q() if inn_values: query |= Q(inn__in=inn_values) if ogrn_values: query |= Q(ogrn__in=ogrn_values) | Q(ogrip__in=ogrn_values) return [ str(uid) for uid in Organization.objects.filter(query).values_list("uid", flat=True) ] @staticmethod def _parser_batch_identities( *, source: str, batch_id: int, ) -> tuple[set[str], set[str]]: if source == ParserLoadLog.Source.INDUSTRIAL: return _identity_values( IndustrialCertificateRecord.objects.filter(load_batch=batch_id), inn_field="inn", ogrn_field="ogrn", ) if source == ParserLoadLog.Source.INDUSTRIAL_PRODUCTS: return _identity_values( IndustrialProductRecord.objects.filter(load_batch=batch_id), inn_field="inn", ogrn_field="ogrn", ) if source == ParserLoadLog.Source.MANUFACTURES: return _identity_values( ManufacturerRecord.objects.filter(load_batch=batch_id), inn_field="inn", ogrn_field="ogrn", ) if source == ParserLoadLog.Source.INSPECTIONS: return _identity_values( InspectionRecord.objects.filter(load_batch=batch_id), inn_field="inn", ogrn_field="ogrn", ) if source == ParserLoadLog.Source.PROCUREMENTS: return _identity_values( ProcurementRecord.objects.filter(load_batch=batch_id), inn_field="customer_inn", ogrn_field="customer_ogrn", ) if source == ParserLoadLog.Source.FNS_REPORTS: return ( set(), set( FinancialReport.objects.filter(load_batch=batch_id) .exclude(ogrn="") .values_list("ogrn", flat=True) .distinct() ), ) if source in { ParserLoadLog.Source.PROCUREMENTS_44FZ, ParserLoadLog.Source.PROCUREMENTS_223FZ, ParserLoadLog.Source.CONTRACTS, ParserLoadLog.Source.UNFAIR_SUPPLIERS, ParserLoadLog.Source.FAS_GOZ, ParserLoadLog.Source.ARBITRATION, ParserLoadLog.Source.FEDRESURS_BANKRUPTCY, ParserLoadLog.Source.FSTEC, ParserLoadLog.Source.TRUDVSEM, }: sources = ( VACANCY_RECORD_SOURCES if source == ParserLoadLog.Source.TRUDVSEM else (source,) ) return _identity_values( GenericParserRecord.objects.filter( source__in=sources, load_batch=batch_id, ), inn_field="inn", ogrn_field="ogrn", ) return set(), set() @staticmethod def _iter_batches(queryset, batch_size: int) -> Iterable[list[Organization]]: batch: list[Organization] = [] for organization in queryset.iterator(chunk_size=batch_size): batch.append(organization) if len(batch) >= batch_size: yield batch batch = [] if batch: yield batch def _identity_values( queryset, *, inn_field: str, ogrn_field: str ) -> tuple[set[str], set[str]]: inn_values = set( queryset.exclude(**{inn_field: ""}).values_list(inn_field, flat=True).distinct() ) ogrn_values = set( queryset.exclude(**{ogrn_field: ""}) .values_list(ogrn_field, flat=True) .distinct() ) return inn_values, ogrn_values def normalize_identifier(value: str | int | None, *, max_length: int) -> str: """Return digits-only identifier bounded by the target field length.""" if value is None: return "" normalized = re.sub(r"\D+", "", str(value)) if not normalized or len(normalized) > max_length: return "" return normalized def normalize_organization_name(value: str | None) -> str: """Normalize organization names for matching spelling variants.""" if value is None: return "" normalized = str(value).strip().lower().replace("ё", "е") normalized = normalized.translate( str.maketrans({char: " " for char in _QUOTE_CHARS}) ) normalized = re.sub(r"[^\w\s-]+", " ", normalized, flags=re.UNICODE) normalized = normalized.replace("-", " ") normalized = re.sub(r"\s+", " ", normalized).strip() for pattern in _LEGAL_FORM_PATTERNS: normalized = re.sub(pattern, " ", normalized, flags=re.IGNORECASE) return re.sub(r"\s+", " ", normalized).strip() class OrganizationPopulationService: """Builds organizations from currently available source tables.""" @classmethod def populate(cls) -> PopulateOrganizationsResult: scanned = 0 created = 0 updated = 0 skipped = 0 with transaction.atomic(): existing = list(Organization.objects.all()) lookup = cls._build_lookup(existing) create_instances: list[Organization] = [] update_instances_by_uid: dict[str, Organization] = {} for candidate in cls.iter_candidates(): scanned += 1 if not normalize_organization_name(candidate.name): skipped += 1 continue organization = cls._find_existing(lookup, candidate) if organization is None: organization = Organization( name=candidate.name.strip(), inn=candidate.inn, kpp=candidate.kpp, ogrn=candidate.ogrn, ogrip=candidate.ogrip, ) existing.append(organization) create_instances.append(organization) cls._index_organization(lookup, organization) created += 1 continue if cls._assign_existing_fields(organization, candidate): cls._index_organization(lookup, organization) update_instances_by_uid[str(organization.uid)] = organization updated += 1 if create_instances: Organization.objects.bulk_create(create_instances, batch_size=1000) update_instances = list(update_instances_by_uid.values()) if update_instances: Organization.objects.bulk_update( update_instances, fields=["name", "inn", "kpp", "ogrn", "ogrip"], batch_size=1000, ) return PopulateOrganizationsResult( scanned=scanned, created=created, updated=updated, skipped=skipped, ) @classmethod def iter_candidates(cls) -> Iterable[OrganizationCandidate]: """Yield organization candidates from all current source tables.""" for row in RegisterOrganization.objects.iterator(): yield cls._candidate( name=row.pn_name, inn=row.mn_inn, kpp=row.in_kpp, ogrn=row.mn_ogrn, ) for row in IndustrialCertificateRecord.objects.iterator(): yield cls._candidate( name=row.organisation_name, inn=row.inn, ogrn=row.ogrn, ) for row in ManufacturerRecord.objects.iterator(): yield cls._candidate( name=row.full_legal_name, inn=row.inn, ogrn=row.ogrn, ) for row in IndustrialProductRecord.objects.iterator(): yield cls._candidate( name=row.full_organisation_name, inn=row.inn, ogrn=row.ogrn, ) for row in GenericParserRecord.objects.iterator(): yield cls._candidate( name=row.organisation_name or row.title, inn=row.inn, kpp=cls._payload_kpp(row.payload), ogrn=row.ogrn, ) for row in InspectionRecord.objects.iterator(): yield cls._candidate( name=row.organisation_name, inn=row.inn, ogrn=row.ogrn, ) for row in ProcurementRecord.objects.iterator(): yield cls._candidate( name=row.customer_name, inn=row.customer_inn, kpp=row.customer_kpp, ogrn=row.customer_ogrn, ) @staticmethod def _candidate( *, name: str | None, inn: str | int | None = None, kpp: str | int | None = None, ogrn: str | int | None = None, ) -> OrganizationCandidate: normalized_inn = normalize_identifier(inn, max_length=12) normalized_ogrn = normalize_identifier(ogrn, max_length=15) ogrip = ( normalized_ogrn if len(normalized_ogrn) == 15 and len(normalized_inn) == 12 else "" ) legal_ogrn = normalized_ogrn if len(normalized_ogrn) == 13 else "" return OrganizationCandidate( name=(name or "").strip(), inn=normalized_inn, kpp="" if ogrip else normalize_identifier(kpp, max_length=9), ogrn=legal_ogrn, ogrip=ogrip, ) @classmethod def _payload_kpp(cls, payload: object) -> str: if not isinstance(payload, dict): return "" company = payload.get("company") if isinstance(company, dict): company_kpp = normalize_identifier(company.get("kpp"), max_length=9) if company_kpp: return company_kpp return cls._find_payload_identifier(payload, {"kpp", "кпп"}, max_length=9) @classmethod def _find_payload_identifier( cls, value: object, keys: set[str], *, max_length: int, ) -> str: if isinstance(value, dict): for key, item in value.items(): if str(key).strip().lower() in keys: identifier = normalize_identifier(item, max_length=max_length) if identifier: return identifier nested = cls._find_payload_identifier(item, keys, max_length=max_length) if nested: return nested elif isinstance(value, list): for item in value: nested = cls._find_payload_identifier(item, keys, max_length=max_length) if nested: return nested return "" @classmethod def _build_lookup(cls, organizations: list[Organization]) -> OrganizationLookup: lookup = OrganizationLookup( by_inn_kpp={}, by_ogrn_kpp={}, by_inn={}, by_ogrn={}, by_ogrip={}, by_normalized_name={}, ) for organization in organizations: cls._index_organization(lookup, organization) return lookup @staticmethod def _index_organization( lookup: OrganizationLookup, organization: Organization, ) -> None: if organization.inn and organization.kpp: lookup.by_inn_kpp.setdefault( (organization.inn, organization.kpp), organization ) if organization.ogrn and organization.kpp: lookup.by_ogrn_kpp.setdefault( (organization.ogrn, organization.kpp), organization ) if organization.inn: lookup.by_inn.setdefault(organization.inn, []) if organization not in lookup.by_inn[organization.inn]: lookup.by_inn[organization.inn].append(organization) if organization.ogrn: lookup.by_ogrn.setdefault(organization.ogrn, []) if organization not in lookup.by_ogrn[organization.ogrn]: lookup.by_ogrn[organization.ogrn].append(organization) if organization.ogrip: lookup.by_ogrip.setdefault(organization.ogrip, organization) normalized_name = normalize_organization_name(organization.name) if normalized_name: lookup.by_normalized_name.setdefault(normalized_name, organization) @staticmethod def _find_exact_identifier_match( lookup: OrganizationLookup, candidate: OrganizationCandidate, ) -> Organization | None: if candidate.inn and candidate.kpp: organization = lookup.by_inn_kpp.get((candidate.inn, candidate.kpp)) if organization is not None: return organization if candidate.ogrn and candidate.kpp: organization = lookup.by_ogrn_kpp.get((candidate.ogrn, candidate.kpp)) if organization is not None: return organization if candidate.ogrip and candidate.ogrip in lookup.by_ogrip: return lookup.by_ogrip[candidate.ogrip] return None @staticmethod def _find_blank_kpp_match( lookup: OrganizationLookup, candidate: OrganizationCandidate, ) -> Organization | None: if candidate.inn and candidate.kpp: blank_kpp_matches = [ organization for organization in lookup.by_inn.get(candidate.inn, []) if not organization.kpp ] if len(blank_kpp_matches) == 1: return blank_kpp_matches[0] if candidate.ogrn and candidate.kpp: blank_kpp_matches = [ organization for organization in lookup.by_ogrn.get(candidate.ogrn, []) if not organization.kpp ] if len(blank_kpp_matches) == 1: return blank_kpp_matches[0] return None @staticmethod def _find_single_identifier_match( lookup: OrganizationLookup, candidate: OrganizationCandidate, ) -> Organization | None: if candidate.inn and not candidate.kpp: organizations = lookup.by_inn.get(candidate.inn, []) if len(organizations) == 1: return organizations[0] if candidate.ogrn and not candidate.kpp: organizations = lookup.by_ogrn.get(candidate.ogrn, []) if len(organizations) == 1: return organizations[0] return None @staticmethod def _find_name_match( lookup: OrganizationLookup, candidate: OrganizationCandidate, ) -> Organization | None: candidate_name = normalize_organization_name(candidate.name) if not candidate_name: return None return lookup.by_normalized_name.get(candidate_name) @classmethod def _find_existing( cls, lookup: OrganizationLookup, candidate: OrganizationCandidate, ) -> Organization | None: organization = cls._find_exact_identifier_match(lookup, candidate) if organization is not None: return organization organization = cls._find_blank_kpp_match(lookup, candidate) if organization is not None: return organization organization = cls._find_single_identifier_match(lookup, candidate) if organization is not None: return organization if candidate.kpp and (candidate.inn or candidate.ogrn): return None return cls._find_name_match(lookup, candidate) @classmethod def _update_existing( cls, organization: Organization, candidate: OrganizationCandidate, ) -> bool: if not cls._assign_existing_fields(organization, candidate): return False organization.save(update_fields=["name", "inn", "kpp", "ogrn", "ogrip"]) return True @classmethod def _assign_existing_fields( cls, organization: Organization, candidate: OrganizationCandidate, ) -> bool: changed = False selected_name = cls._select_name(organization.name, candidate.name) if selected_name != organization.name: organization.name = selected_name changed = True for field_name in ("inn", "kpp", "ogrn", "ogrip"): if getattr(organization, field_name): continue if field_name == "ogrip" and (organization.kpp or organization.ogrn): continue if field_name in {"kpp", "ogrn"} and organization.ogrip: continue candidate_value = getattr(candidate, field_name) if candidate_value: setattr(organization, field_name, candidate_value) changed = True return changed @staticmethod def _select_name(current: str, candidate: str) -> str: current_clean = current.strip() candidate_clean = candidate.strip() if not candidate_clean: return current_clean if not current_clean: return candidate_clean current_is_abbreviated = current_clean.lower().startswith(_ABBREVIATED_PREFIXES) candidate_is_abbreviated = candidate_clean.lower().startswith( _ABBREVIATED_PREFIXES ) if current_is_abbreviated and not candidate_is_abbreviated: return candidate_clean if len(candidate_clean) > len(current_clean) and not candidate_is_abbreviated: return candidate_clean return current_clean