Add organizations v2 API and registry enrichment
All checks were successful
CI/CD Pipeline / Quality Gate (push) Successful in 26s
CI/CD Pipeline / Build and Push Images (push) Successful in 6s
CI/CD Pipeline / Internal Notify (push) Successful in 0s
CI/CD Pipeline / Deploy Dev in Dokploy (push) Successful in 1s

This commit is contained in:
2026-05-06 19:04:46 +02:00
parent f54aa4cb0b
commit 0f17ff6773
62 changed files with 10311 additions and 430 deletions

View File

@@ -0,0 +1,703 @@
"""Services for building the canonical organizations directory."""
from __future__ import annotations
import re
from collections.abc import Iterable
from dataclasses import dataclass
from apps.parsers.models import (
FinancialReport,
GenericParserRecord,
IndustrialCertificateRecord,
IndustrialProductRecord,
InspectionRecord,
ManufacturerRecord,
ParserLoadLog,
ProcurementRecord,
)
from django.db import transaction
from django.db.models import Q
from django.utils import timezone
from registers.models import Organization as RegisterOrganization
from organizations.api_enrichment import OrganizationApiEnrichmentService
from organizations.models import Organization, OrganizationDataSnapshot
_QUOTE_CHARS = "\"'«»„“”"
_LEGAL_FORM_PATTERNS = (
r"\bобщество\s+с\s+ограниченной\s+ответственностью\b",
r"\bооо\b",
r"\booo\b",
r"\bакционерное\s+общество\b",
r"\bао\b",
r"\bao\b",
r"\bпубличное\s+акционерное\s+общество\b",
r"\bпао\b",
r"\bpao\b",
r"\акрытое\s+акционерное\s+общество\b",
r"\ао\b",
r"\bzao\b",
r"\bиндивидуальный\s+предприниматель\b",
r"\bип\b",
)
_ABBREVIATED_PREFIXES = (
"ооо ",
"ooo ",
"ао ",
"ao ",
"пао ",
"pao ",
"зао ",
"zao ",
"ип ",
)
@dataclass(frozen=True)
class OrganizationCandidate:
"""Organization data extracted from an existing source table."""
name: str
inn: str = ""
kpp: str = ""
ogrn: str = ""
ogrip: str = ""
@dataclass(frozen=True)
class PopulateOrganizationsResult:
"""Result counters for organization population."""
scanned: int
created: int
updated: int
skipped: int
@dataclass(frozen=True)
class RefreshOrganizationDataSnapshotsResult:
"""Result counters for precomputed organization API data snapshots."""
processed: int
created: int
updated: int
@dataclass
class OrganizationLookup:
"""In-memory indexes for matching organization candidates."""
by_inn_kpp: dict[tuple[str, str], Organization]
by_ogrn_kpp: dict[tuple[str, str], Organization]
by_inn: dict[str, list[Organization]]
by_ogrn: dict[str, list[Organization]]
by_ogrip: dict[str, Organization]
by_normalized_name: dict[str, Organization]
class OrganizationDataSnapshotRefreshService:
"""Refreshes precomputed v2 data JSON for canonical organizations."""
@classmethod
def refresh(
cls,
*,
organization_uids: Iterable[str] | None = None,
batch_size: int = 100,
) -> RefreshOrganizationDataSnapshotsResult:
queryset = Organization.objects.all().order_by("uid")
if organization_uids is not None:
queryset = queryset.filter(uid__in=list(organization_uids))
processed = 0
created = 0
updated = 0
for organizations in cls._iter_batches(queryset, batch_size):
enrichment = OrganizationApiEnrichmentService.build_for(organizations)
existing_snapshots = {
str(snapshot.organization_id): snapshot
for snapshot in OrganizationDataSnapshot.objects.filter(
organization_id__in=[
organization.uid for organization in organizations
]
)
}
create_instances: list[OrganizationDataSnapshot] = []
update_instances: list[OrganizationDataSnapshot] = []
for organization in organizations:
processed += 1
item = enrichment[str(organization.uid)]
data = item.data_presence
registries = [
{
"id": registry.id,
"name": registry.name,
}
for registry in item.registries
]
snapshot = existing_snapshots.get(str(organization.uid))
if snapshot is None:
create_instances.append(
OrganizationDataSnapshot(
organization=organization,
data=data,
registries=registries,
)
)
continue
snapshot.data = data
snapshot.registries = registries
snapshot.updated_at = timezone.now()
update_instances.append(snapshot)
if create_instances:
OrganizationDataSnapshot.objects.bulk_create(
create_instances,
batch_size=batch_size,
)
created += len(create_instances)
if update_instances:
OrganizationDataSnapshot.objects.bulk_update(
update_instances,
fields=["data", "registries", "updated_at"],
batch_size=batch_size,
)
updated += len(update_instances)
return RefreshOrganizationDataSnapshotsResult(
processed=processed,
created=created,
updated=updated,
)
@classmethod
def refresh_for_parser_batch(
cls,
*,
source: str,
batch_id: int,
batch_size: int = 100,
) -> RefreshOrganizationDataSnapshotsResult:
organization_uids = cls.organization_uids_for_parser_batch(
source=source,
batch_id=batch_id,
)
return cls.refresh(
organization_uids=organization_uids,
batch_size=batch_size,
)
@classmethod
def organization_uids_for_parser_batch(
cls,
*,
source: str,
batch_id: int,
) -> list[str]:
inn_values, ogrn_values = cls._parser_batch_identities(
source=source,
batch_id=batch_id,
)
if not inn_values and not ogrn_values:
return []
query = Q()
if inn_values:
query |= Q(inn__in=inn_values)
if ogrn_values:
query |= Q(ogrn__in=ogrn_values) | Q(ogrip__in=ogrn_values)
return [
str(uid)
for uid in Organization.objects.filter(query).values_list("uid", flat=True)
]
@staticmethod
def _parser_batch_identities(
*,
source: str,
batch_id: int,
) -> tuple[set[str], set[str]]:
if source == ParserLoadLog.Source.INDUSTRIAL:
return _identity_values(
IndustrialCertificateRecord.objects.filter(load_batch=batch_id),
inn_field="inn",
ogrn_field="ogrn",
)
if source == ParserLoadLog.Source.INDUSTRIAL_PRODUCTS:
return _identity_values(
IndustrialProductRecord.objects.filter(load_batch=batch_id),
inn_field="inn",
ogrn_field="ogrn",
)
if source == ParserLoadLog.Source.MANUFACTURES:
return _identity_values(
ManufacturerRecord.objects.filter(load_batch=batch_id),
inn_field="inn",
ogrn_field="ogrn",
)
if source == ParserLoadLog.Source.INSPECTIONS:
return _identity_values(
InspectionRecord.objects.filter(load_batch=batch_id),
inn_field="inn",
ogrn_field="ogrn",
)
if source == ParserLoadLog.Source.PROCUREMENTS:
return _identity_values(
ProcurementRecord.objects.filter(load_batch=batch_id),
inn_field="customer_inn",
ogrn_field="customer_ogrn",
)
if source == ParserLoadLog.Source.FNS_REPORTS:
return (
set(),
set(
FinancialReport.objects.filter(load_batch=batch_id)
.exclude(ogrn="")
.values_list("ogrn", flat=True)
.distinct()
),
)
if source in {
ParserLoadLog.Source.PROCUREMENTS_44FZ,
ParserLoadLog.Source.PROCUREMENTS_223FZ,
ParserLoadLog.Source.CONTRACTS,
ParserLoadLog.Source.UNFAIR_SUPPLIERS,
ParserLoadLog.Source.FAS_GOZ,
ParserLoadLog.Source.ARBITRATION,
ParserLoadLog.Source.FEDRESURS_BANKRUPTCY,
ParserLoadLog.Source.FSTEC,
ParserLoadLog.Source.TRUDVSEM,
}:
return _identity_values(
GenericParserRecord.objects.filter(source=source, load_batch=batch_id),
inn_field="inn",
ogrn_field="ogrn",
)
return set(), set()
@staticmethod
def _iter_batches(queryset, batch_size: int) -> Iterable[list[Organization]]:
batch: list[Organization] = []
for organization in queryset.iterator(chunk_size=batch_size):
batch.append(organization)
if len(batch) >= batch_size:
yield batch
batch = []
if batch:
yield batch
def _identity_values(
queryset, *, inn_field: str, ogrn_field: str
) -> tuple[set[str], set[str]]:
inn_values = set(
queryset.exclude(**{inn_field: ""}).values_list(inn_field, flat=True).distinct()
)
ogrn_values = set(
queryset.exclude(**{ogrn_field: ""})
.values_list(ogrn_field, flat=True)
.distinct()
)
return inn_values, ogrn_values
def normalize_identifier(value: str | int | None, *, max_length: int) -> str:
"""Return digits-only identifier bounded by the target field length."""
if value is None:
return ""
normalized = re.sub(r"\D+", "", str(value))
if not normalized or len(normalized) > max_length:
return ""
return normalized
def normalize_organization_name(value: str | None) -> str:
"""Normalize organization names for matching spelling variants."""
if value is None:
return ""
normalized = str(value).strip().lower().replace("ё", "е")
normalized = normalized.translate(
str.maketrans({char: " " for char in _QUOTE_CHARS})
)
normalized = re.sub(r"[^\w\s-]+", " ", normalized, flags=re.UNICODE)
normalized = normalized.replace("-", " ")
normalized = re.sub(r"\s+", " ", normalized).strip()
for pattern in _LEGAL_FORM_PATTERNS:
normalized = re.sub(pattern, " ", normalized, flags=re.IGNORECASE)
return re.sub(r"\s+", " ", normalized).strip()
class OrganizationPopulationService:
"""Builds organizations from currently available source tables."""
@classmethod
def populate(cls) -> PopulateOrganizationsResult:
scanned = 0
created = 0
updated = 0
skipped = 0
with transaction.atomic():
existing = list(Organization.objects.all())
lookup = cls._build_lookup(existing)
create_instances: list[Organization] = []
update_instances_by_uid: dict[str, Organization] = {}
for candidate in cls.iter_candidates():
scanned += 1
if not normalize_organization_name(candidate.name):
skipped += 1
continue
organization = cls._find_existing(lookup, candidate)
if organization is None:
organization = Organization(
name=candidate.name.strip(),
inn=candidate.inn,
kpp=candidate.kpp,
ogrn=candidate.ogrn,
ogrip=candidate.ogrip,
)
existing.append(organization)
create_instances.append(organization)
cls._index_organization(lookup, organization)
created += 1
continue
if cls._assign_existing_fields(organization, candidate):
cls._index_organization(lookup, organization)
update_instances_by_uid[str(organization.uid)] = organization
updated += 1
if create_instances:
Organization.objects.bulk_create(create_instances, batch_size=1000)
update_instances = list(update_instances_by_uid.values())
if update_instances:
Organization.objects.bulk_update(
update_instances,
fields=["name", "inn", "kpp", "ogrn", "ogrip"],
batch_size=1000,
)
return PopulateOrganizationsResult(
scanned=scanned,
created=created,
updated=updated,
skipped=skipped,
)
@classmethod
def iter_candidates(cls) -> Iterable[OrganizationCandidate]:
"""Yield organization candidates from all current source tables."""
for row in RegisterOrganization.objects.iterator():
yield cls._candidate(
name=row.pn_name,
inn=row.mn_inn,
kpp=row.in_kpp,
ogrn=row.mn_ogrn,
)
for row in IndustrialCertificateRecord.objects.iterator():
yield cls._candidate(
name=row.organisation_name,
inn=row.inn,
ogrn=row.ogrn,
)
for row in ManufacturerRecord.objects.iterator():
yield cls._candidate(
name=row.full_legal_name,
inn=row.inn,
ogrn=row.ogrn,
)
for row in IndustrialProductRecord.objects.iterator():
yield cls._candidate(
name=row.full_organisation_name,
inn=row.inn,
ogrn=row.ogrn,
)
for row in GenericParserRecord.objects.iterator():
yield cls._candidate(
name=row.organisation_name or row.title,
inn=row.inn,
kpp=cls._payload_kpp(row.payload),
ogrn=row.ogrn,
)
for row in InspectionRecord.objects.iterator():
yield cls._candidate(
name=row.organisation_name,
inn=row.inn,
ogrn=row.ogrn,
)
for row in ProcurementRecord.objects.iterator():
yield cls._candidate(
name=row.customer_name,
inn=row.customer_inn,
kpp=row.customer_kpp,
ogrn=row.customer_ogrn,
)
@staticmethod
def _candidate(
*,
name: str | None,
inn: str | int | None = None,
kpp: str | int | None = None,
ogrn: str | int | None = None,
) -> OrganizationCandidate:
normalized_inn = normalize_identifier(inn, max_length=12)
normalized_ogrn = normalize_identifier(ogrn, max_length=15)
ogrip = (
normalized_ogrn
if len(normalized_ogrn) == 15 and len(normalized_inn) == 12
else ""
)
legal_ogrn = normalized_ogrn if len(normalized_ogrn) == 13 else ""
return OrganizationCandidate(
name=(name or "").strip(),
inn=normalized_inn,
kpp="" if ogrip else normalize_identifier(kpp, max_length=9),
ogrn=legal_ogrn,
ogrip=ogrip,
)
@classmethod
def _payload_kpp(cls, payload: object) -> str:
if not isinstance(payload, dict):
return ""
company = payload.get("company")
if isinstance(company, dict):
company_kpp = normalize_identifier(company.get("kpp"), max_length=9)
if company_kpp:
return company_kpp
return cls._find_payload_identifier(payload, {"kpp", "кпп"}, max_length=9)
@classmethod
def _find_payload_identifier(
cls,
value: object,
keys: set[str],
*,
max_length: int,
) -> str:
if isinstance(value, dict):
for key, item in value.items():
if str(key).strip().lower() in keys:
identifier = normalize_identifier(item, max_length=max_length)
if identifier:
return identifier
nested = cls._find_payload_identifier(item, keys, max_length=max_length)
if nested:
return nested
elif isinstance(value, list):
for item in value:
nested = cls._find_payload_identifier(item, keys, max_length=max_length)
if nested:
return nested
return ""
@classmethod
def _build_lookup(cls, organizations: list[Organization]) -> OrganizationLookup:
lookup = OrganizationLookup(
by_inn_kpp={},
by_ogrn_kpp={},
by_inn={},
by_ogrn={},
by_ogrip={},
by_normalized_name={},
)
for organization in organizations:
cls._index_organization(lookup, organization)
return lookup
@staticmethod
def _index_organization(
lookup: OrganizationLookup,
organization: Organization,
) -> None:
if organization.inn and organization.kpp:
lookup.by_inn_kpp.setdefault(
(organization.inn, organization.kpp), organization
)
if organization.ogrn and organization.kpp:
lookup.by_ogrn_kpp.setdefault(
(organization.ogrn, organization.kpp), organization
)
if organization.inn:
lookup.by_inn.setdefault(organization.inn, [])
if organization not in lookup.by_inn[organization.inn]:
lookup.by_inn[organization.inn].append(organization)
if organization.ogrn:
lookup.by_ogrn.setdefault(organization.ogrn, [])
if organization not in lookup.by_ogrn[organization.ogrn]:
lookup.by_ogrn[organization.ogrn].append(organization)
if organization.ogrip:
lookup.by_ogrip.setdefault(organization.ogrip, organization)
normalized_name = normalize_organization_name(organization.name)
if normalized_name:
lookup.by_normalized_name.setdefault(normalized_name, organization)
@staticmethod
def _find_exact_identifier_match(
lookup: OrganizationLookup,
candidate: OrganizationCandidate,
) -> Organization | None:
if candidate.inn and candidate.kpp:
organization = lookup.by_inn_kpp.get((candidate.inn, candidate.kpp))
if organization is not None:
return organization
if candidate.ogrn and candidate.kpp:
organization = lookup.by_ogrn_kpp.get((candidate.ogrn, candidate.kpp))
if organization is not None:
return organization
if candidate.ogrip and candidate.ogrip in lookup.by_ogrip:
return lookup.by_ogrip[candidate.ogrip]
return None
@staticmethod
def _find_blank_kpp_match(
lookup: OrganizationLookup,
candidate: OrganizationCandidate,
) -> Organization | None:
if candidate.inn and candidate.kpp:
blank_kpp_matches = [
organization
for organization in lookup.by_inn.get(candidate.inn, [])
if not organization.kpp
]
if len(blank_kpp_matches) == 1:
return blank_kpp_matches[0]
if candidate.ogrn and candidate.kpp:
blank_kpp_matches = [
organization
for organization in lookup.by_ogrn.get(candidate.ogrn, [])
if not organization.kpp
]
if len(blank_kpp_matches) == 1:
return blank_kpp_matches[0]
return None
@staticmethod
def _find_single_identifier_match(
lookup: OrganizationLookup,
candidate: OrganizationCandidate,
) -> Organization | None:
if candidate.inn and not candidate.kpp:
organizations = lookup.by_inn.get(candidate.inn, [])
if len(organizations) == 1:
return organizations[0]
if candidate.ogrn and not candidate.kpp:
organizations = lookup.by_ogrn.get(candidate.ogrn, [])
if len(organizations) == 1:
return organizations[0]
return None
@staticmethod
def _find_name_match(
lookup: OrganizationLookup,
candidate: OrganizationCandidate,
) -> Organization | None:
candidate_name = normalize_organization_name(candidate.name)
if not candidate_name:
return None
return lookup.by_normalized_name.get(candidate_name)
@classmethod
def _find_existing(
cls,
lookup: OrganizationLookup,
candidate: OrganizationCandidate,
) -> Organization | None:
organization = cls._find_exact_identifier_match(lookup, candidate)
if organization is not None:
return organization
organization = cls._find_blank_kpp_match(lookup, candidate)
if organization is not None:
return organization
organization = cls._find_single_identifier_match(lookup, candidate)
if organization is not None:
return organization
if candidate.kpp and (candidate.inn or candidate.ogrn):
return None
return cls._find_name_match(lookup, candidate)
@classmethod
def _update_existing(
cls,
organization: Organization,
candidate: OrganizationCandidate,
) -> bool:
if not cls._assign_existing_fields(organization, candidate):
return False
organization.save(update_fields=["name", "inn", "kpp", "ogrn", "ogrip"])
return True
@classmethod
def _assign_existing_fields(
cls,
organization: Organization,
candidate: OrganizationCandidate,
) -> bool:
changed = False
selected_name = cls._select_name(organization.name, candidate.name)
if selected_name != organization.name:
organization.name = selected_name
changed = True
for field_name in ("inn", "kpp", "ogrn", "ogrip"):
if getattr(organization, field_name):
continue
if field_name == "ogrip" and (organization.kpp or organization.ogrn):
continue
if field_name in {"kpp", "ogrn"} and organization.ogrip:
continue
candidate_value = getattr(candidate, field_name)
if candidate_value:
setattr(organization, field_name, candidate_value)
changed = True
return changed
@staticmethod
def _select_name(current: str, candidate: str) -> str:
current_clean = current.strip()
candidate_clean = candidate.strip()
if not candidate_clean:
return current_clean
if not current_clean:
return candidate_clean
current_is_abbreviated = current_clean.lower().startswith(_ABBREVIATED_PREFIXES)
candidate_is_abbreviated = candidate_clean.lower().startswith(
_ABBREVIATED_PREFIXES
)
if current_is_abbreviated and not candidate_is_abbreviated:
return candidate_clean
if len(candidate_clean) > len(current_clean) and not candidate_is_abbreviated:
return candidate_clean
return current_clean