Files
mostovik-backend/src/organizations/services.py
Aleksandr Meshchriakov 19a7d5a91c
All checks were successful
CI/CD Pipeline / Quality Gate (push) Successful in 28s
CI/CD Pipeline / Build and Push Images (push) Successful in 10s
CI/CD Pipeline / Internal Notify (push) Successful in 0s
CI/CD Pipeline / Deploy Dev in Dokploy (push) Successful in 1s
perf(organizations): speed up filtered API lists
2026-05-14 17:08:03 +02:00

717 lines
24 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Services for building the canonical organizations directory."""
from __future__ import annotations
import re
from collections.abc import Iterable
from dataclasses import dataclass
from apps.parsers.models import (
VACANCY_RECORD_SOURCES,
FinancialReport,
GenericParserRecord,
IndustrialCertificateRecord,
IndustrialProductRecord,
InspectionRecord,
ManufacturerRecord,
ParserLoadLog,
ProcurementRecord,
)
from django.db import transaction
from django.db.models import Q
from django.utils import timezone
from registers.models import Organization as RegisterOrganization
from organizations.api_enrichment import OrganizationApiEnrichmentService
from organizations.data_sources import data_source_summary
from organizations.models import Organization, OrganizationDataSnapshot
_QUOTE_CHARS = "\"'«»„“”"
_LEGAL_FORM_PATTERNS = (
r"\bобщество\s+с\s+ограниченной\s+ответственностью\b",
r"\bооо\b",
r"\booo\b",
r"\bакционерное\s+общество\b",
r"\bао\b",
r"\bao\b",
r"\bпубличное\s+акционерное\s+общество\b",
r"\bпао\b",
r"\bpao\b",
r"\акрытое\s+акционерное\s+общество\b",
r"\ао\b",
r"\bzao\b",
r"\bиндивидуальный\s+предприниматель\b",
r"\bип\b",
)
_ABBREVIATED_PREFIXES = (
"ооо ",
"ooo ",
"ао ",
"ao ",
"пао ",
"pao ",
"зао ",
"zao ",
"ип ",
)
@dataclass(frozen=True)
class OrganizationCandidate:
"""Organization data extracted from an existing source table."""
name: str
inn: str = ""
kpp: str = ""
ogrn: str = ""
ogrip: str = ""
@dataclass(frozen=True)
class PopulateOrganizationsResult:
"""Result counters for organization population."""
scanned: int
created: int
updated: int
skipped: int
@dataclass(frozen=True)
class RefreshOrganizationDataSnapshotsResult:
"""Result counters for precomputed organization API data snapshots."""
processed: int
created: int
updated: int
@dataclass
class OrganizationLookup:
"""In-memory indexes for matching organization candidates."""
by_inn_kpp: dict[tuple[str, str], Organization]
by_ogrn_kpp: dict[tuple[str, str], Organization]
by_inn: dict[str, list[Organization]]
by_ogrn: dict[str, list[Organization]]
by_ogrip: dict[str, Organization]
by_normalized_name: dict[str, Organization]
class OrganizationDataSnapshotRefreshService:
"""Refreshes precomputed v2 data JSON for canonical organizations."""
@classmethod
def refresh(
cls,
*,
organization_uids: Iterable[str] | None = None,
batch_size: int = 100,
) -> RefreshOrganizationDataSnapshotsResult:
queryset = Organization.objects.all().order_by("uid")
if organization_uids is not None:
queryset = queryset.filter(uid__in=list(organization_uids))
processed = 0
created = 0
updated = 0
for organizations in cls._iter_batches(queryset, batch_size):
enrichment = OrganizationApiEnrichmentService.build_for(organizations)
existing_snapshots = {
str(snapshot.organization_id): snapshot
for snapshot in OrganizationDataSnapshot.objects.filter(
organization_id__in=[
organization.uid for organization in organizations
]
)
}
create_instances: list[OrganizationDataSnapshot] = []
update_instances: list[OrganizationDataSnapshot] = []
for organization in organizations:
processed += 1
item = enrichment[str(organization.uid)]
data = item.data_presence
data_source_counts = data_source_summary(data)
registries = [
{
"id": registry.id,
"name": registry.name,
}
for registry in item.registries
]
snapshot = existing_snapshots.get(str(organization.uid))
if snapshot is None:
create_instances.append(
OrganizationDataSnapshot(
organization=organization,
data=data,
data_source_counts=data_source_counts,
registries=registries,
)
)
continue
snapshot.data = data
snapshot.data_source_counts = data_source_counts
snapshot.registries = registries
snapshot.updated_at = timezone.now()
update_instances.append(snapshot)
if create_instances:
OrganizationDataSnapshot.objects.bulk_create(
create_instances,
batch_size=batch_size,
)
created += len(create_instances)
if update_instances:
OrganizationDataSnapshot.objects.bulk_update(
update_instances,
fields=["data", "data_source_counts", "registries", "updated_at"],
batch_size=batch_size,
)
updated += len(update_instances)
return RefreshOrganizationDataSnapshotsResult(
processed=processed,
created=created,
updated=updated,
)
@classmethod
def refresh_for_parser_batch(
cls,
*,
source: str,
batch_id: int,
batch_size: int = 100,
) -> RefreshOrganizationDataSnapshotsResult:
organization_uids = cls.organization_uids_for_parser_batch(
source=source,
batch_id=batch_id,
)
return cls.refresh(
organization_uids=organization_uids,
batch_size=batch_size,
)
@classmethod
def organization_uids_for_parser_batch(
cls,
*,
source: str,
batch_id: int,
) -> list[str]:
inn_values, ogrn_values = cls._parser_batch_identities(
source=source,
batch_id=batch_id,
)
if not inn_values and not ogrn_values:
return []
query = Q()
if inn_values:
query |= Q(inn__in=inn_values)
if ogrn_values:
query |= Q(ogrn__in=ogrn_values) | Q(ogrip__in=ogrn_values)
return [
str(uid)
for uid in Organization.objects.filter(query).values_list("uid", flat=True)
]
@staticmethod
def _parser_batch_identities(
*,
source: str,
batch_id: int,
) -> tuple[set[str], set[str]]:
if source == ParserLoadLog.Source.INDUSTRIAL:
return _identity_values(
IndustrialCertificateRecord.objects.filter(load_batch=batch_id),
inn_field="inn",
ogrn_field="ogrn",
)
if source == ParserLoadLog.Source.INDUSTRIAL_PRODUCTS:
return _identity_values(
IndustrialProductRecord.objects.filter(load_batch=batch_id),
inn_field="inn",
ogrn_field="ogrn",
)
if source == ParserLoadLog.Source.MANUFACTURES:
return _identity_values(
ManufacturerRecord.objects.filter(load_batch=batch_id),
inn_field="inn",
ogrn_field="ogrn",
)
if source == ParserLoadLog.Source.INSPECTIONS:
return _identity_values(
InspectionRecord.objects.filter(load_batch=batch_id),
inn_field="inn",
ogrn_field="ogrn",
)
if source == ParserLoadLog.Source.PROCUREMENTS:
return _identity_values(
ProcurementRecord.objects.filter(load_batch=batch_id),
inn_field="customer_inn",
ogrn_field="customer_ogrn",
)
if source == ParserLoadLog.Source.FNS_REPORTS:
return (
set(),
set(
FinancialReport.objects.filter(load_batch=batch_id)
.exclude(ogrn="")
.values_list("ogrn", flat=True)
.distinct()
),
)
if source in {
ParserLoadLog.Source.PROCUREMENTS_44FZ,
ParserLoadLog.Source.PROCUREMENTS_223FZ,
ParserLoadLog.Source.CONTRACTS,
ParserLoadLog.Source.UNFAIR_SUPPLIERS,
ParserLoadLog.Source.FAS_GOZ,
ParserLoadLog.Source.ARBITRATION,
ParserLoadLog.Source.FEDRESURS_BANKRUPTCY,
ParserLoadLog.Source.FSTEC,
ParserLoadLog.Source.TRUDVSEM,
}:
sources = (
VACANCY_RECORD_SOURCES
if source == ParserLoadLog.Source.TRUDVSEM
else (source,)
)
return _identity_values(
GenericParserRecord.objects.filter(
source__in=sources,
load_batch=batch_id,
),
inn_field="inn",
ogrn_field="ogrn",
)
return set(), set()
@staticmethod
def _iter_batches(queryset, batch_size: int) -> Iterable[list[Organization]]:
batch: list[Organization] = []
for organization in queryset.iterator(chunk_size=batch_size):
batch.append(organization)
if len(batch) >= batch_size:
yield batch
batch = []
if batch:
yield batch
def _identity_values(
queryset, *, inn_field: str, ogrn_field: str
) -> tuple[set[str], set[str]]:
inn_values = set(
queryset.exclude(**{inn_field: ""}).values_list(inn_field, flat=True).distinct()
)
ogrn_values = set(
queryset.exclude(**{ogrn_field: ""})
.values_list(ogrn_field, flat=True)
.distinct()
)
return inn_values, ogrn_values
def normalize_identifier(value: str | int | None, *, max_length: int) -> str:
"""Return digits-only identifier bounded by the target field length."""
if value is None:
return ""
normalized = re.sub(r"\D+", "", str(value))
if not normalized or len(normalized) > max_length:
return ""
return normalized
def normalize_organization_name(value: str | None) -> str:
"""Normalize organization names for matching spelling variants."""
if value is None:
return ""
normalized = str(value).strip().lower().replace("ё", "е")
normalized = normalized.translate(
str.maketrans({char: " " for char in _QUOTE_CHARS})
)
normalized = re.sub(r"[^\w\s-]+", " ", normalized, flags=re.UNICODE)
normalized = normalized.replace("-", " ")
normalized = re.sub(r"\s+", " ", normalized).strip()
for pattern in _LEGAL_FORM_PATTERNS:
normalized = re.sub(pattern, " ", normalized, flags=re.IGNORECASE)
return re.sub(r"\s+", " ", normalized).strip()
class OrganizationPopulationService:
"""Builds organizations from currently available source tables."""
@classmethod
def populate(cls) -> PopulateOrganizationsResult:
scanned = 0
created = 0
updated = 0
skipped = 0
with transaction.atomic():
existing = list(Organization.objects.all())
lookup = cls._build_lookup(existing)
create_instances: list[Organization] = []
update_instances_by_uid: dict[str, Organization] = {}
for candidate in cls.iter_candidates():
scanned += 1
if not normalize_organization_name(candidate.name):
skipped += 1
continue
organization = cls._find_existing(lookup, candidate)
if organization is None:
organization = Organization(
name=candidate.name.strip(),
inn=candidate.inn,
kpp=candidate.kpp,
ogrn=candidate.ogrn,
ogrip=candidate.ogrip,
)
existing.append(organization)
create_instances.append(organization)
cls._index_organization(lookup, organization)
created += 1
continue
if cls._assign_existing_fields(organization, candidate):
cls._index_organization(lookup, organization)
update_instances_by_uid[str(organization.uid)] = organization
updated += 1
if create_instances:
Organization.objects.bulk_create(create_instances, batch_size=1000)
update_instances = list(update_instances_by_uid.values())
if update_instances:
Organization.objects.bulk_update(
update_instances,
fields=["name", "inn", "kpp", "ogrn", "ogrip"],
batch_size=1000,
)
return PopulateOrganizationsResult(
scanned=scanned,
created=created,
updated=updated,
skipped=skipped,
)
@classmethod
def iter_candidates(cls) -> Iterable[OrganizationCandidate]:
"""Yield organization candidates from all current source tables."""
for row in RegisterOrganization.objects.iterator():
yield cls._candidate(
name=row.pn_name,
inn=row.mn_inn,
kpp=row.in_kpp,
ogrn=row.mn_ogrn,
)
for row in IndustrialCertificateRecord.objects.iterator():
yield cls._candidate(
name=row.organisation_name,
inn=row.inn,
ogrn=row.ogrn,
)
for row in ManufacturerRecord.objects.iterator():
yield cls._candidate(
name=row.full_legal_name,
inn=row.inn,
ogrn=row.ogrn,
)
for row in IndustrialProductRecord.objects.iterator():
yield cls._candidate(
name=row.full_organisation_name,
inn=row.inn,
ogrn=row.ogrn,
)
for row in GenericParserRecord.objects.iterator():
yield cls._candidate(
name=row.organisation_name or row.title,
inn=row.inn,
kpp=cls._payload_kpp(row.payload),
ogrn=row.ogrn,
)
for row in InspectionRecord.objects.iterator():
yield cls._candidate(
name=row.organisation_name,
inn=row.inn,
ogrn=row.ogrn,
)
for row in ProcurementRecord.objects.iterator():
yield cls._candidate(
name=row.customer_name,
inn=row.customer_inn,
kpp=row.customer_kpp,
ogrn=row.customer_ogrn,
)
@staticmethod
def _candidate(
*,
name: str | None,
inn: str | int | None = None,
kpp: str | int | None = None,
ogrn: str | int | None = None,
) -> OrganizationCandidate:
normalized_inn = normalize_identifier(inn, max_length=12)
normalized_ogrn = normalize_identifier(ogrn, max_length=15)
ogrip = (
normalized_ogrn
if len(normalized_ogrn) == 15 and len(normalized_inn) == 12
else ""
)
legal_ogrn = normalized_ogrn if len(normalized_ogrn) == 13 else ""
return OrganizationCandidate(
name=(name or "").strip(),
inn=normalized_inn,
kpp="" if ogrip else normalize_identifier(kpp, max_length=9),
ogrn=legal_ogrn,
ogrip=ogrip,
)
@classmethod
def _payload_kpp(cls, payload: object) -> str:
if not isinstance(payload, dict):
return ""
company = payload.get("company")
if isinstance(company, dict):
company_kpp = normalize_identifier(company.get("kpp"), max_length=9)
if company_kpp:
return company_kpp
return cls._find_payload_identifier(payload, {"kpp", "кпп"}, max_length=9)
@classmethod
def _find_payload_identifier(
cls,
value: object,
keys: set[str],
*,
max_length: int,
) -> str:
if isinstance(value, dict):
for key, item in value.items():
if str(key).strip().lower() in keys:
identifier = normalize_identifier(item, max_length=max_length)
if identifier:
return identifier
nested = cls._find_payload_identifier(item, keys, max_length=max_length)
if nested:
return nested
elif isinstance(value, list):
for item in value:
nested = cls._find_payload_identifier(item, keys, max_length=max_length)
if nested:
return nested
return ""
@classmethod
def _build_lookup(cls, organizations: list[Organization]) -> OrganizationLookup:
lookup = OrganizationLookup(
by_inn_kpp={},
by_ogrn_kpp={},
by_inn={},
by_ogrn={},
by_ogrip={},
by_normalized_name={},
)
for organization in organizations:
cls._index_organization(lookup, organization)
return lookup
@staticmethod
def _index_organization(
lookup: OrganizationLookup,
organization: Organization,
) -> None:
if organization.inn and organization.kpp:
lookup.by_inn_kpp.setdefault(
(organization.inn, organization.kpp), organization
)
if organization.ogrn and organization.kpp:
lookup.by_ogrn_kpp.setdefault(
(organization.ogrn, organization.kpp), organization
)
if organization.inn:
lookup.by_inn.setdefault(organization.inn, [])
if organization not in lookup.by_inn[organization.inn]:
lookup.by_inn[organization.inn].append(organization)
if organization.ogrn:
lookup.by_ogrn.setdefault(organization.ogrn, [])
if organization not in lookup.by_ogrn[organization.ogrn]:
lookup.by_ogrn[organization.ogrn].append(organization)
if organization.ogrip:
lookup.by_ogrip.setdefault(organization.ogrip, organization)
normalized_name = normalize_organization_name(organization.name)
if normalized_name:
lookup.by_normalized_name.setdefault(normalized_name, organization)
@staticmethod
def _find_exact_identifier_match(
lookup: OrganizationLookup,
candidate: OrganizationCandidate,
) -> Organization | None:
if candidate.inn and candidate.kpp:
organization = lookup.by_inn_kpp.get((candidate.inn, candidate.kpp))
if organization is not None:
return organization
if candidate.ogrn and candidate.kpp:
organization = lookup.by_ogrn_kpp.get((candidate.ogrn, candidate.kpp))
if organization is not None:
return organization
if candidate.ogrip and candidate.ogrip in lookup.by_ogrip:
return lookup.by_ogrip[candidate.ogrip]
return None
@staticmethod
def _find_blank_kpp_match(
lookup: OrganizationLookup,
candidate: OrganizationCandidate,
) -> Organization | None:
if candidate.inn and candidate.kpp:
blank_kpp_matches = [
organization
for organization in lookup.by_inn.get(candidate.inn, [])
if not organization.kpp
]
if len(blank_kpp_matches) == 1:
return blank_kpp_matches[0]
if candidate.ogrn and candidate.kpp:
blank_kpp_matches = [
organization
for organization in lookup.by_ogrn.get(candidate.ogrn, [])
if not organization.kpp
]
if len(blank_kpp_matches) == 1:
return blank_kpp_matches[0]
return None
@staticmethod
def _find_single_identifier_match(
lookup: OrganizationLookup,
candidate: OrganizationCandidate,
) -> Organization | None:
if candidate.inn and not candidate.kpp:
organizations = lookup.by_inn.get(candidate.inn, [])
if len(organizations) == 1:
return organizations[0]
if candidate.ogrn and not candidate.kpp:
organizations = lookup.by_ogrn.get(candidate.ogrn, [])
if len(organizations) == 1:
return organizations[0]
return None
@staticmethod
def _find_name_match(
lookup: OrganizationLookup,
candidate: OrganizationCandidate,
) -> Organization | None:
candidate_name = normalize_organization_name(candidate.name)
if not candidate_name:
return None
return lookup.by_normalized_name.get(candidate_name)
@classmethod
def _find_existing(
cls,
lookup: OrganizationLookup,
candidate: OrganizationCandidate,
) -> Organization | None:
organization = cls._find_exact_identifier_match(lookup, candidate)
if organization is not None:
return organization
organization = cls._find_blank_kpp_match(lookup, candidate)
if organization is not None:
return organization
organization = cls._find_single_identifier_match(lookup, candidate)
if organization is not None:
return organization
if candidate.kpp and (candidate.inn or candidate.ogrn):
return None
return cls._find_name_match(lookup, candidate)
@classmethod
def _update_existing(
cls,
organization: Organization,
candidate: OrganizationCandidate,
) -> bool:
if not cls._assign_existing_fields(organization, candidate):
return False
organization.save(update_fields=["name", "inn", "kpp", "ogrn", "ogrip"])
return True
@classmethod
def _assign_existing_fields(
cls,
organization: Organization,
candidate: OrganizationCandidate,
) -> bool:
changed = False
selected_name = cls._select_name(organization.name, candidate.name)
if selected_name != organization.name:
organization.name = selected_name
changed = True
for field_name in ("inn", "kpp", "ogrn", "ogrip"):
if getattr(organization, field_name):
continue
if field_name == "ogrip" and (organization.kpp or organization.ogrn):
continue
if field_name in {"kpp", "ogrn"} and organization.ogrip:
continue
candidate_value = getattr(candidate, field_name)
if candidate_value:
setattr(organization, field_name, candidate_value)
changed = True
return changed
@staticmethod
def _select_name(current: str, candidate: str) -> str:
current_clean = current.strip()
candidate_clean = candidate.strip()
if not candidate_clean:
return current_clean
if not current_clean:
return candidate_clean
current_is_abbreviated = current_clean.lower().startswith(_ABBREVIATED_PREFIXES)
candidate_is_abbreviated = candidate_clean.lower().startswith(
_ABBREVIATED_PREFIXES
)
if current_is_abbreviated and not candidate_is_abbreviated:
return candidate_clean
if len(candidate_clean) > len(current_clean) and not candidate_is_abbreviated:
return candidate_clean
return current_clean