perf(organizations): speed up filtered API lists
All checks were successful
CI/CD Pipeline / Quality Gate (push) Successful in 28s
CI/CD Pipeline / Build and Push Images (push) Successful in 10s
CI/CD Pipeline / Internal Notify (push) Successful in 0s
CI/CD Pipeline / Deploy Dev in Dokploy (push) Successful in 1s

This commit is contained in:
2026-05-14 17:08:03 +02:00
parent df89e498cc
commit 19a7d5a91c
10 changed files with 360 additions and 62 deletions

View File

@@ -21,6 +21,7 @@ from apps.parsers.models import (
from django.db.models import Count, Prefetch, Q
from registers.models import RegistryMembershipPeriod
from organizations.data_sources import to_api_data_source, to_internal_data_source
from organizations.models import Organization
GENERIC_SOURCES = (
@@ -45,12 +46,7 @@ DATA_PRESENCE_KEYS = (
ParserLoadLog.Source.FNS_REPORTS,
)
DATA_PRESENCE_KEY_SET = {str(source) for source in DATA_PRESENCE_KEYS}
API_DATA_SOURCE_ALIASES = {
ParserLoadLog.Source.TRUDVSEM: "vacancies",
}
API_DATA_SOURCE_KEY_SET = {
API_DATA_SOURCE_ALIASES.get(source, str(source)) for source in DATA_PRESENCE_KEYS
}
API_DATA_SOURCE_KEY_SET = {to_api_data_source(source) for source in DATA_PRESENCE_KEYS}
@dataclass(frozen=True)
@@ -98,19 +94,6 @@ def data_presence_identity_values(source: str) -> tuple[set[str], set[str]]:
return matches["inn"], matches["ogrn"]
def to_api_data_source(source: str) -> str:
"""Return v2 public data source key for an internal parser source."""
return API_DATA_SOURCE_ALIASES.get(source, str(source))
def to_internal_data_source(source: str) -> str:
"""Return internal parser source key from a v2 public key."""
for internal_source, api_source in API_DATA_SOURCE_ALIASES.items():
if source == api_source:
return str(internal_source)
return source
def _source_matches(source: str) -> dict[str, set[str]]:
if source == ParserLoadLog.Source.INDUSTRIAL:
return OrganizationApiEnrichmentService._matching_identifiers_for_all(

View File

@@ -0,0 +1,48 @@
"""Helpers for organization API data source keys and summaries."""
from __future__ import annotations
from typing import Any
API_DATA_SOURCE_ALIASES = {
"trudvsem": "vacancies",
}
def to_api_data_source(source: str) -> str:
"""Return v2 public data source key for an internal parser source."""
return API_DATA_SOURCE_ALIASES.get(str(source), str(source))
def to_internal_data_source(source: str) -> str:
"""Return internal parser source key from a v2 public key."""
for internal_source, api_source in API_DATA_SOURCE_ALIASES.items():
if source == api_source:
return internal_source
return source
def snapshot_data_with_api_keys(data: dict[str, Any]) -> dict[str, Any]:
"""Return snapshot data keyed by public API source names."""
return {to_api_data_source(source): value for source, value in data.items()}
def data_source_summary(data: dict[str, Any]) -> list[dict[str, int | str]]:
"""Return non-empty source counters for a data payload."""
summary: list[dict[str, int | str]] = []
for source in sorted(data):
value = data[source]
if isinstance(value, list):
count = len(value)
elif value:
count = 1
else:
count = 0
if count:
summary.append({"source": source, "count": count})
return summary
def snapshot_data_source_summary(data: dict[str, Any]) -> list[dict[str, int | str]]:
"""Return non-empty public source counters for stored snapshot data."""
return data_source_summary(snapshot_data_with_api_keys(data))

View File

@@ -1,6 +1,7 @@
"""Filters for organizations API v2."""
from django.db.models import CharField, Exists, OuterRef, Q
from apps.parsers.models import FinancialReport, ParserLoadLog
from django.db.models import CharField, Q
from django.db.models.functions import Cast
from django_filters import rest_framework as filters
from registers.models import RegistryMembershipPeriod
@@ -84,6 +85,9 @@ class OrganizationFilter(filters.FilterSet):
if source not in DATA_PRESENCE_KEYS:
return queryset.none()
if source == ParserLoadLog.Source.FNS_REPORTS:
return self._filter_by_fns_report_presence(queryset, value)
inn_values, ogrn_values = data_presence_identity_values(source)
filtered = self._filter_by_registry_identities(
queryset, inn_values, ogrn_values
@@ -106,6 +110,17 @@ class OrganizationFilter(filters.FilterSet):
query |= Q(ogrn__in=ogrn_values) | Q(ogrip__in=ogrn_values)
return queryset.filter(query)
@staticmethod
def _filter_by_fns_report_presence(queryset, value):
report_ogrns = FinancialReport.objects.order_by().values_list(
"ogrn",
flat=True,
)
query = Q(ogrn__in=report_ogrns) | Q(ogrip__in=report_ogrns)
if value:
return queryset.filter(query)
return queryset.exclude(query)
@classmethod
def _filter_by_registry_membership(
cls,
@@ -115,23 +130,44 @@ class OrganizationFilter(filters.FilterSet):
registry_name: str | None = None,
has_registry: bool = True,
):
membership = cls._registry_membership_subquery(
query = cls._registry_identity_query(
registry_id=registry_id,
registry_name=registry_name,
)
return queryset.annotate(_has_registry=Exists(membership)).filter(
_has_registry=has_registry
if has_registry:
return queryset.filter(query)
return queryset.exclude(query)
@classmethod
def _registry_identity_query(
cls,
*,
registry_id: str | None = None,
registry_name: str | None = None,
):
inn_values, ogrn_values = cls._registry_identity_value_querysets(
registry_id=registry_id,
registry_name=registry_name,
)
return (
Q(inn__in=inn_values) | Q(ogrn__in=ogrn_values) | Q(ogrip__in=ogrn_values)
)
@staticmethod
def _registry_membership_subquery(
def _registry_identity_value_querysets(
*,
registry_id: str | None = None,
registry_name: str | None = None,
):
membership = RegistryMembershipPeriod.objects.filter(
ended_at__isnull=True,
).annotate(
).order_by()
if registry_id:
membership = membership.filter(registry_id=registry_id)
if registry_name:
membership = membership.filter(registry__name__icontains=registry_name)
membership = membership.annotate(
organization_inn_text=Cast(
"organization__mn_inn", output_field=CharField()
),
@@ -139,13 +175,8 @@ class OrganizationFilter(filters.FilterSet):
"organization__mn_ogrn", output_field=CharField()
),
)
if registry_id:
membership = membership.filter(registry_id=registry_id)
if registry_name:
membership = membership.filter(registry__name__icontains=registry_name)
return membership.filter(
Q(organization_inn_text=OuterRef("inn"))
| Q(organization_ogrn_text=OuterRef("ogrn"))
| Q(organization_ogrn_text=OuterRef("ogrip"))
return (
membership.values_list("organization_inn_text", flat=True),
membership.values_list("organization_ogrn_text", flat=True),
)

View File

@@ -0,0 +1,119 @@
from django.db import migrations, models
API_DATA_SOURCE_ALIASES = {
"trudvsem": "vacancies",
}
def to_api_data_source(source):
return API_DATA_SOURCE_ALIASES.get(str(source), str(source))
def data_source_summary(data):
summary = []
for source in sorted(data):
value = data[source]
if isinstance(value, list):
count = len(value)
elif value:
count = 1
else:
count = 0
if count:
summary.append({"source": to_api_data_source(source), "count": count})
return summary
def backfill_data_source_counts_python(apps):
snapshot_model = apps.get_model("organizations", "OrganizationDataSnapshot")
updates = []
for snapshot in snapshot_model.objects.only(
"organization_id",
"data",
"data_source_counts",
).iterator(chunk_size=100):
snapshot.data_source_counts = data_source_summary(snapshot.data)
updates.append(snapshot)
if len(updates) >= 100:
snapshot_model.objects.bulk_update(updates, ["data_source_counts"])
updates = []
if updates:
snapshot_model.objects.bulk_update(updates, ["data_source_counts"])
def backfill_data_source_counts(apps, schema_editor):
if schema_editor.connection.vendor != "postgresql":
backfill_data_source_counts_python(apps)
return
with schema_editor.connection.cursor() as cursor:
cursor.execute(
"""
UPDATE organizations_data_snapshot snapshot
SET data_source_counts = COALESCE(
(
SELECT jsonb_agg(
jsonb_build_object(
'source',
source_counts.source,
'count',
source_counts.record_count
)
ORDER BY source_counts.source
)
FROM (
SELECT CASE source_items.key
WHEN 'trudvsem' THEN 'vacancies'
ELSE source_items.key
END AS source,
CASE
WHEN jsonb_typeof(source_items.value) = 'array'
THEN jsonb_array_length(source_items.value)
WHEN source_items.value IN (
'null'::jsonb,
'false'::jsonb,
'[]'::jsonb,
'{}'::jsonb,
'""'::jsonb
)
THEN 0
ELSE 1
END AS record_count
FROM jsonb_each(snapshot.data) AS source_items
) AS source_counts
WHERE source_counts.record_count > 0
),
'[]'::jsonb
)
"""
)
def clear_data_source_counts(apps, schema_editor):
snapshot_model = apps.get_model("organizations", "OrganizationDataSnapshot")
snapshot_model.objects.update(data_source_counts=[])
class Migration(migrations.Migration):
dependencies = [
("organizations", "0004_seed_daily_snapshot_refresh_schedule"),
]
operations = [
migrations.AddField(
model_name="organizationdatasnapshot",
name="data_source_counts",
field=models.JSONField(
default=list,
help_text="Готовый JSON data_sources для API v2",
verbose_name="счетчики источников",
),
),
migrations.RunPython(
backfill_data_source_counts,
reverse_code=clear_data_source_counts,
),
]

View File

@@ -6,6 +6,7 @@ from django.db import models
from django.db.models import Q
from django.utils.translation import gettext_lazy as _
from organizations.data_sources import snapshot_data_source_summary
from organizations.name_normalization import normalize_organization_name
@@ -116,6 +117,11 @@ class OrganizationDataSnapshot(models.Model):
default=list,
help_text=_("Готовый JSON registries для API v2"),
)
data_source_counts = models.JSONField(
_("счетчики источников"),
default=list,
help_text=_("Готовый JSON data_sources для API v2"),
)
updated_at = models.DateTimeField(
_("дата обновления"),
auto_now=True,
@@ -129,3 +135,14 @@ class OrganizationDataSnapshot(models.Model):
def __str__(self) -> str:
return f"Snapshot for {self.organization_id}"
def save(self, *args, **kwargs) -> None:
update_fields = kwargs.get("update_fields")
if update_fields is None or "data" in update_fields:
self.data_source_counts = snapshot_data_source_summary(self.data)
if update_fields is not None:
kwargs["update_fields"] = list(
dict.fromkeys([*update_fields, "data_source_counts"])
)
super().save(*args, **kwargs)

View File

@@ -4,7 +4,10 @@ from typing import Any
from rest_framework import serializers
from organizations.api_enrichment import to_api_data_source
from organizations.data_sources import (
data_source_summary,
snapshot_data_with_api_keys,
)
from organizations.models import Organization
@@ -36,8 +39,11 @@ class OrganizationSerializer(serializers.ModelSerializer):
def get_data(self, obj) -> dict[str, Any]:
snapshot = getattr(obj, "data_snapshot", None)
if snapshot is not None:
data = _snapshot_data_with_api_keys(snapshot.data)
data_sources = self.context.get("data_sources")
if data_sources is not None and not data_sources:
return {}
data = snapshot_data_with_api_keys(snapshot.data)
if data_sources is None:
return data
return {
@@ -61,13 +67,17 @@ class OrganizationSerializer(serializers.ModelSerializer):
def get_data_sources(self, obj) -> list[dict[str, int | str]]:
snapshot = getattr(obj, "data_snapshot", None)
if snapshot is not None:
data = _snapshot_data_with_api_keys(snapshot.data)
return _data_source_summary(data)
data_source_counts = getattr(snapshot, "data_source_counts", None)
if data_source_counts:
return data_source_counts
if "data" in snapshot.get_deferred_fields():
return []
return data_source_summary(snapshot_data_with_api_keys(snapshot.data))
enrichment = self.context.get("enrichment", {}).get(str(obj.uid))
if enrichment is None:
return []
return _data_source_summary(enrichment.data_presence)
return data_source_summary(enrichment.data_presence)
def get_registries(self, obj) -> list[dict[str, str]]:
snapshot = getattr(obj, "data_snapshot", None)
@@ -84,22 +94,3 @@ class OrganizationSerializer(serializers.ModelSerializer):
}
for registry in enrichment.registries
]
def _snapshot_data_with_api_keys(data: dict[str, Any]) -> dict[str, Any]:
return {to_api_data_source(source): value for source, value in data.items()}
def _data_source_summary(data: dict[str, Any]) -> list[dict[str, int | str]]:
summary: list[dict[str, int | str]] = []
for source in sorted(data):
value = data[source]
if isinstance(value, list):
count = len(value)
elif value:
count = 1
else:
count = 0
if count:
summary.append({"source": source, "count": count})
return summary

View File

@@ -23,6 +23,7 @@ from django.utils import timezone
from registers.models import Organization as RegisterOrganization
from organizations.api_enrichment import OrganizationApiEnrichmentService
from organizations.data_sources import data_source_summary
from organizations.models import Organization, OrganizationDataSnapshot
_QUOTE_CHARS = "\"'«»„“”"
@@ -132,6 +133,7 @@ class OrganizationDataSnapshotRefreshService:
processed += 1
item = enrichment[str(organization.uid)]
data = item.data_presence
data_source_counts = data_source_summary(data)
registries = [
{
"id": registry.id,
@@ -146,12 +148,14 @@ class OrganizationDataSnapshotRefreshService:
OrganizationDataSnapshot(
organization=organization,
data=data,
data_source_counts=data_source_counts,
registries=registries,
)
)
continue
snapshot.data = data
snapshot.data_source_counts = data_source_counts
snapshot.registries = registries
snapshot.updated_at = timezone.now()
update_instances.append(snapshot)
@@ -165,7 +169,7 @@ class OrganizationDataSnapshotRefreshService:
if update_instances:
OrganizationDataSnapshot.objects.bulk_update(
update_instances,
fields=["data", "registries", "updated_at"],
fields=["data", "data_source_counts", "registries", "updated_at"],
batch_size=batch_size,
)
updated += len(update_instances)

View File

@@ -285,9 +285,7 @@ class CachedReadOnlyMixin:
class OrganizationViewSet(CachedReadOnlyMixin, ReadOnlyModelViewSet):
"""Read-only API for canonical organizations."""
queryset = Organization.objects.select_related("data_snapshot").order_by(
"name", "uid"
)
queryset = Organization.objects.order_by("name", "uid")
serializer_class = OrganizationSerializer
permission_classes = [IsAuthenticated]
lookup_field = "uid"
@@ -307,7 +305,10 @@ class OrganizationViewSet(CachedReadOnlyMixin, ReadOnlyModelViewSet):
return super().get_permissions()
def get_queryset(self):
queryset = super().get_queryset()
queryset = super().get_queryset().select_related("data_snapshot")
if self._should_defer_snapshot_data():
queryset = queryset.defer("data_snapshot__data")
if self.action != "list" or "has_registry" in self.request.query_params:
return queryset
@@ -320,6 +321,20 @@ class OrganizationViewSet(CachedReadOnlyMixin, ReadOnlyModelViewSet):
return filterset.qs
return queryset
def _should_defer_snapshot_data(self) -> bool:
if getattr(self, "action", None) != "list":
return False
return not any(
name in self.request.query_params
for name in (
"data",
"data_sources",
"exclude_data",
"exclude_data_sources",
)
)
@swagger_auto_schema(
tags=[ORGANIZATIONS_TAG],
operation_id="v2_organizations_list",