perf(organizations): speed up filtered API lists
This commit is contained in:
@@ -21,6 +21,7 @@ from apps.parsers.models import (
|
||||
from django.db.models import Count, Prefetch, Q
|
||||
from registers.models import RegistryMembershipPeriod
|
||||
|
||||
from organizations.data_sources import to_api_data_source, to_internal_data_source
|
||||
from organizations.models import Organization
|
||||
|
||||
GENERIC_SOURCES = (
|
||||
@@ -45,12 +46,7 @@ DATA_PRESENCE_KEYS = (
|
||||
ParserLoadLog.Source.FNS_REPORTS,
|
||||
)
|
||||
DATA_PRESENCE_KEY_SET = {str(source) for source in DATA_PRESENCE_KEYS}
|
||||
API_DATA_SOURCE_ALIASES = {
|
||||
ParserLoadLog.Source.TRUDVSEM: "vacancies",
|
||||
}
|
||||
API_DATA_SOURCE_KEY_SET = {
|
||||
API_DATA_SOURCE_ALIASES.get(source, str(source)) for source in DATA_PRESENCE_KEYS
|
||||
}
|
||||
API_DATA_SOURCE_KEY_SET = {to_api_data_source(source) for source in DATA_PRESENCE_KEYS}
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
@@ -98,19 +94,6 @@ def data_presence_identity_values(source: str) -> tuple[set[str], set[str]]:
|
||||
return matches["inn"], matches["ogrn"]
|
||||
|
||||
|
||||
def to_api_data_source(source: str) -> str:
|
||||
"""Return v2 public data source key for an internal parser source."""
|
||||
return API_DATA_SOURCE_ALIASES.get(source, str(source))
|
||||
|
||||
|
||||
def to_internal_data_source(source: str) -> str:
|
||||
"""Return internal parser source key from a v2 public key."""
|
||||
for internal_source, api_source in API_DATA_SOURCE_ALIASES.items():
|
||||
if source == api_source:
|
||||
return str(internal_source)
|
||||
return source
|
||||
|
||||
|
||||
def _source_matches(source: str) -> dict[str, set[str]]:
|
||||
if source == ParserLoadLog.Source.INDUSTRIAL:
|
||||
return OrganizationApiEnrichmentService._matching_identifiers_for_all(
|
||||
|
||||
48
src/organizations/data_sources.py
Normal file
48
src/organizations/data_sources.py
Normal file
@@ -0,0 +1,48 @@
|
||||
"""Helpers for organization API data source keys and summaries."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any
|
||||
|
||||
API_DATA_SOURCE_ALIASES = {
|
||||
"trudvsem": "vacancies",
|
||||
}
|
||||
|
||||
|
||||
def to_api_data_source(source: str) -> str:
|
||||
"""Return v2 public data source key for an internal parser source."""
|
||||
return API_DATA_SOURCE_ALIASES.get(str(source), str(source))
|
||||
|
||||
|
||||
def to_internal_data_source(source: str) -> str:
|
||||
"""Return internal parser source key from a v2 public key."""
|
||||
for internal_source, api_source in API_DATA_SOURCE_ALIASES.items():
|
||||
if source == api_source:
|
||||
return internal_source
|
||||
return source
|
||||
|
||||
|
||||
def snapshot_data_with_api_keys(data: dict[str, Any]) -> dict[str, Any]:
|
||||
"""Return snapshot data keyed by public API source names."""
|
||||
return {to_api_data_source(source): value for source, value in data.items()}
|
||||
|
||||
|
||||
def data_source_summary(data: dict[str, Any]) -> list[dict[str, int | str]]:
|
||||
"""Return non-empty source counters for a data payload."""
|
||||
summary: list[dict[str, int | str]] = []
|
||||
for source in sorted(data):
|
||||
value = data[source]
|
||||
if isinstance(value, list):
|
||||
count = len(value)
|
||||
elif value:
|
||||
count = 1
|
||||
else:
|
||||
count = 0
|
||||
if count:
|
||||
summary.append({"source": source, "count": count})
|
||||
return summary
|
||||
|
||||
|
||||
def snapshot_data_source_summary(data: dict[str, Any]) -> list[dict[str, int | str]]:
|
||||
"""Return non-empty public source counters for stored snapshot data."""
|
||||
return data_source_summary(snapshot_data_with_api_keys(data))
|
||||
@@ -1,6 +1,7 @@
|
||||
"""Filters for organizations API v2."""
|
||||
|
||||
from django.db.models import CharField, Exists, OuterRef, Q
|
||||
from apps.parsers.models import FinancialReport, ParserLoadLog
|
||||
from django.db.models import CharField, Q
|
||||
from django.db.models.functions import Cast
|
||||
from django_filters import rest_framework as filters
|
||||
from registers.models import RegistryMembershipPeriod
|
||||
@@ -84,6 +85,9 @@ class OrganizationFilter(filters.FilterSet):
|
||||
if source not in DATA_PRESENCE_KEYS:
|
||||
return queryset.none()
|
||||
|
||||
if source == ParserLoadLog.Source.FNS_REPORTS:
|
||||
return self._filter_by_fns_report_presence(queryset, value)
|
||||
|
||||
inn_values, ogrn_values = data_presence_identity_values(source)
|
||||
filtered = self._filter_by_registry_identities(
|
||||
queryset, inn_values, ogrn_values
|
||||
@@ -106,6 +110,17 @@ class OrganizationFilter(filters.FilterSet):
|
||||
query |= Q(ogrn__in=ogrn_values) | Q(ogrip__in=ogrn_values)
|
||||
return queryset.filter(query)
|
||||
|
||||
@staticmethod
|
||||
def _filter_by_fns_report_presence(queryset, value):
|
||||
report_ogrns = FinancialReport.objects.order_by().values_list(
|
||||
"ogrn",
|
||||
flat=True,
|
||||
)
|
||||
query = Q(ogrn__in=report_ogrns) | Q(ogrip__in=report_ogrns)
|
||||
if value:
|
||||
return queryset.filter(query)
|
||||
return queryset.exclude(query)
|
||||
|
||||
@classmethod
|
||||
def _filter_by_registry_membership(
|
||||
cls,
|
||||
@@ -115,23 +130,44 @@ class OrganizationFilter(filters.FilterSet):
|
||||
registry_name: str | None = None,
|
||||
has_registry: bool = True,
|
||||
):
|
||||
membership = cls._registry_membership_subquery(
|
||||
query = cls._registry_identity_query(
|
||||
registry_id=registry_id,
|
||||
registry_name=registry_name,
|
||||
)
|
||||
return queryset.annotate(_has_registry=Exists(membership)).filter(
|
||||
_has_registry=has_registry
|
||||
if has_registry:
|
||||
return queryset.filter(query)
|
||||
return queryset.exclude(query)
|
||||
|
||||
@classmethod
|
||||
def _registry_identity_query(
|
||||
cls,
|
||||
*,
|
||||
registry_id: str | None = None,
|
||||
registry_name: str | None = None,
|
||||
):
|
||||
inn_values, ogrn_values = cls._registry_identity_value_querysets(
|
||||
registry_id=registry_id,
|
||||
registry_name=registry_name,
|
||||
)
|
||||
return (
|
||||
Q(inn__in=inn_values) | Q(ogrn__in=ogrn_values) | Q(ogrip__in=ogrn_values)
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _registry_membership_subquery(
|
||||
def _registry_identity_value_querysets(
|
||||
*,
|
||||
registry_id: str | None = None,
|
||||
registry_name: str | None = None,
|
||||
):
|
||||
membership = RegistryMembershipPeriod.objects.filter(
|
||||
ended_at__isnull=True,
|
||||
).annotate(
|
||||
).order_by()
|
||||
if registry_id:
|
||||
membership = membership.filter(registry_id=registry_id)
|
||||
if registry_name:
|
||||
membership = membership.filter(registry__name__icontains=registry_name)
|
||||
|
||||
membership = membership.annotate(
|
||||
organization_inn_text=Cast(
|
||||
"organization__mn_inn", output_field=CharField()
|
||||
),
|
||||
@@ -139,13 +175,8 @@ class OrganizationFilter(filters.FilterSet):
|
||||
"organization__mn_ogrn", output_field=CharField()
|
||||
),
|
||||
)
|
||||
if registry_id:
|
||||
membership = membership.filter(registry_id=registry_id)
|
||||
if registry_name:
|
||||
membership = membership.filter(registry__name__icontains=registry_name)
|
||||
|
||||
return membership.filter(
|
||||
Q(organization_inn_text=OuterRef("inn"))
|
||||
| Q(organization_ogrn_text=OuterRef("ogrn"))
|
||||
| Q(organization_ogrn_text=OuterRef("ogrip"))
|
||||
return (
|
||||
membership.values_list("organization_inn_text", flat=True),
|
||||
membership.values_list("organization_ogrn_text", flat=True),
|
||||
)
|
||||
|
||||
119
src/organizations/migrations/0005_snapshot_data_source_counts.py
Normal file
119
src/organizations/migrations/0005_snapshot_data_source_counts.py
Normal file
@@ -0,0 +1,119 @@
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
API_DATA_SOURCE_ALIASES = {
|
||||
"trudvsem": "vacancies",
|
||||
}
|
||||
|
||||
|
||||
def to_api_data_source(source):
|
||||
return API_DATA_SOURCE_ALIASES.get(str(source), str(source))
|
||||
|
||||
|
||||
def data_source_summary(data):
|
||||
summary = []
|
||||
for source in sorted(data):
|
||||
value = data[source]
|
||||
if isinstance(value, list):
|
||||
count = len(value)
|
||||
elif value:
|
||||
count = 1
|
||||
else:
|
||||
count = 0
|
||||
if count:
|
||||
summary.append({"source": to_api_data_source(source), "count": count})
|
||||
return summary
|
||||
|
||||
|
||||
def backfill_data_source_counts_python(apps):
|
||||
snapshot_model = apps.get_model("organizations", "OrganizationDataSnapshot")
|
||||
updates = []
|
||||
|
||||
for snapshot in snapshot_model.objects.only(
|
||||
"organization_id",
|
||||
"data",
|
||||
"data_source_counts",
|
||||
).iterator(chunk_size=100):
|
||||
snapshot.data_source_counts = data_source_summary(snapshot.data)
|
||||
updates.append(snapshot)
|
||||
if len(updates) >= 100:
|
||||
snapshot_model.objects.bulk_update(updates, ["data_source_counts"])
|
||||
updates = []
|
||||
|
||||
if updates:
|
||||
snapshot_model.objects.bulk_update(updates, ["data_source_counts"])
|
||||
|
||||
|
||||
def backfill_data_source_counts(apps, schema_editor):
|
||||
if schema_editor.connection.vendor != "postgresql":
|
||||
backfill_data_source_counts_python(apps)
|
||||
return
|
||||
|
||||
with schema_editor.connection.cursor() as cursor:
|
||||
cursor.execute(
|
||||
"""
|
||||
UPDATE organizations_data_snapshot snapshot
|
||||
SET data_source_counts = COALESCE(
|
||||
(
|
||||
SELECT jsonb_agg(
|
||||
jsonb_build_object(
|
||||
'source',
|
||||
source_counts.source,
|
||||
'count',
|
||||
source_counts.record_count
|
||||
)
|
||||
ORDER BY source_counts.source
|
||||
)
|
||||
FROM (
|
||||
SELECT CASE source_items.key
|
||||
WHEN 'trudvsem' THEN 'vacancies'
|
||||
ELSE source_items.key
|
||||
END AS source,
|
||||
CASE
|
||||
WHEN jsonb_typeof(source_items.value) = 'array'
|
||||
THEN jsonb_array_length(source_items.value)
|
||||
WHEN source_items.value IN (
|
||||
'null'::jsonb,
|
||||
'false'::jsonb,
|
||||
'[]'::jsonb,
|
||||
'{}'::jsonb,
|
||||
'""'::jsonb
|
||||
)
|
||||
THEN 0
|
||||
ELSE 1
|
||||
END AS record_count
|
||||
FROM jsonb_each(snapshot.data) AS source_items
|
||||
) AS source_counts
|
||||
WHERE source_counts.record_count > 0
|
||||
),
|
||||
'[]'::jsonb
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
def clear_data_source_counts(apps, schema_editor):
|
||||
snapshot_model = apps.get_model("organizations", "OrganizationDataSnapshot")
|
||||
snapshot_model.objects.update(data_source_counts=[])
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("organizations", "0004_seed_daily_snapshot_refresh_schedule"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name="organizationdatasnapshot",
|
||||
name="data_source_counts",
|
||||
field=models.JSONField(
|
||||
default=list,
|
||||
help_text="Готовый JSON data_sources для API v2",
|
||||
verbose_name="счетчики источников",
|
||||
),
|
||||
),
|
||||
migrations.RunPython(
|
||||
backfill_data_source_counts,
|
||||
reverse_code=clear_data_source_counts,
|
||||
),
|
||||
]
|
||||
@@ -6,6 +6,7 @@ from django.db import models
|
||||
from django.db.models import Q
|
||||
from django.utils.translation import gettext_lazy as _
|
||||
|
||||
from organizations.data_sources import snapshot_data_source_summary
|
||||
from organizations.name_normalization import normalize_organization_name
|
||||
|
||||
|
||||
@@ -116,6 +117,11 @@ class OrganizationDataSnapshot(models.Model):
|
||||
default=list,
|
||||
help_text=_("Готовый JSON registries для API v2"),
|
||||
)
|
||||
data_source_counts = models.JSONField(
|
||||
_("счетчики источников"),
|
||||
default=list,
|
||||
help_text=_("Готовый JSON data_sources для API v2"),
|
||||
)
|
||||
updated_at = models.DateTimeField(
|
||||
_("дата обновления"),
|
||||
auto_now=True,
|
||||
@@ -129,3 +135,14 @@ class OrganizationDataSnapshot(models.Model):
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f"Snapshot for {self.organization_id}"
|
||||
|
||||
def save(self, *args, **kwargs) -> None:
|
||||
update_fields = kwargs.get("update_fields")
|
||||
if update_fields is None or "data" in update_fields:
|
||||
self.data_source_counts = snapshot_data_source_summary(self.data)
|
||||
if update_fields is not None:
|
||||
kwargs["update_fields"] = list(
|
||||
dict.fromkeys([*update_fields, "data_source_counts"])
|
||||
)
|
||||
|
||||
super().save(*args, **kwargs)
|
||||
|
||||
@@ -4,7 +4,10 @@ from typing import Any
|
||||
|
||||
from rest_framework import serializers
|
||||
|
||||
from organizations.api_enrichment import to_api_data_source
|
||||
from organizations.data_sources import (
|
||||
data_source_summary,
|
||||
snapshot_data_with_api_keys,
|
||||
)
|
||||
from organizations.models import Organization
|
||||
|
||||
|
||||
@@ -36,8 +39,11 @@ class OrganizationSerializer(serializers.ModelSerializer):
|
||||
def get_data(self, obj) -> dict[str, Any]:
|
||||
snapshot = getattr(obj, "data_snapshot", None)
|
||||
if snapshot is not None:
|
||||
data = _snapshot_data_with_api_keys(snapshot.data)
|
||||
data_sources = self.context.get("data_sources")
|
||||
if data_sources is not None and not data_sources:
|
||||
return {}
|
||||
|
||||
data = snapshot_data_with_api_keys(snapshot.data)
|
||||
if data_sources is None:
|
||||
return data
|
||||
return {
|
||||
@@ -61,13 +67,17 @@ class OrganizationSerializer(serializers.ModelSerializer):
|
||||
def get_data_sources(self, obj) -> list[dict[str, int | str]]:
|
||||
snapshot = getattr(obj, "data_snapshot", None)
|
||||
if snapshot is not None:
|
||||
data = _snapshot_data_with_api_keys(snapshot.data)
|
||||
return _data_source_summary(data)
|
||||
data_source_counts = getattr(snapshot, "data_source_counts", None)
|
||||
if data_source_counts:
|
||||
return data_source_counts
|
||||
if "data" in snapshot.get_deferred_fields():
|
||||
return []
|
||||
return data_source_summary(snapshot_data_with_api_keys(snapshot.data))
|
||||
|
||||
enrichment = self.context.get("enrichment", {}).get(str(obj.uid))
|
||||
if enrichment is None:
|
||||
return []
|
||||
return _data_source_summary(enrichment.data_presence)
|
||||
return data_source_summary(enrichment.data_presence)
|
||||
|
||||
def get_registries(self, obj) -> list[dict[str, str]]:
|
||||
snapshot = getattr(obj, "data_snapshot", None)
|
||||
@@ -84,22 +94,3 @@ class OrganizationSerializer(serializers.ModelSerializer):
|
||||
}
|
||||
for registry in enrichment.registries
|
||||
]
|
||||
|
||||
|
||||
def _snapshot_data_with_api_keys(data: dict[str, Any]) -> dict[str, Any]:
|
||||
return {to_api_data_source(source): value for source, value in data.items()}
|
||||
|
||||
|
||||
def _data_source_summary(data: dict[str, Any]) -> list[dict[str, int | str]]:
|
||||
summary: list[dict[str, int | str]] = []
|
||||
for source in sorted(data):
|
||||
value = data[source]
|
||||
if isinstance(value, list):
|
||||
count = len(value)
|
||||
elif value:
|
||||
count = 1
|
||||
else:
|
||||
count = 0
|
||||
if count:
|
||||
summary.append({"source": source, "count": count})
|
||||
return summary
|
||||
|
||||
@@ -23,6 +23,7 @@ from django.utils import timezone
|
||||
from registers.models import Organization as RegisterOrganization
|
||||
|
||||
from organizations.api_enrichment import OrganizationApiEnrichmentService
|
||||
from organizations.data_sources import data_source_summary
|
||||
from organizations.models import Organization, OrganizationDataSnapshot
|
||||
|
||||
_QUOTE_CHARS = "\"'«»„“”"
|
||||
@@ -132,6 +133,7 @@ class OrganizationDataSnapshotRefreshService:
|
||||
processed += 1
|
||||
item = enrichment[str(organization.uid)]
|
||||
data = item.data_presence
|
||||
data_source_counts = data_source_summary(data)
|
||||
registries = [
|
||||
{
|
||||
"id": registry.id,
|
||||
@@ -146,12 +148,14 @@ class OrganizationDataSnapshotRefreshService:
|
||||
OrganizationDataSnapshot(
|
||||
organization=organization,
|
||||
data=data,
|
||||
data_source_counts=data_source_counts,
|
||||
registries=registries,
|
||||
)
|
||||
)
|
||||
continue
|
||||
|
||||
snapshot.data = data
|
||||
snapshot.data_source_counts = data_source_counts
|
||||
snapshot.registries = registries
|
||||
snapshot.updated_at = timezone.now()
|
||||
update_instances.append(snapshot)
|
||||
@@ -165,7 +169,7 @@ class OrganizationDataSnapshotRefreshService:
|
||||
if update_instances:
|
||||
OrganizationDataSnapshot.objects.bulk_update(
|
||||
update_instances,
|
||||
fields=["data", "registries", "updated_at"],
|
||||
fields=["data", "data_source_counts", "registries", "updated_at"],
|
||||
batch_size=batch_size,
|
||||
)
|
||||
updated += len(update_instances)
|
||||
|
||||
@@ -285,9 +285,7 @@ class CachedReadOnlyMixin:
|
||||
class OrganizationViewSet(CachedReadOnlyMixin, ReadOnlyModelViewSet):
|
||||
"""Read-only API for canonical organizations."""
|
||||
|
||||
queryset = Organization.objects.select_related("data_snapshot").order_by(
|
||||
"name", "uid"
|
||||
)
|
||||
queryset = Organization.objects.order_by("name", "uid")
|
||||
serializer_class = OrganizationSerializer
|
||||
permission_classes = [IsAuthenticated]
|
||||
lookup_field = "uid"
|
||||
@@ -307,7 +305,10 @@ class OrganizationViewSet(CachedReadOnlyMixin, ReadOnlyModelViewSet):
|
||||
return super().get_permissions()
|
||||
|
||||
def get_queryset(self):
|
||||
queryset = super().get_queryset()
|
||||
queryset = super().get_queryset().select_related("data_snapshot")
|
||||
if self._should_defer_snapshot_data():
|
||||
queryset = queryset.defer("data_snapshot__data")
|
||||
|
||||
if self.action != "list" or "has_registry" in self.request.query_params:
|
||||
return queryset
|
||||
|
||||
@@ -320,6 +321,20 @@ class OrganizationViewSet(CachedReadOnlyMixin, ReadOnlyModelViewSet):
|
||||
return filterset.qs
|
||||
return queryset
|
||||
|
||||
def _should_defer_snapshot_data(self) -> bool:
|
||||
if getattr(self, "action", None) != "list":
|
||||
return False
|
||||
|
||||
return not any(
|
||||
name in self.request.query_params
|
||||
for name in (
|
||||
"data",
|
||||
"data_sources",
|
||||
"exclude_data",
|
||||
"exclude_data_sources",
|
||||
)
|
||||
)
|
||||
|
||||
@swagger_auto_schema(
|
||||
tags=[ORGANIZATIONS_TAG],
|
||||
operation_id="v2_organizations_list",
|
||||
|
||||
Reference in New Issue
Block a user