From 0f17ff677361b965292473f8ac9a790378a14b2d Mon Sep 17 00:00:00 2001 From: Aleksandr Meshchriakov Date: Wed, 6 May 2026 19:04:46 +0200 Subject: [PATCH] Add organizations v2 API and registry enrichment --- .env.dev | 2 +- .env.prod.example | 5 +- .gitignore | 2 + check_organizations_v2_api.py | 79 + docker-compose.dev.yml | 6 - docker/Dockerfile | 17 +- ...dashboard-registry-enrichment-analytics.md | 205 +++ ...rd-registry-enrichment-analytics-design.md | 198 ++ src/apps/core/services.py | 96 +- src/apps/parsers/clients/proverki/client.py | 94 +- src/apps/parsers/clients/trudvsem/client.py | 63 +- src/apps/parsers/clients/vacancies.py | 369 ++++ src/apps/parsers/frontend_compat.py | 30 +- ...seed_daily_registry_enrichment_schedule.py | 61 + src/apps/parsers/organization_enrichment.py | 236 +++ src/apps/parsers/serializers.py | 134 +- src/apps/parsers/services.py | 193 +- src/apps/parsers/source_cards.py | 2 + src/apps/parsers/source_registry.py | 15 +- src/apps/parsers/tasks.py | 984 ++++++++-- src/apps/parsers/views.py | 437 ++++- src/core/api_v2_urls.py | 10 + src/core/urls.py | 1 + src/organizations/__init__.py | 1 + src/organizations/admin.py | 22 + src/organizations/api_enrichment.py | 775 ++++++++ src/organizations/apps.py | 9 + src/organizations/filters.py | 151 ++ src/organizations/management/__init__.py | 1 + .../management/commands/__init__.py | 1 + .../commands/populate_organizations.py | 31 + .../refresh_organization_data_snapshots.py | 49 + src/organizations/migrations/0001_initial.py | 131 ++ .../0002_organization_data_snapshot.py | 57 + .../0003_allow_branch_kpp_organizations.py | 36 + src/organizations/migrations/__init__.py | 1 + src/organizations/models.py | 131 ++ src/organizations/name_normalization.py | 159 ++ src/organizations/serializers.py | 98 + src/organizations/services.py | 703 ++++++++ src/organizations/tasks.py | 37 + src/organizations/urls.py | 13 + src/organizations/views.py | 456 +++++ src/registers/serializers.py | 2 + src/registers/services.py | 125 +- src/settings/base.py | 5 + src/settings/dev.py | 3 + src/templates/dashboard.html | 1598 ++++++++++++++++- tests/apps/core/test_bulk_operations.py | 35 + tests/apps/organizations/__init__.py | 1 + tests/apps/organizations/test_api_v2.py | 921 ++++++++++ tests/apps/organizations/test_models.py | 57 + .../organizations/test_populate_command.py | 143 ++ tests/apps/organizations/test_services.py | 128 ++ tests/apps/parsers/test_dashboard_page.py | 75 + tests/apps/parsers/test_proverki_client.py | 173 +- tests/apps/parsers/test_services.py | 28 + tests/apps/parsers/test_tasks.py | 656 ++++++- tests/apps/parsers/test_vacancy_clients.py | 255 +++ tests/apps/parsers/test_views.py | 358 ++++ tests/apps/registers/test_services.py | 72 + tests/test_api_inventory_e2e.py | 5 + 62 files changed, 10311 insertions(+), 430 deletions(-) create mode 100644 check_organizations_v2_api.py create mode 100644 docs/superpowers/plans/2026-05-06-dashboard-registry-enrichment-analytics.md create mode 100644 docs/superpowers/specs/2026-05-06-dashboard-registry-enrichment-analytics-design.md create mode 100644 src/apps/parsers/clients/vacancies.py create mode 100644 src/apps/parsers/migrations/0022_seed_daily_registry_enrichment_schedule.py create mode 100644 src/apps/parsers/organization_enrichment.py create mode 100644 src/core/api_v2_urls.py create mode 100644 src/organizations/__init__.py create mode 100644 src/organizations/admin.py create mode 100644 src/organizations/api_enrichment.py create mode 100644 src/organizations/apps.py create mode 100644 src/organizations/filters.py create mode 100644 src/organizations/management/__init__.py create mode 100644 src/organizations/management/commands/__init__.py create mode 100644 src/organizations/management/commands/populate_organizations.py create mode 100644 src/organizations/management/commands/refresh_organization_data_snapshots.py create mode 100644 src/organizations/migrations/0001_initial.py create mode 100644 src/organizations/migrations/0002_organization_data_snapshot.py create mode 100644 src/organizations/migrations/0003_allow_branch_kpp_organizations.py create mode 100644 src/organizations/migrations/__init__.py create mode 100644 src/organizations/models.py create mode 100644 src/organizations/name_normalization.py create mode 100644 src/organizations/serializers.py create mode 100644 src/organizations/services.py create mode 100644 src/organizations/tasks.py create mode 100644 src/organizations/urls.py create mode 100644 src/organizations/views.py create mode 100644 tests/apps/organizations/__init__.py create mode 100644 tests/apps/organizations/test_api_v2.py create mode 100644 tests/apps/organizations/test_models.py create mode 100644 tests/apps/organizations/test_populate_command.py create mode 100644 tests/apps/organizations/test_services.py create mode 100644 tests/apps/parsers/test_vacancy_clients.py diff --git a/.env.dev b/.env.dev index ed46e27..f5abbb2 100644 --- a/.env.dev +++ b/.env.dev @@ -1,5 +1,5 @@ # Docker Compose development environment -DJANGO_SETTINGS_MODULE=config.settings.dev +DJANGO_SETTINGS_MODULE=settings.dev POSTGRES_HOST=db POSTGRES_PORT=5432 diff --git a/.env.prod.example b/.env.prod.example index f266c09..20622cb 100644 --- a/.env.prod.example +++ b/.env.prod.example @@ -24,8 +24,9 @@ CELERY_LOG_LEVEL=INFO CELERY_WORKER_CONCURRENCY=2 # Parsers API keys -CHECKO_API_KEY=pRiEnJuD1tclsLCb -ZAKUPKI_TOKEN=019c03d7-e1f6-7091-b296-8c88b4c585dd +CHECKO_API_KEY= +ZAKUPKI_TOKEN= +SUPERJOB_APP_ID= # Optional: comma-separated HTTP(S) proxies for parser tasks # Example: PARSER_PROXIES=http://user:pass@proxy1:8080,http://user:pass@proxy2:8080 PARSER_PROXIES= diff --git a/.gitignore b/.gitignore index b70dc6d..32760f3 100644 --- a/.gitignore +++ b/.gitignore @@ -14,6 +14,7 @@ build/ .venv venv/ .env.local +.env.devstack .env.*.local # Django @@ -41,6 +42,7 @@ Thumbs.db *.bak *.backupdata/ data/ +deteil.json .zed/ .env.prod tmp/ diff --git a/check_organizations_v2_api.py b/check_organizations_v2_api.py new file mode 100644 index 0000000..fb27765 --- /dev/null +++ b/check_organizations_v2_api.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python3 +"""Smoke-check organizations API v2 list and detail endpoints.""" + +from __future__ import annotations + +import argparse +import json +import sys +from typing import Any +from urllib.error import HTTPError, URLError +from urllib.parse import urlencode, urlparse +from urllib.request import Request, urlopen + + +def request_json(url: str) -> dict[str, Any]: + parsed_url = urlparse(url) + if parsed_url.scheme not in {"http", "https"}: + raise RuntimeError(f"Unsupported URL scheme for {url}") + + request = Request(url, headers={"Accept": "application/json"}) # noqa: S310 + try: + with urlopen(request, timeout=30) as response: # noqa: S310 + body = response.read().decode("utf-8") + except HTTPError as exc: + detail = exc.read().decode("utf-8", errors="replace") + raise RuntimeError(f"GET {url} failed: HTTP {exc.code}: {detail}") from exc + except URLError as exc: + raise RuntimeError(f"GET {url} failed: {exc.reason}") from exc + + return json.loads(body) + + +def first_result_uid(payload: dict[str, Any]) -> str: + data = payload.get("data") + if not isinstance(data, list) or not data: + raise RuntimeError("List response does not contain any organizations in data") + + uid = data[0].get("uid") + if not isinstance(uid, str) or not uid: + raise RuntimeError("First organization does not contain uid") + return uid + + +def main() -> int: + parser = argparse.ArgumentParser( + description="Send GET list and GET item requests to organizations API v2." + ) + parser.add_argument( + "--base-url", + default="http://127.0.0.1:8000", + help="Backend base URL, default: http://127.0.0.1:8000", + ) + parser.add_argument("--page-size", type=int, default=1) + args = parser.parse_args() + + base_url = args.base_url.rstrip("/") + query = urlencode({"page_size": args.page_size}) + list_url = f"{base_url}/api/v2/organizations/?{query}" + + print(f"GET list: {list_url}") + list_payload = request_json(list_url) + print(json.dumps(list_payload, ensure_ascii=False, indent=2)) + + uid = first_result_uid(list_payload) + item_url = f"{base_url}/api/v2/organizations/{uid}/" + + print(f"\nGET item: {item_url}") + item_payload = request_json(item_url) + print(json.dumps(item_payload, ensure_ascii=False, indent=2)) + + return 0 + + +if __name__ == "__main__": + try: + raise SystemExit(main()) + except RuntimeError as exc: + print(exc, file=sys.stderr) + raise SystemExit(1) from exc diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml index cd6db37..7358f9c 100644 --- a/docker-compose.dev.yml +++ b/docker-compose.dev.yml @@ -71,8 +71,6 @@ services: ports: - "8000:8000" volumes: - - ./src:/app/src - - ./logs:/app/logs - ./media:/app/media - ./staticfiles:/app/staticfiles - ./input:/app/input @@ -95,8 +93,6 @@ services: mem_limit: 3g memswap_limit: 3g volumes: - - ./src:/app/src - - ./logs:/app/logs - ./input:/app/input command: ["/app/docker/scripts/start-celery-worker.sh"] @@ -112,7 +108,5 @@ services: migrate: condition: service_completed_successfully volumes: - - ./src:/app/src - - ./logs:/app/logs - ./input:/app/input command: ["/app/docker/scripts/start-celery-beat.sh"] diff --git a/docker/Dockerfile b/docker/Dockerfile index 76bf1c8..48d17dc 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -79,25 +79,26 @@ RUN mkdir -p logs media staticfiles input/fns input/fns/processed input/fns/fail ENV PATH="/app/.venv/bin:${PATH}" \ PYTHONPATH=/app/src \ DJANGO_SETTINGS_MODULE=settings.dev \ - POSTGRES_HOST=10.10.0.114 \ + POSTGRES_HOST=db \ POSTGRES_PORT=5432 \ POSTGRES_DB=mostovik \ POSTGRES_USER=postgres \ POSTGRES_PASSWORD=postgres \ POSTGRES_SSLMODE=disable \ - REDIS_HOST=10.10.0.110 \ - REDIS_CACHE_URL=redis://10.10.0.110:6379/1 \ - CELERY_BROKER_URL=redis://10.10.0.110:6379/0 \ - CELERY_RESULT_BACKEND=redis://10.10.0.110:6379/0 \ + REDIS_HOST=redis \ + REDIS_CACHE_URL=redis://redis:6379/1 \ + CELERY_BROKER_URL=redis://redis:6379/0 \ + CELERY_RESULT_BACKEND=redis://redis:6379/0 \ PORT=8000 \ GUNICORN_WORKERS=4 \ GUNICORN_TIMEOUT=60 \ CELERY_LOG_LEVEL=INFO \ CELERY_WORKER_CONCURRENCY=2 \ - CHECKO_API_KEY=pRiEnJuD1tclsLCb \ - ZAKUPKI_TOKEN=019c03d7-e1f6-7091-b296-8c88b4c585dd \ + CHECKO_API_KEY= \ + ZAKUPKI_TOKEN= \ + SUPERJOB_APP_ID= \ COLLECTSTATIC_ON_MIGRATE=0 \ - BACKUP_ENCRYPTION_KEY=a2tra2tra2tra2tra2tra2tra2tra2tra2tra2s \ + BACKUP_ENCRYPTION_KEY= \ BACKUP_KEY_ID=default \ BACKUP_EXPORT_DIRECTORY=/app/media/backups \ STATE_CORP_EXCHANGE_URL= \ diff --git a/docs/superpowers/plans/2026-05-06-dashboard-registry-enrichment-analytics.md b/docs/superpowers/plans/2026-05-06-dashboard-registry-enrichment-analytics.md new file mode 100644 index 0000000..9f3ff57 --- /dev/null +++ b/docs/superpowers/plans/2026-05-06-dashboard-registry-enrichment-analytics.md @@ -0,0 +1,205 @@ +# Dashboard Registry Enrichment Analytics Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Rebuild the dashboard analytics tab around active registry organizations and the enrichment pipeline that fills data for them. + +**Architecture:** Keep the existing Django API endpoint `/api/v1/parsers/dashboard/` and template `src/templates/dashboard.html`. Add a focused backend aggregate `registry_enrichment_analytics` beside existing fields, then make the analytics tab render registry coverage, matrix, pipeline, action queue, and secondary technical counters from that aggregate. + +**Tech Stack:** Django 3.2, DRF, existing parser/register ORM models, server-rendered HTML with inline vanilla JS/CSS, pytest. + +--- + +### Task 1: Backend Analytics Contract + +**Files:** +- Modify: `src/apps/parsers/views.py` +- Test: `tests/apps/parsers/test_views.py` + +- [ ] **Step 1: Write failing API test** + +Add a test that creates two active registry organizations in two registries, FNS data for one organization, industrial data for one organization, and `unfair_suppliers` for one organization. Assert: + +- `registry_enrichment_analytics` exists. +- active population is `2`. +- `source_coverage` contains FNS/industrial and excludes `unfair_suppliers`. +- `risk_signals` contains `unfair_suppliers`. +- `registry_source_matrix` has per-registry source counts. +- `core_profile_complete` is `1` when one organization has FNS and industrial coverage. + +Run: + +```bash +PYTHONPATH=src .venv/bin/pytest tests/apps/parsers/test_views.py::ParsersViewSetTest::test_dashboard_data_exposes_registry_enrichment_analytics -v +``` + +Expected: FAIL because `registry_enrichment_analytics` is missing. + +- [ ] **Step 2: Implement aggregate helpers** + +In `src/apps/parsers/views.py`, add helpers based on the existing registry coverage matching: + +- active registry organization identity indexes. +- source matched organization id sets. +- source coverage entries. +- risk signal entries. +- registry/source matrix rows. +- pipeline summary from schedules, jobs, and load logs. + +Use source matching rules already present: + +- default fields: `inn`, `ogrn`. +- FNS: `ogrn` only. +- legacy procurements: `customer_inn`, `customer_ogrn`. +- `unfair_suppliers`: risk signal, not completeness. + +- [ ] **Step 3: Add aggregate to dashboard response** + +Return `registry_enrichment_analytics` from `ParserDashboardDataView.get()` while keeping existing `registry_data_coverage`. + +- [ ] **Step 4: Verify backend test passes** + +Run: + +```bash +PYTHONPATH=src .venv/bin/pytest tests/apps/parsers/test_views.py::ParsersViewSetTest::test_dashboard_data_exposes_registry_enrichment_analytics -v +``` + +Expected: PASS. + +### Task 2: Analytics Template Structure + +**Files:** +- Modify: `src/templates/dashboard.html` +- Test: `tests/apps/parsers/test_dashboard_page.py` + +- [ ] **Step 1: Write failing template test** + +Add assertions that `/dashboard` contains: + +- `analyticsRegistryKpis` +- `registrySourceCoverageChart` +- `registrySourceMatrix` +- `enrichmentPipelinePanel` +- `analyticsActionQueue` +- `technicalSourceCounters` +- `renderRegistryEnrichmentAnalytics` +- `renderRegistrySourceMatrix` + +Run: + +```bash +PYTHONPATH=src .venv/bin/pytest tests/apps/parsers/test_dashboard_page.py::ParserDashboardPageTest::test_dashboard_prioritizes_registry_enrichment_analytics -v +``` + +Expected: FAIL because the new DOM/functions are missing. + +- [ ] **Step 2: Replace analytics panel markup** + +Update `analyticsPanel` so its first visible blocks are: + +- registry organization KPI grid. +- registry source coverage + matrix. +- enrichment pipeline + action queue. +- secondary technical counters below. + +- [ ] **Step 3: Add CSS for compact bars and matrix** + +Add focused classes: + +- `.analytics-hero-grid` +- `.registry-coverage-layout` +- `.source-coverage-list` +- `.registry-matrix` +- `.matrix-cell` +- `.action-list` +- `.technical-counters` + +- [ ] **Step 4: Verify template test passes** + +Run: + +```bash +PYTHONPATH=src .venv/bin/pytest tests/apps/parsers/test_dashboard_page.py::ParserDashboardPageTest::test_dashboard_prioritizes_registry_enrichment_analytics -v +``` + +Expected: PASS. + +### Task 3: Frontend Rendering + +**Files:** +- Modify: `src/templates/dashboard.html` +- Test: `tests/apps/parsers/test_dashboard_page.py` + +- [ ] **Step 1: Implement render functions** + +Add or update inline JS: + +- `renderRegistryEnrichmentAnalytics()` +- `renderRegistrySourceCoverage()` +- `renderRegistrySourceMatrix()` +- `renderEnrichmentPipeline()` +- `renderAnalyticsActionQueue()` +- `renderTechnicalSourceCounters()` + +Change `renderAnalytics()` to call these functions and keep existing status/source totals as secondary content. + +- [ ] **Step 2: Add defensive empty states** + +If `dashboardData.registry_enrichment_analytics` is missing, render an empty state and keep secondary counters visible. + +- [ ] **Step 3: Syntax-check JS** + +Run: + +```bash +perl -0ne 'while (m{}sg) { print $1 }' src/templates/dashboard.html > /tmp/mostovik-dashboard-inline.js +node --check /tmp/mostovik-dashboard-inline.js +``` + +Expected: exit code 0. + +### Task 4: Verification + +**Files:** +- No new files. + +- [ ] **Step 1: Run focused tests** + +Run: + +```bash +PYTHONPATH=src .venv/bin/pytest tests/apps/parsers/test_views.py::ParsersViewSetTest::test_dashboard_data_exposes_registry_enrichment_analytics tests/apps/parsers/test_dashboard_page.py::ParserDashboardPageTest::test_dashboard_prioritizes_registry_enrichment_analytics -v +``` + +Expected: PASS. + +- [ ] **Step 2: Run dashboard/parser regression suite** + +Run: + +```bash +PYTHONPATH=src .venv/bin/pytest tests/apps/parsers/test_views.py tests/apps/parsers/test_dashboard_page.py tests/apps/organizations tests/apps/registers/test_services.py +``` + +Expected: PASS. + +- [ ] **Step 3: Run Django system check** + +Run: + +```bash +PYTHONPATH=src .venv/bin/python src/manage.py check +``` + +Expected: `System check identified no issues`. + +- [ ] **Step 4: Browser smoke test** + +Open `/dashboard`, confirm: + +- first analytics section is registry coverage, not raw source records. +- matrix renders. +- pipeline renders. +- source totals moved below primary analytics. +- existing organization/FNS drill-down still works. diff --git a/docs/superpowers/specs/2026-05-06-dashboard-registry-enrichment-analytics-design.md b/docs/superpowers/specs/2026-05-06-dashboard-registry-enrichment-analytics-design.md new file mode 100644 index 0000000..a6fa911 --- /dev/null +++ b/docs/superpowers/specs/2026-05-06-dashboard-registry-enrichment-analytics-design.md @@ -0,0 +1,198 @@ +# Dashboard Registry Enrichment Analytics Design + +Date: 2026-05-06 + +## Goal + +Rework the dashboard analytics tab so it treats active registry organizations as the primary population and parser/enrichment jobs as the operational process that fills data for those organizations. + +The existing analytics page is source-centric: total records, source counts, and load quality. That remains useful, but secondary. The new first screen must answer: + +- How many active registry organizations are under control? +- How many have additional data from enrichment sources? +- Which registries are under-covered by source? +- Which enrichment jobs are scheduled, running, successful, failed, or stale? +- What actions should the operator take next? + +## Scope + +In scope: + +- Rebuild only the `analyticsPanel` dashboard tab. +- Keep current navigation and other dashboard tabs unchanged. +- Add dashboard API aggregate data under `/api/v1/parsers/dashboard/`. +- Use active `RegistryMembershipPeriod` rows as the population. +- Exclude `unfair_suppliers` from completeness degradation. It is a risk signal, not a required enrichment source. +- Keep source record totals available, but move them below the primary registry analytics. + +Out of scope: + +- Changing v2 organization API contracts. +- Changing parser execution behavior. +- Changing Celery scheduling semantics. +- Adding external chart dependencies. + +## UX Structure + +The analytics tab becomes a hybrid of "coverage center" and "enrichment pipeline". + +Top section: Registry Organization Coverage + +- KPI cards: + - Active registry organizations. + - Organizations with at least one enrichment source. + - Organizations with core profile coverage. + - Organizations requiring attention. +- Coverage by source: + - Bar rows for FNS reports, industrial certificates, products, manufacturers, inspections, procurements, arbitration, bankruptcy, FSTEC, vacancies, etc. + - Each row shows matched organization count and percent of active registry organizations. + - `unfair_suppliers` is not included here. +- Registry × source matrix: + - Rows are registries. + - Columns are important enrichment sources. + - Cells show percent coverage for organizations in that registry. + - This gives a fast view of which registry/source pair needs work. + +Second section: Enrichment Pipeline + +- Job KPI cards: + - Active schedules. + - Running jobs. + - Recent successes. + - Recent failures. +- Recent job quality meter: + - Reuse existing load log status data, but frame it as enrichment pipeline health. +- Action queue: + - Organizations without enrichment data. + - Organizations with identifier/matching problems. + - Snapshots older than latest parser batches. + - Risk signals such as unfair suppliers, bankruptcy, GOZ evasion shown separately from coverage. + +Third section: Secondary Technical Counters + +- Current source record totals and source mode breakdown move below the registry-focused blocks. +- These remain useful for diagnostics, but no longer dominate the page. + +## Backend Data Contract + +Extend `/api/v1/parsers/dashboard/` with an analytics object: + +```json +{ + "registry_enrichment_analytics": { + "population": { + "active_registry_organizations": 252, + "active_memberships": 647, + "registries_with_data_percent": 100 + }, + "coverage_summary": { + "with_any_enrichment": 68, + "with_any_enrichment_percent": 27.0, + "core_profile_complete": 21, + "core_profile_complete_percent": 8.3, + "requires_attention": 184 + }, + "source_coverage": [ + { + "source": "fns_reports", + "label": "ФНС отчетность", + "organizations_count": 45, + "coverage_percent": 17.9, + "required_for_core_profile": true, + "risk_signal": false + } + ], + "registry_source_matrix": [ + { + "registry_id": "uuid", + "registry_name": "Реестр ГК Росатом ГОЗ", + "active_organizations": 139, + "sources": { + "fns_reports": { + "organizations_count": 20, + "coverage_percent": 14.4 + } + } + } + ], + "risk_signals": [ + { + "source": "unfair_suppliers", + "label": "Недобросовестные поставщики", + "organizations_count": 3, + "coverage_percent": 1.2 + } + ], + "pipeline": { + "active_schedules": 15, + "running_jobs": 0, + "recent_success": 13, + "recent_failed": 0, + "recent_other": 1 + } + } +} +``` + +The existing `registry_data_coverage` can remain temporarily for compatibility inside dashboard JS, but new UI should read `registry_enrichment_analytics`. + +## Aggregation Rules + +- Population is distinct organizations from active registry memberships: `ended_at IS NULL`. +- Source coverage matches parser records to registry organizations by INN or OGRN. +- `FinancialReport` matches by OGRN. +- legacy `ProcurementRecord` matches by `customer_inn` and `customer_ogrn`. +- `unfair_suppliers` is excluded from completeness and shown as a risk signal. +- Percent values use one decimal place. +- If a source has records but no identifiers, it does not count as organization coverage. + +Core profile completeness for the first version: + +- Organization has FNS reports. +- Organization has at least one industrial/manufacturer/product source. + +This is intentionally conservative and can become configurable later. + +## Frontend Design + +Implementation remains in `src/templates/dashboard.html` for now, following the current dashboard pattern. + +New/updated DOM blocks: + +- `analyticsRegistryKpis` +- `registrySourceCoverageChart` +- `registrySourceMatrix` +- `enrichmentPipelinePanel` +- `analyticsActionQueue` +- `technicalSourceCounters` + +No chart dependency is added. Use CSS bars, compact matrix cells, and existing badges/cards. This keeps the dashboard self-contained. + +## Error Handling + +- If analytics aggregate is missing, show empty states instead of crashing. +- If registries are unavailable, keep the pipeline and technical counters visible. +- If coverage has zero population, render zeroed KPIs and explanatory empty states. + +## Testing + +Add/update tests: + +- Dashboard API returns `registry_enrichment_analytics`. +- `unfair_suppliers` appears in `risk_signals`, not `source_coverage`. +- Matrix counts source coverage per registry. +- Template contains new analytics sections and still includes secondary source counters. +- Existing parser/dashboard tests continue to pass. + +Manual validation: + +- Open `/dashboard`. +- Confirm first visible analytics content is registry organization coverage. +- Confirm source record totals are below primary registry analytics. +- Confirm FNS table and existing organization drill-down are unaffected. + +## Risks + +- Matching by INN/OGRN can undercount sources with incomplete identifiers. +- Current dashboard API may become heavier with matrix aggregation. Keep queries bounded and use grouped SQL where practical. +- Core completeness definition is a business rule; first implementation uses a conservative default and should be easy to adjust. diff --git a/src/apps/core/services.py b/src/apps/core/services.py index bc6cebe..1d2cd73 100644 --- a/src/apps/core/services.py +++ b/src/apps/core/services.py @@ -300,6 +300,7 @@ class BulkOperationsMixin: unique_fields: list[str], update_fields: list[str], create_defaults: dict | None = None, + chunk_size: int = 500, ) -> tuple[int, int]: """ Upsert: обновить существующие или создать новые. @@ -309,31 +310,96 @@ class BulkOperationsMixin: unique_fields: Поля для поиска существующих update_fields: Поля для обновления create_defaults: Значения по умолчанию для создания + chunk_size: Размер чанка для bulk_create/bulk_update Returns: (created_count, updated_count) """ - created_count = 0 - updated_count = 0 + if not items: + return 0, 0 + defaults = create_defaults or {} - for item in items: - lookup = {field: item[field] for field in unique_fields} - update_data = { - field: item[field] for field in update_fields if field in item - } + items_by_lookup = { + tuple(item[field] for field in unique_fields): item for item in items + } + existing_by_lookup = cls._bulk_existing_by_lookup( + lookup_keys=list(items_by_lookup), + unique_fields=unique_fields, + chunk_size=chunk_size, + ) - obj, created = cls.model.objects.update_or_create( - **lookup, - defaults={**update_data, **defaults}, + create_instances = [] + update_instances = [] + effective_update_fields = set(update_fields) | set(defaults) + if hasattr(cls.model, "updated_at"): + effective_update_fields.add("updated_at") + now = timezone.now() + + for lookup_key, item in items_by_lookup.items(): + existing = existing_by_lookup.get(lookup_key) + if existing is None: + create_instances.append(cls.model(**{**item, **defaults})) + continue + + for field in update_fields: + if field in item: + setattr(existing, field, item[field]) + for field, value in defaults.items(): + setattr(existing, field, value) + if hasattr(existing, "updated_at"): + existing.updated_at = now + update_instances.append(existing) + + created_count = cls.bulk_create_chunked( + create_instances, + chunk_size=chunk_size, + ) + + update_fields_list = list(effective_update_fields) + for i in range(0, len(update_instances), chunk_size): + chunk = update_instances[i : i + chunk_size] + cls.model.objects.bulk_update( + chunk, + fields=update_fields_list, + batch_size=chunk_size, ) - if created: - created_count += 1 - else: - updated_count += 1 + return created_count, len(update_instances) - return created_count, updated_count + @classmethod + def _bulk_existing_by_lookup( + cls, + *, + lookup_keys: list[tuple], + unique_fields: list[str], + chunk_size: int, + ) -> dict[tuple, models.Model]: + """Получить существующие записи по unique_fields без per-row запросов.""" + if not lookup_keys: + return {} + + existing_by_lookup = {} + if len(unique_fields) == 1: + field = unique_fields[0] + values = [key[0] for key in lookup_keys] + for i in range(0, len(values), chunk_size): + chunk = values[i : i + chunk_size] + for obj in cls.model.objects.filter(**{f"{field}__in": chunk}): + existing_by_lookup[(getattr(obj, field),)] = obj + return existing_by_lookup + + for i in range(0, len(lookup_keys), chunk_size): + chunk = lookup_keys[i : i + chunk_size] + query = Q() + for lookup_key in chunk: + query |= Q(**dict(zip(unique_fields, lookup_key, strict=True))) + for obj in cls.model.objects.filter(query): + existing_by_lookup[ + tuple(getattr(obj, field) for field in unique_fields) + ] = obj + + return existing_by_lookup @classmethod @transaction.atomic diff --git a/src/apps/parsers/clients/proverki/client.py b/src/apps/parsers/clients/proverki/client.py index 8738feb..195e3f2 100644 --- a/src/apps/parsers/clients/proverki/client.py +++ b/src/apps/parsers/clients/proverki/client.py @@ -15,6 +15,7 @@ import tempfile import zipfile from collections.abc import Callable from dataclasses import dataclass, field +from urllib.parse import urljoin, urlsplit, urlunsplit from xml.etree import ( # noqa: S314 - XML parsing with proper error handling ElementTree as ET, ) @@ -104,6 +105,53 @@ class ProverkiClient: ) return self._http_client + def _playwright_proxy(self) -> dict[str, str] | None: + """Вернуть настройки proxy в формате Playwright.""" + if not self.proxies: + return None + + proxy_url = self.proxies[0] + parsed = urlsplit(proxy_url) + if not parsed.scheme or not parsed.netloc: + return {"server": proxy_url} + + host = parsed.hostname or "" + if parsed.port: + host = f"{host}:{parsed.port}" + server = urlunsplit((parsed.scheme, host, "", "", "")) + proxy = {"server": server} + if parsed.username: + proxy["username"] = parsed.username + if parsed.password: + proxy["password"] = parsed.password + return proxy + + def _should_download_portal_href_directly(self, href: str | None) -> bool: + """Понять, можно ли скачать найденный portal href обычным HTTP.""" + if not href: + return False + + parsed = urlsplit(href) + path = parsed.path or href + return path.lower().endswith(".zip") and ( + "/blob/" in path or "/opendata/" in path + ) + + def _download_portal_href(self, portal_url: str, href: str) -> bytes: + """Скачать файл по ссылке, найденной на portal page.""" + download_url = urljoin(portal_url, href) + headers = { + "Accept": "application/zip, application/octet-stream, */*", + "Referer": portal_url, + } + content = self.http_client.download_file(download_url, headers=headers) + logger.info( + "Downloaded %d bytes from portal href %s", + len(content), + download_url, + ) + return content + def fetch_inspections( self, *, @@ -315,7 +363,7 @@ class ProverkiClient: url=file_url, ) from e - def _download_from_portal( + def _download_from_portal( # noqa: C901 self, portal_url: str, progress_callback: Callable[[int, str], None] | None = None, @@ -340,20 +388,24 @@ class ProverkiClient: logger.info("Downloading from portal: %s", portal_url) browser = self._get_browser() - context = browser.new_context( - user_agent=( + context_options = { + "user_agent": ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/120.0.0.0 Safari/537.36" ), - accept_downloads=True, - ) + "accept_downloads": True, + } + proxy = self._playwright_proxy() + if proxy: + context_options["proxy"] = proxy + context = browser.new_context(**context_options) page = context.new_page() try: # Переходим на страницу датасета logger.info("Navigating to dataset page: %s", portal_url) - page.goto(portal_url, wait_until="networkidle", timeout=60000) + page.goto(portal_url, wait_until="domcontentloaded", timeout=60000) # Ждём загрузки SPA контента (Angular) - ищем признаки загруженной страницы # На proverki.gov.ru используется Angular, контент загружается динамически @@ -418,6 +470,16 @@ class ProverkiClient: href = zip_link.get_attribute("href") logger.info("Found ZIP download link: %s", href) + if self._should_download_portal_href_directly(href): + try: + return self._download_portal_href(portal_url, href) + except HTTPClientError as exc: + logger.warning( + "Direct portal href download failed, falling back to " + "browser download: %s", + exc, + ) + with page.expect_download(timeout=120000) as download_info: zip_link.click() @@ -997,20 +1059,24 @@ class ProverkiClient: logger.info("Using Playwright to fetch: %s", url) browser = self._get_browser() - context = browser.new_context( - user_agent=( + context_options = { + "user_agent": ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/120.0.0.0 Safari/537.36" ), - accept_downloads=True, - ) + "accept_downloads": True, + } + proxy = self._playwright_proxy() + if proxy: + context_options["proxy"] = proxy + context = browser.new_context(**context_options) page = context.new_page() try: # Сначала пробуем прямой переход на URL (с рендерингом JS) logger.info("Trying direct URL with JS rendering: %s", url) - response = page.goto(url, wait_until="networkidle", timeout=60000) + response = page.goto(url, wait_until="domcontentloaded", timeout=60000) # Проверяем, получили ли мы данные напрямую content_type = response.headers.get("content-type", "") if response else "" @@ -1057,7 +1123,11 @@ class ProverkiClient: # Последняя попытка - идём на портал открытых данных logger.info("Navigating to portal as last resort: %s", OPEN_DATA_PORTAL_URL) - page.goto(OPEN_DATA_PORTAL_URL, wait_until="networkidle", timeout=60000) + page.goto( + OPEN_DATA_PORTAL_URL, + wait_until="domcontentloaded", + timeout=60000, + ) page.wait_for_timeout(3000) # Ищем ссылку на нужный dataset (план проверок) diff --git a/src/apps/parsers/clients/trudvsem/client.py b/src/apps/parsers/clients/trudvsem/client.py index 3987547..ae940e4 100644 --- a/src/apps/parsers/clients/trudvsem/client.py +++ b/src/apps/parsers/clients/trudvsem/client.py @@ -10,11 +10,13 @@ from typing import Any from apps.parsers.clients.base import BaseHTTPClient, HTTPClientError from apps.parsers.clients.common.schemas import GenericParserItem from apps.parsers.models import ParserLoadLog +from requests.adapters import BaseAdapter logger = logging.getLogger(__name__) DEFAULT_BASE_URL = "http://opendata.trudvsem.ru/api/v1" VACANCIES_ENDPOINT = "/vacancies" +COMPANY_INN_VACANCIES_ENDPOINT = "/vacancies/company/inn/{inn}" class TrudvsemClientError(HTTPClientError): @@ -31,7 +33,9 @@ class TrudvsemClient: base_url: str = DEFAULT_BASE_URL timeout: int = 120 company_search_max_pages: int = 20 + http_adapter: BaseAdapter | None = None _http_client: BaseHTTPClient | None = field(default=None, repr=False) + supports_company_inn: bool = True @property def http_client(self) -> BaseHTTPClient: @@ -41,6 +45,7 @@ class TrudvsemClient: base_url=self.base_url, proxies=self.proxies, timeout=self.timeout, + adapter=self.http_adapter, ) return self._http_client @@ -99,42 +104,30 @@ class TrudvsemClient: company_inn: str, text: str | None, ) -> list[GenericParserItem]: - """Искать вакансии работодателя по страницам, чтобы не дать ложный ноль.""" - records: list[GenericParserItem] = [] - current_offset = offset - page_size = max(limit, 1) + """Получить вакансии работодателя через официальный endpoint по ИНН.""" + params: dict[str, Any] = {"limit": limit, "offset": offset} + if region_code: + params["region"] = region_code + if text: + params["text"] = text - for _ in range(self.company_search_max_pages): - params: dict[str, Any] = {"limit": page_size, "offset": current_offset} - if region_code: - params["region"] = region_code - if text: - params["text"] = text - - try: - response = self.http_client.get_json(VACANCIES_ENDPOINT, params=params) - except HTTPClientError: - raise - except Exception as e: - raise TrudvsemClientError(f"Failed to fetch vacancies: {e}") from e - - vacancies = self._extract_vacancies(response) - page_records = [self._map_vacancy(vacancy) for vacancy in vacancies] - records.extend( - record for record in page_records if record.inn == company_inn + try: + response = self.http_client.get_json( + COMPANY_INN_VACANCIES_ENDPOINT.format(inn=company_inn), + params=params, ) - if len(records) >= limit: - result = records[:limit] - logger.info("Fetched %d Trudvsem vacancies by INN", len(result)) - return result - if len(vacancies) < page_size: - logger.info("Fetched %d Trudvsem vacancies by INN", len(records)) - return records - current_offset += page_size + except HTTPClientError: + raise + except Exception as e: + raise TrudvsemClientError( + f"Failed to fetch vacancies by company INN: {e}" + ) from e - raise TrudvsemClientError( - "Company INN search reached page limit before exhausting vacancies" - ) + records = [ + self._map_vacancy(vacancy) for vacancy in self._extract_vacancies(response) + ] + logger.info("Fetched %d Trudvsem vacancies by INN", len(records)) + return records def _extract_vacancies(self, response: dict) -> list[dict]: """Достать список вакансий из ответа API.""" @@ -157,6 +150,8 @@ class TrudvsemClient: if not external_id: raw = json.dumps(vacancy, ensure_ascii=False, sort_keys=True, default=str) external_id = hashlib.sha256(raw.encode("utf-8")).hexdigest()[:24] + payload = dict(vacancy) + payload["vacancy_source"] = "trudvsem" return GenericParserItem( source=ParserLoadLog.Source.TRUDVSEM, external_id=external_id, @@ -168,7 +163,7 @@ class TrudvsemClient: amount=self._parse_salary(salary), status=str(vacancy.get("state") or ""), url=str(vacancy.get("vac_url") or vacancy.get("url") or ""), - payload=vacancy, + payload=payload, ) def _parse_salary(self, salary: Any) -> Decimal | None: diff --git a/src/apps/parsers/clients/vacancies.py b/src/apps/parsers/clients/vacancies.py new file mode 100644 index 0000000..1fde525 --- /dev/null +++ b/src/apps/parsers/clients/vacancies.py @@ -0,0 +1,369 @@ +"""Клиенты вакансий для общего источника vacancies.""" + +from __future__ import annotations + +import hashlib +import json +import logging +from collections.abc import Mapping +from dataclasses import dataclass, field +from datetime import UTC, datetime +from decimal import Decimal, InvalidOperation +from typing import Any, Protocol + +from apps.parsers.clients.base import BaseHTTPClient, HTTPClientError +from apps.parsers.clients.common.schemas import GenericParserItem +from apps.parsers.clients.trudvsem import TrudvsemClient +from apps.parsers.models import ParserLoadLog +from requests.adapters import BaseAdapter + +logger = logging.getLogger(__name__) + +HH_BASE_URL = "https://api.hh.ru" +SUPERJOB_BASE_URL = "https://api.superjob.ru" +TRUDVSEM_SOURCE = "trudvsem" +HH_SOURCE = "hh" +SUPERJOB_SOURCE = "superjob" +SUPPORTED_VACANCY_SOURCES = (TRUDVSEM_SOURCE, HH_SOURCE, SUPERJOB_SOURCE) + + +class VacanciesClientError(HTTPClientError): + """Ошибка клиента вакансий.""" + + pass + + +class VacancyProvider(Protocol): + """Минимальный контракт клиента одного источника вакансий.""" + + supports_company_inn: bool + + def fetch_vacancies( + self, + *, + limit: int = 100, + offset: int = 0, + region_code: str | None = None, + company_inn: str | None = None, + text: str | None = None, + ) -> list[GenericParserItem]: + """Получить вакансии.""" + + def close(self) -> None: + """Закрыть ресурсы клиента.""" + + +def _parse_decimal(value: Any) -> Decimal | None: + if value in (None, ""): + return None + try: + return Decimal(str(value).replace(" ", "").replace(",", ".")) + except InvalidOperation: + return None + + +def _normalize_limit(limit: int, *, default: int = 100, max_value: int = 100) -> int: + if limit <= 0: + return default + return min(limit, max_value) + + +def _unix_timestamp_to_date(value: Any) -> str: + if value in (None, ""): + return "" + try: + return datetime.fromtimestamp(int(value), tz=UTC).date().isoformat() + except (TypeError, ValueError, OSError): + return str(value) + + +def _stable_external_id(source: str, payload: Mapping[str, Any]) -> str: + raw = json.dumps(payload, ensure_ascii=False, sort_keys=True, default=str) + digest = hashlib.sha256(raw.encode("utf-8")).hexdigest()[:24] + return f"{source}:{digest}" + + +@dataclass +class HHVacanciesClient: + """Клиент публичного API HeadHunter.""" + + proxies: list[str] | None = None + base_url: str = HH_BASE_URL + timeout: int = 120 + http_adapter: BaseAdapter | None = None + _http_client: BaseHTTPClient | None = field(default=None, repr=False) + supports_company_inn: bool = False + + @property + def http_client(self) -> BaseHTTPClient: + if self._http_client is None: + self._http_client = BaseHTTPClient( + base_url=self.base_url, + proxies=self.proxies, + timeout=self.timeout, + adapter=self.http_adapter, + headers={"Accept": "application/json"}, + ) + return self._http_client + + def fetch_vacancies( + self, + *, + limit: int = 100, + offset: int = 0, + region_code: str | None = None, + company_inn: str | None = None, + text: str | None = None, + ) -> list[GenericParserItem]: + """Получить вакансии HH.""" + if company_inn: + return [] + + per_page = _normalize_limit(limit) + params: dict[str, Any] = { + "per_page": per_page, + "page": max(offset, 0) // per_page, + } + if region_code: + params["area"] = region_code + if text: + params["text"] = text + + response = self.http_client.get_json("/vacancies", params=params) + items = response.get("items", []) + if not isinstance(items, list): + return [] + records = [ + self._map_vacancy(item) for item in items if isinstance(item, Mapping) + ] + logger.info("Fetched %d HH vacancies", len(records)) + return records + + def _map_vacancy(self, vacancy: Mapping[str, Any]) -> GenericParserItem: + employer = vacancy.get("employer") or {} + if not isinstance(employer, Mapping): + employer = {} + salary = vacancy.get("salary") or {} + if not isinstance(salary, Mapping): + salary = {} + vacancy_type = vacancy.get("type") or {} + if not isinstance(vacancy_type, Mapping): + vacancy_type = {} + vacancy_id = str(vacancy.get("id") or "") + external_id = f"{HH_SOURCE}:{vacancy_id}" if vacancy_id else "" + if not external_id: + external_id = _stable_external_id(HH_SOURCE, vacancy) + status = str(vacancy_type.get("name") or "") + if vacancy.get("archived"): + status = "archived" + + payload = dict(vacancy) + payload["vacancy_source"] = HH_SOURCE + return GenericParserItem( + source=ParserLoadLog.Source.TRUDVSEM, + external_id=external_id, + organisation_name=str(employer.get("name") or ""), + title=str(vacancy.get("name") or ""), + record_date=str(vacancy.get("published_at") or ""), + amount=_parse_decimal(salary.get("from") or salary.get("to")), + status=status, + url=str(vacancy.get("alternate_url") or vacancy.get("url") or ""), + payload=payload, + ) + + def close(self) -> None: + if self._http_client is not None: + self._http_client.close() + self._http_client = None + + +@dataclass +class SuperJobVacanciesClient: + """Клиент API SuperJob.""" + + app_id: str + proxies: list[str] | None = None + base_url: str = SUPERJOB_BASE_URL + timeout: int = 120 + http_adapter: BaseAdapter | None = None + _http_client: BaseHTTPClient | None = field(default=None, repr=False) + supports_company_inn: bool = False + + @property + def http_client(self) -> BaseHTTPClient: + if self._http_client is None: + self._http_client = BaseHTTPClient( + base_url=self.base_url, + proxies=self.proxies, + timeout=self.timeout, + adapter=self.http_adapter, + headers={ + "Accept": "application/json", + "X-Api-App-Id": self.app_id, + }, + ) + return self._http_client + + def fetch_vacancies( + self, + *, + limit: int = 100, + offset: int = 0, + region_code: str | None = None, + company_inn: str | None = None, + text: str | None = None, + ) -> list[GenericParserItem]: + """Получить вакансии SuperJob.""" + if company_inn: + return [] + + count = _normalize_limit(limit) + params: dict[str, Any] = { + "count": count, + "page": max(offset, 0) // count, + } + if region_code: + params["town"] = region_code + if text: + params["keyword"] = text + + response = self.http_client.get_json("/2.0/vacancies/", params=params) + items = response.get("objects", []) + if not isinstance(items, list): + return [] + records = [ + self._map_vacancy(item) for item in items if isinstance(item, Mapping) + ] + logger.info("Fetched %d SuperJob vacancies", len(records)) + return records + + def _map_vacancy(self, vacancy: Mapping[str, Any]) -> GenericParserItem: + vacancy_id = str(vacancy.get("id") or "") + external_id = f"{SUPERJOB_SOURCE}:{vacancy_id}" if vacancy_id else "" + if not external_id: + external_id = _stable_external_id(SUPERJOB_SOURCE, vacancy) + payload = dict(vacancy) + payload["vacancy_source"] = SUPERJOB_SOURCE + return GenericParserItem( + source=ParserLoadLog.Source.TRUDVSEM, + external_id=external_id, + organisation_name=str(vacancy.get("firm_name") or ""), + title=str(vacancy.get("profession") or ""), + record_date=_unix_timestamp_to_date(vacancy.get("date_published")), + amount=_parse_decimal( + vacancy.get("payment_from") or vacancy.get("payment_to") + ), + status="archived" if vacancy.get("is_archive") else "open", + url=str(vacancy.get("link") or ""), + payload=payload, + ) + + def close(self) -> None: + if self._http_client is not None: + self._http_client.close() + self._http_client = None + + +@dataclass +class VacanciesClient: + """Комбинированный клиент вакансий.""" + + proxies: list[str] | None = None + superjob_app_id: str = "" + sources: list[str] | None = None + source_clients: dict[str, VacancyProvider] | None = None + _source_clients_cache: dict[str, VacancyProvider] | None = field( + default=None, + init=False, + repr=False, + ) + + def _build_source_clients(self) -> dict[str, VacancyProvider]: + if self.source_clients is not None: + return self.source_clients + + if self._source_clients_cache is not None: + return self._source_clients_cache + + clients: dict[str, VacancyProvider] = { + TRUDVSEM_SOURCE: TrudvsemClient(proxies=self.proxies), + HH_SOURCE: HHVacanciesClient(proxies=self.proxies), + } + if self.superjob_app_id: + clients[SUPERJOB_SOURCE] = SuperJobVacanciesClient( + app_id=self.superjob_app_id, + proxies=self.proxies, + ) + self._source_clients_cache = clients + return clients + + def _selected_sources(self) -> list[str]: + selected = self.sources or list(SUPPORTED_VACANCY_SOURCES) + unknown = sorted(set(selected) - set(SUPPORTED_VACANCY_SOURCES)) + if unknown: + raise VacanciesClientError( + f"Unsupported vacancy sources: {', '.join(unknown)}" + ) + return list(dict.fromkeys(selected)) + + def fetch_vacancies( + self, + *, + limit: int = 100, + offset: int = 0, + region_code: str | None = None, + company_inn: str | None = None, + text: str | None = None, + ) -> list[GenericParserItem]: + """Получить вакансии из включённых источников.""" + clients = self._build_source_clients() + records: list[GenericParserItem] = [] + errors: list[str] = [] + attempts = 0 + + for source in self._selected_sources(): + client = clients.get(source) + if client is None: + if self.sources and source == SUPERJOB_SOURCE: + raise VacanciesClientError("SUPERJOB_APP_ID is required") + logger.info("Vacancy source %s is skipped: not configured", source) + continue + if company_inn and not getattr(client, "supports_company_inn", False): + logger.info( + "Vacancy source %s is skipped: company_inn is not supported", + source, + ) + continue + + attempts += 1 + try: + records.extend( + client.fetch_vacancies( + limit=limit, + offset=offset, + region_code=region_code, + company_inn=company_inn, + text=text, + ) + ) + except Exception as exc: + logger.warning("Vacancy source %s failed: %s", source, exc) + errors.append(f"{source}: {exc}") + + if errors and not records and attempts: + raise VacanciesClientError( + f"All vacancy sources failed; first error: {errors[0]}" + ) + return records + + def close(self) -> None: + for client in self._build_source_clients().values(): + close = getattr(client, "close", None) + if close: + close() + + def __enter__(self) -> VacanciesClient: + return self + + def __exit__(self, exc_type, exc_val, exc_tb) -> None: + self.close() diff --git a/src/apps/parsers/frontend_compat.py b/src/apps/parsers/frontend_compat.py index 1dacf54..40fad2f 100644 --- a/src/apps/parsers/frontend_compat.py +++ b/src/apps/parsers/frontend_compat.py @@ -171,36 +171,10 @@ SOURCE_CARD_DEFINITIONS = ( ), FrontendSourceCardDefinition( slug="labor-vacancies", - title="Вакансии Работа России", - description="Вакансии работодателей из ЕЦП Работа в России.", + title="Вакансии", + description="Вакансии работодателей, состоящих в активных реестрах.", order=90, source_keys=("trudvsem",), - refresh_params=( - { - "name": "company_inn", - "label": "ИНН работодателя", - "description": "Фильтр вакансий по ИНН работодателя.", - "required": False, - "type": "string", - "default": None, - }, - { - "name": "text", - "label": "Текст", - "description": "Поисковая строка вакансии.", - "required": False, - "type": "string", - "default": None, - }, - { - "name": "limit", - "label": "Лимит", - "description": "Размер страницы API Работа России.", - "required": False, - "type": "integer", - "default": 100, - }, - ), ), ) SOURCE_CARD_BY_SLUG = {item.slug: item for item in SOURCE_CARD_DEFINITIONS} diff --git a/src/apps/parsers/migrations/0022_seed_daily_registry_enrichment_schedule.py b/src/apps/parsers/migrations/0022_seed_daily_registry_enrichment_schedule.py new file mode 100644 index 0000000..53b567b --- /dev/null +++ b/src/apps/parsers/migrations/0022_seed_daily_registry_enrichment_schedule.py @@ -0,0 +1,61 @@ +import json + +from django.db import migrations + +DAILY_REGISTRY_ENRICHMENT_TASK_NAME = "parser:registry-enrichment:daily-msk" +DAILY_REGISTRY_ENRICHMENT_TASK_PATH = ( + "apps.parsers.tasks.parse_registry_enrichment_sources" +) +DAILY_MSK_CRON = { + "minute": "0", + "hour": "2", + "day_of_week": "*", + "day_of_month": "*", + "month_of_year": "*", + "timezone": "Europe/Moscow", +} + + +def seed_daily_registry_enrichment_schedule(apps, schema_editor): + CrontabSchedule = apps.get_model("django_celery_beat", "CrontabSchedule") + PeriodicTask = apps.get_model("django_celery_beat", "PeriodicTask") + + crontab, _ = CrontabSchedule.objects.get_or_create(**DAILY_MSK_CRON) + field_names = {field.name for field in PeriodicTask._meta.fields} + schedule_fields = {"crontab": crontab} + for field_name in ("interval", "solar", "clocked"): + if field_name in field_names: + schedule_fields[field_name] = None + + PeriodicTask.objects.update_or_create( + name=DAILY_REGISTRY_ENRICHMENT_TASK_NAME, + defaults={ + "task": DAILY_REGISTRY_ENRICHMENT_TASK_PATH, + "args": json.dumps([]), + "kwargs": json.dumps({}), + "enabled": True, + "description": ( + "Daily enrichment lookup for active registry organizations." + ), + **schedule_fields, + }, + ) + + +def remove_daily_registry_enrichment_schedule(apps, schema_editor): + PeriodicTask = apps.get_model("django_celery_beat", "PeriodicTask") + PeriodicTask.objects.filter(name=DAILY_REGISTRY_ENRICHMENT_TASK_NAME).delete() + + +class Migration(migrations.Migration): + dependencies = [ + ("django_celery_beat", "0018_improve_crontab_helptext"), + ("parsers", "0021_widen_industrial_product_codes"), + ] + + operations = [ + migrations.RunPython( + seed_daily_registry_enrichment_schedule, + reverse_code=remove_daily_registry_enrichment_schedule, + ), + ] diff --git a/src/apps/parsers/organization_enrichment.py b/src/apps/parsers/organization_enrichment.py new file mode 100644 index 0000000..3550874 --- /dev/null +++ b/src/apps/parsers/organization_enrichment.py @@ -0,0 +1,236 @@ +"""Canonical organization enrichment for legacy parser API responses.""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any + +from django.db.models import Q +from organizations.models import Organization + +ORGANIZATION_ENRICHMENT_CONTEXT_KEY = "canonical_organization_enrichment" + + +@dataclass(frozen=True) +class OrganizationFieldSet: + """Response/source fields carrying one organization's identity.""" + + name: str | None = None + inn: str | None = None + kpp: str | None = None + ogrn: str | None = None + + +CanonicalOrganizationMap = dict[tuple[str, ...], Organization] + + +def _value_from(item: Any, field_name: str) -> Any: + if isinstance(item, dict): + return item.get(field_name) + return getattr(item, field_name, None) + + +def _normalized_value(value: Any) -> str: + if value is None: + return "" + return str(value).strip() + + +def _is_blank(value: Any) -> bool: + return _normalized_value(value) == "" + + +def _collect_identifiers( + items: list[Any], + field_sets: tuple[OrganizationFieldSet, ...], +) -> tuple[set[str], set[str], set[str]]: + inns: set[str] = set() + kpps: set[str] = set() + registration_numbers: set[str] = set() + for item in items: + for field_set in field_sets: + if field_set.inn: + inn = _normalized_value(_value_from(item, field_set.inn)) + if inn: + inns.add(inn) + if field_set.kpp: + kpp = _normalized_value(_value_from(item, field_set.kpp)) + if kpp: + kpps.add(kpp) + if field_set.ogrn: + ogrn = _normalized_value(_value_from(item, field_set.ogrn)) + if ogrn: + registration_numbers.add(ogrn) + return inns, kpps, registration_numbers + + +def _build_organization_query( + inns: set[str], + kpps: set[str], + registration_numbers: set[str], +) -> Q | None: + query = Q() + if inns: + query |= Q(inn__in=inns) + if kpps: + query |= Q(kpp__in=kpps) + if registration_numbers: + query |= Q(ogrn__in=registration_numbers) | Q(ogrip__in=registration_numbers) + return query or None + + +def _index_canonical_organization( + enrichment: CanonicalOrganizationMap, + by_inn: dict[str, list[Organization]], + by_ogrn: dict[str, list[Organization]], + organization: Organization, +) -> None: + if organization.inn and organization.kpp: + enrichment[("inn_kpp", organization.inn, organization.kpp)] = organization + if organization.ogrn and organization.kpp: + enrichment[("ogrn_kpp", organization.ogrn, organization.kpp)] = organization + if organization.inn: + by_inn.setdefault(organization.inn, []).append(organization) + if organization.ogrn: + by_ogrn.setdefault(organization.ogrn, []).append(organization) + if organization.ogrip: + enrichment[("ogrn", organization.ogrip)] = organization + + +def _index_unique_identity_matches( + enrichment: CanonicalOrganizationMap, + by_inn: dict[str, list[Organization]], + by_ogrn: dict[str, list[Organization]], +) -> None: + for inn, organizations in by_inn.items(): + if len(organizations) == 1: + enrichment[("inn", inn)] = organizations[0] + for ogrn, organizations in by_ogrn.items(): + if len(organizations) == 1: + enrichment[("ogrn", ogrn)] = organizations[0] + + +def build_canonical_organization_enrichment( + items: list[Any], + field_sets: tuple[OrganizationFieldSet, ...], +) -> CanonicalOrganizationMap: + """Build a page-level canonical org lookup without per-row DB queries.""" + + if not items or not field_sets: + return {} + + inns, kpps, registration_numbers = _collect_identifiers(items, field_sets) + query = _build_organization_query(inns, kpps, registration_numbers) + if query is None: + return {} + + enrichment: CanonicalOrganizationMap = {} + by_inn: dict[str, list[Organization]] = {} + by_ogrn: dict[str, list[Organization]] = {} + for organization in Organization.objects.filter(query).only( + "name", + "inn", + "kpp", + "ogrn", + "ogrip", + ): + _index_canonical_organization(enrichment, by_inn, by_ogrn, organization) + _index_unique_identity_matches(enrichment, by_inn, by_ogrn) + return enrichment + + +def find_canonical_organization( + item: Any, + field_sets: tuple[OrganizationFieldSet, ...], + enrichment: CanonicalOrganizationMap, +) -> Organization | None: + """Find canonical organization for one response item by available identity.""" + + for field_set in field_sets: + kpp = ( + _normalized_value(_value_from(item, field_set.kpp)) if field_set.kpp else "" + ) + if field_set.inn: + inn = _normalized_value(_value_from(item, field_set.inn)) + if inn and kpp and ("inn_kpp", inn, kpp) in enrichment: + return enrichment[("inn_kpp", inn, kpp)] + if inn and ("inn", inn) in enrichment: + return enrichment[("inn", inn)] + if field_set.ogrn: + ogrn = _normalized_value(_value_from(item, field_set.ogrn)) + if ogrn and kpp and ("ogrn_kpp", ogrn, kpp) in enrichment: + return enrichment[("ogrn_kpp", ogrn, kpp)] + if ogrn and ("ogrn", ogrn) in enrichment: + return enrichment[("ogrn", ogrn)] + return None + + +def enrich_organization_fields( + data: dict[str, Any], + *, + source: Any, + field_sets: tuple[OrganizationFieldSet, ...], + enrichment: CanonicalOrganizationMap, +) -> dict[str, Any]: + """Fill blank v1 organization fields from canonical Organization data.""" + + organization = find_canonical_organization(source, field_sets, enrichment) + if organization is None: + return data + + enriched = dict(data) + for field_set in field_sets: + if ( + field_set.name + and field_set.name in enriched + and _is_blank(enriched[field_set.name]) + ): + enriched[field_set.name] = organization.name + if ( + field_set.inn + and field_set.inn in enriched + and _is_blank(enriched[field_set.inn]) + ): + enriched[field_set.inn] = organization.inn + if ( + field_set.kpp + and field_set.kpp in enriched + and _is_blank(enriched[field_set.kpp]) + ): + enriched[field_set.kpp] = organization.kpp + if ( + field_set.ogrn + and field_set.ogrn in enriched + and _is_blank(enriched[field_set.ogrn]) + ): + enriched[field_set.ogrn] = organization.ogrn or organization.ogrip + return enriched + + +PARSER_RESULT_ORGANIZATION_FIELDS = ( + OrganizationFieldSet( + name="organisation_name", + inn="inn", + ogrn="ogrn", + ), +) + + +def enrich_parser_result_rows(rows: list[dict[str, Any]]) -> list[dict[str, Any]]: + """Enrich unified v1 parser result dictionaries without changing their shape.""" + + enrichment = build_canonical_organization_enrichment( + rows, + PARSER_RESULT_ORGANIZATION_FIELDS, + ) + if not enrichment: + return rows + return [ + enrich_organization_fields( + row, + source=row, + field_sets=PARSER_RESULT_ORGANIZATION_FIELDS, + enrichment=enrichment, + ) + for row in rows + ] diff --git a/src/apps/parsers/serializers.py b/src/apps/parsers/serializers.py index 22e3800..ea72310 100644 --- a/src/apps/parsers/serializers.py +++ b/src/apps/parsers/serializers.py @@ -22,6 +22,12 @@ from apps.parsers.models import ( ParsingSettings, ProcurementRecord, ) +from apps.parsers.organization_enrichment import ( + ORGANIZATION_ENRICHMENT_CONTEXT_KEY, + OrganizationFieldSet, + build_canonical_organization_enrichment, + enrich_organization_fields, +) from rest_framework import serializers BLOCKED_FILE_HOSTS = {"localhost", "localhost.localdomain"} @@ -76,15 +82,63 @@ def _validate_public_file_host(host: str) -> None: # ============================================================================= -class IndustrialCertificateSerializer(serializers.ModelSerializer): +class CanonicalOrganizationListSerializer(serializers.ListSerializer): + """Adds one canonical organization lookup for the current serialized page.""" + + def to_representation(self, data): + items = list(data.all() if hasattr(data, "all") else data) + field_sets = getattr(self.child, "canonical_organization_fields", ()) + self.context[ + ORGANIZATION_ENRICHMENT_CONTEXT_KEY + ] = build_canonical_organization_enrichment(items, field_sets) + return super().to_representation(items) + + +class CanonicalOrganizationEnrichmentMixin: + """Fill blank v1 organization fields while preserving existing contract.""" + + canonical_organization_fields: tuple[OrganizationFieldSet, ...] = () + + def to_representation(self, instance): + data = super().to_representation(instance) + if not self.canonical_organization_fields: + return data + + enrichment = self.context.get(ORGANIZATION_ENRICHMENT_CONTEXT_KEY) + if enrichment is None: + enrichment = build_canonical_organization_enrichment( + [instance], + self.canonical_organization_fields, + ) + return enrich_organization_fields( + data, + source=instance, + field_sets=self.canonical_organization_fields, + enrichment=enrichment, + ) + + +class IndustrialCertificateSerializer( + CanonicalOrganizationEnrichmentMixin, + serializers.ModelSerializer, +): """ Сертификат промышленного производства РФ. Данные загружаются из Минпромторга. """ + canonical_organization_fields = ( + OrganizationFieldSet( + name="organisation_name", + inn="inn", + ogrn="ogrn", + ), + ) + class Meta: model = IndustrialCertificateRecord + list_serializer_class = CanonicalOrganizationListSerializer fields = [ "id", "load_batch", @@ -109,15 +163,27 @@ class IndustrialCertificateSerializer(serializers.ModelSerializer): # ============================================================================= -class ManufacturerSerializer(serializers.ModelSerializer): +class ManufacturerSerializer( + CanonicalOrganizationEnrichmentMixin, + serializers.ModelSerializer, +): """ Производитель из реестра Минпромторга. Данные загружаются из Минпромторга. """ + canonical_organization_fields = ( + OrganizationFieldSet( + name="full_legal_name", + inn="inn", + ogrn="ogrn", + ), + ) + class Meta: model = ManufacturerRecord + list_serializer_class = CanonicalOrganizationListSerializer fields = [ "id", "load_batch", @@ -132,15 +198,27 @@ class ManufacturerSerializer(serializers.ModelSerializer): read_only_fields = fields -class IndustrialProductSerializer(serializers.ModelSerializer): +class IndustrialProductSerializer( + CanonicalOrganizationEnrichmentMixin, + serializers.ModelSerializer, +): """ Промышленная продукция из реестра Минпромторга. Данные загружаются из Минпромторга. """ + canonical_organization_fields = ( + OrganizationFieldSet( + name="full_organisation_name", + inn="inn", + ogrn="ogrn", + ), + ) + class Meta: model = IndustrialProductRecord + list_serializer_class = CanonicalOrganizationListSerializer fields = [ "id", "load_batch", @@ -165,15 +243,27 @@ class IndustrialProductSerializer(serializers.ModelSerializer): # ============================================================================= -class InspectionSerializer(serializers.ModelSerializer): +class InspectionSerializer( + CanonicalOrganizationEnrichmentMixin, + serializers.ModelSerializer, +): """ Проверка из Единого реестра проверок. Поддерживает ФЗ-294 и ФЗ-248. """ + canonical_organization_fields = ( + OrganizationFieldSet( + name="organisation_name", + inn="inn", + ogrn="ogrn", + ), + ) + class Meta: model = InspectionRecord + list_serializer_class = CanonicalOrganizationListSerializer fields = [ "id", "load_batch", @@ -206,15 +296,28 @@ class InspectionSerializer(serializers.ModelSerializer): # ============================================================================= -class ProcurementSerializer(serializers.ModelSerializer): +class ProcurementSerializer( + CanonicalOrganizationEnrichmentMixin, + serializers.ModelSerializer, +): """ Государственная закупка из ЕИС zakupki.gov.ru. Поддерживает 44-ФЗ и 223-ФЗ. """ + canonical_organization_fields = ( + OrganizationFieldSet( + name="customer_name", + inn="customer_inn", + kpp="customer_kpp", + ogrn="customer_ogrn", + ), + ) + class Meta: model = ProcurementRecord + list_serializer_class = CanonicalOrganizationListSerializer fields = [ "id", "load_batch", @@ -432,6 +535,13 @@ class ParserRunRequestSerializer(serializers.Serializer): required=False, allow_empty=True, ) + vacancy_sources = serializers.ListField( + child=serializers.ChoiceField(choices=["trudvsem", "hh", "superjob"]), + required=False, + allow_empty=False, + ) + registry_organizations_only = serializers.BooleanField(required=False) + registry_organization_limit = serializers.IntegerField(required=False, min_value=1) year = serializers.IntegerField(required=False, min_value=2000, max_value=2100) month = serializers.IntegerField(required=False, min_value=1, max_value=12) limit = serializers.IntegerField(required=False, min_value=1, max_value=1000) @@ -695,11 +805,23 @@ class ParserLoadLogSerializer(serializers.ModelSerializer): return 0 -class GenericParserRecordSerializer(serializers.ModelSerializer): +class GenericParserRecordSerializer( + CanonicalOrganizationEnrichmentMixin, + serializers.ModelSerializer, +): """Сериализатор универсальных записей новых источников.""" + canonical_organization_fields = ( + OrganizationFieldSet( + name="organisation_name", + inn="inn", + ogrn="ogrn", + ), + ) + class Meta: model = GenericParserRecord + list_serializer_class = CanonicalOrganizationListSerializer fields = [ "id", "load_batch", diff --git a/src/apps/parsers/services.py b/src/apps/parsers/services.py index e3c154e..4c69731 100644 --- a/src/apps/parsers/services.py +++ b/src/apps/parsers/services.py @@ -55,17 +55,6 @@ _DATE_FORMATS = ( ) -def _model_defaults(instance, lookup_fields: list[str]) -> dict: - """Собрать defaults для get_or_create из Django model instance.""" - lookup = set(lookup_fields) - defaults = {} - for field in instance._meta.concrete_fields: - if field.primary_key or field.name in lookup: - continue - defaults[field.name] = getattr(instance, field.name) - return defaults - - def normalize_to_date(value: str | None) -> date | None: """Нормализовать строку с датой в date.""" if value is None: @@ -665,6 +654,8 @@ class IndustrialProductService( cls, products: list[IndustrialProduct], batch_id: int, + *, + chunk_size: int = 1000, ) -> int: """Сохранить список промышленной продукции из парсера.""" if not products: @@ -679,8 +670,12 @@ class IndustrialProductService( [(product.inn, product.ogrn) for product in products] ) - items = [ - { + items_by_registry_number = {} + for product in products: + if not product.registry_number: + continue + + items_by_registry_number[product.registry_number] = { "load_batch": batch_id, "full_organisation_name": product.full_organisation_name, "ogrn": product.ogrn, @@ -697,26 +692,62 @@ class IndustrialProductService( ogrn=product.ogrn, ), } - for product in products - if product.registry_number - ] - created_count, updated_count = cls.bulk_update_or_create( - items, - unique_fields=["registry_number"], - update_fields=[ - "load_batch", - "full_organisation_name", - "ogrn", - "inn", - "product_name", - "product_model", - "okpd2_code", - "tnved_code", - "regulatory_document", - "registry_organization_id", - ], + existing_ids_by_registry_number = {} + registry_numbers = list(items_by_registry_number) + for i in range(0, len(registry_numbers), chunk_size): + chunk = registry_numbers[i : i + chunk_size] + existing_ids_by_registry_number.update( + cls.model.objects.filter(registry_number__in=chunk).values_list( + "registry_number", + "id", + ) + ) + + create_instances = [] + update_instances = [] + now = timezone.now() + for registry_number, item in items_by_registry_number.items(): + existing_id = existing_ids_by_registry_number.get(registry_number) + if existing_id is None: + create_instances.append(cls.model(**item)) + continue + + update_instances.append( + cls.model( + id=existing_id, + updated_at=now, + **item, + ) + ) + + created_count = cls.bulk_create_chunked( + create_instances, + chunk_size=chunk_size, ) + updated_count = 0 + update_fields = [ + "load_batch", + "full_organisation_name", + "ogrn", + "inn", + "product_name", + "product_model", + "okpd2_code", + "tnved_code", + "regulatory_document", + "registry_organization_id", + "updated_at", + ] + for i in range(0, len(update_instances), chunk_size): + chunk = update_instances[i : i + chunk_size] + cls.model.objects.bulk_update( + chunk, + fields=update_fields, + batch_size=chunk_size, + ) + updated_count += len(chunk) + saved_count = created_count + updated_count logger.info( "Saved %d industrial products (created=%d, updated=%d)", @@ -764,18 +795,32 @@ class GenericParserRecordService(BulkOperationsMixin, BaseService[GenericParserR cls.bulk_create_chunked(instances, chunk_size=chunk_size) return len(instances) except IntegrityError: - logger.info("Falling back to get_or_create after generic record conflict") + logger.info("Retrying generic record insert with ignored conflicts") - created_count = 0 - for instance in instances: - lookup = {field: getattr(instance, field) for field in unique_fields} - _, created = cls.model.objects.get_or_create( - defaults=_model_defaults(instance, unique_fields), - **lookup, + lookup_keys = [ + tuple(getattr(instance, field) for field in unique_fields) + for instance in instances + ] + existing_before = set( + cls._bulk_existing_by_lookup( + lookup_keys=lookup_keys, + unique_fields=unique_fields, + chunk_size=chunk_size, ) - if created: - created_count += 1 - return created_count + ) + cls.bulk_create_chunked( + instances, + chunk_size=chunk_size, + ignore_conflicts=True, + ) + existing_after = set( + cls._bulk_existing_by_lookup( + lookup_keys=lookup_keys, + unique_fields=unique_fields, + chunk_size=chunk_size, + ) + ) + return len(existing_after - existing_before) @classmethod @transaction.atomic @@ -1015,19 +1060,28 @@ class ProxyService(BaseService[Proxy]): Returns: Количество добавленных прокси """ - created_count = 0 - for address in addresses: - _, created = cls.model.objects.get_or_create( - address=address, - defaults={ - "is_active": True, - "source": source, - "country_code": country_code.upper(), - }, + unique_addresses = list(dict.fromkeys(addresses)) + if not unique_addresses: + return 0 + + existing_addresses = set( + cls.model.objects.filter(address__in=unique_addresses).values_list( + "address", + flat=True, ) - if created: - created_count += 1 - return created_count + ) + create_instances = [ + cls.model( + address=address, + is_active=True, + source=source, + country_code=country_code.upper(), + ) + for address in unique_addresses + if address not in existing_addresses + ] + cls.model.objects.bulk_create(create_instances, batch_size=1000) + return len(create_instances) class ProxyToolsSyncError(Exception): @@ -1272,16 +1326,20 @@ class ProxyToolsSyncService: } created = 0 - updated = 0 + create_instances: list[Proxy] = [] + update_instances: list[Proxy] = [] + now = timezone.now() for address in addresses: proxy = existing_by_address.get(address) if proxy is None: - Proxy.objects.create( - address=address, - is_active=True, - description="Imported from Proxy-Tools", - source=cls.SOURCE, - country_code=cls.COUNTRY_CODE, + create_instances.append( + Proxy( + address=address, + is_active=True, + description="Imported from Proxy-Tools", + source=cls.SOURCE, + country_code=cls.COUNTRY_CODE, + ) ) created += 1 continue @@ -1294,8 +1352,17 @@ class ProxyToolsSyncService: proxy.description = "Imported from Proxy-Tools" changed_fields.append("description") if changed_fields: - proxy.save(update_fields=[*changed_fields, "updated_at"]) - updated += 1 + proxy.updated_at = now + update_instances.append(proxy) + + if create_instances: + Proxy.objects.bulk_create(create_instances, batch_size=1000) + if update_instances: + Proxy.objects.bulk_update( + update_instances, + fields=["is_active", "description", "updated_at"], + batch_size=1000, + ) deactivated = 0 active_imported = existing_qs.filter(is_active=True) @@ -1308,7 +1375,7 @@ class ProxyToolsSyncService: return { "created": created, - "updated": updated, + "updated": len(update_instances), "deactivated": deactivated, } diff --git a/src/apps/parsers/source_cards.py b/src/apps/parsers/source_cards.py index 06ed6dd..ed14d33 100644 --- a/src/apps/parsers/source_cards.py +++ b/src/apps/parsers/source_cards.py @@ -104,6 +104,7 @@ SOURCE_CARD_DEFINITIONS: tuple[SourceCardDefinition, ...] = ( "apps.parsers.tasks.parse_procurements_44fz", "apps.parsers.tasks.parse_procurements_223fz", "apps.parsers.tasks.parse_contracts", + "apps.parsers.tasks.parse_registry_contracts", ), source_items=( SourceItemDefinition( @@ -206,6 +207,7 @@ SOURCE_CARD_DEFINITIONS: tuple[SourceCardDefinition, ...] = ( task_names=( "apps.parsers.tasks.parse_inspections", "apps.parsers.tasks.sync_inspections", + "apps.parsers.tasks.parse_registry_inspections", ), source_items=( SourceItemDefinition( diff --git a/src/apps/parsers/source_registry.py b/src/apps/parsers/source_registry.py index 744a2cc..d7a665c 100644 --- a/src/apps/parsers/source_registry.py +++ b/src/apps/parsers/source_registry.py @@ -236,8 +236,7 @@ PARSER_SOURCES: dict[str, ParserSourceDescriptor] = { access_method="official_search_api", parser_strategy="checko_legal_cases_by_inn_ogrn", source_notes=( - "Поиск дел выполняется по ИНН/ОГРН организаций из реестров; " - "если реестр пустой, используются ИНН из уже загруженных источников. " + "Поиск дел выполняется по ИНН/ОГРН активных организаций из реестров. " "Checko отдаёт карточки со ссылками на КАД Арбитр." ), api_route="arbitration/cases", @@ -281,13 +280,17 @@ PARSER_SOURCES: dict[str, ParserSourceDescriptor] = { "trudvsem": ParserSourceDescriptor( key="trudvsem", source=ParserLoadLog.Source.TRUDVSEM, - title="Вакансии Работа России", - agency="ЕЦП Работа в России", - data_scope="Вакансии работодателей", + title="Вакансии", + agency="Работа России / HH / SuperJob", + data_scope="Вакансии работодателей из нескольких job-board источников", task_name="apps.parsers.tasks.parse_trudvsem_vacancies", upstream_url="https://opendata.trudvsem.ru/api/v1/vacancies", access_method="public_api", - parser_strategy="trudvsem_vacancies_api", + parser_strategy="multi_source_vacancies_api", + source_notes=( + "Internal source key remains trudvsem for backward compatibility; " + "payload.vacancy_source distinguishes trudvsem, hh and superjob." + ), api_route="trudvsem/vacancies", ), } diff --git a/src/apps/parsers/tasks.py b/src/apps/parsers/tasks.py index 2cec33b..b106467 100644 --- a/src/apps/parsers/tasks.py +++ b/src/apps/parsers/tasks.py @@ -10,7 +10,7 @@ import logging import shutil import time import uuid -from dataclasses import dataclass +from dataclasses import dataclass, replace from datetime import datetime from decimal import Decimal from pathlib import Path @@ -18,7 +18,14 @@ from pathlib import Path from apps.core.services import BackgroundJobService from apps.core.tasks import PeriodicTask as CorePeriodicTask from apps.parsers.clients.base import HTTPClientError -from apps.parsers.clients.checko import CheckoClient, CompanyRequest, LegalCasesRequest +from apps.parsers.clients.checko import ( + CheckoClient, + CompanyRequest, + ContractLaw, + ContractsRequest, + InspectionsRequest, + LegalCasesRequest, +) from apps.parsers.clients.checko.exceptions import CheckoError from apps.parsers.clients.common import GenericParserItem, StructuredDataClient from apps.parsers.clients.minpromtorg import ( @@ -27,18 +34,10 @@ from apps.parsers.clients.minpromtorg import ( ManufacturesClient, ) from apps.parsers.clients.proverki import ProverkiClient -from apps.parsers.clients.trudvsem import TrudvsemClient +from apps.parsers.clients.proverki.schemas import Inspection as ProverkiInspection +from apps.parsers.clients.vacancies import VacanciesClient from apps.parsers.clients.zakupki import ZakupkiClient -from apps.parsers.models import ( - FinancialReport, - GenericParserRecord, - IndustrialCertificateRecord, - IndustrialProductRecord, - InspectionRecord, - ManufacturerRecord, - ParserLoadLog, - ProcurementRecord, -) +from apps.parsers.models import ParserLoadLog from apps.parsers.services import ( FNSReportService, GenericParserRecordService, @@ -54,20 +53,20 @@ from apps.parsers.services import ( from apps.parsers.source_registry import PARSER_SOURCES from celery import shared_task from django.conf import settings -from registers.models import Organization +from django.db import transaction +from registers.models import RegistryMembershipPeriod from requests.adapters import BaseAdapter logger = logging.getLogger(__name__) -# Константы для синхронизации проверок -DEFAULT_START_YEAR = 2025 -DEFAULT_START_MONTH = 1 STRUCTURED_SOURCE_OPTIONS = { "fstec": {"verify_ssl": False}, "fedresurs_bankruptcy": {"timeout": 30}, } FEDRESURS_CHECKO_FALLBACK_LIMIT = 100 ARBITRATION_CHECKO_LIMIT = 100 +REGISTRY_INSPECTIONS_CHECKO_LIMIT = 1000 +REGISTRY_CONTRACTS_CHECKO_LIMIT = 1000 PARSER_STALE_LOAD_MAX_AGE_MINUTES = 90 PARSER_SOFT_TIME_LIMIT_SECONDS = 15 * 60 PARSER_TIME_LIMIT_SECONDS = 20 * 60 @@ -81,17 +80,85 @@ class ParserSourceSkipped(Exception): pass +@dataclass(frozen=True) +class RegistryLookupTarget: + """Организация из активных реестров для lookup-загрузок.""" + + organization_id: int + inn: str + ogrn: str + name: str + + +VACANCY_REGISTRY_MAX_PAGES_PER_ORGANIZATION = 100 + + +def _resolve_lookup_limit( + limit: int | None, + *, + default: int, +) -> int: + """Нормализовать лимит lookup-загрузки.""" + if limit is None: + limit = default + try: + resolved = int(limit) + except (TypeError, ValueError): + resolved = default + return max(resolved, 0) + + +def _active_registry_lookup_targets( + *, + limit: int | None = None, +) -> list[RegistryLookupTarget]: + """Вернуть организации, которые сейчас состоят хотя бы в одном реестре.""" + queryset = ( + RegistryMembershipPeriod.objects.filter(ended_at__isnull=True) + .order_by("organization_id") + .values( + "organization_id", + "organization__mn_inn", + "organization__mn_ogrn", + "organization__pn_name", + ) + .distinct() + ) + if limit is not None: + queryset = queryset[: max(0, limit)] + + targets: list[RegistryLookupTarget] = [] + for row in queryset: + inn = str(row["organization__mn_inn"] or "").strip() + ogrn = str(row["organization__mn_ogrn"] or "").strip() + if not inn and not ogrn: + continue + targets.append( + RegistryLookupTarget( + organization_id=int(row["organization_id"]), + inn=inn, + ogrn=ogrn, + name=str(row["organization__pn_name"] or "").strip(), + ) + ) + return targets + + def _resolve_proxies(proxies: list[str] | None) -> list[str] | None: """ Разрешить итоговый список прокси. Приоритет: 1. Явно переданные в задачу `proxies` - 2. Runtime-прокси из БД (с приоритетом Proxy-Tools RU) + 2. Runtime-прокси из БД, только если включён явный opt-in + `PARSER_USE_RUNTIME_PROXIES` """ if proxies is not None: return proxies + if not getattr(settings, "PARSER_USE_RUNTIME_PROXIES", False): + return None + db_proxies = ProxyService.get_runtime_proxies_or_none() if db_proxies: return db_proxies @@ -134,6 +201,22 @@ def _get_or_create_background_job( return job +def _queue_organization_snapshot_refresh(source: str, batch_id: int) -> None: + """Queue snapshot refresh only after parser writes are committed.""" + + def enqueue() -> None: + from organizations.tasks import ( + refresh_organization_data_snapshots_for_parser_batch, + ) + + refresh_organization_data_snapshots_for_parser_batch.delay( + source=str(source), + batch_id=batch_id, + ) + + transaction.on_commit(enqueue) + + def _run_generic_parser( self, *, @@ -176,6 +259,73 @@ def _run_generic_parser( ) result = {"batch_id": batch_id, "saved": saved_count, "status": "success"} job.complete(result=result) + _queue_organization_snapshot_refresh(source, batch_id) + return result + except ParserSourceSkipped as e: + message = str(e) + logger.warning("%s skipped: %s", task_name, message) + ParserLoadLogService.update( + load_log, + status=ParserLoadLog.Status.SKIPPED, + error_message=message, + ) + result = { + "batch_id": batch_id, + "saved": 0, + "status": "skipped", + "reason": message, + } + job.complete(result=result) + return result + except Exception as e: + logger.error("%s failed: %s", task_name, e, exc_info=True) + ParserLoadLogService.mark_failed(load_log, str(e)) + job.fail(error=str(e)) + raise + + +def _run_inspection_parser( + self, + *, + source_key: str, + task_name: str, + fetch_records, + requested_by_id: int | None = None, +) -> dict: + """Единый runner для lookup-загрузок проверок в native storage.""" + source = ParserLoadLog.Source.INSPECTIONS + load_log, batch_id = ParserLoadLogService.create_load_log_with_next_batch_id( + source=source, + status="in_progress", + ) + task_id = self.request.id or str(uuid.uuid4()) + job = _get_or_create_background_job( + task_id=task_id, + task_name=task_name, + source=source, + batch_id=batch_id, + requested_by_id=requested_by_id, + meta={"source_key": source_key}, + ) + job.mark_started() + job.update_progress(0, "Инициализация парсера...") + + try: + job.update_progress(20, "Lookup проверок по организациям из реестров...") + records = fetch_records() + job.update_progress(70, f"Сохранение {len(records)} проверок...") + saved_count = InspectionService.save_inspections( + records, + batch_id=batch_id, + ) + ParserLoadLogService.update( + load_log, + status="success", + records_count=saved_count, + ) + result = {"batch_id": batch_id, "saved": saved_count, "status": "success"} + job.complete(result=result) + _queue_organization_snapshot_refresh(source, batch_id) return result except ParserSourceSkipped as e: message = str(e) @@ -292,27 +442,20 @@ def _fetch_checko_bankruptcy_records( logger.warning("CHECKO_API_KEY is empty; Fedresurs fallback skipped") return [] - limit = getattr( - settings, - "FEDRESURS_CHECKO_FALLBACK_LIMIT", - FEDRESURS_CHECKO_FALLBACK_LIMIT, + limit = _resolve_lookup_limit( + getattr( + settings, + "FEDRESURS_CHECKO_FALLBACK_LIMIT", + FEDRESURS_CHECKO_FALLBACK_LIMIT, + ), + default=FEDRESURS_CHECKO_FALLBACK_LIMIT, ) - try: - limit = int(limit) - except (TypeError, ValueError): - limit = FEDRESURS_CHECKO_FALLBACK_LIMIT if limit <= 0: logger.info("Fedresurs Checko fallback is disabled by limit=%s", limit) return [] - organizations = list( - Organization.objects.order_by("mn_inn").values( - "mn_inn", - "mn_ogrn", - "pn_name", - )[:limit] - ) - if not organizations: - logger.info("No registry organizations found for Fedresurs fallback") + targets = _active_registry_lookup_targets(limit=limit) + if not targets: + logger.info("No active registry organizations found for Fedresurs fallback") return [] checko_proxies = ( @@ -320,14 +463,17 @@ def _fetch_checko_bankruptcy_records( ) client = CheckoClient(api_key=api_key, proxies=checko_proxies, timeout=30) records: list[GenericParserItem] = [] - for organization in organizations: - inn = str(organization["mn_inn"]) - ogrn = str(organization["mn_ogrn"]) - name = organization["pn_name"] + for target in targets: try: - response = client.get_company(CompanyRequest(inn=inn)) + response = client.get_company( + CompanyRequest(inn=target.inn or None, ogrn=target.ogrn or None) + ) except CheckoError as exc: - logger.info("Checko bankruptcy lookup skipped for inn=%s: %s", inn, exc) + logger.info( + "Checko bankruptcy lookup skipped for target=%s: %s", + target.inn or target.ogrn, + exc, + ) continue company = response.data if company is None: @@ -335,9 +481,9 @@ def _fetch_checko_bankruptcy_records( records.extend( _checko_bankruptcy_items( company=company, - fallback_inn=inn, - fallback_ogrn=ogrn, - fallback_name=name, + fallback_inn=target.inn, + fallback_ogrn=target.ogrn, + fallback_name=target.name, ) ) logger.info("Fetched %d bankruptcy records through Checko fallback", len(records)) @@ -451,85 +597,19 @@ def _add_arbitration_subject( subjects.setdefault(key, subject) -def _extend_arbitration_subjects_from_model( - subjects: dict[str, ArbitrationSubject], - *, - model, - limit: int, - inn_field: str | None, - ogrn_field: str | None, - name_field: str | None, -) -> None: - """Добрать организации из уже загруженных parser-таблиц.""" - if len(subjects) >= limit: - return - - fields = [ - field - for field in (inn_field, ogrn_field, name_field, "registry_organization_id") - if field - ] - queryset = model.objects.all().order_by("-updated_at") - if inn_field: - queryset = queryset.exclude(**{inn_field: ""}) - elif ogrn_field: - queryset = queryset.exclude(**{ogrn_field: ""}) - - # Берем небольшой запас: в источниках часто несколько строк на один ИНН. - row_limit = max((limit - len(subjects)) * 5, 10) - for row in queryset.values(*fields)[:row_limit]: - _add_arbitration_subject( - subjects, - inn=row.get(inn_field) if inn_field else "", - ogrn=row.get(ogrn_field) if ogrn_field else "", - name=row.get(name_field) if name_field else "", - registry_organization_id=row.get("registry_organization_id"), - limit=limit, - ) - if len(subjects) >= limit: - return - - def _arbitration_subjects(limit: int) -> list[ArbitrationSubject]: - """Собрать организации для арбитражного lookup.""" + """Собрать активные организации из реестров для арбитражного lookup.""" subjects: dict[str, ArbitrationSubject] = {} - - for organization in Organization.objects.order_by("pn_name").values( - "id", - "mn_inn", - "mn_ogrn", - "pn_name", - )[:limit]: + for target in _active_registry_lookup_targets(limit=limit): _add_arbitration_subject( subjects, - inn=organization["mn_inn"], - ogrn=organization["mn_ogrn"], - name=organization["pn_name"], - registry_organization_id=organization["id"], + inn=target.inn, + ogrn=target.ogrn, + name=target.name, + registry_organization_id=target.organization_id, limit=limit, ) - fallbacks = ( - (ManufacturerRecord, "inn", "ogrn", "full_legal_name"), - (IndustrialCertificateRecord, "inn", "ogrn", "organisation_name"), - (IndustrialProductRecord, "inn", "ogrn", "full_organisation_name"), - (InspectionRecord, "inn", "ogrn", "organisation_name"), - (ProcurementRecord, "customer_inn", "customer_ogrn", "customer_name"), - (GenericParserRecord, "inn", "ogrn", "organisation_name"), - (FinancialReport, None, "ogrn", None), - ) - for model, inn_field, ogrn_field, name_field in fallbacks: - _extend_arbitration_subjects_from_model( - subjects, - model=model, - inn_field=inn_field, - ogrn_field=ogrn_field, - name_field=name_field, - limit=limit, - ) - if len(subjects) >= limit: - break - return list(subjects.values()) @@ -561,7 +641,7 @@ def _fetch_checko_arbitration_records( subjects = _arbitration_subjects(resolved_limit) if not subjects: raise ParserSourceSkipped( - "no registry organizations or parser identifiers found for arbitration" + "no active registry organizations found for arbitration" ) checko_proxies = ( @@ -718,6 +798,293 @@ def _arbitration_external_id( return f"checko-arbitration:{digest}" +def _checko_inspection_registration_number( + inspection, + *, + target: RegistryLookupTarget, +) -> str: + registration_number = ( + getattr(inspection, "erp_id", None) or getattr(inspection, "id", None) or "" + ) + if registration_number: + return str(registration_number) + + raw = ":".join( + [ + target.inn, + target.ogrn, + str(getattr(inspection, "plan_date_from", None) or ""), + str(getattr(inspection, "plan_date_to", None) or ""), + str(getattr(inspection, "authority_ogrn", None) or ""), + str(getattr(inspection, "subject", None) or ""), + ] + ) + digest = hashlib.sha256(raw.encode("utf-8")).hexdigest()[:24] + return f"checko-inspection:{digest}" + + +def _checko_inspection_item( + inspection, + *, + target: RegistryLookupTarget, +) -> ProverkiInspection: + result = str(getattr(inspection, "result", None) or "") + if not result and getattr(inspection, "violations_found", False): + result = "violations_found=true" + return ProverkiInspection( + registration_number=_checko_inspection_registration_number( + inspection, + target=target, + ), + inn=target.inn, + ogrn=target.ogrn, + organisation_name=target.name, + control_authority=str(getattr(inspection, "authority_name", None) or ""), + inspection_type=str(getattr(inspection, "type", None) or ""), + inspection_form=str(getattr(inspection, "form", None) or ""), + start_date=str( + getattr(inspection, "actual_date_from", None) + or getattr(inspection, "plan_date_from", None) + or "" + ), + end_date=str( + getattr(inspection, "actual_date_to", None) + or getattr(inspection, "plan_date_to", None) + or "" + ), + status=str(getattr(inspection, "status", None) or ""), + legal_basis="checko", + result=result, + is_federal_law_248=False, + ) + + +def _fetch_checko_registry_inspections( + *, + limit: int | None, + proxies: list[str] | None, +) -> list[ProverkiInspection]: + """Получить проверки по активным организациям из реестров через Checko.""" + api_key = getattr(settings, "CHECKO_API_KEY", "") + if not api_key: + raise ParserSourceSkipped( + "CHECKO_API_KEY is empty; registry inspections parser skipped" + ) + + resolved_limit = _resolve_lookup_limit( + limit, + default=getattr( + settings, + "REGISTRY_INSPECTIONS_CHECKO_LIMIT", + REGISTRY_INSPECTIONS_CHECKO_LIMIT, + ), + ) + if resolved_limit <= 0: + logger.info("Registry inspections Checko parser is disabled by limit=%s", limit) + return [] + + targets = _active_registry_lookup_targets(limit=resolved_limit) + if not targets: + raise ParserSourceSkipped( + "no active registry organizations found for registry inspections" + ) + + checko_proxies = ( + proxies if getattr(settings, "CHECKO_USE_RUNTIME_PROXIES", False) else None + ) + client = CheckoClient(api_key=api_key, proxies=checko_proxies, timeout=30) + records: list[ProverkiInspection] = [] + failed_lookups = 0 + for target in targets: + try: + request = InspectionsRequest( + inn=target.inn or None, + ogrn=target.ogrn or None, + limit=100, + ) + for inspection in client.iter_inspections(request): + records.append(_checko_inspection_item(inspection, target=target)) + except CheckoError as exc: + failed_lookups += 1 + logger.info( + "Checko inspections lookup skipped for target=%s: %s", + target.inn or target.ogrn, + exc, + ) + + if failed_lookups == len(targets) and not records: + raise ParserSourceSkipped("Checko inspections lookups failed for all targets") + + logger.info( + "Fetched %d inspections through Checko for %d registry organizations", + len(records), + len(targets), + ) + return records + + +def _contract_party_payload(party) -> dict: + if party is None: + return {} + return { + "ogrn": getattr(party, "ogrn", None), + "inn": getattr(party, "inn", None), + "kpp": getattr(party, "kpp", None), + "name": getattr(party, "name", None), + "region_code": getattr(party, "region_code", None), + } + + +def _checko_contract_external_id( + contract, + *, + target: RegistryLookupTarget, + law: ContractLaw, +) -> str: + registry_number = str(getattr(contract, "registry_number", None) or "") + if registry_number: + return f"checko-contract:{law.value}:{registry_number}" + + raw = ":".join( + [ + law.value, + target.inn, + target.ogrn, + str(getattr(contract, "purchase_number", None) or ""), + str(getattr(contract, "publish_date", None) or ""), + str(getattr(contract, "subject", None) or ""), + ] + ) + digest = hashlib.sha256(raw.encode("utf-8")).hexdigest()[:24] + return f"checko-contract:{law.value}:{digest}" + + +def _checko_contract_item( + contract, + *, + target: RegistryLookupTarget, + law: ContractLaw, +) -> GenericParserItem: + price = getattr(contract, "price", None) + amount = Decimal(str(price)) if price is not None else None + customer = _contract_party_payload(getattr(contract, "customer", None)) + suppliers = [ + _contract_party_payload(supplier) + for supplier in getattr(contract, "suppliers", None) or () + ] + return GenericParserItem( + source=ParserLoadLog.Source.CONTRACTS, + external_id=_checko_contract_external_id( + contract, + target=target, + law=law, + ), + inn=target.inn, + ogrn=target.ogrn, + organisation_name=target.name, + title=str( + getattr(contract, "subject", None) + or getattr(contract, "registry_number", None) + or "Контракт ЕИС" + ), + record_date=str( + getattr(contract, "publish_date", None) + or getattr(contract, "sign_date", None) + or "" + ), + amount=amount, + status=str(getattr(contract, "status", None) or ""), + url=str(getattr(contract, "url", None) or ""), + payload={ + "provider": "checko", + "law": law.value, + "registry_number": getattr(contract, "registry_number", None), + "purchase_number": getattr(contract, "purchase_number", None), + "sign_date": getattr(contract, "sign_date", None), + "execution_date": getattr(contract, "execution_date", None), + "currency_code": getattr(contract, "currency_code", None), + "customer": customer, + "suppliers": suppliers, + "target": { + "registry_organization_id": target.organization_id, + "inn": target.inn, + "ogrn": target.ogrn, + "name": target.name, + }, + }, + ) + + +def _fetch_checko_registry_contract_records( + *, + limit: int | None, + proxies: list[str] | None, +) -> list[GenericParserItem]: + """Получить контракты по активным организациям из реестров через Checko.""" + api_key = getattr(settings, "CHECKO_API_KEY", "") + if not api_key: + raise ParserSourceSkipped( + "CHECKO_API_KEY is empty; registry contracts parser skipped" + ) + + resolved_limit = _resolve_lookup_limit( + limit, + default=getattr( + settings, + "REGISTRY_CONTRACTS_CHECKO_LIMIT", + REGISTRY_CONTRACTS_CHECKO_LIMIT, + ), + ) + if resolved_limit <= 0: + logger.info("Registry contracts Checko parser is disabled by limit=%s", limit) + return [] + + targets = _active_registry_lookup_targets(limit=resolved_limit) + if not targets: + raise ParserSourceSkipped( + "no active registry organizations found for registry contracts" + ) + + checko_proxies = ( + proxies if getattr(settings, "CHECKO_USE_RUNTIME_PROXIES", False) else None + ) + client = CheckoClient(api_key=api_key, proxies=checko_proxies, timeout=30) + records: list[GenericParserItem] = [] + failed_lookups = 0 + for target in targets: + for law in (ContractLaw.FZ44, ContractLaw.FZ223): + try: + request = ContractsRequest( + law=law, + inn=target.inn or None, + ogrn=target.ogrn or None, + limit=100, + ) + for contract in client.iter_contracts(request): + records.append( + _checko_contract_item(contract, target=target, law=law) + ) + except CheckoError as exc: + failed_lookups += 1 + logger.info( + "Checko contracts lookup skipped for target=%s law=%s: %s", + target.inn or target.ogrn, + law.value, + exc, + ) + + expected_lookups = len(targets) * 2 + if failed_lookups == expected_lookups and not records: + raise ParserSourceSkipped("Checko contracts lookups failed for all targets") + + logger.info( + "Fetched %d contracts through Checko for %d registry organizations", + len(records), + len(targets), + ) + return records + + @shared_task(bind=True, base=CorePeriodicTask) def sync_ru_proxies(self) -> dict[str, int | str]: # noqa: ARG001 """Периодически загружать RU-прокси из Proxy-Tools.""" @@ -886,6 +1253,7 @@ def _process_fns_file_sync( "lines_count": len(parsed.lines), } ) + _queue_organization_snapshot_refresh(source, batch_id) logger.info( "FNS file processed: %s (report_id=%d, lines=%d)", @@ -1006,6 +1374,7 @@ def parse_industrial_production( # Завершаем BackgroundJob job.complete(result={"batch_id": batch_id, "saved": saved_count}) + _queue_organization_snapshot_refresh(source, batch_id) logger.info( "Industrial production parsing completed (batch_id=%d, saved=%d)", @@ -1098,6 +1467,7 @@ def parse_manufactures( # Завершаем BackgroundJob job.complete(result={"batch_id": batch_id, "saved": saved_count}) + _queue_organization_snapshot_refresh(source, batch_id) logger.info( "Manufactures parsing completed (batch_id=%d, saved=%d)", @@ -1186,6 +1556,7 @@ def parse_industrial_products( ) job.complete(result={"batch_id": batch_id, "saved": saved_count}) + _queue_organization_snapshot_refresh(source, batch_id) logger.info( "Industrial products parsing completed (batch_id=%d, saved=%d)", @@ -1216,7 +1587,8 @@ def parse_all_minpromtorg( Args: proxies: Список прокси-серверов (опционально). - Если не передан, каждая задача возьмёт прокси из БД. + Если не передан, задачи идут напрямую. Runtime-прокси из БД + используются только при PARSER_USE_RUNTIME_PROXIES=true. client_adapter: HTTP-адаптер (опционально). Returns: @@ -1352,6 +1724,7 @@ def parse_inspections( # Завершаем BackgroundJob job.complete(result={"batch_id": batch_id, "saved": saved_count}) + _queue_organization_snapshot_refresh(source, batch_id) logger.info( "Inspections parsing completed (batch_id=%d, saved=%d)", @@ -1465,6 +1838,31 @@ def _get_next_month(year: int, month: int) -> tuple[int, int]: return year, month + 1 +def _subtract_months(year: int, month: int, months: int) -> tuple[int, int]: + """Вернуть период на `months` месяцев раньше.""" + total_months = year * 12 + (month - 1) - months + return total_months // 12, total_months % 12 + 1 + + +def _resolve_minimum_coverage_start_period( + *, + last_year: int | None, + last_month: int | None, + current_year: int, + current_month: int, +) -> tuple[int, int]: + """ + Рассчитать старт инкрементальной загрузки. + + Если данных по источнику ещё нет, загружаем rolling-окно в 12 месяцев. + Если данные уже есть, продолжаем с месяца после последнего периода, чтобы + не создавать дыр в историческом хвосте. + """ + if last_year and last_month: + return _get_next_month(last_year, last_month) + return _subtract_months(current_year, current_month, 11) + + @shared_task(bind=True) def sync_inspections( # noqa: C901 self, @@ -1485,11 +1883,10 @@ def sync_inspections( # noqa: C901 Синхронизация данных о проверках с proverki.gov.ru. Логика работы: - 1. Проверяет последнюю загруженную дату в БД - 2. Если данных нет - начинает с 01.01.2025 - 3. Загружает месяц за месяцем до конца текущего года + 1. Проверяет последний загруженный месяц в БД + 2. Если данных нет - загружает rolling-окно за последние 12 месяцев + 3. Если данные есть - дозагружает хвост с месяца после последнего периода 4. Загружает оба типа проверок (ФЗ-294 и ФЗ-248) - 5. Если данных нет за период - прекращает загрузку для этого типа Args: proxies: Список прокси-серверов (опционально) @@ -1541,6 +1938,9 @@ def sync_inspections( # noqa: C901 max_months_per_law = max(1, int(max_months_per_law)) total_saved = 0 results = {"fz294": [], "fz248": []} + attempted_fetches = 0 + failed_fetches = 0 + fetch_errors: list[str] = [] try: client_kwargs = {"proxies": proxies} @@ -1573,10 +1973,13 @@ def sync_inspections( # noqa: C901 last_year, last_month = InspectionService.get_last_loaded_period( is_federal_law_248=is_fz248 ) - + year, month = _resolve_minimum_coverage_start_period( + last_year=last_year, + last_month=last_month, + current_year=current_year, + current_month=current_month, + ) if last_year and last_month: - # Начинаем со следующего месяца после последнего загруженного - year, month = _get_next_month(last_year, last_month) logger.info( "%s: continuing from %d/%d (last loaded: %d/%d)", fz_name, @@ -1586,10 +1989,8 @@ def sync_inspections( # noqa: C901 last_month, ) else: - # Начинаем с дефолтной даты - year, month = DEFAULT_START_YEAR, DEFAULT_START_MONTH logger.info( - "%s: no data in DB, starting from %d/%d", + "%s: no data in DB, loading minimum coverage from %d/%d", fz_name, year, month, @@ -1612,21 +2013,13 @@ def sync_inspections( # noqa: C901 ) break - # Прекращаем если 2 месяца подряд нет данных - if empty_months_count >= 2: - logger.info( - "%s: stopping after %d empty months", - fz_name, - empty_months_count, - ) - break - job.update_progress( 20 + (50 if is_fz248 else 0), f"Загрузка {fz_name} за {month:02d}/{year}...", ) try: + attempted_fetches += 1 inspections = client.fetch_inspections( year=year, month=month, @@ -1670,6 +2063,8 @@ def sync_inspections( # noqa: C901 ) except Exception as e: + failed_fetches += 1 + fetch_errors.append(f"{fz_name} {year}/{month}: {e}") logger.warning( "%s %d/%d: error - %s", fz_name, @@ -1683,6 +2078,12 @@ def sync_inspections( # noqa: C901 # Переходим к следующему месяцу. year, month = _get_next_month(year, month) + if attempted_fetches and failed_fetches == attempted_fetches: + raise RuntimeError( + "All inspections fetch attempts failed; first error: " + f"{fetch_errors[0]}" + ) + # Обновляем лог ParserLoadLogService.update( load_log, @@ -1698,6 +2099,7 @@ def sync_inspections( # noqa: C901 "results": results, } ) + _queue_organization_snapshot_refresh(source, batch_id) logger.info("Inspections sync completed (total_saved=%d)", total_saved) @@ -1799,6 +2201,8 @@ def parse_procurements( client_kwargs["scheme"] = client_scheme if client_adapter: client_kwargs["http_adapter"] = client_adapter + if settings.ZAKUPKI_TOKEN: + client_kwargs["token"] = settings.ZAKUPKI_TOKEN with ZakupkiClient(**client_kwargs) as client: procurements = client.fetch_procurements( region_code=region_code, @@ -1828,6 +2232,7 @@ def parse_procurements( # Завершаем BackgroundJob job.complete(result={"batch_id": batch_id, "saved": saved_count}) + _queue_organization_snapshot_refresh(source, batch_id) logger.info( "Procurements parsing completed (batch_id=%d, saved=%d)", @@ -1866,9 +2271,9 @@ def sync_procurements( # noqa: C901 Синхронизация данных о закупках с zakupki.gov.ru. Логика работы: - 1. Проверяет последнюю загруженную дату в БД для региона - 2. Если данных нет - начинает с 01.01.2025 - 3. Загружает месяц за месяцем до текущего + 1. Проверяет последний загруженный месяц в БД для региона и закона + 2. Если данных нет - загружает rolling-окно за последние 12 месяцев + 3. Если данные есть - дозагружает хвост с месяца после последнего периода Args: region_code: Код региона (обязательный) @@ -1917,6 +2322,9 @@ def sync_procurements( # noqa: C901 current_month = current_month or now.month total_saved = 0 results = [] + attempted_fetches = 0 + failed_fetches = 0 + fetch_errors: list[str] = [] try: client_kwargs = {"proxies": proxies} @@ -1926,6 +2334,8 @@ def sync_procurements( # noqa: C901 client_kwargs["scheme"] = client_scheme if client_adapter: client_kwargs["http_adapter"] = client_adapter + if settings.ZAKUPKI_TOKEN: + client_kwargs["token"] = settings.ZAKUPKI_TOKEN with ZakupkiClient(**client_kwargs) as client: # Определяем начальную точку last_year, last_month = ProcurementService.get_last_loaded_period( @@ -1933,9 +2343,13 @@ def sync_procurements( # noqa: C901 law_type=f"{law_type}-FZ", ) + start_year, start_month = _resolve_minimum_coverage_start_period( + last_year=last_year, + last_month=last_month, + current_year=current_year, + current_month=current_month, + ) if last_year and last_month: - # Начинаем со следующего месяца после последнего загруженного - start_year, start_month = _get_next_month(last_year, last_month) logger.info( "Continuing from %d/%d (last loaded: %d/%d)", start_year, @@ -1944,10 +2358,8 @@ def sync_procurements( # noqa: C901 last_month, ) else: - # Начинаем с дефолтной даты - start_year, start_month = DEFAULT_START_YEAR, DEFAULT_START_MONTH logger.info( - "No data in DB, starting from %d/%d", + "No data in DB, loading minimum coverage from %d/%d", start_year, start_month, ) @@ -1959,17 +2371,13 @@ def sync_procurements( # noqa: C901 while year < current_year or ( year == current_year and month <= current_month ): - # Прекращаем если 2 месяца подряд нет данных - if empty_months_count >= 2: - logger.info("Stopping after %d empty months", empty_months_count) - break - job.update_progress( 20 + (60 * ((year - start_year) * 12 + month - start_month) // 24), f"Загрузка за {month:02d}/{year}...", ) try: + attempted_fetches += 1 procurements = client.fetch_procurements( region_code=region_code, year=year, @@ -2012,12 +2420,20 @@ def sync_procurements( # noqa: C901 ) except Exception as e: + failed_fetches += 1 + fetch_errors.append(f"{year}/{month}: {e}") logger.warning("%d/%d: error - %s", year, month, str(e)) empty_months_count += 1 # Переходим к следующему месяцу year, month = _get_next_month(year, month) + if attempted_fetches and failed_fetches == attempted_fetches: + raise RuntimeError( + "All procurements fetch attempts failed; first error: " + f"{fetch_errors[0]}" + ) + # Обновляем лог ParserLoadLogService.update( load_log, @@ -2033,6 +2449,7 @@ def sync_procurements( # noqa: C901 "results": results, } ) + _queue_organization_snapshot_refresh(source, batch_id) logger.info("Procurements sync completed (total_saved=%d)", total_saved) @@ -2128,6 +2545,29 @@ def parse_contracts( ) +@shared_task(bind=True) +def parse_registry_contracts( + self, + *, + limit: int | None = None, + proxies: list[str] | None = None, + requested_by_id: int | None = None, +) -> dict: + """Lookup контрактов ЕИС по активным организациям из реестров.""" + proxies = _resolve_proxies(proxies) + return _run_generic_parser( + self, + source_key="registry_contracts", + source=ParserLoadLog.Source.CONTRACTS, + task_name="apps.parsers.tasks.parse_registry_contracts", + requested_by_id=requested_by_id, + fetch_records=lambda: _fetch_checko_registry_contract_records( + limit=limit, + proxies=proxies, + ), + ) + + @shared_task(bind=True) def parse_unfair_suppliers( self, @@ -2220,6 +2660,28 @@ def parse_arbitration_cases( ) +@shared_task(bind=True) +def parse_registry_inspections( + self, + *, + limit: int | None = None, + proxies: list[str] | None = None, + requested_by_id: int | None = None, +) -> dict: + """Lookup проверок по активным организациям из реестров через Checko.""" + proxies = _resolve_proxies(proxies) + return _run_inspection_parser( + self, + source_key="registry_inspections", + task_name="apps.parsers.tasks.parse_registry_inspections", + requested_by_id=requested_by_id, + fetch_records=lambda: _fetch_checko_registry_inspections( + limit=limit, + proxies=proxies, + ), + ) + + @shared_task( bind=True, soft_time_limit=PARSER_SOFT_TIME_LIMIT_SECONDS, @@ -2249,6 +2711,42 @@ def parse_fedresurs_bankruptcy( ) +@shared_task(bind=True) +def parse_registry_enrichment_sources( + self, # noqa: ARG001 + *, + limit: int | None = None, + proxies: list[str] | None = None, + requested_by_id: int | None = None, +) -> dict[str, str]: + """ + Запустить daily-контур обогащения активных организаций из реестров. + + Внутри остаются независимые задачи: одни забирают полный официальный реестр, + другие делают lookup по ИНН/ОГРН активных организаций. + """ + proxies = _resolve_proxies(proxies) + tasks_to_run = { + "contracts": parse_registry_contracts, + "unfair_suppliers": parse_unfair_suppliers, + "arbitration": parse_arbitration_cases, + "bankruptcy": parse_fedresurs_bankruptcy, + "inspections": parse_registry_inspections, + "vacancies": parse_trudvsem_vacancies, + } + results: dict[str, str] = {} + for key, task in tasks_to_run.items(): + kwargs = { + "proxies": proxies, + "requested_by_id": requested_by_id, + } + if key in {"contracts", "arbitration", "inspections"}: + kwargs["limit"] = limit + result = task.delay(**kwargs) + results[key] = result.id + return results + + @shared_task( bind=True, soft_time_limit=PARSER_SOFT_TIME_LIMIT_SECONDS, @@ -2314,10 +2812,13 @@ def parse_trudvsem_vacancies( region_code: str | None = None, company_inn: str | None = None, text: str | None = None, + vacancy_sources: list[str] | None = None, + registry_organizations_only: bool = True, + registry_organization_limit: int | None = None, proxies: list[str] | None = None, requested_by_id: int | None = None, ) -> dict: - """Парсинг вакансий Работа России.""" + """Парсинг вакансий по активным организациям реестров.""" proxies = _resolve_proxies(proxies) return _run_generic_parser( self, @@ -2325,16 +2826,195 @@ def parse_trudvsem_vacancies( source=ParserLoadLog.Source.TRUDVSEM, task_name="apps.parsers.tasks.parse_trudvsem_vacancies", requested_by_id=requested_by_id, - fetch_records=lambda: TrudvsemClient(proxies=proxies).fetch_vacancies( + fetch_records=lambda: _fetch_vacancy_records( + proxies=proxies, limit=limit, offset=offset, region_code=region_code, company_inn=company_inn, text=text, + vacancy_sources=vacancy_sources, + registry_organizations_only=registry_organizations_only, + registry_organization_limit=registry_organization_limit, ), ) +def _fetch_vacancy_records( + *, + proxies: list[str] | None, + limit: int, + offset: int, + region_code: str | None, + company_inn: str | None, + text: str | None, + vacancy_sources: list[str] | None, + registry_organizations_only: bool, + registry_organization_limit: int | None, +) -> list[GenericParserItem]: + if _should_fetch_registry_organization_vacancies( + registry_organizations_only=registry_organizations_only, + region_code=region_code, + company_inn=company_inn, + text=text, + ): + return _fetch_registry_organization_vacancy_records( + proxies=proxies, + limit=limit, + vacancy_sources=vacancy_sources, + registry_organization_limit=registry_organization_limit, + ) + + with VacanciesClient( + proxies=proxies, + superjob_app_id=getattr(settings, "SUPERJOB_APP_ID", ""), + sources=vacancy_sources, + ) as client: + return client.fetch_vacancies( + limit=limit, + offset=offset, + region_code=region_code, + company_inn=company_inn, + text=text, + ) + + +def _should_fetch_registry_organization_vacancies( + *, + registry_organizations_only: bool, + region_code: str | None, + company_inn: str | None, + text: str | None, +) -> bool: + """Registry-mode включается только для пустого ручного фильтра.""" + return registry_organizations_only and not any( + [ + region_code, + company_inn, + text, + ] + ) + + +def _active_registry_vacancy_targets( + *, + limit: int | None = None, +) -> list[RegistryLookupTarget]: + return _active_registry_lookup_targets(limit=limit) + + +def _attach_registry_vacancy_target( + record: GenericParserItem, + target: RegistryLookupTarget, +) -> GenericParserItem: + payload = dict(record.payload) + payload["registry_organization"] = { + "id": target.organization_id, + "inn": target.inn, + "ogrn": target.ogrn, + "name": target.name, + } + return replace( + record, + inn=record.inn or target.inn, + ogrn=record.ogrn or target.ogrn, + organisation_name=record.organisation_name or target.name, + payload=payload, + ) + + +def _fetch_registry_target_vacancy_records( + client: VacanciesClient, + target: RegistryLookupTarget, + *, + page_size: int, +) -> list[GenericParserItem]: + records: list[GenericParserItem] = [] + offset = 0 + for _ in range(VACANCY_REGISTRY_MAX_PAGES_PER_ORGANIZATION): + page_records = client.fetch_vacancies( + limit=page_size, + offset=offset, + company_inn=target.inn, + ) + records.extend( + _attach_registry_vacancy_target(record, target) for record in page_records + ) + if len(page_records) < page_size: + return records + offset += page_size + + raise RuntimeError( + "Vacancy registry organization page limit reached " + f"for organization {target.organization_id} ({target.inn})" + ) + + +def _fetch_registry_organization_vacancy_records( + *, + proxies: list[str] | None, + limit: int, + vacancy_sources: list[str] | None, + registry_organization_limit: int | None, +) -> list[GenericParserItem]: + targets = _active_registry_vacancy_targets(limit=registry_organization_limit) + if not targets: + logger.info("No active registry organizations for vacancies sync") + return [] + + page_size = max(1, limit) + records: list[GenericParserItem] = [] + errors: list[str] = [] + successful_fetches = 0 + sources = vacancy_sources or ["trudvsem"] + with VacanciesClient( + proxies=proxies, + superjob_app_id=getattr(settings, "SUPERJOB_APP_ID", ""), + sources=sources, + ) as client: + for target in targets: + try: + organization_records = _fetch_registry_target_vacancy_records( + client, + target, + page_size=page_size, + ) + except Exception as exc: + logger.warning( + "Vacancy fetch failed for registry organization %s (%s): %s", + target.organization_id, + target.inn, + exc, + ) + errors.append(f"{target.inn}: {exc}") + continue + + successful_fetches += 1 + records.extend( + _attach_registry_vacancy_target(record, target) + for record in organization_records + ) + + if errors and successful_fetches == 0: + raise RuntimeError( + "All registry organization vacancy fetches failed; " + f"first error: {errors[0]}" + ) + if errors: + logger.warning( + "Vacancy registry organization sync completed with %d failed " + "organizations", + len(errors), + ) + + logger.info( + "Fetched %d vacancy records for %d active registry organizations", + len(records), + len(targets), + ) + return records + + # ============================================================================= # FNS Tasks (File Watch & Processing) # ============================================================================= diff --git a/src/apps/parsers/views.py b/src/apps/parsers/views.py index f6ace39..2fcd554 100644 --- a/src/apps/parsers/views.py +++ b/src/apps/parsers/views.py @@ -8,6 +8,7 @@ Views для приложения парсеров. import csv import json import uuid +from collections import defaultdict from apps.core.openapi import CommonResponses, ErrorResponses, swagger_tag from apps.core.response import api_error_response, api_response @@ -26,6 +27,7 @@ from apps.parsers.models import ( ParsingSettings, ProcurementRecord, ) +from apps.parsers.organization_enrichment import enrich_parser_result_rows from apps.parsers.serializers import ( FinancialReportDetailSerializer, FinancialReportSerializer, @@ -63,6 +65,7 @@ from apps.parsers.serializers import ( ) from apps.parsers.source_cards import SourceCardService from apps.parsers.source_registry import PARSER_SOURCES +from apps.registers.models import RegistryMembershipPeriod from django.core.files.storage import default_storage from django.core.paginator import Paginator from django.db.models import CharField, Count, Q @@ -72,7 +75,8 @@ from django.utils.text import get_valid_filename from django.views.generic import TemplateView from django_celery_beat.models import CrontabSchedule, IntervalSchedule, PeriodicTask from drf_yasg import openapi -from drf_yasg.utils import swagger_auto_schema +from drf_yasg.inspectors import SwaggerAutoSchema +from drf_yasg.utils import no_body, swagger_auto_schema from rest_framework import status from rest_framework.exceptions import ValidationError from rest_framework.parsers import FormParser, JSONParser, MultiPartParser @@ -130,14 +134,28 @@ TASKS_BY_NAME = { "apps.parsers.tasks.parse_procurements_44fz": tasks.parse_procurements_44fz, "apps.parsers.tasks.parse_procurements_223fz": tasks.parse_procurements_223fz, "apps.parsers.tasks.parse_contracts": tasks.parse_contracts, + "apps.parsers.tasks.parse_registry_contracts": tasks.parse_registry_contracts, "apps.parsers.tasks.parse_unfair_suppliers": tasks.parse_unfair_suppliers, "apps.parsers.tasks.parse_fas_goz_evasion": tasks.parse_fas_goz_evasion, "apps.parsers.tasks.scan_fns_directory": tasks.scan_fns_directory, "apps.parsers.tasks.parse_arbitration_cases": tasks.parse_arbitration_cases, + "apps.parsers.tasks.parse_registry_inspections": tasks.parse_registry_inspections, "apps.parsers.tasks.parse_fedresurs_bankruptcy": tasks.parse_fedresurs_bankruptcy, + "apps.parsers.tasks.parse_registry_enrichment_sources": ( + tasks.parse_registry_enrichment_sources + ), "apps.parsers.tasks.parse_fstec_registers": tasks.parse_fstec_registers, "apps.parsers.tasks.parse_trudvsem_vacancies": tasks.parse_trudvsem_vacancies, } + + +class MultipartFormSwaggerAutoSchema(SwaggerAutoSchema): + """Document mixed JSON/multipart upload endpoints as form-data in Swagger.""" + + def get_consumes(self) -> list[str]: + return ["multipart/form-data"] + + PARSER_TASK_NAMES = set(TASKS_BY_NAME) NATIVE_RECORD_MODELS = { ParserLoadLog.Source.INDUSTRIAL: IndustrialCertificateRecord, @@ -147,6 +165,26 @@ NATIVE_RECORD_MODELS = { ParserLoadLog.Source.PROCUREMENTS: ProcurementRecord, ParserLoadLog.Source.FNS_REPORTS: FinancialReport, } +REGISTRY_COVERAGE_EXCLUDED_SOURCES = { + ParserLoadLog.Source.UNFAIR_SUPPLIERS, + ParserLoadLog.Source.FEDRESURS_BANKRUPTCY, + ParserLoadLog.Source.INSPECTIONS, +} +REGISTRY_COVERAGE_NATIVE_IDENTITY_FIELDS = { + ParserLoadLog.Source.FNS_REPORTS: (None, "ogrn"), + ParserLoadLog.Source.PROCUREMENTS: ("customer_inn", "customer_ogrn"), +} +REGISTRY_RISK_SIGNAL_SEVERITY = { + ParserLoadLog.Source.UNFAIR_SUPPLIERS: "critical", + ParserLoadLog.Source.FEDRESURS_BANKRUPTCY: "critical", + ParserLoadLog.Source.INSPECTIONS: "soft", +} +REGISTRY_RISK_SIGNAL_SOURCES = set(REGISTRY_RISK_SIGNAL_SEVERITY) +CORE_PROFILE_INDUSTRIAL_SOURCES = { + ParserLoadLog.Source.INDUSTRIAL, + ParserLoadLog.Source.INDUSTRIAL_PRODUCTS, + ParserLoadLog.Source.MANUFACTURES, +} EXISTING_TASK_PARAMS = { "industrial": {"proxies", "requested_by_id"}, "manufactures": {"proxies", "requested_by_id"}, @@ -171,6 +209,9 @@ TRUDVSEM_PARAMS = { "region_code", "company_inn", "text", + "vacancy_sources", + "registry_organizations_only", + "registry_organization_limit", "proxies", "requested_by_id", } @@ -910,6 +951,7 @@ class FNSReportUploadView(APIView): permission_classes = [IsAdminUser] @swagger_auto_schema( + auto_schema=MultipartFormSwaggerAutoSchema, tags=[FNS_TAG], operation_summary="Загрузка файлов", operation_description=( @@ -939,6 +981,7 @@ class FNSReportUploadView(APIView): ), ), ], + request_body=no_body, consumes=["multipart/form-data"], responses={ 200: FNSFileUploadSuccessSerializer, @@ -1596,6 +1639,390 @@ def _get_parser_periodic_task_for_user(pk: int, user) -> PeriodicTask | None: return None +def _normalize_registry_identifier(value) -> str: + return str(value).strip() if value not in (None, "") else "" + + +def _active_registry_organization_indexes() -> ( + tuple[ + set[int], + dict[str, set[int]], + dict[str, set[int]], + ] +): + organization_ids: set[int] = set() + by_inn: dict[str, set[int]] = defaultdict(set) + by_ogrn: dict[str, set[int]] = defaultdict(set) + rows = ( + RegistryMembershipPeriod.objects.filter(ended_at__isnull=True) + .order_by() + .values_list( + "organization_id", + "organization__mn_inn", + "organization__mn_ogrn", + ) + .distinct() + ) + for organization_id, inn, ogrn in rows: + organization_ids.add(organization_id) + inn_text = _normalize_registry_identifier(inn) + ogrn_text = _normalize_registry_identifier(ogrn) + if inn_text: + by_inn[inn_text].add(organization_id) + if ogrn_text: + by_ogrn[ogrn_text].add(organization_id) + return organization_ids, by_inn, by_ogrn + + +def _matched_registry_organization_count( + queryset, + by_inn: dict[str, set[int]], + by_ogrn: dict[str, set[int]], + *, + inn_field: str | None = "inn", + ogrn_field: str | None = "ogrn", +) -> int: + filters = Q() + if inn_field and by_inn: + filters |= Q(**{f"{inn_field}__in": list(by_inn)}) + if ogrn_field and by_ogrn: + filters |= Q(**{f"{ogrn_field}__in": list(by_ogrn)}) + if not filters: + return 0 + + fields = [field for field in (inn_field, ogrn_field) if field] + matched_ids: set[int] = set() + for row in queryset.filter(filters).order_by().values_list(*fields).distinct(): + values = dict(zip(fields, row, strict=True)) + inn_text = _normalize_registry_identifier(values.get(inn_field)) + ogrn_text = _normalize_registry_identifier(values.get(ogrn_field)) + if inn_text: + matched_ids.update(by_inn.get(inn_text, ())) + if ogrn_text: + matched_ids.update(by_ogrn.get(ogrn_text, ())) + return len(matched_ids) + + +def _matched_registry_organization_ids( + queryset, + by_inn: dict[str, set[int]], + by_ogrn: dict[str, set[int]], + *, + inn_field: str | None = "inn", + ogrn_field: str | None = "ogrn", +) -> set[int]: + filters = Q() + if inn_field and by_inn: + filters |= Q(**{f"{inn_field}__in": list(by_inn)}) + if ogrn_field and by_ogrn: + filters |= Q(**{f"{ogrn_field}__in": list(by_ogrn)}) + if not filters: + return set() + + fields = [field for field in (inn_field, ogrn_field) if field] + matched_ids: set[int] = set() + for row in queryset.filter(filters).order_by().values_list(*fields).distinct(): + values = dict(zip(fields, row, strict=True)) + inn_text = _normalize_registry_identifier(values.get(inn_field)) + ogrn_text = _normalize_registry_identifier(values.get(ogrn_field)) + if inn_text: + matched_ids.update(by_inn.get(inn_text, ())) + if ogrn_text: + matched_ids.update(by_ogrn.get(ogrn_text, ())) + return matched_ids + + +def _source_registry_matches( + by_inn: dict[str, set[int]], + by_ogrn: dict[str, set[int]], +) -> dict[str, set[int]]: + matches: dict[str, set[int]] = {} + for source, _label in ParserLoadLog.Source.choices: + inn_field, ogrn_field = REGISTRY_COVERAGE_NATIVE_IDENTITY_FIELDS.get( + source, + ("inn", "ogrn"), + ) + if source == ParserLoadLog.Source.FNS_REPORTS: + queryset = FinancialReport.objects.all() + elif source in NATIVE_RECORD_MODELS: + queryset = NATIVE_RECORD_MODELS[source].objects.all() + else: + queryset = GenericParserRecord.objects.filter(source=source) + matches[source] = _matched_registry_organization_ids( + queryset, + by_inn, + by_ogrn, + inn_field=inn_field, + ogrn_field=ogrn_field, + ) + return matches + + +def _coverage_percent(count: int, total: int) -> float: + return round(count / total * 100, 1) if total else 0 + + +def _registry_source_matrix_rows( + *, + registry_organization_ids: set[int], + source_matches: dict[str, set[int]], + enrichment_sources: list[str], +) -> list[dict]: + total_organizations = len(registry_organization_ids) + matrix_rows = [] + registry_rows = ( + RegistryMembershipPeriod.objects.filter(ended_at__isnull=True) + .select_related("registry") + .order_by("registry__name") + .values_list("registry_id", "registry__name", "organization_id") + .distinct() + ) + registry_organizations: dict[tuple[str, str], set[int]] = {} + for registry_id, registry_name, organization_id in registry_rows: + registry_organizations.setdefault( + (str(registry_id), registry_name), + set(), + ).add(organization_id) + for ( + registry_id, + registry_name, + ), organization_ids in registry_organizations.items(): + source_cells = {} + for source in enrichment_sources: + count = len(organization_ids & source_matches.get(source, set())) + source_cells[source] = { + "organizations_count": count, + "coverage_percent": _coverage_percent(count, len(organization_ids)), + } + matrix_rows.append( + { + "registry_id": registry_id, + "registry_name": registry_name, + "active_organizations": len(organization_ids), + "sources": source_cells, + } + ) + total_source_cells = {} + for source in enrichment_sources: + matched_ids = registry_organization_ids & source_matches.get(source, set()) + total_source_cells[source] = { + "organizations_count": len(matched_ids), + "coverage_percent": _coverage_percent( + len(matched_ids), + total_organizations, + ), + } + matrix_rows.append( + { + "registry_id": "__all__", + "registry_name": "Все организации", + "active_organizations": total_organizations, + "sources": total_source_cells, + "is_total": True, + } + ) + return matrix_rows + + +def _registry_data_coverage() -> dict: + registry_organization_ids, by_inn, by_ogrn = _active_registry_organization_indexes() + total_organizations = len(registry_organization_ids) + descriptor_titles: dict[str, str] = {} + for descriptor in PARSER_SOURCES.values(): + descriptor_titles.setdefault(descriptor.source, descriptor.title) + source_labels = dict(ParserLoadLog.Source.choices) + + items = [] + for source, _label in ParserLoadLog.Source.choices: + if source in REGISTRY_COVERAGE_EXCLUDED_SOURCES: + continue + inn_field, ogrn_field = REGISTRY_COVERAGE_NATIVE_IDENTITY_FIELDS.get( + source, + ("inn", "ogrn"), + ) + if source == ParserLoadLog.Source.FNS_REPORTS: + organizations_count = _matched_registry_organization_count( + FinancialReport.objects.all(), + by_inn, + by_ogrn, + inn_field=inn_field, + ogrn_field=ogrn_field, + ) + elif source in NATIVE_RECORD_MODELS: + organizations_count = _matched_registry_organization_count( + NATIVE_RECORD_MODELS[source].objects.all(), + by_inn, + by_ogrn, + inn_field=inn_field, + ogrn_field=ogrn_field, + ) + else: + organizations_count = _matched_registry_organization_count( + GenericParserRecord.objects.filter(source=source), + by_inn, + by_ogrn, + ) + coverage_percent = ( + round(organizations_count / total_organizations * 100, 1) + if total_organizations + else 0 + ) + items.append( + { + "source": source, + "label": str(descriptor_titles.get(source) or source_labels[source]), + "organizations_count": organizations_count, + "coverage_percent": coverage_percent, + } + ) + + return { + "total_organizations": total_organizations, + "items": items, + } + + +def _registry_enrichment_analytics(jobs, schedules: list[dict]) -> dict: + registry_organization_ids, by_inn, by_ogrn = _active_registry_organization_indexes() + total_organizations = len(registry_organization_ids) + source_matches = _source_registry_matches(by_inn, by_ogrn) + descriptor_titles: dict[str, str] = {} + for descriptor in PARSER_SOURCES.values(): + descriptor_titles.setdefault(descriptor.source, descriptor.title) + source_labels = dict(ParserLoadLog.Source.choices) + + source_coverage = [] + risk_signals = [] + enrichment_sources = [ + source + for source, _label in ParserLoadLog.Source.choices + if source not in REGISTRY_RISK_SIGNAL_SOURCES + ] + for source, _label in ParserLoadLog.Source.choices: + matched_ids = source_matches.get(source, set()) + risk_severity = REGISTRY_RISK_SIGNAL_SEVERITY.get(source) + item = { + "source": source, + "label": str(descriptor_titles.get(source) or source_labels[source]), + "organizations_count": len(matched_ids), + "coverage_percent": _coverage_percent( + len(matched_ids), total_organizations + ), + "required_for_core_profile": source + in ({ParserLoadLog.Source.FNS_REPORTS} | CORE_PROFILE_INDUSTRIAL_SOURCES), + "risk_signal": risk_severity is not None, + } + if risk_severity: + item["risk_severity"] = risk_severity + risk_signals.append(item) + else: + source_coverage.append(item) + + enriched_ids: set[int] = set() + for source in enrichment_sources: + enriched_ids.update(source_matches.get(source, set())) + industrial_ids: set[int] = set() + for source in CORE_PROFILE_INDUSTRIAL_SOURCES: + industrial_ids.update(source_matches.get(source, set())) + core_profile_ids = ( + source_matches.get(ParserLoadLog.Source.FNS_REPORTS, set()) & industrial_ids + ) + risk_ids_by_severity = { + "critical": set(), + "soft": set(), + } + for source, severity in REGISTRY_RISK_SIGNAL_SEVERITY.items(): + risk_ids_by_severity.setdefault(severity, set()).update( + source_matches.get(source, set()) + ) + risk_ids_by_severity["any"] = ( + set().union(*risk_ids_by_severity.values()) if risk_ids_by_severity else set() + ) + risk_summary = { + severity: { + "organizations_count": len(organization_ids), + "coverage_percent": _coverage_percent( + len(organization_ids), + total_organizations, + ), + } + for severity, organization_ids in risk_ids_by_severity.items() + } + active_memberships = RegistryMembershipPeriod.objects.filter( + ended_at__isnull=True + ).count() + registries_total = ( + RegistryMembershipPeriod.objects.filter(ended_at__isnull=True) + .order_by() + .values("registry_id") + .distinct() + .count() + ) + registries_with_data = ( + RegistryMembershipPeriod.objects.filter(ended_at__isnull=True) + .filter(organization_id__in=enriched_ids) + .order_by() + .values("registry_id") + .distinct() + .count() + if enriched_ids + else 0 + ) + + matrix_rows = _registry_source_matrix_rows( + registry_organization_ids=registry_organization_ids, + source_matches=source_matches, + enrichment_sources=enrichment_sources, + ) + + latest_loads = list(ParserLoadLog.objects.all().order_by("-created_at")[:30]) + recent_success = sum( + 1 for load in latest_loads if load.status == ParserLoadLog.Status.SUCCESS + ) + recent_failed = sum( + 1 for load in latest_loads if load.status == ParserLoadLog.Status.FAILED + ) + recent_other = max(len(latest_loads) - recent_success - recent_failed, 0) + running_jobs = sum( + 1 for job in jobs if job.status in {"pending", "started", "retry"} + ) + + return { + "population": { + "active_registry_organizations": total_organizations, + "active_memberships": active_memberships, + "registries_with_data_percent": _coverage_percent( + registries_with_data, + registries_total, + ), + }, + "coverage_summary": { + "with_any_enrichment": len(enriched_ids), + "with_any_enrichment_percent": _coverage_percent( + len(enriched_ids), + total_organizations, + ), + "core_profile_complete": len(core_profile_ids), + "core_profile_complete_percent": _coverage_percent( + len(core_profile_ids), + total_organizations, + ), + "requires_attention": max(total_organizations - len(enriched_ids), 0), + }, + "source_coverage": source_coverage, + "registry_source_matrix": matrix_rows, + "risk_signals": risk_signals, + "risk_summary": risk_summary, + "pipeline": { + "active_schedules": sum(1 for schedule in schedules if schedule["enabled"]), + "running_jobs": running_jobs, + "recent_success": recent_success, + "recent_failed": recent_failed, + "recent_other": recent_other, + }, + } + + def source_result_swagger_tag(source_key: str) -> str: return SOURCE_RESULT_TAGS.get(source_key, PARSERS_TAG) @@ -1855,6 +2282,7 @@ class SourceResultListView(APIView): ) for record in page_obj.object_list ] + rows = enrich_parser_result_rows(rows) serializer = ParserResultRecordSerializer(rows, many=True) return api_response( serializer.data, @@ -1893,6 +2321,7 @@ class SourceResultDetailView(APIView): record, include_payload=params["include_payload"], ) + data = enrich_parser_result_rows([data])[0] return api_response(ParserResultRecordSerializer(data).data) @@ -2030,6 +2459,7 @@ class GenericParserRecordListView(APIView): _native_record_to_result(source, record) for record in NATIVE_RECORD_MODELS[source].objects.all()[:limit] ] + rows = enrich_parser_result_rows(rows) return api_response(rows) queryset = GenericParserRecord.objects.all() if source: @@ -2201,6 +2631,11 @@ class ParserDashboardDataView(APIView): "schedules": ParserScheduleSerializer(schedules, many=True).data, "jobs": BackgroundJobListSerializer(jobs, many=True).data, "source_counts": source_counts, + "registry_data_coverage": _registry_data_coverage(), + "registry_enrichment_analytics": _registry_enrichment_analytics( + jobs, + schedules, + ), "load_logs": ParserLoadLogSerializer( ParserLoadLog.objects.all().order_by("-created_at")[:30], many=True, diff --git a/src/core/api_v2_urls.py b/src/core/api_v2_urls.py new file mode 100644 index 0000000..b2bbeea --- /dev/null +++ b/src/core/api_v2_urls.py @@ -0,0 +1,10 @@ +"""API v2 URL configuration.""" + +from django.urls import include, path +from organizations.urls import organizations_urlpatterns + +app_name = "api_v2" + +urlpatterns = [ + path("", include((organizations_urlpatterns, "organizations"))), +] diff --git a/src/core/urls.py b/src/core/urls.py index e5e403f..5fa8d97 100644 --- a/src/core/urls.py +++ b/src/core/urls.py @@ -71,6 +71,7 @@ urlpatterns = [ path("admin/", admin.site.urls), path("health/", include("apps.core.urls")), path("api/v1/", include("core.api_v1_urls", namespace="api_v1")), + path("api/v2/", include("core.api_v2_urls", namespace="api_v2")), path("auth/", include("rest_framework.urls")), ] diff --git a/src/organizations/__init__.py b/src/organizations/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/src/organizations/__init__.py @@ -0,0 +1 @@ + diff --git a/src/organizations/admin.py b/src/organizations/admin.py new file mode 100644 index 0000000..dbf4438 --- /dev/null +++ b/src/organizations/admin.py @@ -0,0 +1,22 @@ +"""Admin configuration for organizations app.""" + +from django.contrib import admin + +from organizations.models import Organization + + +@admin.register(Organization) +class OrganizationAdmin(admin.ModelAdmin): + """Admin for canonical organizations.""" + + list_display = ["name_short", "inn", "kpp", "ogrn", "ogrip"] + search_fields = ["name", "inn", "kpp", "ogrn", "ogrip"] + ordering = ["name"] + readonly_fields = ["uid"] + + def name_short(self, obj: Organization) -> str: + name = obj.name or "" + return name[:80] + "..." if len(name) > 80 else name + + name_short.short_description = "Организация" + name_short.admin_order_field = "name" diff --git a/src/organizations/api_enrichment.py b/src/organizations/api_enrichment.py new file mode 100644 index 0000000..4478c04 --- /dev/null +++ b/src/organizations/api_enrichment.py @@ -0,0 +1,775 @@ +"""Batch enrichment helpers for organizations API v2.""" + +from __future__ import annotations + +from dataclasses import dataclass +from datetime import date, datetime +from typing import Any + +from apps.parsers.models import ( + FinancialReport, + FinancialReportLine, + GenericParserRecord, + IndustrialCertificateRecord, + IndustrialProductRecord, + InspectionRecord, + ManufacturerRecord, + ParserLoadLog, + ProcurementRecord, +) +from django.db.models import Count, Prefetch, Q +from registers.models import RegistryMembershipPeriod + +from organizations.models import Organization + +GENERIC_SOURCES = ( + ParserLoadLog.Source.PROCUREMENTS_44FZ, + ParserLoadLog.Source.PROCUREMENTS_223FZ, + ParserLoadLog.Source.CONTRACTS, + ParserLoadLog.Source.UNFAIR_SUPPLIERS, + ParserLoadLog.Source.FAS_GOZ, + ParserLoadLog.Source.ARBITRATION, + ParserLoadLog.Source.FEDRESURS_BANKRUPTCY, + ParserLoadLog.Source.FSTEC, + ParserLoadLog.Source.TRUDVSEM, +) + +DATA_PRESENCE_KEYS = ( + ParserLoadLog.Source.INDUSTRIAL, + ParserLoadLog.Source.INDUSTRIAL_PRODUCTS, + ParserLoadLog.Source.MANUFACTURES, + ParserLoadLog.Source.INSPECTIONS, + ParserLoadLog.Source.PROCUREMENTS, + *GENERIC_SOURCES, + ParserLoadLog.Source.FNS_REPORTS, +) +DATA_PRESENCE_KEY_SET = {str(source) for source in DATA_PRESENCE_KEYS} +API_DATA_SOURCE_ALIASES = { + ParserLoadLog.Source.TRUDVSEM: "vacancies", +} +API_DATA_SOURCE_KEY_SET = { + API_DATA_SOURCE_ALIASES.get(source, str(source)) for source in DATA_PRESENCE_KEYS +} + + +@dataclass(frozen=True) +class RegistrySummary: + """Registry identity returned in organizations API.""" + + id: str + name: str + + +@dataclass(frozen=True) +class OrganizationEnrichment: + """Computed parser and registry availability for one organization.""" + + data_presence: dict[str, Any] + registries: list[RegistrySummary] + + +def active_registry_identity_values( + *, + registry_id: str | None = None, + registry_name: str | None = None, +) -> tuple[set[str], set[str]]: + """Return INN/OGRN values of organizations with active registry membership.""" + memberships = RegistryMembershipPeriod.objects.filter(ended_at__isnull=True) + if registry_id: + memberships = memberships.filter(registry_id=registry_id) + if registry_name: + memberships = memberships.filter(registry__name__icontains=registry_name) + + inn_values: set[str] = set() + ogrn_values: set[str] = set() + for inn, ogrn in memberships.values_list( + "organization__mn_inn", + "organization__mn_ogrn", + ): + inn_values.add(str(inn)) + ogrn_values.add(str(ogrn)) + return inn_values, ogrn_values + + +def data_presence_identity_values(source: str) -> tuple[set[str], set[str]]: + """Return INN/OGRN values of organizations with data for a parser source.""" + matches = _source_matches(to_internal_data_source(source)) + return matches["inn"], matches["ogrn"] + + +def to_api_data_source(source: str) -> str: + """Return v2 public data source key for an internal parser source.""" + return API_DATA_SOURCE_ALIASES.get(source, str(source)) + + +def to_internal_data_source(source: str) -> str: + """Return internal parser source key from a v2 public key.""" + for internal_source, api_source in API_DATA_SOURCE_ALIASES.items(): + if source == api_source: + return str(internal_source) + return source + + +def _source_matches(source: str) -> dict[str, set[str]]: + if source == ParserLoadLog.Source.INDUSTRIAL: + return OrganizationApiEnrichmentService._matching_identifiers_for_all( + IndustrialCertificateRecord.objects, + inn_field="inn", + ogrn_field="ogrn", + ) + if source == ParserLoadLog.Source.INDUSTRIAL_PRODUCTS: + return OrganizationApiEnrichmentService._matching_identifiers_for_all( + IndustrialProductRecord.objects, + inn_field="inn", + ogrn_field="ogrn", + ) + if source == ParserLoadLog.Source.MANUFACTURES: + return OrganizationApiEnrichmentService._matching_identifiers_for_all( + ManufacturerRecord.objects, + inn_field="inn", + ogrn_field="ogrn", + ) + if source == ParserLoadLog.Source.INSPECTIONS: + return OrganizationApiEnrichmentService._matching_identifiers_for_all( + InspectionRecord.objects, + inn_field="inn", + ogrn_field="ogrn", + ) + if source == ParserLoadLog.Source.PROCUREMENTS: + return OrganizationApiEnrichmentService._matching_identifiers_for_all( + ProcurementRecord.objects, + inn_field="customer_inn", + ogrn_field="customer_ogrn", + ) + if source == ParserLoadLog.Source.FNS_REPORTS: + return { + "inn": set(), + "ogrn": set( + FinancialReport.objects.values_list("ogrn", flat=True).distinct() + ), + } + if source in GENERIC_SOURCES: + return OrganizationApiEnrichmentService._matching_identifiers_for_all( + GenericParserRecord.objects.filter(source=source), + inn_field="inn", + ogrn_field="ogrn", + ) + + raise ValueError(f"Unsupported data_presence source: {source}") + + +class OrganizationApiEnrichmentService: + """Computes list/detail enrichment without per-row database queries.""" + + @classmethod + def build_for( + cls, + organizations: list[Organization], + data_sources: set[str] | None = None, + ) -> dict[str, OrganizationEnrichment]: + if not organizations: + return {} + + selected_sources = ( + API_DATA_SOURCE_KEY_SET + if data_sources is None + else {to_api_data_source(source) for source in data_sources} + ) + identifiers = cls._collect_identifiers(organizations) + presence = cls._build_presence(organizations, identifiers, selected_sources) + registries = cls._build_registries(organizations, identifiers) + + return { + str(organization.uid): OrganizationEnrichment( + data_presence=presence[str(organization.uid)], + registries=registries[str(organization.uid)], + ) + for organization in organizations + } + + @staticmethod + def empty_presence(data_sources: set[str] | None = None) -> dict[str, Any]: + selected_sources = ( + API_DATA_SOURCE_KEY_SET + if data_sources is None + else {to_api_data_source(source) for source in data_sources} + ) + return { + to_api_data_source(source): [] + for source in DATA_PRESENCE_KEYS + if to_api_data_source(source) in selected_sources + } + + @classmethod + def _collect_identifiers( + cls, organizations: list[Organization] + ) -> dict[str, set[str]]: + return { + "inn": { + organization.inn for organization in organizations if organization.inn + }, + "ogrn": { + organization.ogrn for organization in organizations if organization.ogrn + }, + "ogrip": { + organization.ogrip + for organization in organizations + if organization.ogrip + }, + } + + @classmethod + def _build_presence( + cls, + organizations: list[Organization], + identifiers: dict[str, set[str]], + selected_sources: set[str], + ) -> dict[str, dict[str, Any]]: + presence = { + str(organization.uid): cls.empty_presence(selected_sources) + for organization in organizations + } + + if to_api_data_source(ParserLoadLog.Source.INDUSTRIAL) in selected_sources: + cls._attach_industrial_certificates(presence, organizations, identifiers) + if ( + to_api_data_source(ParserLoadLog.Source.INDUSTRIAL_PRODUCTS) + in selected_sources + ): + cls._attach_source_records( + presence, + organizations, + ParserLoadLog.Source.INDUSTRIAL_PRODUCTS, + IndustrialProductRecord.objects, + identifiers, + inn_field="inn", + ogrn_field="ogrn", + serializer=cls._serialize_industrial_product, + ) + if to_api_data_source(ParserLoadLog.Source.MANUFACTURES) in selected_sources: + cls._attach_source_records( + presence, + organizations, + ParserLoadLog.Source.MANUFACTURES, + ManufacturerRecord.objects, + identifiers, + inn_field="inn", + ogrn_field="ogrn", + serializer=cls._serialize_manufacturer, + ) + if to_api_data_source(ParserLoadLog.Source.INSPECTIONS) in selected_sources: + cls._attach_source_records( + presence, + organizations, + ParserLoadLog.Source.INSPECTIONS, + InspectionRecord.objects, + identifiers, + inn_field="inn", + ogrn_field="ogrn", + serializer=cls._serialize_inspection, + ) + if to_api_data_source(ParserLoadLog.Source.PROCUREMENTS) in selected_sources: + cls._attach_source_records( + presence, + organizations, + ParserLoadLog.Source.PROCUREMENTS, + ProcurementRecord.objects, + identifiers, + inn_field="customer_inn", + ogrn_field="customer_ogrn", + serializer=cls._serialize_procurement, + ) + if to_api_data_source(ParserLoadLog.Source.FNS_REPORTS) in selected_sources: + cls._attach_source_records( + presence, + organizations, + ParserLoadLog.Source.FNS_REPORTS, + FinancialReport.objects.annotate( + lines_count=Count("lines") + ).prefetch_related( + Prefetch( + "lines", + queryset=FinancialReportLine.objects.order_by( + "year", + "form_code", + "line_code", + ), + ) + ), + identifiers, + inn_field=None, + ogrn_field="ogrn", + serializer=cls._serialize_financial_report, + ) + + selected_generic_sources = [ + source + for source in GENERIC_SOURCES + if to_api_data_source(source) in selected_sources + ] + if selected_generic_sources: + cls._attach_generic_records( + presence, + organizations, + identifiers, + selected_generic_sources, + ) + + return presence + + @classmethod + def _attach_industrial_certificates( + cls, + presence: dict[str, dict[str, Any]], + organizations: list[Organization], + identifiers: dict[str, set[str]], + ) -> None: + cls._attach_source_records( + presence, + organizations, + ParserLoadLog.Source.INDUSTRIAL, + IndustrialCertificateRecord.objects, + identifiers, + inn_field="inn", + ogrn_field="ogrn", + serializer=cls._serialize_industrial_certificate, + ) + + @classmethod + def _attach_source_records( + cls, + presence: dict[str, dict[str, Any]], + organizations: list[Organization], + source: str, + queryset, + identifiers: dict[str, set[str]], + *, + inn_field: str | None, + ogrn_field: str, + serializer, + ) -> None: + if inn_field is not None: + identity_filter = cls._identity_filter( + identifiers, + inn_field=inn_field, + ogrn_field=ogrn_field, + ) + else: + identity_filter = cls._identity_filter( + { + "inn": set(), + "ogrn": identifiers["ogrn"], + "ogrip": identifiers["ogrip"], + }, + inn_field=None, + ogrn_field=ogrn_field, + ) + if identity_filter is None: + return + + records_by_inn: dict[str, list[dict[str, Any]]] = {} + records_by_ogrn: dict[str, list[dict[str, Any]]] = {} + records = queryset.filter(identity_filter).order_by("-created_at", "-id") + for record in records: + item = serializer(record) + if inn_field is not None: + inn_value = getattr(record, inn_field) + if inn_value: + records_by_inn.setdefault(inn_value, []).append(item) + ogrn_value = getattr(record, ogrn_field) + if ogrn_value: + records_by_ogrn.setdefault(ogrn_value, []).append(item) + + for organization in organizations: + seen: set[int] = set() + items = [] + for item in ( + records_by_inn.get(organization.inn, []) + + records_by_ogrn.get(organization.ogrn, []) + + records_by_ogrn.get(organization.ogrip, []) + ): + item_id = item["id"] + if item_id in seen: + continue + seen.add(item_id) + items.append(item) + presence[str(organization.uid)][to_api_data_source(source)] = items + + @classmethod + def _attach_generic_records( + cls, + presence: dict[str, dict[str, Any]], + organizations: list[Organization], + identifiers: dict[str, set[str]], + selected_sources: list[str], + ) -> None: + identity_filter = cls._identity_filter( + identifiers, + inn_field="inn", + ogrn_field="ogrn", + ) + if identity_filter is None: + return + + records_by_source_and_inn: dict[str, dict[str, list[dict[str, Any]]]] = { + str(source): {} for source in selected_sources + } + records_by_source_and_ogrn: dict[str, dict[str, list[dict[str, Any]]]] = { + str(source): {} for source in selected_sources + } + + records = ( + GenericParserRecord.objects.filter(source__in=selected_sources) + .filter(identity_filter) + .order_by("source", "-created_at", "-id") + ) + for record in records: + item = cls._serialize_generic_record(record) + source = str(record.source) + if record.inn: + records_by_source_and_inn[source].setdefault(record.inn, []).append( + item + ) + if record.ogrn: + records_by_source_and_ogrn[source].setdefault(record.ogrn, []).append( + item + ) + + for organization in organizations: + organization_key = str(organization.uid) + for source in selected_sources: + source_key = str(source) + seen: set[int] = set() + items = [] + records_by_inn = records_by_source_and_inn[source_key] + records_by_ogrn = records_by_source_and_ogrn[source_key] + for item in ( + records_by_inn.get(organization.inn, []) + + records_by_ogrn.get(organization.ogrn, []) + + records_by_ogrn.get(organization.ogrip, []) + ): + item_id = item["id"] + if item_id in seen: + continue + seen.add(item_id) + items.append(item) + presence[organization_key][to_api_data_source(source_key)] = items + + @staticmethod + def _serialize_industrial_certificate( + record: IndustrialCertificateRecord, + ) -> dict[str, Any]: + return { + "id": record.id, + "load_batch": record.load_batch, + "issue_date": record.issue_date, + "issue_date_normalized": _isoformat(record.issue_date_normalized), + "certificate_number": record.certificate_number, + "expiry_date": record.expiry_date, + "expiry_date_normalized": _isoformat(record.expiry_date_normalized), + "certificate_file_url": record.certificate_file_url, + "organisation_name": record.organisation_name, + "inn": record.inn, + "ogrn": record.ogrn, + "registry_organization": record.registry_organization_id, + "created_at": _isoformat(record.created_at), + "updated_at": _isoformat(record.updated_at), + } + + @staticmethod + def _serialize_industrial_product( + record: IndustrialProductRecord, + ) -> dict[str, Any]: + return { + "id": record.id, + "load_batch": record.load_batch, + "full_organisation_name": record.full_organisation_name, + "ogrn": record.ogrn, + "inn": record.inn, + "registry_number": record.registry_number, + "product_name": record.product_name, + "product_model": record.product_model, + "okpd2_code": record.okpd2_code, + "tnved_code": record.tnved_code, + "regulatory_document": record.regulatory_document, + "registry_organization": record.registry_organization_id, + "created_at": _isoformat(record.created_at), + "updated_at": _isoformat(record.updated_at), + } + + @staticmethod + def _serialize_manufacturer(record: ManufacturerRecord) -> dict[str, Any]: + return { + "id": record.id, + "load_batch": record.load_batch, + "full_legal_name": record.full_legal_name, + "inn": record.inn, + "ogrn": record.ogrn, + "address": record.address, + "registry_organization": record.registry_organization_id, + "created_at": _isoformat(record.created_at), + "updated_at": _isoformat(record.updated_at), + } + + @staticmethod + def _serialize_inspection(record: InspectionRecord) -> dict[str, Any]: + return { + "id": record.id, + "load_batch": record.load_batch, + "registration_number": record.registration_number, + "inn": record.inn, + "ogrn": record.ogrn, + "organisation_name": record.organisation_name, + "control_authority": record.control_authority, + "inspection_type": record.inspection_type, + "inspection_form": record.inspection_form, + "start_date": record.start_date, + "start_date_normalized": _isoformat(record.start_date_normalized), + "end_date": record.end_date, + "end_date_normalized": _isoformat(record.end_date_normalized), + "status": record.status, + "legal_basis": record.legal_basis, + "result": record.result, + "is_federal_law_248": record.is_federal_law_248, + "data_year": record.data_year, + "data_month": record.data_month, + "registry_organization": record.registry_organization_id, + "created_at": _isoformat(record.created_at), + "updated_at": _isoformat(record.updated_at), + } + + @staticmethod + def _serialize_procurement(record: ProcurementRecord) -> dict[str, Any]: + return { + "id": record.id, + "load_batch": record.load_batch, + "purchase_number": record.purchase_number, + "purchase_name": record.purchase_name, + "customer_inn": record.customer_inn, + "customer_kpp": record.customer_kpp, + "customer_ogrn": record.customer_ogrn, + "customer_name": record.customer_name, + "max_price": record.max_price, + "max_price_amount": _decimal_string(record.max_price_amount), + "currency_code": record.currency_code, + "placement_method": record.placement_method, + "publish_date": record.publish_date, + "publish_date_normalized": _isoformat(record.publish_date_normalized), + "end_date": record.end_date, + "end_date_normalized": _isoformat(record.end_date_normalized), + "status": record.status, + "law_type": record.law_type, + "purchase_object_info": record.purchase_object_info, + "href": record.href, + "region_code": record.region_code, + "data_year": record.data_year, + "data_month": record.data_month, + "registry_organization": record.registry_organization_id, + "created_at": _isoformat(record.created_at), + "updated_at": _isoformat(record.updated_at), + } + + @staticmethod + def _serialize_generic_record(record: GenericParserRecord) -> dict[str, Any]: + return { + "id": record.id, + "load_batch": record.load_batch, + "source": record.source, + "external_id": record.external_id, + "inn": record.inn, + "ogrn": record.ogrn, + "organisation_name": record.organisation_name, + "title": record.title, + "record_date": record.record_date, + "amount": _decimal_string(record.amount), + "status": record.status, + "url": record.url, + "payload": record.payload, + "registry_organization": record.registry_organization_id, + "created_at": _isoformat(record.created_at), + "updated_at": _isoformat(record.updated_at), + } + + @staticmethod + def _serialize_financial_report(record: FinancialReport) -> dict[str, Any]: + return { + "id": record.id, + "external_id": record.external_id, + "ogrn": record.ogrn, + "registry_organization": record.registry_organization_id, + "file_name": record.file_name, + "file_hash": record.file_hash, + "load_batch": record.load_batch, + "status": record.status, + "source": record.source, + "error_message": record.error_message, + "created_at": _isoformat(record.created_at), + "updated_at": _isoformat(record.updated_at), + "lines_count": getattr(record, "lines_count", 0), + "lines": _financial_report_lines_by_year(record), + } + + @staticmethod + def _matching_identifiers( + queryset, + identifiers: dict[str, set[str]], + *, + inn_field: str, + ogrn_field: str, + ) -> dict[str, set[str]]: + matched_inn = set() + matched_ogrn = set() + + if identifiers["inn"]: + matched_inn = set( + queryset.filter(**{f"{inn_field}__in": identifiers["inn"]}) + .values_list(inn_field, flat=True) + .distinct() + ) + + ogrn_identifiers = identifiers["ogrn"] | identifiers["ogrip"] + if ogrn_identifiers: + matched_ogrn = set( + queryset.filter(**{f"{ogrn_field}__in": ogrn_identifiers}) + .values_list(ogrn_field, flat=True) + .distinct() + ) + + return {"inn": matched_inn, "ogrn": matched_ogrn} + + @staticmethod + def _identity_filter( + identifiers: dict[str, set[str]], + *, + inn_field: str | None, + ogrn_field: str, + ) -> Q | None: + identity_filter = Q() + has_identity = False + if inn_field is not None and identifiers["inn"]: + identity_filter |= Q(**{f"{inn_field}__in": identifiers["inn"]}) + has_identity = True + + ogrn_identifiers = identifiers["ogrn"] | identifiers["ogrip"] + if ogrn_identifiers: + identity_filter |= Q(**{f"{ogrn_field}__in": ogrn_identifiers}) + has_identity = True + + if not has_identity: + return None + return identity_filter + + @staticmethod + def _matching_identifiers_for_all( + queryset, + *, + inn_field: str, + ogrn_field: str, + ) -> dict[str, set[str]]: + matched_inn = set( + queryset.exclude(**{inn_field: ""}) + .values_list(inn_field, flat=True) + .distinct() + ) + matched_ogrn = set( + queryset.exclude(**{ogrn_field: ""}) + .values_list(ogrn_field, flat=True) + .distinct() + ) + return {"inn": matched_inn, "ogrn": matched_ogrn} + + @staticmethod + def _build_registries( + organizations: list[Organization], + identifiers: dict[str, set[str]], + ) -> dict[str, list[RegistrySummary]]: + registries = {str(organization.uid): [] for organization in organizations} + if not identifiers["inn"] and not identifiers["ogrn"]: + return registries + + identity_filter = Q() + if identifiers["inn"]: + identity_filter |= Q(organization__mn_inn__in=identifiers["inn"]) + if identifiers["ogrn"]: + identity_filter |= Q(organization__mn_ogrn__in=identifiers["ogrn"]) + + memberships = ( + RegistryMembershipPeriod.objects.filter(ended_at__isnull=True) + .filter(identity_filter) + .select_related("registry", "organization") + .order_by("registry__name") + ) + membership_by_inn: dict[str, list[RegistrySummary]] = {} + membership_by_ogrn: dict[str, list[RegistrySummary]] = {} + + for membership in memberships: + summary = RegistrySummary( + id=str(membership.registry_id), + name=membership.registry.name, + ) + membership_by_inn.setdefault( + str(membership.organization.mn_inn), + [], + ).append(summary) + membership_by_ogrn.setdefault( + str(membership.organization.mn_ogrn), + [], + ).append(summary) + + for organization in organizations: + seen: set[str] = set() + summaries = [] + for summary in membership_by_inn.get( + organization.inn, [] + ) + membership_by_ogrn.get(organization.ogrn, []): + if summary.id in seen: + continue + seen.add(summary.id) + summaries.append(summary) + registries[str(organization.uid)] = summaries + + return registries + + +def _isoformat(value: date | datetime | None) -> str | None: + if value is None: + return None + return value.isoformat().replace("+00:00", "Z") + + +def _decimal_string(value: Any | None) -> str | None: + if value is None: + return None + return str(value) + + +def _financial_report_lines_by_year( + record: FinancialReport, +) -> dict[str, dict[str, Any]]: + lines_by_year: dict[str, dict[str, Any]] = {} + for line in record.lines.all(): + year = str(line.year) + section = _financial_report_line_section(line) + lines_by_year.setdefault(year, {}).setdefault(section, {})[line.line_code] = { + "form_code": line.form_code, + "name": line.line_name, + "period_start": line.period_start, + "period_end": line.period_end, + } + return lines_by_year + + +def _financial_report_line_section(line: FinancialReportLine) -> str: + if line.form_code != "1": + return f"form_{line.form_code}" + + try: + line_code = int(line.line_code) + except ValueError: + return "balance" + + if 1000 <= line_code < 1300 or line_code == 1600: + return "active" + if 1300 <= line_code < 1600 or line_code == 1700: + return "passive" + return "balance" diff --git a/src/organizations/apps.py b/src/organizations/apps.py new file mode 100644 index 0000000..08c6a8d --- /dev/null +++ b/src/organizations/apps.py @@ -0,0 +1,9 @@ +from django.apps import AppConfig + + +class OrganizationsConfig(AppConfig): + """Конфигурация приложения организаций.""" + + default_auto_field = "django.db.models.BigAutoField" + name = "organizations" + verbose_name = "Организации" diff --git a/src/organizations/filters.py b/src/organizations/filters.py new file mode 100644 index 0000000..9059d38 --- /dev/null +++ b/src/organizations/filters.py @@ -0,0 +1,151 @@ +"""Filters for organizations API v2.""" + +from django.db.models import CharField, Exists, OuterRef, Q +from django.db.models.functions import Cast +from django_filters import rest_framework as filters +from registers.models import RegistryMembershipPeriod + +from organizations.api_enrichment import ( + DATA_PRESENCE_KEYS, + data_presence_identity_values, + to_internal_data_source, +) +from organizations.models import Organization + + +class OrganizationFilter(filters.FilterSet): + """Exact identifier filters plus partial name matching.""" + + name = filters.CharFilter(field_name="name", lookup_expr="icontains") + inn = filters.CharFilter(field_name="inn", lookup_expr="exact") + kpp = filters.CharFilter(field_name="kpp", lookup_expr="exact") + ogrn = filters.CharFilter(field_name="ogrn", lookup_expr="exact") + ogrip = filters.CharFilter(field_name="ogrip", lookup_expr="exact") + registry = filters.UUIDFilter(method="filter_registry") + registry_name = filters.CharFilter(method="filter_registry_name") + has_registry = filters.BooleanFilter(method="filter_has_registry") + has_industrial = filters.BooleanFilter(method="filter_data_presence") + has_industrial_products = filters.BooleanFilter(method="filter_data_presence") + has_manufactures = filters.BooleanFilter(method="filter_data_presence") + has_inspections = filters.BooleanFilter(method="filter_data_presence") + has_procurements = filters.BooleanFilter(method="filter_data_presence") + has_procurements_44fz = filters.BooleanFilter(method="filter_data_presence") + has_procurements_223fz = filters.BooleanFilter(method="filter_data_presence") + has_contracts = filters.BooleanFilter(method="filter_data_presence") + has_unfair_suppliers = filters.BooleanFilter(method="filter_data_presence") + has_fas_goz = filters.BooleanFilter(method="filter_data_presence") + has_arbitration = filters.BooleanFilter(method="filter_data_presence") + has_fedresurs_bankruptcy = filters.BooleanFilter(method="filter_data_presence") + has_fstec = filters.BooleanFilter(method="filter_data_presence") + has_trudvsem = filters.BooleanFilter(method="filter_data_presence") + has_vacancies = filters.BooleanFilter(method="filter_data_presence") + has_fns_reports = filters.BooleanFilter(method="filter_data_presence") + + class Meta: + model = Organization + fields = [ + "name", + "inn", + "kpp", + "ogrn", + "ogrip", + "registry", + "registry_name", + "has_registry", + "has_industrial", + "has_industrial_products", + "has_manufactures", + "has_inspections", + "has_procurements", + "has_procurements_44fz", + "has_procurements_223fz", + "has_contracts", + "has_unfair_suppliers", + "has_fas_goz", + "has_arbitration", + "has_fedresurs_bankruptcy", + "has_fstec", + "has_trudvsem", + "has_vacancies", + "has_fns_reports", + ] + + def filter_registry(self, queryset, _name, value): + return self._filter_by_registry_membership(queryset, registry_id=str(value)) + + def filter_registry_name(self, queryset, _name, value): + return self._filter_by_registry_membership(queryset, registry_name=value) + + def filter_has_registry(self, queryset, _name, value): + return self._filter_by_registry_membership(queryset, has_registry=value) + + def filter_data_presence(self, queryset, name, value): + source = to_internal_data_source(name.removeprefix("has_")) + if source not in DATA_PRESENCE_KEYS: + return queryset.none() + + inn_values, ogrn_values = data_presence_identity_values(source) + filtered = self._filter_by_registry_identities( + queryset, inn_values, ogrn_values + ) + if value: + return filtered + return queryset.exclude(uid__in=filtered.values("uid")) + + @staticmethod + def _filter_by_registry_identities( + queryset, inn_values: set[str], ogrn_values: set[str] + ): + if not inn_values and not ogrn_values: + return queryset.none() + + query = Q() + if inn_values: + query |= Q(inn__in=inn_values) + if ogrn_values: + query |= Q(ogrn__in=ogrn_values) | Q(ogrip__in=ogrn_values) + return queryset.filter(query) + + @classmethod + def _filter_by_registry_membership( + cls, + queryset, + *, + registry_id: str | None = None, + registry_name: str | None = None, + has_registry: bool = True, + ): + membership = cls._registry_membership_subquery( + registry_id=registry_id, + registry_name=registry_name, + ) + return queryset.annotate(_has_registry=Exists(membership)).filter( + _has_registry=has_registry + ) + + @staticmethod + def _registry_membership_subquery( + *, + registry_id: str | None = None, + registry_name: str | None = None, + ): + membership = RegistryMembershipPeriod.objects.filter( + ended_at__isnull=True, + ).annotate( + organization_inn_text=Cast( + "organization__mn_inn", output_field=CharField() + ), + organization_ogrn_text=Cast( + "organization__mn_ogrn", output_field=CharField() + ), + ) + if registry_id: + membership = membership.filter(registry_id=registry_id) + if registry_name: + membership = membership.filter(registry__name__icontains=registry_name) + + return membership.filter( + Q(organization_inn_text=OuterRef("inn")) + | Q(organization_ogrn_text=OuterRef("ogrn")) + | Q(organization_ogrn_text=OuterRef("ogrip")) + ) diff --git a/src/organizations/management/__init__.py b/src/organizations/management/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/src/organizations/management/__init__.py @@ -0,0 +1 @@ + diff --git a/src/organizations/management/commands/__init__.py b/src/organizations/management/commands/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/src/organizations/management/commands/__init__.py @@ -0,0 +1 @@ + diff --git a/src/organizations/management/commands/populate_organizations.py b/src/organizations/management/commands/populate_organizations.py new file mode 100644 index 0000000..38a53ea --- /dev/null +++ b/src/organizations/management/commands/populate_organizations.py @@ -0,0 +1,31 @@ +"""Populate canonical organizations from existing source tables.""" + +from __future__ import annotations + +import json + +from apps.core.management.commands.base import BaseAppCommand + +from organizations.services import OrganizationPopulationService + + +class Command(BaseAppCommand): + """Populate organizations directory from current database records.""" + + help = "Заполняет справочник organizations.Organization из существующих данных БД" + use_transaction = True + + def execute_command(self, *args, **options) -> str: + result = OrganizationPopulationService.populate() + rendered = json.dumps( + { + "scanned": result.scanned, + "created": result.created, + "updated": result.updated, + "skipped": result.skipped, + }, + ensure_ascii=False, + sort_keys=True, + ) + self.log_success(rendered) + return rendered diff --git a/src/organizations/management/commands/refresh_organization_data_snapshots.py b/src/organizations/management/commands/refresh_organization_data_snapshots.py new file mode 100644 index 0000000..aee0278 --- /dev/null +++ b/src/organizations/management/commands/refresh_organization_data_snapshots.py @@ -0,0 +1,49 @@ +"""Refresh precomputed organization data snapshots.""" + +from __future__ import annotations + +import json + +from apps.core.management.commands.base import BaseAppCommand + +from organizations.services import OrganizationDataSnapshotRefreshService + + +class Command(BaseAppCommand): + """Refresh organizations_data_snapshot rows for API v2.""" + + help = "Пересобирает organizations_data_snapshot для API v2" + use_transaction = False + + def add_arguments(self, parser) -> None: + super().add_arguments(parser) + parser.add_argument( + "--uid", + action="append", + dest="uids", + default=None, + help="UID организации. Можно передать несколько раз.", + ) + parser.add_argument( + "--batch-size", + type=int, + default=100, + help="Размер пачки организаций для пересборки.", + ) + + def execute_command(self, *args, **options) -> str: + result = OrganizationDataSnapshotRefreshService.refresh( + organization_uids=options.get("uids"), + batch_size=options["batch_size"], + ) + rendered = json.dumps( + { + "processed": result.processed, + "created": result.created, + "updated": result.updated, + }, + ensure_ascii=False, + sort_keys=True, + ) + self.log_success(rendered) + return rendered diff --git a/src/organizations/migrations/0001_initial.py b/src/organizations/migrations/0001_initial.py new file mode 100644 index 0000000..b12fabf --- /dev/null +++ b/src/organizations/migrations/0001_initial.py @@ -0,0 +1,131 @@ +# Generated manually for organizations app. + +import uuid + +from django.db import migrations, models +from django.db.models import Q + + +class Migration(migrations.Migration): + initial = True + + dependencies = [] + + operations = [ + migrations.CreateModel( + name="Organization", + fields=[ + ( + "uid", + models.UUIDField( + default=uuid.uuid4, + editable=False, + primary_key=True, + serialize=False, + verbose_name="UID", + ), + ), + ( + "name", + models.CharField( + db_index=True, + help_text="Наименование организации или ИП", + max_length=1024, + verbose_name="наименование", + ), + ), + ( + "inn", + models.CharField( + blank=True, + db_index=True, + help_text="ИНН ЮЛ или ИП", + max_length=12, + verbose_name="ИНН", + ), + ), + ( + "kpp", + models.CharField( + blank=True, + db_index=True, + help_text="КПП только для юридических лиц", + max_length=9, + verbose_name="КПП", + ), + ), + ( + "ogrn", + models.CharField( + blank=True, + db_index=True, + help_text="ОГРН только для юридических лиц", + max_length=13, + verbose_name="ОГРН", + ), + ), + ( + "ogrip", + models.CharField( + blank=True, + db_index=True, + help_text="ОГРИП только для индивидуальных предпринимателей", + max_length=15, + verbose_name="ОГРИП", + ), + ), + ], + options={ + "verbose_name": "организация", + "verbose_name_plural": "организации", + "db_table": "organizations_organization", + "ordering": ["name"], + }, + ), + migrations.AddIndex( + model_name="organization", + index=models.Index(fields=["inn", "kpp"], name="organizatio_inn_fc65af_idx"), + ), + migrations.AddIndex( + model_name="organization", + index=models.Index(fields=["inn", "ogrn"], name="organizatio_inn_75c78b_idx"), + ), + migrations.AddIndex( + model_name="organization", + index=models.Index( + fields=["inn", "ogrip"], + name="organizatio_inn_88504f_idx", + ), + ), + migrations.AddConstraint( + model_name="organization", + constraint=models.UniqueConstraint( + condition=~Q(inn=""), + fields=("inn",), + name="unique_organizations_inn_not_blank", + ), + ), + migrations.AddConstraint( + model_name="organization", + constraint=models.UniqueConstraint( + condition=~Q(ogrn=""), + fields=("ogrn",), + name="unique_organizations_ogrn_not_blank", + ), + ), + migrations.AddConstraint( + model_name="organization", + constraint=models.UniqueConstraint( + condition=~Q(ogrip=""), + fields=("ogrip",), + name="unique_organizations_ogrip_not_blank", + ), + ), + migrations.AddConstraint( + model_name="organization", + constraint=models.CheckConstraint( + check=Q(ogrip="") | (Q(kpp="") & Q(ogrn="")), + name="check_entrepreneur_has_no_kpp_ogrn", + ), + ), + ] diff --git a/src/organizations/migrations/0002_organization_data_snapshot.py b/src/organizations/migrations/0002_organization_data_snapshot.py new file mode 100644 index 0000000..05703a0 --- /dev/null +++ b/src/organizations/migrations/0002_organization_data_snapshot.py @@ -0,0 +1,57 @@ +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ("organizations", "0001_initial"), + ] + + operations = [ + migrations.CreateModel( + name="OrganizationDataSnapshot", + fields=[ + ( + "organization", + models.OneToOneField( + on_delete=django.db.models.deletion.CASCADE, + primary_key=True, + related_name="data_snapshot", + serialize=False, + to="organizations.organization", + verbose_name="организация", + ), + ), + ( + "data", + models.JSONField( + default=dict, + help_text="Готовый JSON data для API v2", + verbose_name="данные источников", + ), + ), + ( + "registries", + models.JSONField( + default=list, + help_text="Готовый JSON registries для API v2", + verbose_name="реестры", + ), + ), + ( + "updated_at", + models.DateTimeField( + auto_now=True, + db_index=True, + verbose_name="дата обновления", + ), + ), + ], + options={ + "verbose_name": "снапшот данных организации", + "verbose_name_plural": "снапшоты данных организаций", + "db_table": "organizations_data_snapshot", + }, + ), + ] diff --git a/src/organizations/migrations/0003_allow_branch_kpp_organizations.py b/src/organizations/migrations/0003_allow_branch_kpp_organizations.py new file mode 100644 index 0000000..58250fe --- /dev/null +++ b/src/organizations/migrations/0003_allow_branch_kpp_organizations.py @@ -0,0 +1,36 @@ +from django.db import migrations, models +from django.db.models import Q + + +class Migration(migrations.Migration): + + dependencies = [ + ("organizations", "0002_organization_data_snapshot"), + ] + + operations = [ + migrations.RemoveConstraint( + model_name="organization", + name="unique_organizations_inn_not_blank", + ), + migrations.RemoveConstraint( + model_name="organization", + name="unique_organizations_ogrn_not_blank", + ), + migrations.AddConstraint( + model_name="organization", + constraint=models.UniqueConstraint( + fields=("inn", "kpp"), + condition=~Q(inn="") & ~Q(kpp=""), + name="unique_org_inn_kpp_not_blank", + ), + ), + migrations.AddConstraint( + model_name="organization", + constraint=models.UniqueConstraint( + fields=("inn",), + condition=~Q(inn="") & Q(kpp="") & Q(ogrip=""), + name="unique_org_inn_without_kpp", + ), + ), + ] diff --git a/src/organizations/migrations/__init__.py b/src/organizations/migrations/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/src/organizations/migrations/__init__.py @@ -0,0 +1 @@ + diff --git a/src/organizations/models.py b/src/organizations/models.py new file mode 100644 index 0000000..b7b5535 --- /dev/null +++ b/src/organizations/models.py @@ -0,0 +1,131 @@ +"""Models for the canonical organizations directory.""" + +import uuid + +from django.db import models +from django.db.models import Q +from django.utils.translation import gettext_lazy as _ + +from organizations.name_normalization import normalize_organization_name + + +class Organization(models.Model): + """Canonical organization without source-specific relations.""" + + uid = models.UUIDField( + _("UID"), + primary_key=True, + default=uuid.uuid4, + editable=False, + ) + name = models.CharField( + _("наименование"), + max_length=1024, + db_index=True, + help_text=_("Наименование организации или ИП"), + ) + inn = models.CharField( + _("ИНН"), + max_length=12, + blank=True, + db_index=True, + help_text=_("ИНН ЮЛ или ИП"), + ) + kpp = models.CharField( + _("КПП"), + max_length=9, + blank=True, + db_index=True, + help_text=_("КПП только для юридических лиц"), + ) + ogrn = models.CharField( + _("ОГРН"), + max_length=13, + blank=True, + db_index=True, + help_text=_("ОГРН только для юридических лиц"), + ) + ogrip = models.CharField( + _("ОГРИП"), + max_length=15, + blank=True, + db_index=True, + help_text=_("ОГРИП только для индивидуальных предпринимателей"), + ) + + class Meta: + db_table = "organizations_organization" + verbose_name = _("организация") + verbose_name_plural = _("организации") + ordering = ["name"] + indexes = [ + models.Index(fields=["inn", "kpp"]), + models.Index(fields=["inn", "ogrn"]), + models.Index(fields=["inn", "ogrip"]), + ] + constraints = [ + models.UniqueConstraint( + fields=["inn", "kpp"], + condition=~Q(inn="") & ~Q(kpp=""), + name="unique_org_inn_kpp_not_blank", + ), + models.UniqueConstraint( + fields=["inn"], + condition=~Q(inn="") & Q(kpp="") & Q(ogrip=""), + name="unique_org_inn_without_kpp", + ), + models.UniqueConstraint( + fields=["ogrip"], + condition=~Q(ogrip=""), + name="unique_organizations_ogrip_not_blank", + ), + models.CheckConstraint( + check=Q(ogrip="") | (Q(kpp="") & Q(ogrn="")), + name="check_entrepreneur_has_no_kpp_ogrn", + ), + ] + + def __str__(self) -> str: + identifier = self.inn or self.ogrn or self.ogrip + if identifier: + return f"{self.name} ({identifier})" + return self.name + + @property + def normalized_name(self) -> str: + return normalize_organization_name(self.name) + + +class OrganizationDataSnapshot(models.Model): + """Precomputed API v2 data payload for one canonical organization.""" + + organization = models.OneToOneField( + Organization, + on_delete=models.CASCADE, + primary_key=True, + related_name="data_snapshot", + verbose_name=_("организация"), + ) + data = models.JSONField( + _("данные источников"), + default=dict, + help_text=_("Готовый JSON data для API v2"), + ) + registries = models.JSONField( + _("реестры"), + default=list, + help_text=_("Готовый JSON registries для API v2"), + ) + updated_at = models.DateTimeField( + _("дата обновления"), + auto_now=True, + db_index=True, + ) + + class Meta: + db_table = "organizations_data_snapshot" + verbose_name = _("снапшот данных организации") + verbose_name_plural = _("снапшоты данных организаций") + + def __str__(self) -> str: + return f"Snapshot for {self.organization_id}" diff --git a/src/organizations/name_normalization.py b/src/organizations/name_normalization.py new file mode 100644 index 0000000..95392e6 --- /dev/null +++ b/src/organizations/name_normalization.py @@ -0,0 +1,159 @@ +"""Organization name normalization helpers.""" + +from __future__ import annotations + +import re + +LEGAL_FORM_REPLACEMENTS: tuple[tuple[re.Pattern[str], str], ...] = ( + ( + re.compile( + r"\bнаучно[-\s]+производственное\s+предприятие\b", + flags=re.IGNORECASE, + ), + "НПП", + ), + ( + re.compile( + r"\bнаучно[-\s]+производственное\s+объединение\b", + flags=re.IGNORECASE, + ), + "НПО", + ), + ( + re.compile( + r"\bнаучно[-\s]+производственная\s+фирма\b", + flags=re.IGNORECASE, + ), + "НПФ", + ), + ( + re.compile( + r"\bнаучно[-\s]+исследовательский\s+институт\b", + flags=re.IGNORECASE, + ), + "НИИ", + ), + ( + re.compile( + r"\bфедеральное\s+государственное\s+унитарное\s+предприятие\b", + flags=re.IGNORECASE, + ), + "ФГУП", + ), + ( + re.compile( + r"\bгосударственное\s+унитарное\s+предприятие\b", + flags=re.IGNORECASE, + ), + "ГУП", + ), + ( + re.compile( + r"\bмуниципальное\s+унитарное\s+предприятие\b", + flags=re.IGNORECASE, + ), + "МУП", + ), + ( + re.compile( + r"\bавтономная\s+некоммерческая\s+организация\b", + flags=re.IGNORECASE, + ), + "АНО", + ), + ( + re.compile( + r"\bпубличное\s+акционерное\s+общество\b", + flags=re.IGNORECASE, + ), + "ПАО", + ), + ( + re.compile( + r"\bнепубличное\s+акционерное\s+общество\b", + flags=re.IGNORECASE, + ), + "НАО", + ), + ( + re.compile( + r"\bзакрытое\s+акционерное\s+общество\b", + flags=re.IGNORECASE, + ), + "ЗАО", + ), + ( + re.compile( + r"\bоткрытое\s+акционерное\s+общество\b", + flags=re.IGNORECASE, + ), + "ОАО", + ), + ( + re.compile( + r"\bобщество\s+с\s+ограниченной\s+ответственностью\b", + flags=re.IGNORECASE, + ), + "ООО", + ), + ( + re.compile( + r"\bакционерное\s+общество\b", + flags=re.IGNORECASE, + ), + "АО", + ), + ( + re.compile( + r"\bиндивидуальный\s+предприниматель\b", + flags=re.IGNORECASE, + ), + "ИП", + ), +) + +UPPERCASE_TOKENS = { + "АНО", + "АО", + "ГУП", + "ЗАО", + "ИП", + "НАО", + "НИИ", + "НПО", + "НПП", + "НПФ", + "ОАО", + "ООО", + "ПАО", + "МУП", + "ФГУП", +} + +WORD_RE = re.compile(r"[A-Za-zА-Яа-яЁё]+") +WHITESPACE_RE = re.compile(r"\s+") + + +def normalize_organization_name(name: str) -> str: + """Return a compact display name without changing the stored raw name.""" + + normalized = WHITESPACE_RE.sub(" ", name).strip() + for pattern, replacement in LEGAL_FORM_REPLACEMENTS: + normalized = pattern.sub(replacement, normalized) + normalized = WHITESPACE_RE.sub(" ", normalized).strip() + return WORD_RE.sub(_normalize_word, normalized) + + +def _normalize_word(match: re.Match[str]) -> str: + word = match.group(0) + upper_word = word.upper() + if upper_word in UPPERCASE_TOKENS: + return upper_word + if _is_all_caps_word(word): + return word[:1].upper() + word[1:].lower() + return word + + +def _is_all_caps_word(word: str) -> bool: + letters = [char for char in word if char.isalpha()] + return bool(letters) and all(char.isupper() for char in letters) diff --git a/src/organizations/serializers.py b/src/organizations/serializers.py new file mode 100644 index 0000000..82f96a2 --- /dev/null +++ b/src/organizations/serializers.py @@ -0,0 +1,98 @@ +"""Serializers for organizations API v2.""" + +from typing import Any + +from rest_framework import serializers + +from organizations.api_enrichment import to_api_data_source +from organizations.models import Organization + + +class OrganizationSerializer(serializers.ModelSerializer): + """Canonical organization representation.""" + + data = serializers.SerializerMethodField() + data_sources = serializers.SerializerMethodField() + normalized_name = serializers.CharField(read_only=True) + registries = serializers.SerializerMethodField() + + class Meta: + model = Organization + ref_name = "CanonicalOrganization" + fields = [ + "uid", + "name", + "normalized_name", + "inn", + "kpp", + "ogrn", + "ogrip", + "data", + "data_sources", + "registries", + ] + read_only_fields = fields + + def get_data(self, obj) -> dict[str, Any]: + snapshot = getattr(obj, "data_snapshot", None) + if snapshot is not None: + data = _snapshot_data_with_api_keys(snapshot.data) + data_sources = self.context.get("data_sources") + if data_sources is None: + return data + return { + source: data.get(source, []) + for source in data_sources + if source in data + } + + enrichment = self.context.get("enrichment", {}).get(str(obj.uid)) + if enrichment is None: + return {} + return enrichment.data_presence + + def get_data_sources(self, obj) -> list[dict[str, int | str]]: + snapshot = getattr(obj, "data_snapshot", None) + if snapshot is not None: + data = _snapshot_data_with_api_keys(snapshot.data) + return _data_source_summary(data) + + enrichment = self.context.get("enrichment", {}).get(str(obj.uid)) + if enrichment is None: + return [] + return _data_source_summary(enrichment.data_presence) + + def get_registries(self, obj) -> list[dict[str, str]]: + snapshot = getattr(obj, "data_snapshot", None) + if snapshot is not None: + return snapshot.registries + + enrichment = self.context.get("enrichment", {}).get(str(obj.uid)) + if enrichment is None: + return [] + return [ + { + "id": registry.id, + "name": registry.name, + } + for registry in enrichment.registries + ] + + +def _snapshot_data_with_api_keys(data: dict[str, Any]) -> dict[str, Any]: + return {to_api_data_source(source): value for source, value in data.items()} + + +def _data_source_summary(data: dict[str, Any]) -> list[dict[str, int | str]]: + summary: list[dict[str, int | str]] = [] + for source in sorted(data): + value = data[source] + if isinstance(value, list): + count = len(value) + elif value: + count = 1 + else: + count = 0 + if count: + summary.append({"source": source, "count": count}) + return summary diff --git a/src/organizations/services.py b/src/organizations/services.py new file mode 100644 index 0000000..18c5bda --- /dev/null +++ b/src/organizations/services.py @@ -0,0 +1,703 @@ +"""Services for building the canonical organizations directory.""" + +from __future__ import annotations + +import re +from collections.abc import Iterable +from dataclasses import dataclass + +from apps.parsers.models import ( + FinancialReport, + GenericParserRecord, + IndustrialCertificateRecord, + IndustrialProductRecord, + InspectionRecord, + ManufacturerRecord, + ParserLoadLog, + ProcurementRecord, +) +from django.db import transaction +from django.db.models import Q +from django.utils import timezone +from registers.models import Organization as RegisterOrganization + +from organizations.api_enrichment import OrganizationApiEnrichmentService +from organizations.models import Organization, OrganizationDataSnapshot + +_QUOTE_CHARS = "\"'«»„“”" +_LEGAL_FORM_PATTERNS = ( + r"\bобщество\s+с\s+ограниченной\s+ответственностью\b", + r"\bооо\b", + r"\booo\b", + r"\bакционерное\s+общество\b", + r"\bао\b", + r"\bao\b", + r"\bпубличное\s+акционерное\s+общество\b", + r"\bпао\b", + r"\bpao\b", + r"\bзакрытое\s+акционерное\s+общество\b", + r"\bзао\b", + r"\bzao\b", + r"\bиндивидуальный\s+предприниматель\b", + r"\bип\b", +) +_ABBREVIATED_PREFIXES = ( + "ооо ", + "ooo ", + "ао ", + "ao ", + "пао ", + "pao ", + "зао ", + "zao ", + "ип ", +) + + +@dataclass(frozen=True) +class OrganizationCandidate: + """Organization data extracted from an existing source table.""" + + name: str + inn: str = "" + kpp: str = "" + ogrn: str = "" + ogrip: str = "" + + +@dataclass(frozen=True) +class PopulateOrganizationsResult: + """Result counters for organization population.""" + + scanned: int + created: int + updated: int + skipped: int + + +@dataclass(frozen=True) +class RefreshOrganizationDataSnapshotsResult: + """Result counters for precomputed organization API data snapshots.""" + + processed: int + created: int + updated: int + + +@dataclass +class OrganizationLookup: + """In-memory indexes for matching organization candidates.""" + + by_inn_kpp: dict[tuple[str, str], Organization] + by_ogrn_kpp: dict[tuple[str, str], Organization] + by_inn: dict[str, list[Organization]] + by_ogrn: dict[str, list[Organization]] + by_ogrip: dict[str, Organization] + by_normalized_name: dict[str, Organization] + + +class OrganizationDataSnapshotRefreshService: + """Refreshes precomputed v2 data JSON for canonical organizations.""" + + @classmethod + def refresh( + cls, + *, + organization_uids: Iterable[str] | None = None, + batch_size: int = 100, + ) -> RefreshOrganizationDataSnapshotsResult: + queryset = Organization.objects.all().order_by("uid") + if organization_uids is not None: + queryset = queryset.filter(uid__in=list(organization_uids)) + + processed = 0 + created = 0 + updated = 0 + + for organizations in cls._iter_batches(queryset, batch_size): + enrichment = OrganizationApiEnrichmentService.build_for(organizations) + existing_snapshots = { + str(snapshot.organization_id): snapshot + for snapshot in OrganizationDataSnapshot.objects.filter( + organization_id__in=[ + organization.uid for organization in organizations + ] + ) + } + + create_instances: list[OrganizationDataSnapshot] = [] + update_instances: list[OrganizationDataSnapshot] = [] + for organization in organizations: + processed += 1 + item = enrichment[str(organization.uid)] + data = item.data_presence + registries = [ + { + "id": registry.id, + "name": registry.name, + } + for registry in item.registries + ] + + snapshot = existing_snapshots.get(str(organization.uid)) + if snapshot is None: + create_instances.append( + OrganizationDataSnapshot( + organization=organization, + data=data, + registries=registries, + ) + ) + continue + + snapshot.data = data + snapshot.registries = registries + snapshot.updated_at = timezone.now() + update_instances.append(snapshot) + + if create_instances: + OrganizationDataSnapshot.objects.bulk_create( + create_instances, + batch_size=batch_size, + ) + created += len(create_instances) + if update_instances: + OrganizationDataSnapshot.objects.bulk_update( + update_instances, + fields=["data", "registries", "updated_at"], + batch_size=batch_size, + ) + updated += len(update_instances) + + return RefreshOrganizationDataSnapshotsResult( + processed=processed, + created=created, + updated=updated, + ) + + @classmethod + def refresh_for_parser_batch( + cls, + *, + source: str, + batch_id: int, + batch_size: int = 100, + ) -> RefreshOrganizationDataSnapshotsResult: + organization_uids = cls.organization_uids_for_parser_batch( + source=source, + batch_id=batch_id, + ) + return cls.refresh( + organization_uids=organization_uids, + batch_size=batch_size, + ) + + @classmethod + def organization_uids_for_parser_batch( + cls, + *, + source: str, + batch_id: int, + ) -> list[str]: + inn_values, ogrn_values = cls._parser_batch_identities( + source=source, + batch_id=batch_id, + ) + if not inn_values and not ogrn_values: + return [] + + query = Q() + if inn_values: + query |= Q(inn__in=inn_values) + if ogrn_values: + query |= Q(ogrn__in=ogrn_values) | Q(ogrip__in=ogrn_values) + return [ + str(uid) + for uid in Organization.objects.filter(query).values_list("uid", flat=True) + ] + + @staticmethod + def _parser_batch_identities( + *, + source: str, + batch_id: int, + ) -> tuple[set[str], set[str]]: + if source == ParserLoadLog.Source.INDUSTRIAL: + return _identity_values( + IndustrialCertificateRecord.objects.filter(load_batch=batch_id), + inn_field="inn", + ogrn_field="ogrn", + ) + if source == ParserLoadLog.Source.INDUSTRIAL_PRODUCTS: + return _identity_values( + IndustrialProductRecord.objects.filter(load_batch=batch_id), + inn_field="inn", + ogrn_field="ogrn", + ) + if source == ParserLoadLog.Source.MANUFACTURES: + return _identity_values( + ManufacturerRecord.objects.filter(load_batch=batch_id), + inn_field="inn", + ogrn_field="ogrn", + ) + if source == ParserLoadLog.Source.INSPECTIONS: + return _identity_values( + InspectionRecord.objects.filter(load_batch=batch_id), + inn_field="inn", + ogrn_field="ogrn", + ) + if source == ParserLoadLog.Source.PROCUREMENTS: + return _identity_values( + ProcurementRecord.objects.filter(load_batch=batch_id), + inn_field="customer_inn", + ogrn_field="customer_ogrn", + ) + if source == ParserLoadLog.Source.FNS_REPORTS: + return ( + set(), + set( + FinancialReport.objects.filter(load_batch=batch_id) + .exclude(ogrn="") + .values_list("ogrn", flat=True) + .distinct() + ), + ) + if source in { + ParserLoadLog.Source.PROCUREMENTS_44FZ, + ParserLoadLog.Source.PROCUREMENTS_223FZ, + ParserLoadLog.Source.CONTRACTS, + ParserLoadLog.Source.UNFAIR_SUPPLIERS, + ParserLoadLog.Source.FAS_GOZ, + ParserLoadLog.Source.ARBITRATION, + ParserLoadLog.Source.FEDRESURS_BANKRUPTCY, + ParserLoadLog.Source.FSTEC, + ParserLoadLog.Source.TRUDVSEM, + }: + return _identity_values( + GenericParserRecord.objects.filter(source=source, load_batch=batch_id), + inn_field="inn", + ogrn_field="ogrn", + ) + + return set(), set() + + @staticmethod + def _iter_batches(queryset, batch_size: int) -> Iterable[list[Organization]]: + batch: list[Organization] = [] + for organization in queryset.iterator(chunk_size=batch_size): + batch.append(organization) + if len(batch) >= batch_size: + yield batch + batch = [] + if batch: + yield batch + + +def _identity_values( + queryset, *, inn_field: str, ogrn_field: str +) -> tuple[set[str], set[str]]: + inn_values = set( + queryset.exclude(**{inn_field: ""}).values_list(inn_field, flat=True).distinct() + ) + ogrn_values = set( + queryset.exclude(**{ogrn_field: ""}) + .values_list(ogrn_field, flat=True) + .distinct() + ) + return inn_values, ogrn_values + + +def normalize_identifier(value: str | int | None, *, max_length: int) -> str: + """Return digits-only identifier bounded by the target field length.""" + if value is None: + return "" + + normalized = re.sub(r"\D+", "", str(value)) + if not normalized or len(normalized) > max_length: + return "" + return normalized + + +def normalize_organization_name(value: str | None) -> str: + """Normalize organization names for matching spelling variants.""" + if value is None: + return "" + + normalized = str(value).strip().lower().replace("ё", "е") + normalized = normalized.translate( + str.maketrans({char: " " for char in _QUOTE_CHARS}) + ) + normalized = re.sub(r"[^\w\s-]+", " ", normalized, flags=re.UNICODE) + normalized = normalized.replace("-", " ") + normalized = re.sub(r"\s+", " ", normalized).strip() + + for pattern in _LEGAL_FORM_PATTERNS: + normalized = re.sub(pattern, " ", normalized, flags=re.IGNORECASE) + + return re.sub(r"\s+", " ", normalized).strip() + + +class OrganizationPopulationService: + """Builds organizations from currently available source tables.""" + + @classmethod + def populate(cls) -> PopulateOrganizationsResult: + scanned = 0 + created = 0 + updated = 0 + skipped = 0 + + with transaction.atomic(): + existing = list(Organization.objects.all()) + lookup = cls._build_lookup(existing) + create_instances: list[Organization] = [] + update_instances_by_uid: dict[str, Organization] = {} + + for candidate in cls.iter_candidates(): + scanned += 1 + if not normalize_organization_name(candidate.name): + skipped += 1 + continue + + organization = cls._find_existing(lookup, candidate) + if organization is None: + organization = Organization( + name=candidate.name.strip(), + inn=candidate.inn, + kpp=candidate.kpp, + ogrn=candidate.ogrn, + ogrip=candidate.ogrip, + ) + existing.append(organization) + create_instances.append(organization) + cls._index_organization(lookup, organization) + created += 1 + continue + + if cls._assign_existing_fields(organization, candidate): + cls._index_organization(lookup, organization) + update_instances_by_uid[str(organization.uid)] = organization + updated += 1 + + if create_instances: + Organization.objects.bulk_create(create_instances, batch_size=1000) + update_instances = list(update_instances_by_uid.values()) + if update_instances: + Organization.objects.bulk_update( + update_instances, + fields=["name", "inn", "kpp", "ogrn", "ogrip"], + batch_size=1000, + ) + + return PopulateOrganizationsResult( + scanned=scanned, + created=created, + updated=updated, + skipped=skipped, + ) + + @classmethod + def iter_candidates(cls) -> Iterable[OrganizationCandidate]: + """Yield organization candidates from all current source tables.""" + for row in RegisterOrganization.objects.iterator(): + yield cls._candidate( + name=row.pn_name, + inn=row.mn_inn, + kpp=row.in_kpp, + ogrn=row.mn_ogrn, + ) + + for row in IndustrialCertificateRecord.objects.iterator(): + yield cls._candidate( + name=row.organisation_name, + inn=row.inn, + ogrn=row.ogrn, + ) + + for row in ManufacturerRecord.objects.iterator(): + yield cls._candidate( + name=row.full_legal_name, + inn=row.inn, + ogrn=row.ogrn, + ) + + for row in IndustrialProductRecord.objects.iterator(): + yield cls._candidate( + name=row.full_organisation_name, + inn=row.inn, + ogrn=row.ogrn, + ) + + for row in GenericParserRecord.objects.iterator(): + yield cls._candidate( + name=row.organisation_name or row.title, + inn=row.inn, + kpp=cls._payload_kpp(row.payload), + ogrn=row.ogrn, + ) + + for row in InspectionRecord.objects.iterator(): + yield cls._candidate( + name=row.organisation_name, + inn=row.inn, + ogrn=row.ogrn, + ) + + for row in ProcurementRecord.objects.iterator(): + yield cls._candidate( + name=row.customer_name, + inn=row.customer_inn, + kpp=row.customer_kpp, + ogrn=row.customer_ogrn, + ) + + @staticmethod + def _candidate( + *, + name: str | None, + inn: str | int | None = None, + kpp: str | int | None = None, + ogrn: str | int | None = None, + ) -> OrganizationCandidate: + normalized_inn = normalize_identifier(inn, max_length=12) + normalized_ogrn = normalize_identifier(ogrn, max_length=15) + ogrip = ( + normalized_ogrn + if len(normalized_ogrn) == 15 and len(normalized_inn) == 12 + else "" + ) + legal_ogrn = normalized_ogrn if len(normalized_ogrn) == 13 else "" + + return OrganizationCandidate( + name=(name or "").strip(), + inn=normalized_inn, + kpp="" if ogrip else normalize_identifier(kpp, max_length=9), + ogrn=legal_ogrn, + ogrip=ogrip, + ) + + @classmethod + def _payload_kpp(cls, payload: object) -> str: + if not isinstance(payload, dict): + return "" + + company = payload.get("company") + if isinstance(company, dict): + company_kpp = normalize_identifier(company.get("kpp"), max_length=9) + if company_kpp: + return company_kpp + + return cls._find_payload_identifier(payload, {"kpp", "кпп"}, max_length=9) + + @classmethod + def _find_payload_identifier( + cls, + value: object, + keys: set[str], + *, + max_length: int, + ) -> str: + if isinstance(value, dict): + for key, item in value.items(): + if str(key).strip().lower() in keys: + identifier = normalize_identifier(item, max_length=max_length) + if identifier: + return identifier + nested = cls._find_payload_identifier(item, keys, max_length=max_length) + if nested: + return nested + elif isinstance(value, list): + for item in value: + nested = cls._find_payload_identifier(item, keys, max_length=max_length) + if nested: + return nested + return "" + + @classmethod + def _build_lookup(cls, organizations: list[Organization]) -> OrganizationLookup: + lookup = OrganizationLookup( + by_inn_kpp={}, + by_ogrn_kpp={}, + by_inn={}, + by_ogrn={}, + by_ogrip={}, + by_normalized_name={}, + ) + for organization in organizations: + cls._index_organization(lookup, organization) + return lookup + + @staticmethod + def _index_organization( + lookup: OrganizationLookup, + organization: Organization, + ) -> None: + if organization.inn and organization.kpp: + lookup.by_inn_kpp.setdefault( + (organization.inn, organization.kpp), organization + ) + if organization.ogrn and organization.kpp: + lookup.by_ogrn_kpp.setdefault( + (organization.ogrn, organization.kpp), organization + ) + if organization.inn: + lookup.by_inn.setdefault(organization.inn, []) + if organization not in lookup.by_inn[organization.inn]: + lookup.by_inn[organization.inn].append(organization) + if organization.ogrn: + lookup.by_ogrn.setdefault(organization.ogrn, []) + if organization not in lookup.by_ogrn[organization.ogrn]: + lookup.by_ogrn[organization.ogrn].append(organization) + if organization.ogrip: + lookup.by_ogrip.setdefault(organization.ogrip, organization) + + normalized_name = normalize_organization_name(organization.name) + if normalized_name: + lookup.by_normalized_name.setdefault(normalized_name, organization) + + @staticmethod + def _find_exact_identifier_match( + lookup: OrganizationLookup, + candidate: OrganizationCandidate, + ) -> Organization | None: + if candidate.inn and candidate.kpp: + organization = lookup.by_inn_kpp.get((candidate.inn, candidate.kpp)) + if organization is not None: + return organization + if candidate.ogrn and candidate.kpp: + organization = lookup.by_ogrn_kpp.get((candidate.ogrn, candidate.kpp)) + if organization is not None: + return organization + if candidate.ogrip and candidate.ogrip in lookup.by_ogrip: + return lookup.by_ogrip[candidate.ogrip] + return None + + @staticmethod + def _find_blank_kpp_match( + lookup: OrganizationLookup, + candidate: OrganizationCandidate, + ) -> Organization | None: + if candidate.inn and candidate.kpp: + blank_kpp_matches = [ + organization + for organization in lookup.by_inn.get(candidate.inn, []) + if not organization.kpp + ] + if len(blank_kpp_matches) == 1: + return blank_kpp_matches[0] + if candidate.ogrn and candidate.kpp: + blank_kpp_matches = [ + organization + for organization in lookup.by_ogrn.get(candidate.ogrn, []) + if not organization.kpp + ] + if len(blank_kpp_matches) == 1: + return blank_kpp_matches[0] + return None + + @staticmethod + def _find_single_identifier_match( + lookup: OrganizationLookup, + candidate: OrganizationCandidate, + ) -> Organization | None: + if candidate.inn and not candidate.kpp: + organizations = lookup.by_inn.get(candidate.inn, []) + if len(organizations) == 1: + return organizations[0] + if candidate.ogrn and not candidate.kpp: + organizations = lookup.by_ogrn.get(candidate.ogrn, []) + if len(organizations) == 1: + return organizations[0] + return None + + @staticmethod + def _find_name_match( + lookup: OrganizationLookup, + candidate: OrganizationCandidate, + ) -> Organization | None: + candidate_name = normalize_organization_name(candidate.name) + if not candidate_name: + return None + return lookup.by_normalized_name.get(candidate_name) + + @classmethod + def _find_existing( + cls, + lookup: OrganizationLookup, + candidate: OrganizationCandidate, + ) -> Organization | None: + organization = cls._find_exact_identifier_match(lookup, candidate) + if organization is not None: + return organization + + organization = cls._find_blank_kpp_match(lookup, candidate) + if organization is not None: + return organization + + organization = cls._find_single_identifier_match(lookup, candidate) + if organization is not None: + return organization + + if candidate.kpp and (candidate.inn or candidate.ogrn): + return None + return cls._find_name_match(lookup, candidate) + + @classmethod + def _update_existing( + cls, + organization: Organization, + candidate: OrganizationCandidate, + ) -> bool: + if not cls._assign_existing_fields(organization, candidate): + return False + + organization.save(update_fields=["name", "inn", "kpp", "ogrn", "ogrip"]) + return True + + @classmethod + def _assign_existing_fields( + cls, + organization: Organization, + candidate: OrganizationCandidate, + ) -> bool: + changed = False + + selected_name = cls._select_name(organization.name, candidate.name) + if selected_name != organization.name: + organization.name = selected_name + changed = True + + for field_name in ("inn", "kpp", "ogrn", "ogrip"): + if getattr(organization, field_name): + continue + + if field_name == "ogrip" and (organization.kpp or organization.ogrn): + continue + if field_name in {"kpp", "ogrn"} and organization.ogrip: + continue + + candidate_value = getattr(candidate, field_name) + if candidate_value: + setattr(organization, field_name, candidate_value) + changed = True + + return changed + + @staticmethod + def _select_name(current: str, candidate: str) -> str: + current_clean = current.strip() + candidate_clean = candidate.strip() + if not candidate_clean: + return current_clean + if not current_clean: + return candidate_clean + + current_is_abbreviated = current_clean.lower().startswith(_ABBREVIATED_PREFIXES) + candidate_is_abbreviated = candidate_clean.lower().startswith( + _ABBREVIATED_PREFIXES + ) + if current_is_abbreviated and not candidate_is_abbreviated: + return candidate_clean + if len(candidate_clean) > len(current_clean) and not candidate_is_abbreviated: + return candidate_clean + return current_clean diff --git a/src/organizations/tasks.py b/src/organizations/tasks.py new file mode 100644 index 0000000..1fc2324 --- /dev/null +++ b/src/organizations/tasks.py @@ -0,0 +1,37 @@ +"""Celery tasks for organizations snapshots.""" + +from __future__ import annotations + +import logging + +from celery import shared_task +from django.core.cache import cache + +from organizations.services import OrganizationDataSnapshotRefreshService + +logger = logging.getLogger(__name__) + + +@shared_task +def refresh_organization_data_snapshots_for_parser_batch( + *, + source: str, + batch_id: int, + batch_size: int = 100, +) -> dict: + """Refresh snapshots for organizations affected by one parser batch.""" + result = OrganizationDataSnapshotRefreshService.refresh_for_parser_batch( + source=source, + batch_id=batch_id, + batch_size=batch_size, + ) + cache.clear() + payload = { + "source": source, + "batch_id": batch_id, + "processed": result.processed, + "created": result.created, + "updated": result.updated, + } + logger.info("Organization data snapshots refreshed: %s", payload) + return payload diff --git a/src/organizations/urls.py b/src/organizations/urls.py new file mode 100644 index 0000000..6d0a619 --- /dev/null +++ b/src/organizations/urls.py @@ -0,0 +1,13 @@ +"""URL routes for organizations API v2.""" + +from django.urls import include, path +from rest_framework.routers import DefaultRouter + +from organizations.views import OrganizationViewSet + +router = DefaultRouter() +router.register(r"organizations", OrganizationViewSet, basename="organizations") + +organizations_urlpatterns = [ + path("", include(router.urls)), +] diff --git a/src/organizations/views.py b/src/organizations/views.py new file mode 100644 index 0000000..e391af0 --- /dev/null +++ b/src/organizations/views.py @@ -0,0 +1,456 @@ +"""Views for organizations API v2.""" + +from __future__ import annotations + +import hashlib +from typing import Any + +from apps.core.openapi import swagger_tag +from django.conf import settings +from django.core.cache import cache +from django_filters import rest_framework as filters +from drf_yasg import openapi +from drf_yasg.utils import swagger_auto_schema +from rest_framework.exceptions import ValidationError +from rest_framework.filters import OrderingFilter, SearchFilter +from rest_framework.permissions import AllowAny, IsAuthenticated +from rest_framework.response import Response +from rest_framework.viewsets import ReadOnlyModelViewSet + +from organizations.api_enrichment import ( + API_DATA_SOURCE_KEY_SET, + OrganizationApiEnrichmentService, + to_api_data_source, + to_internal_data_source, +) +from organizations.filters import OrganizationFilter +from organizations.models import Organization +from organizations.serializers import OrganizationSerializer + +ORGANIZATIONS_API_CACHE_TIMEOUT_SECONDS = 300 +ORGANIZATIONS_TAG = swagger_tag("Организации", "Organizations") +ORGANIZATION_DATA_SOURCE_KEYS = ", ".join(sorted(API_DATA_SOURCE_KEY_SET)) + + +def _query_parameter( + name: str, + *, + description: str, + param_type: str = openapi.TYPE_STRING, + default: str | int | bool | None = None, + enum: list[str] | None = None, + format_: str | None = None, +) -> openapi.Parameter: + return openapi.Parameter( + name=name, + in_=openapi.IN_QUERY, + type=param_type, + required=False, + description=description, + default=default, + enum=enum, + format=format_, + ) + + +ORGANIZATION_DATA_PARAMS = [ + _query_parameter( + "data", + description=( + "Ограничить блок data одним или несколькими источниками. " + f"Допустимые значения: {ORGANIZATION_DATA_SOURCE_KEYS}. " + "Можно передать несколько параметров или CSV-строку." + ), + ), + _query_parameter( + "data_sources", + description=( + "Alias параметра data. Оставлен для явного указания набора источников." + ), + ), + _query_parameter( + "exclude_data", + description=( + "Исключить один или несколько источников из блока data. " + f"Допустимые значения: {ORGANIZATION_DATA_SOURCE_KEYS}." + ), + ), + _query_parameter( + "exclude_data_sources", + description=( + "Alias параметра exclude_data. Можно передать несколько значений " + "или CSV-строку." + ), + ), +] +ORGANIZATION_LIST_PARAMS = [ + _query_parameter( + "page", + description="Номер страницы пагинации.", + param_type=openapi.TYPE_INTEGER, + default=1, + ), + _query_parameter( + "page_size", + description="Размер страницы. Максимум 100.", + param_type=openapi.TYPE_INTEGER, + default=20, + ), + _query_parameter( + "search", + description="Полнотекстовый поиск по наименованию, ИНН, КПП, ОГРН и ОГРИП.", + ), + _query_parameter( + "ordering", + description=( + "Сортировка по uid, name, inn, kpp, ogrn или ogrip. " + "Префикс '-' включает обратный порядок." + ), + ), + _query_parameter("name", description="Фильтр по части полного наименования."), + _query_parameter("inn", description="Точный фильтр по ИНН."), + _query_parameter("kpp", description="Точный фильтр по КПП."), + _query_parameter("ogrn", description="Точный фильтр по ОГРН."), + _query_parameter("ogrip", description="Точный фильтр по ОГРИП."), + _query_parameter( + "registry", + description="UUID реестра. Возвращает организации из активного участия.", + format_=openapi.FORMAT_UUID, + ), + _query_parameter( + "registry_name", + description="Фильтр по части наименования реестра.", + ), + _query_parameter( + "has_registry", + description=( + "Фильтр наличия активного участия в любом реестре; по умолчанию true " + "для list endpoint, если параметр не передан." + ), + param_type=openapi.TYPE_BOOLEAN, + default=True, + ), + *[ + _query_parameter( + f"has_{source}", + description=f"Фильтр наличия данных источника {source}.", + param_type=openapi.TYPE_BOOLEAN, + ) + for source in sorted(API_DATA_SOURCE_KEY_SET) + ], + *ORGANIZATION_DATA_PARAMS, +] +ORGANIZATION_DETAIL_PARAMS = [ + openapi.Parameter( + name="uid", + in_=openapi.IN_PATH, + type=openapi.TYPE_STRING, + format=openapi.FORMAT_UUID, + required=True, + description="UID организации.", + ), + *ORGANIZATION_DATA_PARAMS, +] +ORGANIZATION_SCHEMA = openapi.Schema( + type=openapi.TYPE_OBJECT, + required=["uid", "name", "inn", "data", "data_sources", "registries"], + properties={ + "uid": openapi.Schema(type=openapi.TYPE_STRING, format=openapi.FORMAT_UUID), + "name": openapi.Schema(type=openapi.TYPE_STRING), + "normalized_name": openapi.Schema(type=openapi.TYPE_STRING), + "inn": openapi.Schema(type=openapi.TYPE_STRING), + "kpp": openapi.Schema(type=openapi.TYPE_STRING), + "ogrn": openapi.Schema(type=openapi.TYPE_STRING), + "ogrip": openapi.Schema(type=openapi.TYPE_STRING), + "data": openapi.Schema( + type=openapi.TYPE_OBJECT, + description=( + "Данные по источникам. Ключи управляются параметрами data/" + "exclude_data." + ), + additional_properties=openapi.Schema( + type=openapi.TYPE_ARRAY, + items=openapi.Schema(type=openapi.TYPE_OBJECT), + ), + ), + "data_sources": openapi.Schema( + type=openapi.TYPE_ARRAY, + items=openapi.Schema( + type=openapi.TYPE_OBJECT, + properties={ + "source": openapi.Schema(type=openapi.TYPE_STRING), + "count": openapi.Schema(type=openapi.TYPE_INTEGER), + }, + ), + ), + "registries": openapi.Schema( + type=openapi.TYPE_ARRAY, + items=openapi.Schema( + type=openapi.TYPE_OBJECT, + properties={ + "id": openapi.Schema(type=openapi.TYPE_STRING), + "name": openapi.Schema(type=openapi.TYPE_STRING), + }, + ), + ), + }, +) +ORGANIZATION_LIST_RESPONSE = openapi.Response( + description="Пагинированный список организаций v2.", + schema=openapi.Schema( + type=openapi.TYPE_OBJECT, + properties={ + "success": openapi.Schema(type=openapi.TYPE_BOOLEAN), + "data": openapi.Schema( + type=openapi.TYPE_ARRAY, + items=ORGANIZATION_SCHEMA, + ), + "errors": openapi.Schema( + type=openapi.TYPE_ARRAY, + items=openapi.Schema(type=openapi.TYPE_OBJECT), + description="Список ошибок; null при успешном ответе.", + ), + "meta": openapi.Schema( + type=openapi.TYPE_OBJECT, + properties={ + "pagination": openapi.Schema( + type=openapi.TYPE_OBJECT, + properties={ + "page": openapi.Schema(type=openapi.TYPE_INTEGER), + "page_size": openapi.Schema(type=openapi.TYPE_INTEGER), + "total_count": openapi.Schema(type=openapi.TYPE_INTEGER), + "total_pages": openapi.Schema(type=openapi.TYPE_INTEGER), + "has_next": openapi.Schema(type=openapi.TYPE_BOOLEAN), + "has_previous": openapi.Schema(type=openapi.TYPE_BOOLEAN), + }, + ), + }, + ), + }, + ), +) +ORGANIZATION_DETAIL_RESPONSE = openapi.Response( + description="Карточка организации v2.", + schema=ORGANIZATION_SCHEMA, +) + + +class CachedReadOnlyMixin: + """Cache successful GET list/retrieve responses by full request path.""" + + cache_timeout = ORGANIZATIONS_API_CACHE_TIMEOUT_SECONDS + cache_key_prefix = "api:v2:organizations" + + def _build_cache_key(self, request) -> str: + user_marker = "anonymous" + if request.user and request.user.is_authenticated: + user_marker = "authenticated" + + raw_key = f"{request.method}:{request.get_full_path()}:{user_marker}" + digest = hashlib.md5(raw_key.encode(), usedforsecurity=False).hexdigest() + return f"{self.cache_key_prefix}:{digest}" + + def _cached_response(self, request, producer) -> Response: + cache_key = self._build_cache_key(request) + cached_data = cache.get(cache_key) + if cached_data is not None: + response = Response(cached_data) + response["X-Cache"] = "HIT" + return response + + response = producer() + if 200 <= response.status_code < 300: + cache.set(cache_key, response.data, timeout=self.cache_timeout) + response["X-Cache"] = "MISS" + return response + + +class OrganizationViewSet(CachedReadOnlyMixin, ReadOnlyModelViewSet): + """Read-only API for canonical organizations.""" + + queryset = Organization.objects.select_related("data_snapshot").order_by( + "name", "uid" + ) + serializer_class = OrganizationSerializer + permission_classes = [IsAuthenticated] + lookup_field = "uid" + filter_backends = [ + filters.DjangoFilterBackend, + SearchFilter, + OrderingFilter, + ] + filterset_class = OrganizationFilter + search_fields = ["name", "inn", "kpp", "ogrn", "ogrip"] + ordering_fields = ["name", "inn", "kpp", "ogrn", "ogrip", "uid"] + ordering = ["name", "uid"] + + def get_permissions(self): + if getattr(settings, "ORGANIZATIONS_V2_ALLOW_ANONYMOUS", False): + return [AllowAny()] + return super().get_permissions() + + def get_queryset(self): + queryset = super().get_queryset() + if self.action != "list" or "has_registry" in self.request.query_params: + return queryset + + filterset = OrganizationFilter( + data={"has_registry": "true"}, + queryset=queryset, + request=self.request, + ) + if filterset.is_valid(): + return filterset.qs + return queryset + + @swagger_auto_schema( + tags=[ORGANIZATIONS_TAG], + operation_id="v2_organizations_list", + operation_summary="Список организаций", + operation_description=( + "Возвращает канонический справочник организаций API v2. " + "По умолчанию показывает только организации с активным участием " + "в реестрах; передайте has_registry=false, чтобы снять это ограничение. " + "Поддерживает пагинацию, поиск по наименованию и реквизитам, фильтры " + "по реестрам и наличию данных по источникам." + ), + manual_parameters=ORGANIZATION_LIST_PARAMS, + responses={200: ORGANIZATION_LIST_RESPONSE}, + ) + def list(self, request, *args: Any, **kwargs: Any) -> Response: + return self._cached_response( + request, + lambda: self._list_with_enrichment(request, *args, **kwargs), + ) + + @swagger_auto_schema( + tags=[ORGANIZATIONS_TAG], + operation_id="v2_organizations_retrieve", + operation_summary="Карточка организации", + operation_description=( + "Возвращает одну организацию по UID с реестрами и данными источников. " + "Параметры data/data_sources и exclude_data/exclude_data_sources " + "позволяют запросить только нужные блоки данных." + ), + manual_parameters=ORGANIZATION_DETAIL_PARAMS, + responses={200: ORGANIZATION_DETAIL_RESPONSE, 404: "Организация не найдена"}, + ) + def retrieve(self, request, *args: Any, **kwargs: Any) -> Response: + return self._cached_response( + request, + lambda: self._retrieve_with_enrichment(request, *args, **kwargs), + ) + + def _list_with_enrichment(self, request, *args: Any, **kwargs: Any) -> Response: + queryset = self.filter_queryset(self.get_queryset()) + data_sources = self._parse_data_sources(request) + + page = self.paginate_queryset(queryset) + if page is not None: + organizations = list(page) + enrichment = self._build_missing_snapshot_enrichment( + organizations, + data_sources, + ) + serializer = self.get_serializer( + organizations, + many=True, + context={ + **self.get_serializer_context(), + "data_sources": data_sources, + "enrichment": enrichment, + }, + ) + return self.get_paginated_response(serializer.data) + + organizations = list(queryset) + enrichment = self._build_missing_snapshot_enrichment( + organizations, + data_sources, + ) + serializer = self.get_serializer( + organizations, + many=True, + context={ + **self.get_serializer_context(), + "data_sources": data_sources, + "enrichment": enrichment, + }, + ) + return Response(serializer.data) + + def _retrieve_with_enrichment( + self, + request, + *args: Any, + **kwargs: Any, + ) -> Response: + organization = self.get_object() + data_sources = self._parse_data_sources(request) + enrichment = self._build_missing_snapshot_enrichment( + [organization], + data_sources, + ) + serializer = self.get_serializer( + organization, + context={ + **self.get_serializer_context(), + "data_sources": data_sources, + "enrichment": enrichment, + }, + ) + return Response(serializer.data) + + @staticmethod + def _build_missing_snapshot_enrichment( + organizations: list[Organization], + data_sources: set[str] | None, + ) -> dict: + missing = [ + organization + for organization in organizations + if not hasattr(organization, "data_snapshot") + ] + if not missing: + return {} + return OrganizationApiEnrichmentService.build_for( + missing, + data_sources=data_sources, + ) + + @staticmethod + def _parse_data_sources(request) -> set[str] | None: + included = _query_param_values(request, "data", "data_sources") + excluded = _query_param_values(request, "exclude_data", "exclude_data_sources") + + unknown = (included | excluded) - API_DATA_SOURCE_KEY_SET + if unknown: + raise ValidationError( + { + "data": ( + "Unknown data source(s): " + + ", ".join(sorted(unknown)) + + ". Available sources: " + + ", ".join(sorted(API_DATA_SOURCE_KEY_SET)) + ) + } + ) + + if included: + return { + to_api_data_source(to_internal_data_source(source)) + for source in included - excluded + } + if excluded: + return API_DATA_SOURCE_KEY_SET - excluded + return None + + +def _query_param_values(request, *names: str) -> set[str]: + values: set[str] = set() + for name in names: + for raw_value in request.query_params.getlist(name): + values.update( + value.strip() for value in raw_value.split(",") if value.strip() + ) + return values diff --git a/src/registers/serializers.py b/src/registers/serializers.py index 209527b..3945be9 100644 --- a/src/registers/serializers.py +++ b/src/registers/serializers.py @@ -74,6 +74,7 @@ class OrganizationSerializer(serializers.ModelSerializer): class Meta: model = Organization + ref_name = "RegistryOrganization" fields = [ "id", "pn_name", @@ -97,6 +98,7 @@ class OrganizationDetailSerializer(OrganizationSerializer): ) class Meta(OrganizationSerializer.Meta): + ref_name = "RegistryOrganizationDetail" fields = OrganizationSerializer.Meta.fields + ["periods"] diff --git a/src/registers/services.py b/src/registers/services.py index 2feb614..aca67b7 100644 --- a/src/registers/services.py +++ b/src/registers/services.py @@ -153,33 +153,66 @@ class RegisterImportService: cls, rows: list[ParsedOrganization], ) -> tuple[set[int], int, int]: - snapshot_org_ids: set[int] = set() - organizations_created = 0 - organizations_updated = 0 + identities = [(row.mn_ogrn, row.mn_inn) for row in rows] + existing_by_identity = cls._get_organizations_by_identity(identities) + create_instances: list[Organization] = [] + update_instances: list[Organization] = [] + now = timezone.now() for row in rows: - organization, created = Organization.objects.get_or_create( - mn_ogrn=row.mn_ogrn, - mn_inn=row.mn_inn, - defaults={ - "pn_name": row.pn_name, - "in_kpp": row.in_kpp, - "mn_okpo": row.mn_okpo, - }, + identity = (row.mn_ogrn, row.mn_inn) + organization = existing_by_identity.get(identity) + if organization is None: + create_instances.append( + Organization( + mn_ogrn=row.mn_ogrn, + mn_inn=row.mn_inn, + pn_name=row.pn_name, + in_kpp=row.in_kpp, + mn_okpo=row.mn_okpo, + ) + ) + continue + + if cls._assign_organization_fields(organization=organization, row=row): + organization.updated_at = now + update_instances.append(organization) + + if create_instances: + Organization.objects.bulk_create(create_instances, batch_size=1000) + if update_instances: + Organization.objects.bulk_update( + update_instances, + fields=["pn_name", "in_kpp", "mn_okpo", "updated_at"], + batch_size=1000, ) - if created: - organizations_created += 1 - else: - updated = cls._update_organization_fields( - organization=organization, row=row - ) - if updated: - organizations_updated += 1 + snapshot_org_ids = { + organization.id + for organization in cls._get_organizations_by_identity(identities).values() + } + return snapshot_org_ids, len(create_instances), len(update_instances) - snapshot_org_ids.add(organization.id) + @classmethod + def _get_organizations_by_identity( + cls, + identities: list[tuple[int, int]], + ): + if not identities: + return {} - return snapshot_org_ids, organizations_created, organizations_updated + organizations = {} + for i in range(0, len(identities), 500): + query = Q() + for mn_ogrn, mn_inn in identities[i : i + 500]: + query |= Q(mn_ogrn=mn_ogrn, mn_inn=mn_inn) + organizations.update( + { + (organization.mn_ogrn, organization.mn_inn): organization + for organization in Organization.objects.filter(query) + } + ) + return organizations @classmethod def _update_organization_fields( @@ -188,25 +221,34 @@ class RegisterImportService: organization: Organization, row: ParsedOrganization, ) -> bool: - update_fields: list[str] = [] + if not cls._assign_organization_fields(organization=organization, row=row): + return False + + organization.save(update_fields=["pn_name", "in_kpp", "mn_okpo", "updated_at"]) + return True + + @classmethod + def _assign_organization_fields( + cls, + *, + organization: Organization, + row: ParsedOrganization, + ) -> bool: + changed = False if organization.pn_name != row.pn_name: organization.pn_name = row.pn_name - update_fields.append("pn_name") + changed = True - if organization.in_kpp != row.in_kpp: + if row.in_kpp is not None and organization.in_kpp != row.in_kpp: organization.in_kpp = row.in_kpp - update_fields.append("in_kpp") + changed = True if organization.mn_okpo != row.mn_okpo: organization.mn_okpo = row.mn_okpo - update_fields.append("mn_okpo") + changed = True - if not update_fields: - return False - - organization.save(update_fields=update_fields + ["updated_at"]) - return True + return changed @classmethod def get_organizations_queryset( @@ -327,19 +369,30 @@ class RegisterImportService: upload: RegisterUpload, ) -> int: """Закрыть активные периоды, которых нет в новом снимке.""" - closed_count = 0 + delete_ids: list[int] = [] + update_periods: list[RegistryMembershipPeriod] = [] for organization_id, period in active_by_org.items(): if organization_id in snapshot_org_ids: continue if period.started_at == snapshot_date: - period.delete() + delete_ids.append(period.id) else: period.ended_at = snapshot_date period.ended_by_upload = upload - period.save(update_fields=["ended_at", "ended_by_upload", "updated_at"]) - closed_count += 1 - return closed_count + period.updated_at = timezone.now() + update_periods.append(period) + + if delete_ids: + RegistryMembershipPeriod.objects.filter(id__in=delete_ids).delete() + if update_periods: + RegistryMembershipPeriod.objects.bulk_update( + update_periods, + fields=["ended_at", "ended_by_upload", "updated_at"], + batch_size=1000, + ) + + return len(delete_ids) + len(update_periods) @classmethod def _open_new_periods( diff --git a/src/settings/base.py b/src/settings/base.py index 42ccf1e..c6e17cd 100644 --- a/src/settings/base.py +++ b/src/settings/base.py @@ -46,6 +46,7 @@ INSTALLED_APPS = [ "drf_yasg", # Local apps "apps.core", + "organizations", "user", "apps.parsers", "registers", @@ -202,10 +203,14 @@ WSGI_APPLICATION = "core.wsgi.application" # ============================================================================= ZAKUPKI_TOKEN = os.getenv("ZAKUPKI_TOKEN", "") +SUPERJOB_APP_ID = os.getenv("SUPERJOB_APP_ID", "").strip() FNS_LOCK_TTL_SECONDS = 3600 PARSER_PROXIES = [ item.strip() for item in os.getenv("PARSER_PROXIES", "").split(",") if item.strip() ] +PARSER_USE_RUNTIME_PROXIES = ( + os.getenv("PARSER_USE_RUNTIME_PROXIES", "false").strip().lower() == "true" +) PROXY_TOOLS_API_KEY = os.getenv("PROXY_TOOLS_API_KEY", "").strip() PROXY_TOOLS_API_URL = os.getenv( "PROXY_TOOLS_API_URL", "https://proxy-tools.com/api/v1/proxies" diff --git a/src/settings/dev.py b/src/settings/dev.py index adf9c7e..30f7ea6 100644 --- a/src/settings/dev.py +++ b/src/settings/dev.py @@ -92,3 +92,6 @@ REST_FRAMEWORK = { "DEFAULT_THROTTLE_CLASSES": [], "DEFAULT_THROTTLE_RATES": {}, } + +# Dev-only: frontend/scripts can inspect the new v2 organizations API without JWT. +ORGANIZATIONS_V2_ALLOW_ANONYMOUS = True diff --git a/src/templates/dashboard.html b/src/templates/dashboard.html index 2da3874..0d0f2b9 100644 --- a/src/templates/dashboard.html +++ b/src/templates/dashboard.html @@ -95,7 +95,11 @@ outline: 2px solid rgb(97 175 239 / 26%); border-color: var(--accent); } - button { + button, + .button-link { + display: inline-flex; + align-items: center; + justify-content: center; min-height: 38px; border: 1px solid var(--accent); border-radius: 6px; @@ -105,10 +109,18 @@ font-weight: 650; cursor: pointer; white-space: nowrap; + text-decoration: none; } - button:hover { background: var(--accent-strong); border-color: var(--accent-strong); } - button.secondary { background: var(--field); color: var(--accent); } - button.secondary:hover { background: var(--panel-soft); } + button:hover, + .button-link:hover { + background: var(--accent-strong); + border-color: var(--accent-strong); + text-decoration: none; + } + button.secondary, + .button-link.secondary { background: var(--field); color: var(--accent); } + button.secondary:hover, + .button-link.secondary:hover { background: var(--panel-soft); } button.icon-button { width: 36px; min-width: 36px; @@ -321,6 +333,9 @@ font-size: clamp(20px, 2.3vw, 30px); line-height: 1; } + .analytics-hero-grid { + grid-template-columns: repeat(4, minmax(0, 1fr)); + } .chart-grid { display: grid; grid-template-columns: minmax(0, 1.15fr) minmax(300px, .85fr); @@ -397,6 +412,122 @@ padding: 9px; } .mode-pill strong { display: block; font-size: 18px; } + .registry-coverage-layout { + grid-template-columns: minmax(0, 1.1fr) minmax(360px, .9fr); + } + .source-coverage-list, + .action-list { + display: grid; + gap: 9px; + } + .source-coverage-row { + display: grid; + grid-template-columns: minmax(180px, 1fr) minmax(180px, 1.35fr) 90px; + gap: 10px; + align-items: center; + } + .source-coverage-row strong { + overflow: hidden; + text-overflow: ellipsis; + white-space: nowrap; + } + .registry-matrix tr.matrix-total-row { + border-top: 2px solid var(--line-strong); + background: rgb(97 175 239 / 7%); + } + .registry-matrix tr.matrix-total-row td { + font-weight: 700; + } + .pipeline-kpis { + display: grid; + grid-template-columns: repeat(4, minmax(0, 1fr)); + gap: 8px; + margin-bottom: 12px; + } + .pipeline-kpi { + border: 1px solid var(--line); + border-radius: 8px; + background: var(--field); + padding: 10px; + } + .pipeline-kpi span { + display: block; + color: var(--muted); + font-size: 12px; + margin-bottom: 6px; + } + .pipeline-kpi strong { + color: #f4f6f8; + font-size: 22px; + line-height: 1; + } + .registry-matrix { + overflow: visible; + border: 1px solid var(--line); + border-radius: 8px; + } + .registry-matrix table { + width: 100%; + table-layout: fixed; + } + .registry-matrix td { + white-space: nowrap; + } + .registry-matrix th { + white-space: normal; + line-height: 1.15; + vertical-align: bottom; + } + .registry-matrix th:first-child, + .registry-matrix td:first-child { + width: 18%; + } + .registry-matrix th:nth-child(2), + .registry-matrix td:nth-child(2) { + width: 54px; + text-align: center; + } + .registry-matrix th:not(:first-child):not(:nth-child(2)), + .registry-matrix td:not(:first-child):not(:nth-child(2)) { + text-align: center; + } + .matrix-heading { + display: inline-flex; + align-items: center; + justify-content: flex-end; + height: 112px; + writing-mode: vertical-rl; + transform: rotate(180deg); + white-space: normal; + text-align: left; + } + .matrix-cell { + width: 58px; + min-width: 58px; + max-width: 58px; + margin: 0 auto; + border-radius: 6px; + padding: 5px 7px; + background: var(--field); + color: var(--muted); + text-align: center; + font-weight: 650; + } + .matrix-cell.ok { background: rgb(152 195 121 / 18%); color: var(--ok); } + .matrix-cell.warn { background: rgb(229 192 123 / 18%); color: var(--warn); } + .matrix-cell.fail { background: rgb(224 108 117 / 14%); color: var(--danger); } + .action-item { + border: 1px solid var(--line); + border-radius: 8px; + background: var(--field); + padding: 10px; + display: grid; + gap: 4px; + } + .technical-counters { + border-top: 1px solid rgb(75 82 99 / 58%); + padding-top: 18px; + } .registry-analytics { display: grid; grid-template-columns: minmax(220px, .9fr) minmax(0, 1.5fr); @@ -420,6 +551,25 @@ display: grid; gap: 10px; } + .registry-coverage-panel { + grid-column: 1 / -1; + border: 1px solid var(--line); + border-radius: 8px; + background: var(--field); + padding: 12px; + display: grid; + gap: 10px; + } + .registry-coverage-list { + display: grid; + gap: 8px; + } + .registry-coverage-row { + display: grid; + grid-template-columns: minmax(220px, 1fr) minmax(220px, 1.4fr) 92px; + gap: 10px; + align-items: center; + } .registry-bar-row { display: grid; grid-template-columns: minmax(280px, .9fr) minmax(240px, 1.3fr) 96px; @@ -443,6 +593,274 @@ text-align: right; } + .organizations-toolbar { + display: grid; + grid-template-columns: minmax(260px, 1.2fr) minmax(190px, .8fr) minmax(220px, .9fr) auto auto; + gap: 10px; + align-items: end; + padding: 14px; + border: 1px solid rgb(75 82 99 / 52%); + border-radius: 8px; + background: rgb(31 35 41 / 28%); + } + .checkbox-row { + min-height: 38px; + display: flex; + align-items: center; + gap: 8px; + color: var(--text); + font-size: 13px; + } + .checkbox-row input { + width: 16px; + height: 16px; + accent-color: var(--accent); + } + .organizations-table-wrap { + min-height: 560px; + overflow-x: auto; + border: 1px solid var(--line); + border-radius: 8px; + background: var(--panel); + } + .organizations-table { + table-layout: fixed; + min-width: 1100px; + } + .organizations-table th:nth-child(1), + .organizations-table td:nth-child(1) { width: 35%; } + .organizations-table th:nth-child(2), + .organizations-table td:nth-child(2) { width: 220px; } + .organizations-table th:nth-child(3), + .organizations-table td:nth-child(3) { width: 260px; } + .organizations-table th:nth-child(4), + .organizations-table td:nth-child(4) { width: 240px; } + .organizations-table th:nth-child(5), + .organizations-table td:nth-child(5) { width: 280px; } + .organization-name { + display: block; + color: #eef2f7; + font-weight: 650; + overflow: hidden; + text-overflow: ellipsis; + white-space: nowrap; + } + .organization-meta { + display: block; + margin-top: 3px; + color: var(--muted); + font-size: 12px; + overflow: hidden; + text-overflow: ellipsis; + white-space: nowrap; + } + .registry-tags { + display: flex; + gap: 5px; + flex-wrap: wrap; + } + .data-source-tags { + display: flex; + gap: 5px; + flex-wrap: wrap; + } + .organizations-footer { + min-height: 40px; + display: flex; + align-items: center; + justify-content: space-between; + gap: 12px; + flex-wrap: wrap; + } + .pagination-controls { + display: flex; + align-items: center; + gap: 8px; + flex-wrap: wrap; + } + .pagination-controls select { + width: 96px; + } + .organizations-loading { + opacity: .66; + pointer-events: none; + } + .organizations-table tbody tr { + cursor: pointer; + } + .organization-source-grid { + display: grid; + grid-template-columns: repeat(auto-fit, minmax(220px, 1fr)); + gap: 8px; + margin: 12px 0; + } + .source-choice { + min-height: 48px; + display: flex; + align-items: center; + gap: 9px; + border: 1px solid var(--line); + border-radius: 8px; + background: var(--field); + padding: 9px 10px; + color: var(--text); + cursor: pointer; + } + .source-choice input { + width: 16px; + height: 16px; + accent-color: var(--accent); + } + .organization-detail-head { + display: flex; + justify-content: space-between; + gap: 12px; + align-items: flex-start; + padding: 14px; + border: 1px solid var(--line); + border-radius: 8px; + background: var(--panel-soft); + } + .organization-data-grid { + display: grid; + gap: 14px; + } + .organization-source-tabs { + display: flex; + flex-wrap: wrap; + gap: 6px; + padding: 8px; + border: 1px solid var(--line); + border-radius: 8px; + background: var(--field); + } + .organization-source-tabs button { + min-height: 34px; + padding: 0 11px; + background: transparent; + color: var(--muted); + border-color: transparent; + } + .organization-source-tabs button.active { + color: var(--text); + border-color: var(--accent); + background: rgb(97 175 239 / 18%); + } + .organization-data-section { + border: 1px solid var(--line); + border-radius: 8px; + background: var(--panel); + padding: 14px; + min-width: 0; + } + .organization-data-section header { + position: static; + padding: 0 0 10px; + border: 0; + background: transparent; + box-shadow: none; + } + .data-table-wrap { + max-height: 520px; + overflow: auto; + border: 1px solid var(--line); + border-radius: 8px; + } + .data-table { + min-width: 760px; + } + .data-table td, + .data-table th { + overflow-wrap: anywhere; + } + .data-table tbody tr[data-organization-record-detail] { + cursor: pointer; + } + .data-table tbody tr[data-organization-record-detail]:hover { + background: rgb(97 175 239 / 10%); + } + .fns-form { + display: grid; + gap: 12px; + } + .fns-form-block { + display: grid; + gap: 8px; + } + .fns-form-block h3 { + margin: 0; + font-size: 15px; + } + .fns-balance-grid { + display: grid; + grid-template-columns: repeat(2, minmax(0, 1fr)); + gap: 12px; + } + .fns-balance-side { + display: grid; + gap: 8px; + min-width: 0; + } + .fns-balance-side h4 { + margin: 0; + color: var(--accent); + font-size: 13px; + text-transform: uppercase; + } + .fns-balance-side .data-table { + min-width: 620px; + } + .fns-code-column { + min-width: 58px; + width: 58px; + white-space: nowrap; + word-break: normal; + overflow-wrap: normal; + } + .fns-period-column { + min-width: 92px; + white-space: nowrap; + } + .fns-balance-side .data-table td:nth-child(2), + .fns-results-table td:nth-child(2) { + white-space: nowrap; + word-break: normal; + overflow-wrap: normal; + } + .fns-form-group-row td { + background: rgb(97 175 239 / 12%); + color: var(--text); + font-weight: 700; + } + .fns-form-heading-row td { + background: var(--field); + color: var(--accent); + font-weight: 800; + text-transform: uppercase; + } + .fns-form-total td { + color: var(--text); + font-weight: 800; + } + .year-tabs { + display: flex; + flex-wrap: wrap; + gap: 6px; + margin: 10px 0; + } + .year-tabs button { + min-height: 30px; + padding: 0 10px; + background: var(--field); + color: var(--muted); + border-color: var(--line); + } + .year-tabs button.active { + color: var(--text); + border-color: var(--accent); + background: rgb(97 175 239 / 18%); + } + .source-list { display: grid; grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); @@ -1030,8 +1448,11 @@ @media (max-width: 1100px) { .kpi-grid { grid-template-columns: repeat(3, minmax(0, 1fr)); } + .analytics-hero-grid, + .pipeline-kpis { grid-template-columns: repeat(2, minmax(0, 1fr)); } .chart-grid { grid-template-columns: 1fr; } .registry-analytics { grid-template-columns: 1fr; } + .fns-balance-grid { grid-template-columns: 1fr; } .bar-row { grid-template-columns: minmax(120px, 1fr) minmax(120px, 1.4fr) 74px; } } @media (max-width: 760px) { @@ -1040,6 +1461,9 @@ .grid, .source-detail-grid, .detail-grid { grid-template-columns: 1fr; } .dashboard-bar { align-items: stretch; flex-direction: column; } .kpi-grid { grid-template-columns: repeat(2, minmax(0, 1fr)); } + .analytics-hero-grid, + .pipeline-kpis, + .source-coverage-row { grid-template-columns: 1fr; } .source-meta { grid-template-columns: 1fr; } .source-list { grid-template-columns: 1fr; } .upload-row { grid-template-columns: 1fr; } @@ -1060,6 +1484,7 @@ + Swagger нет токена @@ -1194,6 +1619,7 @@