diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml index 7358f9c..97dd27b 100644 --- a/docker-compose.dev.yml +++ b/docker-compose.dev.yml @@ -4,6 +4,13 @@ x-app-build: &app-build args: INSTALL_DEV: "true" +x-parser-runtime-env: &parser-runtime-env + CHECKO_API_KEY: ${CHECKO_API_KEY:-} + SUPERJOB_APP_ID: ${SUPERJOB_APP_ID:-} + PROXY_TOOLS_API_KEY: ${PROXY_TOOLS_API_KEY:-} + PARSER_USE_RUNTIME_PROXIES: ${PARSER_USE_RUNTIME_PROXIES:-} + CHECKO_USE_RUNTIME_PROXIES: ${CHECKO_USE_RUNTIME_PROXIES:-} + services: db: image: postgres:15.10 @@ -65,6 +72,8 @@ services: restart: unless-stopped env_file: - .env.dev + environment: + <<: *parser-runtime-env depends_on: migrate: condition: service_completed_successfully @@ -83,6 +92,7 @@ services: container_name: mostovik_celery_worker restart: unless-stopped environment: + <<: *parser-runtime-env CELERY_WORKER_CONCURRENCY: "1" CELERY_WORKER_MAX_MEMORY_PER_CHILD_KB: "3145728" env_file: @@ -104,6 +114,8 @@ services: restart: unless-stopped env_file: - .env.dev + environment: + <<: *parser-runtime-env depends_on: migrate: condition: service_completed_successfully diff --git a/docs/parser-external-access-note-ru.md b/docs/parser-external-access-note-ru.md new file mode 100644 index 0000000..c67d634 --- /dev/null +++ b/docs/parser-external-access-note-ru.md @@ -0,0 +1,64 @@ +# Аналитическая записка по внешним обращениям парсеров + +Дата подготовки: 2026-05-18 + +## Краткое описание + +В backend Mostovik подсистема парсеров реализована как набор Celery-задач и HTTP-клиентов. Парсеры обращаются к открытым государственным источникам, публичным API и, при наличии ключей в переменных окружения, к платным/служебным API. Полученные данные нормализуются и сохраняются в PostgreSQL через сервисный слой. + +Основные точки входа находятся в `src/apps/parsers/tasks.py`. Внешние HTTP-запросы выполняются через клиенты в `src/apps/parsers/clients/`. Общий HTTP-клиент использует GET/POST, таймауты, стандартный User-Agent и, если включено `PARSER_USE_RUNTIME_PROXIES`, может использовать активные RU-прокси из БД. + +## Куда обращается и что скачивает + +| Источник | Адреса обращения | Что скачивается / получается | +|---|---|---| +| Минпромторг: сертификаты промышленного производства | `https://minpromtorg.gov.ru/api/kss-document-preview` | JSON-список документов, затем последний Excel-файл из поля `files[].url`. Из Excel берутся номер заключения, даты, ссылка на документ, наименование организации, ИНН, ОГРН. | +| Минпромторг: реестр производителей | `https://minpromtorg.gov.ru/api/kss-document-preview` | JSON-список документов, затем последний Excel-файл `data_orgs_YYYYMMDD...`. Из Excel берутся наименование производителя, ИНН, ОГРН, адрес. | +| Минпромторг: промышленная продукция | `https://minpromtorg.gov.ru/api/kss-document-preview` | JSON-список документов, затем Excel-файл реестра промышленной продукции. Из Excel берутся организация, ИНН, ОГРН, регистрационный номер, наименование продукции, модель, ОКПД2, ТН ВЭД, нормативный документ. | +| ГИСП: промышленная продукция | `https://gisp.gov.ru/pp719v2/pub/prod/`, технический API `https://gisp.gov.ru/pp719v2/pub/prod/b/` | В каталоге источников этот адрес указан как upstream для продукции. Универсальный файловый клиент умеет получить первую страницу JSON через POST на `/pp719v2/pub/prod/b/`; основная Celery-задача `parse_industrial_products` сейчас использует Excel discovery Минпромторга. | +| Генпрокуратура: единый реестр проверок | `https://proverki.gov.ru/portal/public-open-data/check/{year}/{month}?isFederalLaw248=true|false`, `https://proverki.gov.ru/portal/public-open-data/check/{year}/plans?isFederalLaw248=true|false` | ZIP/XML выгрузки с проверками или планами проверок. При необходимости используется Playwright: открывается страница портала, выбирается вкладка скачивания, загружается ZIP/XML. | +| ЕИС закупки: SOAP-интеграция | `https://int44.zakupki.gov.ru/eis-integration/services/getDocsIP` | SOAP-запрос возвращает `archiveUrl`; затем скачивается ZIP/XML-архив закупок/контрактов. Для доступа используется `ZAKUPKI_TOKEN` из окружения. | +| ЕИС закупки: HTTP fallback | `https://zakupki.gov.ru/opendata/download/notifications/{region}/{year}/...` | ZIP-архивы с XML-файлами закупок, если SOAP-токен не используется или передана прямая ссылка. | +| ЕИС/FAS generic-источники | `https://zakupki.gov.ru/epz/order/extendedsearch/results.html`, `https://zakupki.gov.ru/epz/orderclause/search/results.html`, `https://zakupki.gov.ru/epz/contract/search/results.html`, `https://zakupki.gov.ru/epz/dishonestsupplier/search/results.html`, `https://fas.gov.ru/pages/activity/reestr-uridicheskih-lic` | HTML-страницы официальных реестров. Парсер извлекает карточки/таблицы: закупки 44-ФЗ, закупки 223-ФЗ, контракты, недобросовестные поставщики, сведения ФАС по ГОЗ. | +| ФНС: бухгалтерская отчетность | Автоматического HTTP-скачивания с ФНС в текущем коде не найдено. В каталоге источников указан справочный URL `https://bo.nalog.gov.ru/advanced-search/organizations/search?...` | Обрабатываются локально загруженные или положенные в папку `input/fns` файлы `fin_{id}_{ogrn}.xlsx`, а также ZIP-архивы с такими файлами. Из Excel берутся строки форм N 1, 2, 3, 4, 6 бухгалтерской отчетности. | +| КАД Арбитр через Checko | Официальный источник в каталоге: `https://kad.arbitr.ru/`; фактический lookup в коде: `https://api.checko.ru/v2/legal-cases` | JSON-ответы по арбитражным делам для активных организаций из внутренних реестров. В запрос передаются ИНН/ОГРН. В payload сохраняются номер дела, суд, тип, статус, даты, суммы, стороны и ссылка на карточку. | +| Федресурс/ЕФРСБ | `https://bankrot.fedresurs.ru/`; fallback: `https://api.checko.ru/v2/company` | Официальный источник обрабатывается как HTML/структурированная выгрузка. При недоступности портала используется Checko: по ИНН/ОГРН организации берутся сведения о банкротных сообщениях из JSON. | +| ФСТЭК | `https://reestr.fstec.ru/reg3` и найденные на странице ссылки вида `module=rfiles` или `/uploads/reg...` | HTML-страница реестра, затем CSV/файловая выгрузка, если ссылка найдена. Для этого источника в коде отключена SSL-верификация. | +| Вакансии: Работа России | Клиент использует `http://opendata.trudvsem.ru/api/v1/vacancies`, `http://opendata.trudvsem.ru/api/v1/vacancies/company/inn/{inn}`; в каталоге источников указан `https://opendata.trudvsem.ru/api/v1/vacancies` | JSON-список вакансий, включая работодателя, ИНН/ОГРН при наличии, название вакансии, дату, зарплату, статус, ссылку. | +| Вакансии: HeadHunter | `https://api.hh.ru/vacancies` | JSON-список вакансий. Поиск выполняется по региону и/или тексту, для организаций без поиска по ИНН используется нормализованное название. | +| Вакансии: SuperJob | `https://api.superjob.ru/2.0/vacancies/` | JSON-список вакансий. Используется только если задан `SUPERJOB_APP_ID`; ключ передается в заголовке `X-Api-App-Id`. | +| Checko: контракты и проверки по организациям | `https://api.checko.ru/v2/contracts`, `https://api.checko.ru/v2/inspections` | JSON-данные по контрактам и проверкам для активных организаций из внутренних реестров. В запрос передаются ИНН/ОГРН, API-ключ передается параметром `key`. | +| Proxy-Tools | `https://proxy-tools.com/api/v1/proxies` | Служебная загрузка списка RU-прокси для парсеров. Используется только при заданном `PROXY_TOOLS_API_KEY`; запрос идет с Bearer-токеном. | + +## Форматы загружаемых данных + +Парсеры работают со следующими форматами: + +- JSON-ответы публичных/платных API. +- Excel-файлы `.xlsx`/`.xlsm` с реестрами Минпромторга и бухгалтерской отчетностью ФНС. +- ZIP-архивы с XML/CSV/JSON/HTML/XLSX-файлами. +- XML-файлы выгрузок проверок и закупок. +- HTML-страницы официальных реестров с таблицами или карточками. +- CSV-файлы, в частности для ФСТЭК. + +Скачанные файлы не исполняются как код: они читаются как данные, парсятся и сохраняются в БД. + +## Передаваемые наружу параметры + +Во внешние запросы могут уходить: + +- периоды загрузки: год, месяц, дата; +- коды регионов; +- ИНН/ОГРН организаций из внутренних активных реестров; +- поисковая строка по названию организации для вакансий; +- служебные ключи API из окружения: `ZAKUPKI_TOKEN`, `CHECKO_API_KEY`, `SUPERJOB_APP_ID`, `PROXY_TOOLS_API_KEY`. + +Ключи в коде не захардкожены, берутся из переменных окружения. + +## Важные оговорки + +- Для ряда задач предусмотрен параметр `file_url`. Если оператор передает его вручную, парсер скачивает файл по переданной ссылке, а не только по дефолтному адресу источника. +- Для ФНС текущая реализация backend не скачивает файлы автоматически с сайта ФНС, а обрабатывает уже полученные Excel/ZIP-файлы через папку наблюдения или API-загрузку. +- Для `proverki.gov.ru` возможен запуск headless Chromium через Playwright, потому что часть загрузок доступна через JS-интерфейс портала. +- Для ФСТЭК SSL-верификация отключена настройкой клиента источника. +- Runtime-прокси из БД используются только при включенном `PARSER_USE_RUNTIME_PROXIES=true`; отдельная задача синхронизации прокси обращается к Proxy-Tools только при наличии `PROXY_TOOLS_API_KEY`. diff --git a/docs/superpowers/plans/2026-05-18-direct-parser-source-ingestion.md b/docs/superpowers/plans/2026-05-18-direct-parser-source-ingestion.md new file mode 100644 index 0000000..276f0e2 --- /dev/null +++ b/docs/superpowers/plans/2026-05-18-direct-parser-source-ingestion.md @@ -0,0 +1,85 @@ +# Direct Parser Source Ingestion Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Move parser runtime reads and writes from legacy parser record tables to organization source storage. + +**Architecture:** Add a focused ingestion service in `organizations` that persists normalized source-record inputs directly into polymorphic source extensions. Parser services become adapters from parser dataclasses to ingestion inputs. Runtime reads use `OrganizationSourceRecord` and extension counters. + +**Tech Stack:** Django 3.2, PostgreSQL, DRF, django-polymorphic, pytest. + +--- + +### Task 1: Direct Ingestion Core + +**Files:** +- Create: `src/organizations/source_identity.py` +- Create: `src/organizations/source_ingestion.py` +- Modify: `src/organizations/source_backfill.py` +- Test: `tests/apps/organizations/test_source_ingestion.py` +- Test: `tests/apps/organizations/test_source_backfill.py` + +- [ ] Write failing tests for direct generic source ingestion. +- [ ] Write failing tests for FNS report ingestion with financial lines. +- [ ] Extract identity normalization from backfill into a shared helper. +- [ ] Implement `SourceRecordInput` and `SourceFinancialLineInput`. +- [ ] Implement `OrganizationSourceIngestionService.save_records`. +- [ ] Keep backfill behavior green by using the same identity normalization helper. + +### Task 2: Parser Save Services + +**Files:** +- Modify: `src/apps/parsers/services.py` +- Test: `tests/apps/parsers/test_services.py` + +- [ ] Switch generic source saves to `OrganizationSourceIngestionService`. +- [ ] Switch industrial certificate/manufacturer/product saves. +- [ ] Switch inspection and procurement saves. +- [ ] Switch FNS report saves and duplicate checks. +- [ ] Replace period/deduplication helpers with source-record queries. + +### Task 3: Parser Tasks + +**Files:** +- Modify: `src/apps/parsers/tasks.py` +- Test: `tests/apps/parsers/test_tasks.py` + +- [ ] Remove source backfill queueing from parser completion. +- [ ] Keep parser load logs and background job progress unchanged. +- [ ] Return source-record identifiers for FNS processing instead of legacy report ids. + +### Task 4: Runtime Reads + +**Files:** +- Modify: `src/apps/parsers/source_cards.py` +- Modify: `src/apps/parsers/views.py` +- Modify: `src/apps/parsers/serializers.py` +- Modify: `src/apps/core/admin_dashboard.py` +- Modify: `src/apps/backups/services.py` +- Test: parser source-card and result endpoint tests. + +- [ ] Move source card counts and timestamps to source extensions/source records. +- [ ] Move parser log organization counts to source records. +- [ ] Adapt v1 parser result endpoints to read source records. +- [ ] Move dashboard/export runtime reads off legacy parser models. + +### Task 5: Frontend Record Detail + +**Files:** +- Modify: `mostovik-frontend/src/pages/main/model/source-record-detail/*` +- Test: frontend source-detail/source-record-detail unit tests. + +- [ ] Replace legacy generated v1 detail clients with organization source-record reads. +- [ ] Use `payload` plus top-level source-record fields for detail rendering. +- [ ] Keep source-detail lists on the new source-record list endpoint. + +### Task 6: Validation + +**Files:** +- No production files. + +- [ ] Run focused backend parser/organization tests. +- [ ] Run frontend source-detail/source-record-detail checks. +- [ ] Run live parser smoke against one small generic source. +- [ ] Confirm legacy parser record counts do not change during the smoke. +- [ ] Confirm new organization source-record counts do change. diff --git a/docs/superpowers/plans/2026-05-18-polymorphic-organization-sources.md b/docs/superpowers/plans/2026-05-18-polymorphic-organization-sources.md new file mode 100644 index 0000000..cbe221f --- /dev/null +++ b/docs/superpowers/plans/2026-05-18-polymorphic-organization-sources.md @@ -0,0 +1,100 @@ +# Polymorphic Organization Sources Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Replace source-centric parser output access with organization-centric polymorphic source extensions. + +**Architecture:** Keep `Organization` as the root entity. Add polymorphic source extensions per product source group and a shared subordinate source-record table. Backfill legacy parser tables idempotently, then switch API v2 to the new extension data. + +**Tech Stack:** Django 3.2, Django REST Framework, django-filter, django-polymorphic, PostgreSQL, pytest. + +--- + +### Task 1: Dependency And Schema + +**Files:** +- Modify: `pyproject.toml` +- Modify: `uv.lock` +- Modify: `src/settings/base.py` +- Modify: `src/organizations/models.py` +- Create: `src/organizations/migrations/0006_polymorphic_source_extensions.py` +- Test: `tests/apps/organizations/test_source_extensions_models.py` + +- [ ] Add `django-polymorphic` to project dependencies. +- [ ] Add `"polymorphic"` to `INSTALLED_APPS` before local apps. +- [ ] Add source-group and identity-status choices. +- [ ] Add `identity_status` and `primary_identity` to `Organization`. +- [ ] Add `OrganizationSourceExtension` as `PolymorphicModel`. +- [ ] Add source extension subclasses. +- [ ] Add `OrganizationSourceRecord`. +- [ ] Add `OrganizationSourceFinancialLine`. +- [ ] Write tests proving: + - one extension per `(organization, source_group)`; + - polymorphic queries return subclass instances; + - source records are unique by legacy model/pk; + - financial lines attach to a source record. + +### Task 2: Backfill Service + +**Files:** +- Create: `src/organizations/source_groups.py` +- Create: `src/organizations/source_backfill.py` +- Create: `src/organizations/management/commands/backfill_organization_sources.py` +- Test: `tests/apps/organizations/test_source_backfill.py` + +- [ ] Define source group mapping for all legacy parser sources. +- [ ] Implement organization resolution by `inn + kpp`, `ogrn`, `ogrip`, unique `inn`, then normalized name. +- [ ] Implement idempotent extension creation/update. +- [ ] Implement idempotent source record creation/update. +- [ ] Preserve legacy row payload and `(legacy_model, legacy_pk)`. +- [ ] Backfill financial report lines into `OrganizationSourceFinancialLine`. +- [ ] Report scanned, created organizations, created extensions, updated extensions, created records, updated records, unresolved rows. + +### Task 3: API v2 Switch + +**Files:** +- Modify: `src/organizations/serializers.py` +- Modify: `src/organizations/filters.py` +- Modify: `src/organizations/views.py` +- Delete or stop using: `src/organizations/api_enrichment.py` +- Delete or stop using: `src/organizations/services.py` snapshot refresh paths +- Test: `tests/apps/organizations/test_api_v2.py` + +- [ ] Replace embedded `data` JSON with compact `sources`. +- [ ] Add source extension list/detail serializers. +- [ ] Add source records endpoint. +- [ ] Rework source filters to use `OrganizationSourceExtension`. +- [ ] Remove snapshot dependency from list/retrieve behavior. +- [ ] Keep old snapshot management command only as deprecated/no-op until cleanup. + +### Task 4: Parser Write Path + +**Files:** +- Modify: `src/apps/parsers/tasks.py` +- Modify: `src/organizations/tasks.py` +- Test: `tests/apps/parsers/test_tasks.py` +- Test: `tests/apps/organizations/test_tasks.py` + +- [ ] Replace snapshot refresh queueing with source backfill queueing for affected parser batches. +- [ ] For each parser completion, backfill only the completed source/batch. +- [ ] Keep full backfill command for initial migration and repair. + +### Task 5: Frontend Contract Repair + +**Files:** +- Modify frontend generated API clients after backend OpenAPI changes. +- Modify source detail table composables to consume `sources` and source records endpoints. + +- [ ] Regenerate API client. +- [ ] Update source pages to request extension records instead of embedded `organization.data[source]`. +- [ ] Verify planned inspections page loads from source records. + +### Task 6: Cleanup Phase + +**Files:** +- Modify migrations only after successful backfill validation. + +- [ ] Remove `OrganizationDataSnapshot`. +- [ ] Remove snapshot refresh schedules. +- [ ] Decide which legacy parser tables remain as ingestion staging and which can be dropped. +- [ ] Run full backend and frontend validation. diff --git a/docs/superpowers/specs/2026-05-18-direct-parser-source-ingestion-design.md b/docs/superpowers/specs/2026-05-18-direct-parser-source-ingestion-design.md new file mode 100644 index 0000000..3462053 --- /dev/null +++ b/docs/superpowers/specs/2026-05-18-direct-parser-source-ingestion-design.md @@ -0,0 +1,76 @@ +# Direct Parser Source Ingestion Design + +## Goal + +Parser runtime must write parsed source records directly into the organization-centric +polymorphic storage: + +- `organizations_organization` +- `organizations_source_extension` +- source extension subclass tables +- `organizations_source_record` +- `organizations_source_financial_line` + +Legacy parser record tables remain only as migration/audit inputs until a later +destructive cleanup. They must not be part of the parser runtime write path or the +runtime read path used by the application. + +## Current Runtime Problem + +Current parser tasks write source rows into legacy parser tables such as +`GenericParserRecord`, `InspectionRecord`, `ProcurementRecord`, +`IndustrialProductRecord`, and `FinancialReport`, then enqueue source backfill into +the new organization storage. This keeps old tables in the hot path and allows new +runtime data to diverge before the async backfill runs. + +## Target Runtime + +Parser tasks keep using `ParserLoadLog`, `ParserBatchSequence`, and `BackgroundJob` +as operational metadata. Parsed records are converted into normalized source-record +inputs and persisted through one ingestion service. + +The ingestion service is responsible for: + +- normalizing identity fields before writing canonical organizations; +- resolving or creating `Organization`; +- creating or updating the source-group polymorphic extension; +- creating or updating `OrganizationSourceRecord` by `(source, external_id)`; +- writing structured financial lines for FNS reports; +- refreshing extension counters in the same transaction. + +Parser save services return the number of inserted or updated source records. They no +longer create or query legacy parser record models for runtime decisions. + +## Runtime Read Scope + +The following runtime reads must use organization source storage: + +- parser source cards and source item counters; +- parser log organization counts; +- source detail lists; +- source record detail reads; +- frontend-facing parser result compatibility endpoints while they remain exposed; +- admin/dashboard/export paths that are used by the app during normal operation. + +Legacy parser tables may still be read by explicit migration/backfill tooling only. + +## Compatibility + +Existing v1 parser-result URLs can remain during transition, but their data source must +be `OrganizationSourceRecord`, not the legacy parser models. Response shape can be +kept best-effort through serializers/adapters that read source-record payloads. + +## Non-Goals + +- Do not drop legacy parser tables in this phase. +- Do not rewrite parser clients. +- Do not remove parser load logs or background jobs. +- Do not make every payload strongly typed immediately. + +## Risks + +- Industrial product ingestion is large; the writer must avoid per-record table scans. +- Existing tests assert legacy model counts and must be updated to assert source-record + behavior. +- Some compatibility endpoints expose legacy primary keys. New records use UUIDs, so + compatibility adapters must accept source-record UUIDs where needed. diff --git a/docs/superpowers/specs/2026-05-18-polymorphic-organization-sources-design.md b/docs/superpowers/specs/2026-05-18-polymorphic-organization-sources-design.md new file mode 100644 index 0000000..6912b3a --- /dev/null +++ b/docs/superpowers/specs/2026-05-18-polymorphic-organization-sources-design.md @@ -0,0 +1,222 @@ +# Polymorphic Organization Sources Design + +## Goal + +Replace the current source-centric parser tables and API v2 JSON snapshot model with an organization-centric schema: + +- `Organization` is the main business entity and stores legal identity data. +- Each source group is represented as a polymorphic organization extension. +- Detailed source records hang under the extension as subordinate records. +- API compatibility with the current frontend is not required. + +## Current Data Facts + +The current dev database contains: + +- `organizations.Organization`: 29,667 rows. +- `OrganizationDataSnapshot`: 29,667 rows after refresh. +- `registers.Organization`: 5,138 rows. +- `InspectionRecord`: 14,059 rows. +- `ProcurementRecord`: 1,000 rows. +- `IndustrialCertificateRecord`: 23,640 rows. +- `ManufacturerRecord`: 8,762 rows. +- `IndustrialProductRecord`: 471,824 rows. +- `GenericParserRecord`: 3,506 rows. +- `FinancialReport`: 10 rows. + +Observed required-field candidates: + +- `Organization.name` is present in 100% of canonical organizations and can be required. +- `inn` is present in 95.34% of canonical organizations. +- `ogrn` is present in 74.84%. +- `kpp` is present in 20.56%. +- `ogrip` is present in 8.08%. +- 673 canonical organizations have no `inn`, `ogrn`, or `ogrip`. + +Therefore only `name` can be required on `Organization`. Identifiers must stay optional and indexed. Identity completeness should be explicit through a status field, not hidden in nullability. + +## Source Groups + +The new source groups match the product navigation: + +- Financial indicators. +- Government procurements, including 44-FZ, 223-FZ, contracts, and legacy procurement rows. +- Russian manufacturers and products, including certificates, manufacturers, and industrial products. +- Planned inspections. +- Bankruptcy procedures. +- Defense supplier risk, including unfair suppliers and FAS GOZ. +- Arbitration cases. +- Information security registries. +- Vacancies from Trudvsem, HH, and SuperJob. + +## Target Schema + +### Organization + +`organizations.Organization` remains the root table. + +Required: + +- `uid` +- `name` + +Optional but indexed: + +- `inn` +- `kpp` +- `ogrn` +- `ogrip` + +New fields: + +- `identity_status`: one of `complete`, `partial`, `missing`. +- `primary_identity`: short normalized search key used for deterministic deduplication and diagnostics. + +Existing uniqueness constraints on non-empty identifiers should remain conservative. They protect the API and backfill from accidental duplicate canonical organizations. + +### OrganizationSourceExtension + +Add a new polymorphic base model: + +- `uid` +- `organization` +- `source_group` +- `title` +- `status` +- `records_count` +- `first_seen_at` +- `last_seen_at` +- `last_load_batch` +- `metadata` +- timestamps + +Constraints: + +- One extension per `(organization, source_group)`. + +Subclasses: + +- `FinancialIndicatorsExtension` +- `GovernmentProcurementExtension` +- `IndustrialProductionExtension` +- `PlannedInspectionExtension` +- `BankruptcyExtension` +- `DefenseSupplierExtension` +- `ArbitrationExtension` +- `SecurityRegistryExtension` +- `VacancyExtension` + +Rationale: `Organization` itself must not be polymorphic because one organization can have many source groups simultaneously. The polymorphic boundary belongs to source extensions. + +### OrganizationSourceRecord + +Use one subordinate detail table for most source rows: + +- `uid` +- `extension` +- `record_type` +- `source` +- `external_id` +- `title` +- `record_date` +- `amount` +- `status` +- `url` +- `payload` +- `legacy_model` +- `legacy_pk` +- `load_batch` +- timestamps + +Constraints: + +- Unique `(source, external_id)` when `external_id` is non-empty. +- Unique `(legacy_model, legacy_pk)` for migrated legacy rows. + +This keeps the number of tables low while still preserving every source-specific payload. + +### FinancialReportLine + +Financial report lines remain a subordinate table because they are structured, repeated, and likely to be queried by year/form/line: + +- `source_record` +- `form_code` +- `line_code` +- `line_name` +- `year` +- `period_start` +- `period_end` + +The existing legacy `FinancialReportLine` table is used only as a staging source after the migration. + +## Backfill Rules + +Backfill must be idempotent. + +For each legacy row: + +1. Resolve canonical `Organization` by identifiers in this order: + - exact `inn + kpp` where available, + - exact `ogrn`, + - exact `ogrip`, + - exact `inn` only when it maps to one canonical organization, + - normalized name fallback only when there is one unambiguous match. +2. If no organization can be resolved, create or reuse an organization with `identity_status=missing` or `partial`. +3. Create or update the matching `OrganizationSourceExtension`. +4. Create or update `OrganizationSourceRecord`. +5. Preserve the original source row in `payload`. +6. Store `legacy_model` and `legacy_pk` for audit and repeatable updates. + +## API Shape + +The new API should be organization-centric: + +- `GET /api/v2/organizations/` +- `GET /api/v2/organizations/{uid}/` +- `GET /api/v2/organizations/{uid}/sources/` +- `GET /api/v2/organization-sources/{uid}/records/` + +The list endpoint can expose compact source summaries: + +```json +{ + "uid": "...", + "name": "...", + "inn": "...", + "ogrn": "...", + "identity_status": "complete", + "sources": [ + { + "uid": "...", + "source_group": "planned_inspections", + "title": "Плановые проверки Генпрокуратуры России", + "records_count": 12, + "last_seen_at": "2026-05-18T00:00:00Z" + } + ] +} +``` + +Source records are fetched on demand from the extension, not embedded into every organization list row. + +## Migration Phases + +1. Add dependency and schema. +2. Add idempotent backfill service and management command. +3. Backfill all existing legacy parser data into source extensions. +4. Switch API v2 to source extensions. +5. Update frontend generated clients and source pages. +6. After verification, remove `OrganizationDataSnapshot` and legacy parser tables in a separate cleanup phase. + +## Non-Goals For First Pass + +- No destructive deletion of legacy parser tables before backfill verification. +- No attempt to make all source payloads strongly typed immediately. +- No frontend visual redesign; only data contract changes needed for the new schema. + +## Risks + +- Polymorphic subclass queries add joins. List endpoints must query the base extension compactly and load records only on demand. +- Legacy rows without identifiers require name fallback and can create duplicates. The command must report unresolved/created organizations. +- Current v2 filters use source-specific existence checks. They must move to extension existence filters. +- Frontend generated API clients will break and must be regenerated after backend OpenAPI changes. diff --git a/pyproject.toml b/pyproject.toml index c52be62..3961748 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,6 +53,7 @@ dependencies = [ "whitenoise>=6.11.0", "djangorestframework-stubs>=3.14.5", "python-docx>=1.2.0", + "django-polymorphic>=3.1,<4.0", ] [project.optional-dependencies] diff --git a/src/apps/parsers/services.py b/src/apps/parsers/services.py index 91ac8dd..34f6539 100644 --- a/src/apps/parsers/services.py +++ b/src/apps/parsers/services.py @@ -27,7 +27,6 @@ from apps.parsers.clients.zakupki.schemas import Procurement from apps.parsers.models import ( VACANCY_RECORD_SOURCES, FinancialReport, - FinancialReportLine, GenericParserRecord, IndustrialCertificateRecord, IndustrialProductRecord, @@ -42,6 +41,12 @@ from django.conf import settings from django.db import IntegrityError, transaction from django.db.models import Q from django.utils import timezone +from organizations.models import OrganizationSourceRecord +from organizations.source_ingestion import ( + OrganizationSourceIngestionService, + SourceFinancialLineInput, + SourceRecordInput, +) from registers.models import Organization logger = logging.getLogger(__name__) @@ -122,6 +127,16 @@ def normalize_to_decimal(value: str | None) -> Decimal | None: return None +def _date_to_iso(value: date | None) -> str | None: + return value.isoformat() if value is not None else None + + +def _optional_int(value: object) -> int | None: + if value is None or value == "": + return None + return int(value) + + @dataclass(frozen=True) class RegistryOrganizationLookup: """Набор индексов для быстрого сопоставления parser-записей с организацией.""" @@ -476,39 +491,41 @@ class IndustrialCertificateService( logger.info("Saving %d certificates (batch_id=%d)", len(certificates), batch_id) - registry_lookup = RegistryOrganizationResolver.build_lookup( - [(cert.inn, cert.ogrn) for cert in certificates] - ) - - instances = [ - cls.model( - load_batch=batch_id, - issue_date=cert.issue_date, - issue_date_normalized=normalize_to_date(cert.issue_date), - certificate_number=cert.certificate_number, - expiry_date=cert.expiry_date, - expiry_date_normalized=normalize_to_date(cert.expiry_date), - certificate_file_url=cert.certificate_file_url, - organisation_name=cert.organisation_name, + source_records = [ + SourceRecordInput( + external_id=cert.certificate_number, + title=cert.certificate_number, + organization_name=cert.organisation_name, inn=cert.inn, ogrn=cert.ogrn, - registry_organization_id=RegistryOrganizationResolver.resolve_organization_id( - lookup=registry_lookup, - inn=cert.inn, - ogrn=cert.ogrn, - ), + record_date=cert.issue_date, + url=cert.certificate_file_url, + payload={ + "load_batch": batch_id, + "issue_date": cert.issue_date, + "issue_date_normalized": _date_to_iso( + normalize_to_date(cert.issue_date), + ), + "certificate_number": cert.certificate_number, + "expiry_date": cert.expiry_date, + "expiry_date_normalized": _date_to_iso( + normalize_to_date(cert.expiry_date), + ), + "certificate_file_url": cert.certificate_file_url, + "organisation_name": cert.organisation_name, + "inn": cert.inn, + "ogrn": cert.ogrn, + }, ) for cert in certificates ] - before_count = cls.model.objects.filter(load_batch=batch_id).count() - cls.bulk_create_chunked( - instances, - chunk_size=chunk_size, - ignore_conflicts=True, # Skip duplicates by certificate_number + result = OrganizationSourceIngestionService.save_records( + source=ParserLoadLog.Source.INDUSTRIAL, + load_batch=batch_id, + records=source_records, ) - after_count = cls.model.objects.filter(load_batch=batch_id).count() - saved_count = max(0, after_count - before_count) + saved_count = result.created_records + result.updated_records logger.info("Saved %d certificates", saved_count) return saved_count @@ -522,7 +539,10 @@ class IndustrialCertificateService( inn: ИНН организации batch_id: Фильтр по пакету загрузки (опционально) """ - qs = cls.filter(inn=inn) + qs = OrganizationSourceRecord.objects.filter( + source=ParserLoadLog.Source.INDUSTRIAL, + extension__organization__inn=inn, + ) if batch_id: qs = qs.filter(load_batch=batch_id) return qs @@ -530,7 +550,10 @@ class IndustrialCertificateService( @classmethod def find_by_certificate_number(cls, certificate_number: str): """Найти сертификаты по номеру.""" - return cls.filter(certificate_number=certificate_number) + return OrganizationSourceRecord.objects.filter( + source=ParserLoadLog.Source.INDUSTRIAL, + external_id=certificate_number, + ) class ManufacturerService(BulkOperationsMixin, BaseService[ManufacturerRecord]): @@ -575,37 +598,31 @@ class ManufacturerService(BulkOperationsMixin, BaseService[ManufacturerRecord]): "Saving %d manufacturers (batch_id=%d)", len(manufacturers), batch_id ) - registry_lookup = RegistryOrganizationResolver.build_lookup( - [(manufacturer.inn, manufacturer.ogrn) for manufacturer in manufacturers] - ) - - items = [ - { - "load_batch": batch_id, - "full_legal_name": m.full_legal_name, - "inn": m.inn, - "ogrn": m.ogrn, - "address": m.address, - "registry_organization_id": RegistryOrganizationResolver.resolve_organization_id( - lookup=registry_lookup, - inn=m.inn, - ogrn=m.ogrn, - ), - } + source_records = [ + SourceRecordInput( + external_id=m.inn, + title=m.full_legal_name, + organization_name=m.full_legal_name, + inn=m.inn, + ogrn=m.ogrn, + payload={ + "load_batch": batch_id, + "full_legal_name": m.full_legal_name, + "inn": m.inn, + "ogrn": m.ogrn, + "address": m.address, + }, + ) for m in manufacturers ] - created_count, updated_count = cls.bulk_update_or_create( - items, - unique_fields=["inn"], - update_fields=[ - "load_batch", - "full_legal_name", - "ogrn", - "address", - "registry_organization_id", - ], + result = OrganizationSourceIngestionService.save_records( + source=ParserLoadLog.Source.MANUFACTURES, + load_batch=batch_id, + records=source_records, ) + created_count = result.created_records + updated_count = result.updated_records saved_count = created_count + updated_count logger.info( "Saved %d manufacturers (created=%d, updated=%d)", @@ -625,7 +642,10 @@ class ManufacturerService(BulkOperationsMixin, BaseService[ManufacturerRecord]): inn: ИНН организации batch_id: Фильтр по пакету загрузки (опционально) """ - qs = cls.filter(inn=inn) + qs = OrganizationSourceRecord.objects.filter( + source=ParserLoadLog.Source.MANUFACTURES, + extension__organization__inn=inn, + ) if batch_id: qs = qs.filter(load_batch=batch_id) return qs @@ -633,7 +653,10 @@ class ManufacturerService(BulkOperationsMixin, BaseService[ManufacturerRecord]): @classmethod def find_by_ogrn(cls, ogrn: str): """Найти производителей по ОГРН.""" - return cls.filter(ogrn=ogrn) + return OrganizationSourceRecord.objects.filter( + source=ParserLoadLog.Source.MANUFACTURES, + extension__organization__ogrn=ogrn, + ) class IndustrialProductService( @@ -667,88 +690,48 @@ class IndustrialProductService( "Saving %d industrial products (batch_id=%d)", len(products), batch_id ) - registry_lookup = RegistryOrganizationResolver.build_lookup( - [(product.inn, product.ogrn) for product in products] - ) - - items_by_registry_number = {} + source_records = [] + seen_registry_numbers: set[str] = set() for product in products: if not product.registry_number: continue - - items_by_registry_number[product.registry_number] = { - "load_batch": batch_id, - "full_organisation_name": product.full_organisation_name, - "ogrn": product.ogrn, - "inn": product.inn, - "registry_number": product.registry_number, - "product_name": product.product_name, - "product_model": product.product_model, - "okpd2_code": product.okpd2_code, - "tnved_code": product.tnved_code, - "regulatory_document": product.regulatory_document, - "registry_organization_id": RegistryOrganizationResolver.resolve_organization_id( - lookup=registry_lookup, + if product.registry_number in seen_registry_numbers: + continue + seen_registry_numbers.add(product.registry_number) + source_records.append( + SourceRecordInput( + external_id=product.registry_number, + title=product.product_name, + organization_name=product.full_organisation_name, inn=product.inn, ogrn=product.ogrn, - ), - } - - existing_ids_by_registry_number = {} - registry_numbers = list(items_by_registry_number) - for i in range(0, len(registry_numbers), chunk_size): - chunk = registry_numbers[i : i + chunk_size] - existing_ids_by_registry_number.update( - cls.model.objects.filter(registry_number__in=chunk).values_list( - "registry_number", - "id", + payload={ + "load_batch": batch_id, + "full_organisation_name": product.full_organisation_name, + "ogrn": product.ogrn, + "inn": product.inn, + "registry_number": product.registry_number, + "product_name": product.product_name, + "product_model": product.product_model, + "okpd2_code": product.okpd2_code, + "tnved_code": product.tnved_code, + "regulatory_document": product.regulatory_document, + }, ) ) - create_instances = [] - update_instances = [] - now = timezone.now() - for registry_number, item in items_by_registry_number.items(): - existing_id = existing_ids_by_registry_number.get(registry_number) - if existing_id is None: - create_instances.append(cls.model(**item)) - continue + # The products XLSX is large enough to exceed the worker memory limit if we + # keep both parser DTOs and normalized source records alive during ingestion. + products.clear() + seen_registry_numbers.clear() - update_instances.append( - cls.model( - id=existing_id, - updated_at=now, - **item, - ) - ) - - created_count = cls.bulk_create_chunked( - create_instances, - chunk_size=chunk_size, + result = OrganizationSourceIngestionService.save_records( + source=ParserLoadLog.Source.INDUSTRIAL_PRODUCTS, + load_batch=batch_id, + records=source_records, ) - updated_count = 0 - update_fields = [ - "load_batch", - "full_organisation_name", - "ogrn", - "inn", - "product_name", - "product_model", - "okpd2_code", - "tnved_code", - "regulatory_document", - "registry_organization_id", - "updated_at", - ] - for i in range(0, len(update_instances), chunk_size): - chunk = update_instances[i : i + chunk_size] - cls.model.objects.bulk_update( - chunk, - fields=update_fields, - batch_size=chunk_size, - ) - updated_count += len(chunk) - + created_count = result.created_records + updated_count = result.updated_records saved_count = created_count + updated_count logger.info( "Saved %d industrial products (created=%d, updated=%d)", @@ -761,7 +744,10 @@ class IndustrialProductService( @classmethod def find_by_inn(cls, inn: str, batch_id: int | None = None): """Найти продукцию по ИНН.""" - qs = cls.filter(inn=inn) + qs = OrganizationSourceRecord.objects.filter( + source=ParserLoadLog.Source.INDUSTRIAL_PRODUCTS, + extension__organization__inn=inn, + ) if batch_id: qs = qs.filter(load_batch=batch_id) return qs @@ -769,12 +755,18 @@ class IndustrialProductService( @classmethod def find_by_ogrn(cls, ogrn: str): """Найти продукцию по ОГРН.""" - return cls.filter(ogrn=ogrn) + return OrganizationSourceRecord.objects.filter( + source=ParserLoadLog.Source.INDUSTRIAL_PRODUCTS, + extension__organization__ogrn=ogrn, + ) @classmethod def find_by_registry_number(cls, registry_number: str): """Найти продукцию по регистрационному номеру.""" - return cls.filter(registry_number=registry_number) + return OrganizationSourceRecord.objects.filter( + source=ParserLoadLog.Source.INDUSTRIAL_PRODUCTS, + external_id=registry_number, + ) class GenericParserRecordService(BulkOperationsMixin, BaseService[GenericParserRecord]): @@ -863,54 +855,54 @@ class GenericParserRecordService(BulkOperationsMixin, BaseService[GenericParserR ) unique_records.setdefault((record_source, record.external_id), record) - source_values = {record_source for record_source, _ in unique_records} - external_ids = {external_id for _, external_id in unique_records} - existing_keys = set( - cls.model.objects.filter( - source__in=source_values, - external_id__in=external_ids, - ).values_list("source", "external_id") - ) - registry_lookup = RegistryOrganizationResolver.build_lookup( - [(record.inn, record.ogrn) for record in unique_records.values()] - ) - instances = [ - cls.model( - load_batch=batch_id, - source=record_source, - external_id=record.external_id, - inn=record.inn, - ogrn=record.ogrn, - organisation_name=record.organisation_name, - title=record.title, - record_date=record.record_date, - amount=record.amount, - status=record.status, - url=record.url, - payload=record.payload, - registry_organization_id=RegistryOrganizationResolver.resolve_organization_id( - lookup=registry_lookup, + records_by_source: dict[str, list[SourceRecordInput]] = defaultdict(list) + for (record_source, _external_id), record in unique_records.items(): + payload = dict(record.payload) if isinstance(record.payload, dict) else {} + payload.update( + { + "load_batch": batch_id, + "source": record_source, + "external_id": record.external_id, + "inn": record.inn, + "ogrn": record.ogrn, + "organisation_name": record.organisation_name, + "title": record.title, + "record_date": record.record_date, + "amount": str(record.amount) if record.amount is not None else None, + "status": record.status, + "url": record.url, + } + ) + records_by_source[record_source].append( + SourceRecordInput( + external_id=record.external_id, + title=record.title, + organization_name=record.organisation_name, inn=record.inn, ogrn=record.ogrn, - ), + record_date=record.record_date, + amount=record.amount, + status=record.status, + url=record.url, + payload=payload, + ) ) - for (record_source, external_id), record in unique_records.items() - if (record_source, external_id) not in existing_keys - ] - if not instances: - logger.info("No new generic records to save (source=%s)", source) - return 0 - return cls._create_with_exact_count( - instances, - unique_fields=["source", "external_id"], - chunk_size=chunk_size, - ) + saved_count = 0 + for record_source, source_records in records_by_source.items(): + result = OrganizationSourceIngestionService.save_records( + source=record_source, + load_batch=batch_id, + records=source_records, + ) + saved_count += result.created_records + result.updated_records + + return saved_count @classmethod def find_by_inn(cls, inn: str, source: str | None = None): """Найти generic records по ИНН.""" - qs = cls.filter(inn=inn) + qs = OrganizationSourceRecord.objects.filter(extension__organization__inn=inn) if source: qs = qs.filter(source=source) return qs @@ -918,7 +910,7 @@ class GenericParserRecordService(BulkOperationsMixin, BaseService[GenericParserR @classmethod def find_by_ogrn(cls, ogrn: str, source: str | None = None): """Найти generic records по ОГРН.""" - qs = cls.filter(ogrn=ogrn) + qs = OrganizationSourceRecord.objects.filter(extension__organization__ogrn=ogrn) if source: qs = qs.filter(source=source) return qs @@ -1458,63 +1450,52 @@ class InspectionService(BulkOperationsMixin, BaseService[InspectionRecord]): data_month, ) - registry_lookup = RegistryOrganizationResolver.build_lookup( - [(inspection.inn, inspection.ogrn) for inspection in inspections] - ) - - items = [ - { - "load_batch": batch_id, - "registration_number": insp.registration_number, - "inn": insp.inn, - "ogrn": insp.ogrn, - "organisation_name": insp.organisation_name, - "control_authority": insp.control_authority, - "inspection_type": insp.inspection_type, - "inspection_form": insp.inspection_form, - "start_date": insp.start_date, - "start_date_normalized": normalize_to_date(insp.start_date), - "end_date": insp.end_date, - "end_date_normalized": normalize_to_date(insp.end_date), - "status": insp.status, - "legal_basis": insp.legal_basis, - "result": insp.result, - "is_federal_law_248": is_federal_law_248, - "data_year": data_year, - "data_month": data_month, - "registry_organization_id": RegistryOrganizationResolver.resolve_organization_id( - lookup=registry_lookup, - inn=insp.inn, - ogrn=insp.ogrn, - ), - } + source_records = [ + SourceRecordInput( + external_id=insp.registration_number, + title=insp.inspection_type + or insp.control_authority + or insp.registration_number, + organization_name=insp.organisation_name, + inn=insp.inn, + ogrn=insp.ogrn, + record_date=insp.start_date, + status=insp.status, + payload={ + "load_batch": batch_id, + "registration_number": insp.registration_number, + "inn": insp.inn, + "ogrn": insp.ogrn, + "organisation_name": insp.organisation_name, + "control_authority": insp.control_authority, + "inspection_type": insp.inspection_type, + "inspection_form": insp.inspection_form, + "start_date": insp.start_date, + "start_date_normalized": _date_to_iso( + normalize_to_date(insp.start_date), + ), + "end_date": insp.end_date, + "end_date_normalized": _date_to_iso( + normalize_to_date(insp.end_date), + ), + "status": insp.status, + "legal_basis": insp.legal_basis, + "result": insp.result, + "is_federal_law_248": is_federal_law_248, + "data_year": data_year, + "data_month": data_month, + }, + ) for insp in inspections ] - created_count, updated_count = cls.bulk_update_or_create( - items, - unique_fields=["registration_number"], - update_fields=[ - "load_batch", - "inn", - "ogrn", - "organisation_name", - "control_authority", - "inspection_type", - "inspection_form", - "start_date", - "start_date_normalized", - "end_date", - "end_date_normalized", - "status", - "legal_basis", - "result", - "is_federal_law_248", - "data_year", - "data_month", - "registry_organization_id", - ], + result = OrganizationSourceIngestionService.save_records( + source=ParserLoadLog.Source.INSPECTIONS, + load_batch=batch_id, + records=source_records, ) + created_count = result.created_records + updated_count = result.updated_records saved_count = created_count + updated_count logger.info( "Saved %d inspections (created=%d, updated=%d)", @@ -1538,17 +1519,24 @@ class InspectionService(BulkOperationsMixin, BaseService[InspectionRecord]): Returns: Кортеж (year, month) или (None, None) если данных нет """ - last_record = ( - cls.model.objects.filter(is_federal_law_248=is_federal_law_248) - .exclude(data_year__isnull=True) - .order_by("-data_year", "-data_month") - .values("data_year", "data_month") - .first() + payloads = ( + OrganizationSourceRecord.objects.filter( + source=ParserLoadLog.Source.INSPECTIONS, + payload__is_federal_law_248=is_federal_law_248, + ) + .exclude(payload__data_year__isnull=True) + .values_list("payload", flat=True) ) - if last_record: - return last_record["data_year"], last_record["data_month"] - return None, None + periods: list[tuple[int, int | None]] = [] + for payload in payloads: + year = payload.get("data_year") + if year is None: + continue + periods.append((int(year), _optional_int(payload.get("data_month")))) + if not periods: + return None, None + return max(periods, key=lambda value: (value[0], value[1] or 0)) @classmethod def has_data_for_period( @@ -1568,10 +1556,11 @@ class InspectionService(BulkOperationsMixin, BaseService[InspectionRecord]): Returns: True если данные есть """ - return cls.model.objects.filter( - data_year=year, - data_month=month, - is_federal_law_248=is_federal_law_248, + return OrganizationSourceRecord.objects.filter( + source=ParserLoadLog.Source.INSPECTIONS, + payload__data_year=year, + payload__data_month=month, + payload__is_federal_law_248=is_federal_law_248, ).exists() @classmethod @@ -1583,7 +1572,10 @@ class InspectionService(BulkOperationsMixin, BaseService[InspectionRecord]): inn: ИНН организации batch_id: Фильтр по пакету загрузки (опционально) """ - qs = cls.filter(inn=inn) + qs = OrganizationSourceRecord.objects.filter( + source=ParserLoadLog.Source.INSPECTIONS, + extension__organization__inn=inn, + ) if batch_id: qs = qs.filter(load_batch=batch_id) return qs @@ -1591,7 +1583,10 @@ class InspectionService(BulkOperationsMixin, BaseService[InspectionRecord]): @classmethod def find_by_registration_number(cls, registration_number: str): """Найти проверки по учётному номеру.""" - return cls.filter(registration_number=registration_number) + return OrganizationSourceRecord.objects.filter( + source=ParserLoadLog.Source.INSPECTIONS, + external_id=registration_number, + ) @classmethod def find_by_control_authority(cls, authority: str, batch_id: int | None = None): @@ -1602,7 +1597,10 @@ class InspectionService(BulkOperationsMixin, BaseService[InspectionRecord]): authority: Наименование контрольного органа (частичное совпадение) batch_id: Фильтр по пакету загрузки (опционально) """ - qs = cls.filter(control_authority__icontains=authority) + qs = OrganizationSourceRecord.objects.filter( + source=ParserLoadLog.Source.INSPECTIONS, + payload__control_authority__icontains=authority, + ) if batch_id: qs = qs.filter(load_batch=batch_id) return qs @@ -1661,74 +1659,59 @@ class ProcurementService(BulkOperationsMixin, BaseService[ProcurementRecord]): data_month, ) - registry_lookup = RegistryOrganizationResolver.build_lookup( - [ - (procurement.customer_inn, procurement.customer_ogrn) - for procurement in procurements - ] - ) - - items = [ - { - "load_batch": batch_id, - "purchase_number": proc.purchase_number, - "purchase_name": proc.purchase_name, - "customer_inn": proc.customer_inn, - "customer_kpp": proc.customer_kpp, - "customer_ogrn": proc.customer_ogrn, - "customer_name": proc.customer_name, - "max_price": proc.max_price, - "max_price_amount": normalize_to_decimal(proc.max_price), - "currency_code": proc.currency_code, - "placement_method": proc.placement_method, - "publish_date": proc.publish_date, - "publish_date_normalized": normalize_to_date(proc.publish_date), - "end_date": proc.end_date, - "end_date_normalized": normalize_to_date(proc.end_date), - "status": proc.status, - "law_type": proc.law_type, - "purchase_object_info": proc.purchase_object_info, - "href": proc.href, - "region_code": region_code or "", - "data_year": data_year, - "data_month": data_month, - "registry_organization_id": RegistryOrganizationResolver.resolve_organization_id( - lookup=registry_lookup, + source_records = [] + for proc in procurements: + amount = normalize_to_decimal(proc.max_price) + publish_date_normalized = normalize_to_date(proc.publish_date) + end_date_normalized = normalize_to_date(proc.end_date) + source_records.append( + SourceRecordInput( + external_id=proc.purchase_number, + title=proc.purchase_name, + organization_name=proc.customer_name, inn=proc.customer_inn, + kpp=proc.customer_kpp, ogrn=proc.customer_ogrn, - ), - } - for proc in procurements - ] + record_date=proc.publish_date, + amount=amount, + status=proc.status, + url=proc.href, + payload={ + "load_batch": batch_id, + "purchase_number": proc.purchase_number, + "purchase_name": proc.purchase_name, + "customer_inn": proc.customer_inn, + "customer_kpp": proc.customer_kpp, + "customer_ogrn": proc.customer_ogrn, + "customer_name": proc.customer_name, + "max_price": proc.max_price, + "max_price_amount": str(amount) if amount is not None else None, + "currency_code": proc.currency_code, + "placement_method": proc.placement_method, + "publish_date": proc.publish_date, + "publish_date_normalized": _date_to_iso( + publish_date_normalized, + ), + "end_date": proc.end_date, + "end_date_normalized": _date_to_iso(end_date_normalized), + "status": proc.status, + "law_type": proc.law_type, + "purchase_object_info": proc.purchase_object_info, + "href": proc.href, + "region_code": region_code or "", + "data_year": data_year, + "data_month": data_month, + }, + ) + ) - created_count, updated_count = cls.bulk_update_or_create( - items, - unique_fields=["purchase_number"], - update_fields=[ - "load_batch", - "purchase_name", - "customer_inn", - "customer_kpp", - "customer_ogrn", - "customer_name", - "max_price", - "max_price_amount", - "currency_code", - "placement_method", - "publish_date", - "publish_date_normalized", - "end_date", - "end_date_normalized", - "status", - "law_type", - "purchase_object_info", - "href", - "region_code", - "data_year", - "data_month", - "registry_organization_id", - ], + result = OrganizationSourceIngestionService.save_records( + source=ParserLoadLog.Source.PROCUREMENTS, + load_batch=batch_id, + records=source_records, ) + created_count = result.created_records + updated_count = result.updated_records saved_count = created_count + updated_count logger.info( "Saved %d procurements (created=%d, updated=%d)", @@ -1753,22 +1736,24 @@ class ProcurementService(BulkOperationsMixin, BaseService[ProcurementRecord]): Returns: Кортеж (year, month) или (None, None) если данных нет """ - qs = cls.model.objects.exclude(data_year__isnull=True) + qs = OrganizationSourceRecord.objects.filter( + source=ParserLoadLog.Source.PROCUREMENTS, + ).exclude(payload__data_year__isnull=True) if region_code: - qs = qs.filter(region_code=region_code) + qs = qs.filter(payload__region_code=region_code) if law_type: - qs = qs.filter(law_type=law_type) + qs = qs.filter(payload__law_type=law_type) - last_record = ( - qs.order_by("-data_year", "-data_month") - .values("data_year", "data_month") - .first() - ) - - if last_record: - return last_record["data_year"], last_record["data_month"] - return None, None + periods: list[tuple[int, int | None]] = [] + for payload in qs.values_list("payload", flat=True): + year = payload.get("data_year") + if year is None: + continue + periods.append((int(year), _optional_int(payload.get("data_month")))) + if not periods: + return None, None + return max(periods, key=lambda value: (value[0], value[1] or 0)) @classmethod def has_data_for_period( @@ -1790,12 +1775,16 @@ class ProcurementService(BulkOperationsMixin, BaseService[ProcurementRecord]): Returns: True если данные есть """ - qs = cls.model.objects.filter(data_year=year, data_month=month) + qs = OrganizationSourceRecord.objects.filter( + source=ParserLoadLog.Source.PROCUREMENTS, + payload__data_year=year, + payload__data_month=month, + ) if region_code: - qs = qs.filter(region_code=region_code) + qs = qs.filter(payload__region_code=region_code) if law_type: - qs = qs.filter(law_type=law_type) + qs = qs.filter(payload__law_type=law_type) return qs.exists() @@ -1808,7 +1797,10 @@ class ProcurementService(BulkOperationsMixin, BaseService[ProcurementRecord]): inn: ИНН заказчика batch_id: Фильтр по пакету загрузки (опционально) """ - qs = cls.filter(customer_inn=inn) + qs = OrganizationSourceRecord.objects.filter( + source=ParserLoadLog.Source.PROCUREMENTS, + extension__organization__inn=inn, + ) if batch_id: qs = qs.filter(load_batch=batch_id) return qs @@ -1816,7 +1808,10 @@ class ProcurementService(BulkOperationsMixin, BaseService[ProcurementRecord]): @classmethod def find_by_purchase_number(cls, purchase_number: str): """Найти закупки по реестровому номеру.""" - return cls.filter(purchase_number=purchase_number) + return OrganizationSourceRecord.objects.filter( + source=ParserLoadLog.Source.PROCUREMENTS, + external_id=purchase_number, + ) @classmethod def find_by_region(cls, region_code: str, batch_id: int | None = None): @@ -1827,7 +1822,10 @@ class ProcurementService(BulkOperationsMixin, BaseService[ProcurementRecord]): region_code: Код региона batch_id: Фильтр по пакету загрузки (опционально) """ - qs = cls.filter(region_code=region_code) + qs = OrganizationSourceRecord.objects.filter( + source=ParserLoadLog.Source.PROCUREMENTS, + payload__region_code=region_code, + ) if batch_id: qs = qs.filter(load_batch=batch_id) return qs @@ -1841,13 +1839,16 @@ class ProcurementService(BulkOperationsMixin, BaseService[ProcurementRecord]): customer_name: Наименование заказчика (частичное совпадение) batch_id: Фильтр по пакету загрузки (опционально) """ - qs = cls.filter(customer_name__icontains=customer_name) + qs = OrganizationSourceRecord.objects.filter( + source=ParserLoadLog.Source.PROCUREMENTS, + payload__customer_name__icontains=customer_name, + ) if batch_id: qs = qs.filter(load_batch=batch_id) return qs -class FNSReportService(BulkOperationsMixin, BaseService[FinancialReport]): +class FNSReportService(BulkOperationsMixin, BaseService[OrganizationSourceRecord]): """ Сервис для работы с бухгалтерской отчетностью ФНС. @@ -1857,7 +1858,7 @@ class FNSReportService(BulkOperationsMixin, BaseService[FinancialReport]): - Поиск отчетов по ОГРН/external_id """ - model = FinancialReport + model = OrganizationSourceRecord @classmethod @transaction.atomic @@ -1871,7 +1872,7 @@ class FNSReportService(BulkOperationsMixin, BaseService[FinancialReport]): source: str, batch_id: int, lines_data: list[dict], - ) -> FinancialReport: + ) -> OrganizationSourceRecord: """ Сохранить отчет и все его строки. @@ -1885,7 +1886,7 @@ class FNSReportService(BulkOperationsMixin, BaseService[FinancialReport]): lines_data: Список словарей с данными строк Returns: - Созданный FinancialReport + Созданная или обновленная запись источника организации. """ logger.info( "Сохранение отчета external_id=%s, ogrn=%s, lines=%d", @@ -1894,80 +1895,113 @@ class FNSReportService(BulkOperationsMixin, BaseService[FinancialReport]): len(lines_data), ) - registry_lookup = RegistryOrganizationResolver.build_lookup([(None, ogrn)]) - - report = cls.create( - external_id=external_id, - ogrn=ogrn, - registry_organization_id=RegistryOrganizationResolver.resolve_organization_id( - lookup=registry_lookup, - inn=None, - ogrn=ogrn, - ), - file_name=file_name, - file_hash=file_hash, - source=source, - load_batch=batch_id, - status=FinancialReport.Status.SUCCESS, - ) - - if lines_data: - line_instances = [ - FinancialReportLine( - report=report, - form_code=line["form_code"], - line_code=line["line_code"], - line_name=line["line_name"], - year=line["year"], - period_start=line.get("period_start"), - period_end=line.get("period_end"), - ) - for line in lines_data - ] - FinancialReportLine.objects.bulk_create( - line_instances, ignore_conflicts=True + financial_lines = [ + SourceFinancialLineInput( + form_code=str(line["form_code"]), + line_code=str(line["line_code"]), + line_name=str(line["line_name"]), + year=int(line["year"]), + period_start=_optional_int(line.get("period_start")), + period_end=_optional_int(line.get("period_end")), ) - logger.info("Сохранено %d строк отчета", len(line_instances)) - + for line in lines_data + ] + OrganizationSourceIngestionService.save_records( + source=ParserLoadLog.Source.FNS_REPORTS, + load_batch=batch_id, + records=[ + SourceRecordInput( + external_id=external_id, + title=file_name or external_id, + organization_name=ogrn or external_id, + ogrn=ogrn, + status=FinancialReport.Status.SUCCESS, + payload={ + "load_batch": batch_id, + "external_id": external_id, + "ogrn": ogrn, + "file_name": file_name, + "file_hash": file_hash, + "source": source, + "status": FinancialReport.Status.SUCCESS, + "lines_count": len(lines_data), + }, + financial_lines=financial_lines, + ) + ], + ) + report = OrganizationSourceRecord.objects.get( + source=ParserLoadLog.Source.FNS_REPORTS, + external_id=external_id, + ) + logger.info("Сохранено %d строк отчета", len(financial_lines)) return report @classmethod def exists_by_hash(cls, file_hash: str) -> bool: """Проверить, существует ли отчет с таким хешем.""" - return cls.model.objects.filter(file_hash=file_hash).exists() + return cls.get_queryset().filter(payload__file_hash=file_hash).exists() @classmethod def exists_by_external_id(cls, external_id: str) -> bool: """Проверить, существует ли отчет с таким external_id.""" - return cls.model.objects.filter(external_id=external_id).exists() + return cls.get_queryset().filter(external_id=external_id).exists() @classmethod def find_by_ogrn(cls, ogrn: str): """Найти все отчеты по ОГРН.""" - return cls.filter(ogrn=ogrn) + return cls.get_queryset().filter( + Q(extension__organization__ogrn=ogrn) | Q(payload__ogrn=ogrn), + ) @classmethod def find_by_external_id(cls, external_id: str): """Найти отчет по external_id.""" - return cls.filter(external_id=external_id).first() + return cls.get_queryset().filter(external_id=external_id).first() @classmethod - def mark_processing(cls, report: FinancialReport) -> FinancialReport: + def get_queryset(cls): + return OrganizationSourceRecord.objects.filter( + source=ParserLoadLog.Source.FNS_REPORTS, + ) + + @classmethod + def mark_processing( + cls, + report: OrganizationSourceRecord, + ) -> OrganizationSourceRecord: """Отметить отчет как обрабатываемый.""" - return cls.update(report, status=FinancialReport.Status.PROCESSING) + return cls._update_status(report, FinancialReport.Status.PROCESSING) @classmethod - def mark_success(cls, report: FinancialReport) -> FinancialReport: + def mark_success(cls, report: OrganizationSourceRecord) -> OrganizationSourceRecord: """Отметить отчет как успешно обработанный.""" - return cls.update(report, status=FinancialReport.Status.SUCCESS) + return cls._update_status(report, FinancialReport.Status.SUCCESS) @classmethod def mark_failed( - cls, report: FinancialReport, error_message: str - ) -> FinancialReport: + cls, report: OrganizationSourceRecord, error_message: str + ) -> OrganizationSourceRecord: """Отметить отчет как неудавшийся.""" - return cls.update( + return cls._update_status( report, - status=FinancialReport.Status.FAILED, + FinancialReport.Status.FAILED, error_message=error_message, ) + + @classmethod + def _update_status( + cls, + report: OrganizationSourceRecord, + status: str, + *, + error_message: str = "", + ) -> OrganizationSourceRecord: + payload = dict(report.payload or {}) + payload["status"] = status + if error_message: + payload["error_message"] = error_message + report.status = status + report.payload = payload + report.save(update_fields=["status", "payload", "updated_at"]) + return report diff --git a/src/apps/parsers/source_cards.py b/src/apps/parsers/source_cards.py index be7cfb6..55627bc 100644 --- a/src/apps/parsers/source_cards.py +++ b/src/apps/parsers/source_cards.py @@ -12,21 +12,13 @@ from apps.core.models import JobStatus from apps.core.services import BackgroundJobService from apps.parsers.models import ( VACANCY_RECORD_SOURCES, - FinancialReport, - FinancialReportLine, - GenericParserRecord, - IndustrialCertificateRecord, - IndustrialProductRecord, - InspectionRecord, - ManufacturerRecord, ParserLoadLog, - ProcurementRecord, ) from django.conf import settings -from django.db.models import CharField, F, Max, Q, Value -from django.db.models.functions import Coalesce, NullIf +from django.db.models import Max, Q from django.http import Http404 from django.utils import timezone +from organizations.models import OrganizationSourceRecord from rest_framework.exceptions import ValidationError SUCCESSFUL_LOAD_STATUSES = {"success", "skipped"} @@ -328,10 +320,16 @@ GENERIC_RECORD_SOURCES_BY_ITEM_CODE = { "fstec": ParserLoadLog.Source.FSTEC, "trudvsem": ParserLoadLog.Source.TRUDVSEM, } -PROCUREMENT_BUYER_ITEM_CODES = { - "procurements_44fz", - "procurements_223fz", - "contracts", + +SOURCE_RECORD_SOURCES_BY_ITEM_CODE = { + item.code: ( + list(VACANCY_RECORD_SOURCES) + if item.parser_source == ParserLoadLog.Source.TRUDVSEM + else [item.parser_source] + ) + for definition in SOURCE_CARD_DEFINITIONS + for item in definition.source_items + if item.parser_source } @@ -779,116 +777,22 @@ class SourceCardService: @classmethod def _get_source_records_count(cls, item_code: str) -> int: - generic_sources = cls._get_generic_sources_for_item_code(item_code) - if generic_sources: - return GenericParserRecord.objects.filter( - source__in=generic_sources - ).count() - if item_code == "fns_reports": - return FinancialReportLine.objects.count() - if item_code == "industrial": - return IndustrialCertificateRecord.objects.count() - if item_code == "manufactures": - return ManufacturerRecord.objects.count() - if item_code == "industrial_products": - return IndustrialProductRecord.objects.count() - if item_code == "inspections": - return InspectionRecord.objects.count() - if item_code == "procurements": - return ProcurementRecord.objects.count() - return 0 + return cls._get_source_record_queryset(item_code).count() @classmethod def _get_source_organizations_count(cls, item_code: str) -> int: - generic_sources = cls._get_generic_sources_for_item_code(item_code) - if generic_sources: - if item_code in PROCUREMENT_BUYER_ITEM_CODES: - return cls._get_generic_procurement_buyer_identities( - generic_sources - ).count() - return ( - GenericParserRecord.objects.filter(source__in=generic_sources) - .exclude(inn="") - .values("inn") - .distinct() - .count() - ) - if item_code == "fns_reports": - return ( - FinancialReport.objects.exclude(ogrn="") - .values("ogrn") - .distinct() - .count() - ) - if item_code == "industrial": - return ( - IndustrialCertificateRecord.objects.exclude(inn="") - .values("inn") - .distinct() - .count() - ) - if item_code == "manufactures": - return ( - ManufacturerRecord.objects.exclude(inn="") - .values("inn") - .distinct() - .count() - ) - if item_code == "industrial_products": - return ( - IndustrialProductRecord.objects.exclude(inn="") - .values("inn") - .distinct() - .count() - ) - if item_code == "inspections": - return ( - InspectionRecord.objects.exclude(inn="") - .values("inn") - .distinct() - .count() - ) - if item_code == "procurements": - return ( - ProcurementRecord.objects.exclude(customer_inn="") - .values("customer_inn") - .distinct() - .count() - ) - return 0 + return ( + cls._get_source_record_queryset(item_code) + .values("extension__organization_id") + .distinct() + .count() + ) @classmethod def _get_source_data_timestamp(cls, item_code: str): - generic_sources = cls._get_generic_sources_for_item_code(item_code) - if generic_sources: - return GenericParserRecord.objects.filter( - source__in=generic_sources - ).aggregate(last_updated=Max("updated_at"))["last_updated"] - if item_code == "fns_reports": - return FinancialReport.objects.aggregate(last_updated=Max("updated_at"))[ - "last_updated" - ] - if item_code == "industrial": - return IndustrialCertificateRecord.objects.aggregate( - last_updated=Max("updated_at") - )["last_updated"] - if item_code == "manufactures": - return ManufacturerRecord.objects.aggregate(last_updated=Max("updated_at"))[ - "last_updated" - ] - if item_code == "industrial_products": - return IndustrialProductRecord.objects.aggregate( - last_updated=Max("updated_at") - )["last_updated"] - if item_code == "inspections": - return InspectionRecord.objects.aggregate(last_updated=Max("updated_at"))[ - "last_updated" - ] - if item_code == "procurements": - return ProcurementRecord.objects.aggregate(last_updated=Max("updated_at"))[ - "last_updated" - ] - return None + return cls._get_source_record_queryset(item_code).aggregate( + last_updated=Max("updated_at") + )["last_updated"] @classmethod def _get_card_organizations_count( @@ -896,54 +800,32 @@ class SourceCardService: definition: SourceCardDefinition, source_items: list[dict[str, Any]], ) -> int: - if definition.slug == "public-procurements": - generic_sources = cls._get_generic_sources_for_definition(definition) - legacy_inns = ( - ProcurementRecord.objects.exclude(customer_inn="") - .annotate(buyer_identity=F("customer_inn")) - .order_by() - .values_list("buyer_identity", flat=True) - .distinct() - ) - generic_inns = cls._get_generic_procurement_buyer_identities( - generic_sources - ) - return legacy_inns.union(generic_inns).count() - - if definition.slug != "manufacturers-and-products": - generic_sources = cls._get_generic_sources_for_definition(definition) - if generic_sources and len(generic_sources) == len(definition.source_items): - return ( - GenericParserRecord.objects.filter(source__in=generic_sources) - .exclude(inn="") - .values("inn") - .distinct() - .count() - ) + source_codes = [item.code for item in definition.source_items] + sources = cls._get_sources_for_item_codes(source_codes) + if not sources: return sum(item["organizations_count"] for item in source_items) - - industrial_inns = ( - IndustrialCertificateRecord.objects.exclude(inn="") - .order_by() - .values_list("inn", flat=True) + return ( + OrganizationSourceRecord.objects.filter(source__in=sources) + .values("extension__organization_id") .distinct() + .count() ) - manufacturer_inns = ( - ManufacturerRecord.objects.exclude(inn="") - .order_by() - .values_list("inn", flat=True) - .distinct() - ) - product_inns = ( - IndustrialProductRecord.objects.exclude(inn="") - .order_by() - .values_list("inn", flat=True) - .distinct() - ) - return industrial_inns.union(manufacturer_inns, product_inns).count() @staticmethod def _get_generic_sources_for_item_code(item_code: str) -> list[str]: + return SourceCardService._get_sources_for_item_code(item_code) + + @staticmethod + def _get_source_record_queryset(item_code: str): + return OrganizationSourceRecord.objects.filter( + source__in=SourceCardService._get_sources_for_item_code(item_code), + ) + + @staticmethod + def _get_sources_for_item_code(item_code: str) -> list[str]: + sources = SOURCE_RECORD_SOURCES_BY_ITEM_CODE.get(item_code) + if sources: + return list(sources) generic_source = GENERIC_RECORD_SOURCES_BY_ITEM_CODE.get(item_code) if not generic_source: return [] @@ -952,35 +834,21 @@ class SourceCardService: return [generic_source] @staticmethod - def _get_generic_procurement_buyer_identities(generic_sources: list[str]): - return ( - GenericParserRecord.objects.filter(source__in=generic_sources) - .annotate( - buyer_identity=Coalesce( - NullIf(F("inn"), Value("")), - NullIf(F("organisation_name"), Value("")), - output_field=CharField(), - ) + def _get_sources_for_item_codes(item_codes: list[str]) -> list[str]: + return list( + dict.fromkeys( + source + for item_code in item_codes + for source in SourceCardService._get_sources_for_item_code(item_code) ) - .exclude(buyer_identity__isnull=True) - .order_by() - .values_list("buyer_identity", flat=True) - .distinct() ) @staticmethod def _get_generic_sources_for_definition( definition: SourceCardDefinition, ) -> list[str]: - return list( - dict.fromkeys( - source - for item in definition.source_items - if item.code in GENERIC_RECORD_SOURCES_BY_ITEM_CODE - for source in SourceCardService._get_generic_sources_for_item_code( - item.code - ) - ) + return SourceCardService._get_sources_for_item_codes( + [item.code for item in definition.source_items] ) @classmethod diff --git a/src/apps/parsers/tasks.py b/src/apps/parsers/tasks.py index b729140..539d188 100644 --- a/src/apps/parsers/tasks.py +++ b/src/apps/parsers/tasks.py @@ -54,7 +54,6 @@ from apps.parsers.services import ( from apps.parsers.source_registry import PARSER_SOURCES from celery import shared_task from django.conf import settings -from django.db import transaction from registers.models import RegistryMembershipPeriod from requests.adapters import BaseAdapter @@ -226,22 +225,6 @@ def _get_or_create_background_job( return job -def _queue_organization_snapshot_refresh(source: str, batch_id: int) -> None: - """Queue snapshot refresh only after parser writes are committed.""" - - def enqueue() -> None: - from organizations.tasks import ( - refresh_organization_data_snapshots_for_parser_batch, - ) - - refresh_organization_data_snapshots_for_parser_batch.delay( - source=str(source), - batch_id=batch_id, - ) - - transaction.on_commit(enqueue) - - def _run_generic_parser( self, *, @@ -284,7 +267,6 @@ def _run_generic_parser( ) result = {"batch_id": batch_id, "saved": saved_count, "status": "success"} job.complete(result=result) - _queue_organization_snapshot_refresh(source, batch_id) return result except ParserSourceSkipped as e: message = str(e) @@ -350,7 +332,6 @@ def _run_inspection_parser( ) result = {"batch_id": batch_id, "saved": saved_count, "status": "success"} job.complete(result=result) - _queue_organization_snapshot_refresh(source, batch_id) return result except ParserSourceSkipped as e: message = str(e) @@ -523,20 +504,36 @@ def _checko_bankruptcy_items( fallback_name: str, ) -> list[GenericParserItem]: """Преобразовать банкротные сообщения Checko в generic records.""" - records: list[GenericParserItem] = [] inn = str(getattr(company, "inn", "") or fallback_inn) ogrn = str(getattr(company, "ogrn", "") or fallback_ogrn) name = getattr(company, "short_name", None) or fallback_name + messages_by_external_id: dict[str, list[dict[str, str]]] = {} + for message in getattr(company, "bankruptcy", ()): message_type = getattr(message, "type", "") or "Сообщение ЕФРСБ" message_date = getattr(message, "date", "") or "" case_number = getattr(message, "case_number", None) or "" external_id = _fedresurs_external_id( inn=inn, - message_type=message_type, - message_date=message_date, + ogrn=ogrn, case_number=case_number, ) + messages_by_external_id.setdefault(external_id, []).append( + { + "case_number": case_number, + "date": message_date, + "type": message_type, + } + ) + + records: list[GenericParserItem] = [] + for external_id, messages in messages_by_external_id.items(): + messages = sorted( + messages, + key=lambda item: item["date"], + reverse=True, + ) + latest_message = messages[0] records.append( GenericParserItem( source="fedresurs_bankruptcy", @@ -544,18 +541,20 @@ def _checko_bankruptcy_items( inn=inn, ogrn=ogrn, organisation_name=name, - title=message_type, - record_date=message_date, - status=message_type, + title=latest_message["type"], + record_date=latest_message["date"], + status=latest_message["type"], payload={ "provider": "checko", "declared_source": "ЕФРСБ", "inn": inn, "ogrn": ogrn, "organisation_name": name, - "type": message_type, - "date": message_date, - "case_number": case_number, + "type": latest_message["type"], + "date": latest_message["date"], + "case_number": latest_message["case_number"], + "messages": messages, + "messages_count": len(messages), }, ) ) @@ -565,14 +564,15 @@ def _checko_bankruptcy_items( def _fedresurs_external_id( *, inn: str, - message_type: str, - message_date: str, case_number: str, + ogrn: str = "", ) -> str: - """Стабильный ID для ЕФРСБ-сообщения из fallback-источника.""" - raw = f"{inn}:{message_type}:{message_date}:{case_number}" - digest = hashlib.sha256(raw.encode("utf-8")).hexdigest()[:24] - return f"checko-fedresurs:{digest}" + """Стабильный ID для процедуры банкротства из fallback-источника.""" + identity = _normalize_identifier(inn) or _normalize_identifier(ogrn) + normalized_case_number = re.sub(r"\s+", "", str(case_number or "")).upper() + if normalized_case_number: + return f"checko-fedresurs:{identity}:{normalized_case_number}" + return f"checko-fedresurs:{identity}" @dataclass(frozen=True) @@ -1272,24 +1272,23 @@ def _process_fns_file_sync( # Завершаем job.complete( result={ - "report_id": report.id, + "source_record_uid": str(report.uid), "external_id": parsed.external_id, "ogrn": parsed.ogrn, "lines_count": len(parsed.lines), } ) - _queue_organization_snapshot_refresh(source, batch_id) logger.info( - "FNS file processed: %s (report_id=%d, lines=%d)", + "FNS file processed: %s (source_record_uid=%s, lines=%d)", file_path.name, - report.id, + report.uid, len(parsed.lines), ) return { "status": "success", - "report_id": report.id, + "source_record_uid": str(report.uid), "external_id": parsed.external_id, "ogrn": parsed.ogrn, "lines_count": len(parsed.lines), @@ -1399,7 +1398,6 @@ def parse_industrial_production( # Завершаем BackgroundJob job.complete(result={"batch_id": batch_id, "saved": saved_count}) - _queue_organization_snapshot_refresh(source, batch_id) logger.info( "Industrial production parsing completed (batch_id=%d, saved=%d)", @@ -1492,7 +1490,6 @@ def parse_manufactures( # Завершаем BackgroundJob job.complete(result={"batch_id": batch_id, "saved": saved_count}) - _queue_organization_snapshot_refresh(source, batch_id) logger.info( "Manufactures parsing completed (batch_id=%d, saved=%d)", @@ -1581,7 +1578,6 @@ def parse_industrial_products( ) job.complete(result={"batch_id": batch_id, "saved": saved_count}) - _queue_organization_snapshot_refresh(source, batch_id) logger.info( "Industrial products parsing completed (batch_id=%d, saved=%d)", @@ -1749,7 +1745,6 @@ def parse_inspections( # Завершаем BackgroundJob job.complete(result={"batch_id": batch_id, "saved": saved_count}) - _queue_organization_snapshot_refresh(source, batch_id) logger.info( "Inspections parsing completed (batch_id=%d, saved=%d)", @@ -2124,7 +2119,6 @@ def sync_inspections( # noqa: C901 "results": results, } ) - _queue_organization_snapshot_refresh(source, batch_id) logger.info("Inspections sync completed (total_saved=%d)", total_saved) @@ -2257,7 +2251,6 @@ def parse_procurements( # Завершаем BackgroundJob job.complete(result={"batch_id": batch_id, "saved": saved_count}) - _queue_organization_snapshot_refresh(source, batch_id) logger.info( "Procurements parsing completed (batch_id=%d, saved=%d)", @@ -2474,7 +2467,6 @@ def sync_procurements( # noqa: C901 "results": results, } ) - _queue_organization_snapshot_refresh(source, batch_id) logger.info("Procurements sync completed (total_saved=%d)", total_saved) diff --git a/src/apps/parsers/views.py b/src/apps/parsers/views.py index d9c8c3f..c0a56dd 100644 --- a/src/apps/parsers/views.py +++ b/src/apps/parsers/views.py @@ -78,6 +78,7 @@ from django_celery_beat.models import CrontabSchedule, IntervalSchedule, Periodi from drf_yasg import openapi from drf_yasg.inspectors import SwaggerAutoSchema from drf_yasg.utils import no_body, swagger_auto_schema +from organizations.models import OrganizationSourceRecord from rest_framework import status from rest_framework.exceptions import ValidationError from rest_framework.parsers import FormParser, JSONParser, MultiPartParser @@ -1757,28 +1758,47 @@ def _matched_registry_organization_ids( return matched_ids +def _source_record_queryset_for_parser_source(source: str): + if source == ParserLoadLog.Source.TRUDVSEM: + return OrganizationSourceRecord.objects.filter(source__in=VACANCY_RECORD_SOURCES) + return OrganizationSourceRecord.objects.filter(source=source) + + +def _matched_source_record_registry_organization_ids( + source: str, + by_inn: dict[str, set[int]], + by_ogrn: dict[str, set[int]], +) -> set[int]: + matched_ids: set[int] = set() + queryset = _source_record_queryset_for_parser_source(source) + for inn, ogrn, ogrip in ( + queryset.order_by() + .values_list( + "extension__organization__inn", + "extension__organization__ogrn", + "extension__organization__ogrip", + ) + .distinct() + ): + inn_text = _normalize_registry_identifier(inn) + ogrn_text = _normalize_registry_identifier(ogrn or ogrip) + if inn_text: + matched_ids.update(by_inn.get(inn_text, ())) + if ogrn_text: + matched_ids.update(by_ogrn.get(ogrn_text, ())) + return matched_ids + + def _source_registry_matches( by_inn: dict[str, set[int]], by_ogrn: dict[str, set[int]], ) -> dict[str, set[int]]: matches: dict[str, set[int]] = {} for source, _label in ParserLoadLog.Source.choices: - inn_field, ogrn_field = REGISTRY_COVERAGE_NATIVE_IDENTITY_FIELDS.get( + matches[source] = _matched_source_record_registry_organization_ids( source, - ("inn", "ogrn"), - ) - if source == ParserLoadLog.Source.FNS_REPORTS: - queryset = FinancialReport.objects.all() - elif source in NATIVE_RECORD_MODELS: - queryset = NATIVE_RECORD_MODELS[source].objects.all() - else: - queryset = GenericParserRecord.objects.filter(source=source) - matches[source] = _matched_registry_organization_ids( - queryset, by_inn, by_ogrn, - inn_field=inn_field, - ogrn_field=ogrn_field, ) return matches @@ -1861,32 +1881,9 @@ def _registry_data_coverage() -> dict: for source, _label in ParserLoadLog.Source.choices: if source in REGISTRY_COVERAGE_EXCLUDED_SOURCES: continue - inn_field, ogrn_field = REGISTRY_COVERAGE_NATIVE_IDENTITY_FIELDS.get( - source, - ("inn", "ogrn"), + organizations_count = len( + _matched_source_record_registry_organization_ids(source, by_inn, by_ogrn) ) - if source == ParserLoadLog.Source.FNS_REPORTS: - organizations_count = _matched_registry_organization_count( - FinancialReport.objects.all(), - by_inn, - by_ogrn, - inn_field=inn_field, - ogrn_field=ogrn_field, - ) - elif source in NATIVE_RECORD_MODELS: - organizations_count = _matched_registry_organization_count( - NATIVE_RECORD_MODELS[source].objects.all(), - by_inn, - by_ogrn, - inn_field=inn_field, - ogrn_field=ogrn_field, - ) - else: - organizations_count = _matched_registry_organization_count( - GenericParserRecord.objects.filter(source=source), - by_inn, - by_ogrn, - ) coverage_percent = ( round(organizations_count / total_organizations * 100, 1) if total_organizations @@ -2781,20 +2778,11 @@ class ParserDashboardDataView(APIView): file_sources = [source for source in sources if source["supports_file_upload"]] jobs = BackgroundJobService.get_user_jobs(user_id=request.user.id, limit=30) source_counts = dict( - GenericParserRecord.objects.order_by() + OrganizationSourceRecord.objects.order_by() .values("source") - .annotate(count=Count("id")) + .annotate(count=Count("uid")) .values_list("source", "count") ) - source_counts.update( - { - source: model.objects.count() - for source, model in NATIVE_RECORD_MODELS.items() - } - ) - source_counts.update( - {ParserLoadLog.Source.FNS_REPORTS: FinancialReport.objects.count()} - ) schedules = [ _periodic_task_to_dict(task) for task in _parser_periodic_tasks_for_user(request.user) diff --git a/src/organizations/filters.py b/src/organizations/filters.py index 2758221..d2094c7 100644 --- a/src/organizations/filters.py +++ b/src/organizations/filters.py @@ -1,46 +1,76 @@ -"""Filters for organizations API v2.""" +"""Filters for organization-centric API v2.""" -from apps.parsers.models import FinancialReport, ParserLoadLog from django.db.models import CharField, Q from django.db.models.functions import Cast from django_filters import rest_framework as filters from registers.models import RegistryMembershipPeriod -from organizations.api_enrichment import ( - DATA_PRESENCE_KEYS, - data_presence_identity_values, - to_internal_data_source, -) -from organizations.models import Organization +from organizations.models import Organization, OrganizationSourceExtension, SourceGroup + +SOURCE_FILTER_ALIASES = { + "financial_indicators": SourceGroup.FINANCIAL_INDICATORS, + "fns_reports": SourceGroup.FINANCIAL_INDICATORS, + "government_procurements": SourceGroup.GOVERNMENT_PROCUREMENTS, + "procurements": SourceGroup.GOVERNMENT_PROCUREMENTS, + "procurements_44fz": SourceGroup.GOVERNMENT_PROCUREMENTS, + "procurements_223fz": SourceGroup.GOVERNMENT_PROCUREMENTS, + "contracts": SourceGroup.GOVERNMENT_PROCUREMENTS, + "industrial_production": SourceGroup.INDUSTRIAL_PRODUCTION, + "industrial": SourceGroup.INDUSTRIAL_PRODUCTION, + "industrial_products": SourceGroup.INDUSTRIAL_PRODUCTION, + "manufactures": SourceGroup.INDUSTRIAL_PRODUCTION, + "planned_inspections": SourceGroup.PLANNED_INSPECTIONS, + "inspections": SourceGroup.PLANNED_INSPECTIONS, + "bankruptcy": SourceGroup.BANKRUPTCY, + "fedresurs_bankruptcy": SourceGroup.BANKRUPTCY, + "defense_suppliers": SourceGroup.DEFENSE_SUPPLIERS, + "unfair_suppliers": SourceGroup.DEFENSE_SUPPLIERS, + "fas_goz": SourceGroup.DEFENSE_SUPPLIERS, + "arbitration": SourceGroup.ARBITRATION, + "security_registries": SourceGroup.SECURITY_REGISTRIES, + "fstec": SourceGroup.SECURITY_REGISTRIES, + "vacancies": SourceGroup.VACANCIES, + "trudvsem": SourceGroup.VACANCIES, +} class OrganizationFilter(filters.FilterSet): - """Exact identifier filters plus partial name matching.""" + """Identifier, registry, and source-extension filters.""" name = filters.CharFilter(field_name="name", lookup_expr="icontains") inn = filters.CharFilter(field_name="inn", lookup_expr="exact") kpp = filters.CharFilter(field_name="kpp", lookup_expr="exact") ogrn = filters.CharFilter(field_name="ogrn", lookup_expr="exact") ogrip = filters.CharFilter(field_name="ogrip", lookup_expr="exact") + identity_status = filters.CharFilter(field_name="identity_status", lookup_expr="exact") registry = filters.UUIDFilter(method="filter_registry") registry_name = filters.CharFilter(method="filter_registry_name") has_registry = filters.BooleanFilter(method="filter_has_registry") - has_industrial = filters.BooleanFilter(method="filter_data_presence") - has_industrial_products = filters.BooleanFilter(method="filter_data_presence") - has_manufactures = filters.BooleanFilter(method="filter_data_presence") - has_inspections = filters.BooleanFilter(method="filter_data_presence") - has_procurements = filters.BooleanFilter(method="filter_data_presence") - has_procurements_44fz = filters.BooleanFilter(method="filter_data_presence") - has_procurements_223fz = filters.BooleanFilter(method="filter_data_presence") - has_contracts = filters.BooleanFilter(method="filter_data_presence") - has_unfair_suppliers = filters.BooleanFilter(method="filter_data_presence") - has_fas_goz = filters.BooleanFilter(method="filter_data_presence") - has_arbitration = filters.BooleanFilter(method="filter_data_presence") - has_fedresurs_bankruptcy = filters.BooleanFilter(method="filter_data_presence") - has_fstec = filters.BooleanFilter(method="filter_data_presence") - has_trudvsem = filters.BooleanFilter(method="filter_data_presence") - has_vacancies = filters.BooleanFilter(method="filter_data_presence") - has_fns_reports = filters.BooleanFilter(method="filter_data_presence") + source_group = filters.CharFilter(method="filter_source_group") + + has_financial_indicators = filters.BooleanFilter(method="filter_source_presence") + has_fns_reports = filters.BooleanFilter(method="filter_source_presence") + has_government_procurements = filters.BooleanFilter(method="filter_source_presence") + has_procurements = filters.BooleanFilter(method="filter_source_presence") + has_procurements_44fz = filters.BooleanFilter(method="filter_source_presence") + has_procurements_223fz = filters.BooleanFilter(method="filter_source_presence") + has_contracts = filters.BooleanFilter(method="filter_source_presence") + has_industrial_production = filters.BooleanFilter(method="filter_source_presence") + has_industrial = filters.BooleanFilter(method="filter_source_presence") + has_industrial_products = filters.BooleanFilter(method="filter_source_presence") + has_manufactures = filters.BooleanFilter(method="filter_source_presence") + has_planned_inspections = filters.BooleanFilter(method="filter_source_presence") + has_inspections = filters.BooleanFilter(method="filter_source_presence") + has_bankruptcy = filters.BooleanFilter(method="filter_source_presence") + has_fedresurs_bankruptcy = filters.BooleanFilter(method="filter_source_presence") + has_defense_suppliers = filters.BooleanFilter(method="filter_source_presence") + has_unfair_suppliers = filters.BooleanFilter(method="filter_source_presence") + has_fas_goz = filters.BooleanFilter(method="filter_source_presence") + has_arbitration = filters.BooleanFilter(method="filter_source_presence") + has_security_registries = filters.BooleanFilter(method="filter_source_presence") + has_fstec = filters.BooleanFilter(method="filter_source_presence") + has_vacancies = filters.BooleanFilter(method="filter_source_presence") + has_trudvsem = filters.BooleanFilter(method="filter_source_presence") class Meta: model = Organization @@ -50,25 +80,11 @@ class OrganizationFilter(filters.FilterSet): "kpp", "ogrn", "ogrip", + "identity_status", "registry", "registry_name", "has_registry", - "has_industrial", - "has_industrial_products", - "has_manufactures", - "has_inspections", - "has_procurements", - "has_procurements_44fz", - "has_procurements_223fz", - "has_contracts", - "has_unfair_suppliers", - "has_fas_goz", - "has_arbitration", - "has_fedresurs_bankruptcy", - "has_fstec", - "has_trudvsem", - "has_vacancies", - "has_fns_reports", + "source_group", ] def filter_registry(self, queryset, _name, value): @@ -80,46 +96,25 @@ class OrganizationFilter(filters.FilterSet): def filter_has_registry(self, queryset, _name, value): return self._filter_by_registry_membership(queryset, has_registry=value) - def filter_data_presence(self, queryset, name, value): - source = to_internal_data_source(name.removeprefix("has_")) - if source not in DATA_PRESENCE_KEYS: + def filter_source_group(self, queryset, _name, value): + source_group = SOURCE_FILTER_ALIASES.get(str(value), str(value)) + return self._filter_by_source_group(queryset, source_group, True) + + def filter_source_presence(self, queryset, name, value): + source_key = name.removeprefix("has_") + source_group = SOURCE_FILTER_ALIASES.get(source_key) + if source_group is None: return queryset.none() - - if source == ParserLoadLog.Source.FNS_REPORTS: - return self._filter_by_fns_report_presence(queryset, value) - - inn_values, ogrn_values = data_presence_identity_values(source) - filtered = self._filter_by_registry_identities( - queryset, inn_values, ogrn_values - ) - if value: - return filtered - return queryset.exclude(uid__in=filtered.values("uid")) + return self._filter_by_source_group(queryset, source_group, value) @staticmethod - def _filter_by_registry_identities( - queryset, inn_values: set[str], ogrn_values: set[str] - ): - if not inn_values and not ogrn_values: - return queryset.none() - - query = Q() - if inn_values: - query |= Q(inn__in=inn_values) - if ogrn_values: - query |= Q(ogrn__in=ogrn_values) | Q(ogrip__in=ogrn_values) - return queryset.filter(query) - - @staticmethod - def _filter_by_fns_report_presence(queryset, value): - report_ogrns = FinancialReport.objects.order_by().values_list( - "ogrn", - flat=True, - ) - query = Q(ogrn__in=report_ogrns) | Q(ogrip__in=report_ogrns) + def _filter_by_source_group(queryset, source_group: str, value: bool): + organization_ids = OrganizationSourceExtension.objects.filter( + source_group=source_group, + ).values("organization_id") if value: - return queryset.filter(query) - return queryset.exclude(query) + return queryset.filter(uid__in=organization_ids) + return queryset.exclude(uid__in=organization_ids) @classmethod def _filter_by_registry_membership( @@ -169,10 +164,12 @@ class OrganizationFilter(filters.FilterSet): membership = membership.annotate( organization_inn_text=Cast( - "organization__mn_inn", output_field=CharField() + "organization__mn_inn", + output_field=CharField(), ), organization_ogrn_text=Cast( - "organization__mn_ogrn", output_field=CharField() + "organization__mn_ogrn", + output_field=CharField(), ), ) diff --git a/src/organizations/management/commands/backfill_organization_sources.py b/src/organizations/management/commands/backfill_organization_sources.py new file mode 100644 index 0000000..1f4faea --- /dev/null +++ b/src/organizations/management/commands/backfill_organization_sources.py @@ -0,0 +1,55 @@ +"""Backfill polymorphic organization source extensions.""" + +from __future__ import annotations + +import json + +from apps.core.management.commands.base import BaseAppCommand + +from organizations.source_backfill import OrganizationSourceBackfillService + + +class Command(BaseAppCommand): + """Backfill organization source extensions from legacy parser tables.""" + + help = "Переносит legacy parser records в polymorphic source extensions" + use_transaction = False + + def add_arguments(self, parser) -> None: + super().add_arguments(parser) + parser.add_argument( + "--source", + dest="source", + default=None, + help="Источник ParserLoadLog.Source. Если не задан, обрабатываются все.", + ) + parser.add_argument( + "--batch-id", + dest="batch_id", + type=int, + default=None, + help="Ограничить перенос одним load_batch.", + ) + + def execute_command(self, *args, **options) -> str: + result = OrganizationSourceBackfillService.backfill( + source=options.get("source"), + batch_id=options.get("batch_id"), + ) + rendered = json.dumps( + { + "scanned": result.scanned, + "created_organizations": result.created_organizations, + "created_extensions": result.created_extensions, + "updated_extensions": result.updated_extensions, + "created_records": result.created_records, + "updated_records": result.updated_records, + "created_financial_lines": result.created_financial_lines, + "updated_financial_lines": result.updated_financial_lines, + "unresolved": result.unresolved, + }, + ensure_ascii=False, + sort_keys=True, + ) + self.log_success(rendered) + return rendered diff --git a/src/organizations/migrations/0006_polymorphic_source_extensions.py b/src/organizations/migrations/0006_polymorphic_source_extensions.py new file mode 100644 index 0000000..34941ee --- /dev/null +++ b/src/organizations/migrations/0006_polymorphic_source_extensions.py @@ -0,0 +1,303 @@ +# Generated by Django 3.2.25 on 2026-05-18 17:49 + +from django.db import migrations, models +import django.db.models.deletion +import uuid + + +def populate_existing_organization_identity(apps, schema_editor): + organization_model = apps.get_model('organizations', 'Organization') + batch = [] + for organization in organization_model.objects.all().iterator(chunk_size=1000): + if organization.inn and (organization.ogrn or organization.ogrip): + organization.identity_status = 'complete' + elif organization.inn or organization.ogrn or organization.ogrip: + organization.identity_status = 'partial' + else: + organization.identity_status = 'missing' + + if organization.inn and organization.kpp: + organization.primary_identity = f'inn:{organization.inn}:kpp:{organization.kpp}' + elif organization.ogrn: + organization.primary_identity = f'ogrn:{organization.ogrn}' + elif organization.ogrip: + organization.primary_identity = f'ogrip:{organization.ogrip}' + elif organization.inn: + organization.primary_identity = f'inn:{organization.inn}' + else: + organization.primary_identity = f'name:{organization.name.strip().lower()}'[:255] + + batch.append(organization) + if len(batch) >= 1000: + organization_model.objects.bulk_update( + batch, + fields=['identity_status', 'primary_identity'], + batch_size=1000, + ) + batch = [] + + if batch: + organization_model.objects.bulk_update( + batch, + fields=['identity_status', 'primary_identity'], + batch_size=1000, + ) + + +def clear_existing_organization_identity(apps, schema_editor): + organization_model = apps.get_model('organizations', 'Organization') + organization_model.objects.update(identity_status='missing', primary_identity='') + + +class Migration(migrations.Migration): + + dependencies = [ + ('contenttypes', '0002_remove_content_type_name'), + ('organizations', '0005_snapshot_data_source_counts'), + ] + + operations = [ + migrations.CreateModel( + name='OrganizationSourceExtension', + fields=[ + ('uid', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False, verbose_name='UID')), + ('source_group', models.CharField(choices=[('financial_indicators', 'Финансово-экономические показатели'), ('government_procurements', 'Государственные закупки'), ('industrial_production', 'Производители и продукция России'), ('planned_inspections', 'Плановые проверки'), ('bankruptcy', 'Сведения о процедурах банкротства'), ('defense_suppliers', 'Недобросовестные поставщики ГОЗ'), ('arbitration', 'Арбитражные дела'), ('security_registries', 'Реестры по информационной безопасности'), ('vacancies', 'Вакансии')], db_index=True, max_length=64, verbose_name='группа источников')), + ('title', models.CharField(help_text='Человекочитаемое название блока источника', max_length=255, verbose_name='название')), + ('status', models.CharField(choices=[('active', 'Активно'), ('inactive', 'Неактивно'), ('error', 'Ошибка')], db_index=True, default='active', max_length=16, verbose_name='статус')), + ('records_count', models.PositiveIntegerField(default=0, verbose_name='количество записей')), + ('first_seen_at', models.DateTimeField(blank=True, db_index=True, null=True, verbose_name='первая запись')), + ('last_seen_at', models.DateTimeField(blank=True, db_index=True, null=True, verbose_name='последняя запись')), + ('last_load_batch', models.PositiveIntegerField(blank=True, db_index=True, null=True, verbose_name='последний пакет загрузки')), + ('metadata', models.JSONField(blank=True, default=dict, verbose_name='метаданные')), + ('created_at', models.DateTimeField(auto_now_add=True, verbose_name='дата создания')), + ('updated_at', models.DateTimeField(auto_now=True, verbose_name='дата обновления')), + ], + options={ + 'verbose_name': 'расширение источника организации', + 'verbose_name_plural': 'расширения источников организаций', + 'db_table': 'organizations_source_extension', + 'ordering': ['organization__name', 'source_group'], + }, + ), + migrations.AddField( + model_name='organization', + name='identity_status', + field=models.CharField(choices=[('complete', 'Полная'), ('partial', 'Частичная'), ('missing', 'Отсутствует')], db_index=True, default='missing', help_text='Оценка полноты идентификационных реквизитов', max_length=16, verbose_name='полнота реквизитов'), + ), + migrations.AddField( + model_name='organization', + name='primary_identity', + field=models.CharField(blank=True, db_index=True, editable=False, help_text='Нормализованный ключ для диагностики и дедупликации', max_length=255, verbose_name='основной идентификатор'), + ), + migrations.RunPython( + populate_existing_organization_identity, + clear_existing_organization_identity, + ), + migrations.CreateModel( + name='ArbitrationExtension', + fields=[ + ('organizationsourceextension_ptr', models.OneToOneField(auto_created=True, on_delete=django.db.models.deletion.CASCADE, parent_link=True, primary_key=True, serialize=False, to='organizations.organizationsourceextension')), + ], + options={ + 'verbose_name': 'арбитражные дела', + 'verbose_name_plural': 'арбитражные дела', + 'db_table': 'organizations_arbitration_extension', + }, + bases=('organizations.organizationsourceextension',), + ), + migrations.CreateModel( + name='BankruptcyExtension', + fields=[ + ('organizationsourceextension_ptr', models.OneToOneField(auto_created=True, on_delete=django.db.models.deletion.CASCADE, parent_link=True, primary_key=True, serialize=False, to='organizations.organizationsourceextension')), + ], + options={ + 'verbose_name': 'сведения о процедурах банкротства', + 'verbose_name_plural': 'сведения о процедурах банкротства', + 'db_table': 'organizations_bankruptcy_extension', + }, + bases=('organizations.organizationsourceextension',), + ), + migrations.CreateModel( + name='DefenseSupplierExtension', + fields=[ + ('organizationsourceextension_ptr', models.OneToOneField(auto_created=True, on_delete=django.db.models.deletion.CASCADE, parent_link=True, primary_key=True, serialize=False, to='organizations.organizationsourceextension')), + ], + options={ + 'verbose_name': 'недобросовестные поставщики ГОЗ', + 'verbose_name_plural': 'недобросовестные поставщики ГОЗ', + 'db_table': 'organizations_defense_supplier_extension', + }, + bases=('organizations.organizationsourceextension',), + ), + migrations.CreateModel( + name='FinancialIndicatorsExtension', + fields=[ + ('organizationsourceextension_ptr', models.OneToOneField(auto_created=True, on_delete=django.db.models.deletion.CASCADE, parent_link=True, primary_key=True, serialize=False, to='organizations.organizationsourceextension')), + ], + options={ + 'verbose_name': 'финансово-экономические показатели', + 'verbose_name_plural': 'финансово-экономические показатели', + 'db_table': 'organizations_financial_indicators_extension', + }, + bases=('organizations.organizationsourceextension',), + ), + migrations.CreateModel( + name='GovernmentProcurementExtension', + fields=[ + ('organizationsourceextension_ptr', models.OneToOneField(auto_created=True, on_delete=django.db.models.deletion.CASCADE, parent_link=True, primary_key=True, serialize=False, to='organizations.organizationsourceextension')), + ], + options={ + 'verbose_name': 'государственные закупки', + 'verbose_name_plural': 'государственные закупки', + 'db_table': 'organizations_government_procurement_extension', + }, + bases=('organizations.organizationsourceextension',), + ), + migrations.CreateModel( + name='IndustrialProductionExtension', + fields=[ + ('organizationsourceextension_ptr', models.OneToOneField(auto_created=True, on_delete=django.db.models.deletion.CASCADE, parent_link=True, primary_key=True, serialize=False, to='organizations.organizationsourceextension')), + ], + options={ + 'verbose_name': 'производители и продукция России', + 'verbose_name_plural': 'производители и продукция России', + 'db_table': 'organizations_industrial_production_extension', + }, + bases=('organizations.organizationsourceextension',), + ), + migrations.CreateModel( + name='PlannedInspectionExtension', + fields=[ + ('organizationsourceextension_ptr', models.OneToOneField(auto_created=True, on_delete=django.db.models.deletion.CASCADE, parent_link=True, primary_key=True, serialize=False, to='organizations.organizationsourceextension')), + ], + options={ + 'verbose_name': 'плановые проверки', + 'verbose_name_plural': 'плановые проверки', + 'db_table': 'organizations_planned_inspection_extension', + }, + bases=('organizations.organizationsourceextension',), + ), + migrations.CreateModel( + name='SecurityRegistryExtension', + fields=[ + ('organizationsourceextension_ptr', models.OneToOneField(auto_created=True, on_delete=django.db.models.deletion.CASCADE, parent_link=True, primary_key=True, serialize=False, to='organizations.organizationsourceextension')), + ], + options={ + 'verbose_name': 'реестры по информационной безопасности', + 'verbose_name_plural': 'реестры по информационной безопасности', + 'db_table': 'organizations_security_registry_extension', + }, + bases=('organizations.organizationsourceextension',), + ), + migrations.CreateModel( + name='VacancyExtension', + fields=[ + ('organizationsourceextension_ptr', models.OneToOneField(auto_created=True, on_delete=django.db.models.deletion.CASCADE, parent_link=True, primary_key=True, serialize=False, to='organizations.organizationsourceextension')), + ], + options={ + 'verbose_name': 'вакансии', + 'verbose_name_plural': 'вакансии', + 'db_table': 'organizations_vacancy_extension', + }, + bases=('organizations.organizationsourceextension',), + ), + migrations.CreateModel( + name='OrganizationSourceRecord', + fields=[ + ('uid', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False, verbose_name='UID')), + ('record_type', models.CharField(db_index=True, max_length=64, verbose_name='тип записи')), + ('source', models.CharField(db_index=True, max_length=64, verbose_name='источник')), + ('external_id', models.CharField(blank=True, db_index=True, max_length=255, verbose_name='внешний ID')), + ('title', models.TextField(blank=True, verbose_name='заголовок')), + ('record_date', models.CharField(blank=True, db_index=True, max_length=255, verbose_name='дата записи')), + ('amount', models.DecimalField(blank=True, decimal_places=2, max_digits=20, null=True, verbose_name='сумма')), + ('status', models.CharField(blank=True, db_index=True, max_length=255, verbose_name='статус')), + ('url', models.TextField(blank=True, verbose_name='URL')), + ('payload', models.JSONField(blank=True, default=dict, verbose_name='исходные данные')), + ('legacy_model', models.CharField(blank=True, db_index=True, max_length=255, verbose_name='legacy model')), + ('legacy_pk', models.CharField(blank=True, db_index=True, max_length=64, verbose_name='legacy pk')), + ('load_batch', models.PositiveIntegerField(blank=True, db_index=True, null=True, verbose_name='ID пакета загрузки')), + ('created_at', models.DateTimeField(auto_now_add=True, verbose_name='дата создания')), + ('updated_at', models.DateTimeField(auto_now=True, verbose_name='дата обновления')), + ('extension', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='records', to='organizations.organizationsourceextension', verbose_name='расширение источника')), + ], + options={ + 'verbose_name': 'запись источника организации', + 'verbose_name_plural': 'записи источников организаций', + 'db_table': 'organizations_source_record', + 'ordering': ['-created_at'], + }, + ), + migrations.CreateModel( + name='OrganizationSourceFinancialLine', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('form_code', models.CharField(db_index=True, max_length=10, verbose_name='код формы')), + ('line_code', models.CharField(db_index=True, max_length=10, verbose_name='код строки')), + ('line_name', models.CharField(max_length=255, verbose_name='наименование строки')), + ('year', models.PositiveSmallIntegerField(db_index=True, verbose_name='год')), + ('period_start', models.BigIntegerField(blank=True, null=True, verbose_name='на начало периода')), + ('period_end', models.BigIntegerField(blank=True, null=True, verbose_name='на конец периода')), + ('source_record', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='financial_lines', to='organizations.organizationsourcerecord', verbose_name='запись источника')), + ], + options={ + 'verbose_name': 'строка финансового источника', + 'verbose_name_plural': 'строки финансовых источников', + 'db_table': 'organizations_source_financial_line', + }, + ), + migrations.AddField( + model_name='organizationsourceextension', + name='organization', + field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='source_extensions', to='organizations.organization', verbose_name='организация'), + ), + migrations.AddField( + model_name='organizationsourceextension', + name='polymorphic_ctype', + field=models.ForeignKey(editable=False, null=True, on_delete=django.db.models.deletion.CASCADE, related_name='polymorphic_organizations.organizationsourceextension_set+', to='contenttypes.contenttype'), + ), + migrations.AddIndex( + model_name='organizationsourcerecord', + index=models.Index(fields=['extension', 'source'], name='organizatio_extensi_4a1a50_idx'), + ), + migrations.AddIndex( + model_name='organizationsourcerecord', + index=models.Index(fields=['source', 'record_type'], name='organizatio_source_5836ed_idx'), + ), + migrations.AddIndex( + model_name='organizationsourcerecord', + index=models.Index(fields=['load_batch', 'source'], name='organizatio_load_ba_5b6485_idx'), + ), + migrations.AddConstraint( + model_name='organizationsourcerecord', + constraint=models.UniqueConstraint(condition=models.Q(('external_id', ''), _negated=True), fields=('source', 'external_id'), name='unique_source_record_external_id'), + ), + migrations.AddConstraint( + model_name='organizationsourcerecord', + constraint=models.UniqueConstraint(condition=models.Q(models.Q(('legacy_model', ''), _negated=True), models.Q(('legacy_pk', ''), _negated=True)), fields=('legacy_model', 'legacy_pk'), name='unique_source_record_legacy_identity'), + ), + migrations.AddIndex( + model_name='organizationsourcefinancialline', + index=models.Index(fields=['source_record', 'form_code', 'line_code'], name='organizatio_source__73f8c0_idx'), + ), + migrations.AddIndex( + model_name='organizationsourcefinancialline', + index=models.Index(fields=['year', 'line_code'], name='organizatio_year_391547_idx'), + ), + migrations.AddConstraint( + model_name='organizationsourcefinancialline', + constraint=models.UniqueConstraint(fields=('source_record', 'form_code', 'line_code', 'year'), name='unique_source_financial_line_year'), + ), + migrations.AddIndex( + model_name='organizationsourceextension', + index=models.Index(fields=['source_group', 'status'], name='organizatio_source__bcbc18_idx'), + ), + migrations.AddIndex( + model_name='organizationsourceextension', + index=models.Index(fields=['organization', 'source_group'], name='organizatio_organiz_ce3750_idx'), + ), + migrations.AddConstraint( + model_name='organizationsourceextension', + constraint=models.UniqueConstraint(fields=('organization', 'source_group'), name='unique_organization_source_group_extension'), + ), + ] diff --git a/src/organizations/models.py b/src/organizations/models.py index d5ca29c..5555aba 100644 --- a/src/organizations/models.py +++ b/src/organizations/models.py @@ -5,14 +5,44 @@ import uuid from django.db import models from django.db.models import Q from django.utils.translation import gettext_lazy as _ +from polymorphic.models import PolymorphicModel from organizations.data_sources import snapshot_data_source_summary from organizations.name_normalization import normalize_organization_name +class SourceGroup(models.TextChoices): + """Product-level organization source groups.""" + + FINANCIAL_INDICATORS = "financial_indicators", _("Финансово-экономические показатели") + GOVERNMENT_PROCUREMENTS = "government_procurements", _("Государственные закупки") + INDUSTRIAL_PRODUCTION = "industrial_production", _("Производители и продукция России") + PLANNED_INSPECTIONS = "planned_inspections", _("Плановые проверки") + BANKRUPTCY = "bankruptcy", _("Сведения о процедурах банкротства") + DEFENSE_SUPPLIERS = "defense_suppliers", _("Недобросовестные поставщики ГОЗ") + ARBITRATION = "arbitration", _("Арбитражные дела") + SECURITY_REGISTRIES = "security_registries", _("Реестры по информационной безопасности") + VACANCIES = "vacancies", _("Вакансии") + + +class SourceExtensionStatus(models.TextChoices): + """Lifecycle status for an organization source extension.""" + + ACTIVE = "active", _("Активно") + INACTIVE = "inactive", _("Неактивно") + ERROR = "error", _("Ошибка") + + class Organization(models.Model): """Canonical organization without source-specific relations.""" + class IdentityStatus(models.TextChoices): + """Completeness of legal identity identifiers.""" + + COMPLETE = "complete", _("Полная") + PARTIAL = "partial", _("Частичная") + MISSING = "missing", _("Отсутствует") + uid = models.UUIDField( _("UID"), primary_key=True, @@ -53,6 +83,22 @@ class Organization(models.Model): db_index=True, help_text=_("ОГРИП только для индивидуальных предпринимателей"), ) + identity_status = models.CharField( + _("полнота реквизитов"), + max_length=16, + choices=IdentityStatus.choices, + default=IdentityStatus.MISSING, + db_index=True, + help_text=_("Оценка полноты идентификационных реквизитов"), + ) + primary_identity = models.CharField( + _("основной идентификатор"), + max_length=255, + blank=True, + db_index=True, + editable=False, + help_text=_("Нормализованный ключ для диагностики и дедупликации"), + ) class Meta: db_table = "organizations_organization" @@ -96,6 +142,379 @@ class Organization(models.Model): def normalized_name(self) -> str: return normalize_organization_name(self.name) + def save(self, *args, **kwargs) -> None: + self.identity_status = self._resolve_identity_status() + self.primary_identity = self._resolve_primary_identity() + update_fields = kwargs.get("update_fields") + if update_fields is not None: + kwargs["update_fields"] = list( + dict.fromkeys([*update_fields, "identity_status", "primary_identity"]) + ) + super().save(*args, **kwargs) + + def _resolve_identity_status(self) -> str: + if self.inn and (self.ogrn or self.ogrip): + return self.IdentityStatus.COMPLETE + if self.inn or self.ogrn or self.ogrip: + return self.IdentityStatus.PARTIAL + return self.IdentityStatus.MISSING + + def _resolve_primary_identity(self) -> str: + if self.inn and self.kpp: + return f"inn:{self.inn}:kpp:{self.kpp}" + if self.ogrn: + return f"ogrn:{self.ogrn}" + if self.ogrip: + return f"ogrip:{self.ogrip}" + if self.inn: + return f"inn:{self.inn}" + normalized_name = normalize_organization_name(self.name) + if normalized_name: + return f"name:{normalized_name}"[:255] + return "" + + +class OrganizationSourceExtension(PolymorphicModel): + """Base source group extension for one canonical organization.""" + + uid = models.UUIDField( + _("UID"), + primary_key=True, + default=uuid.uuid4, + editable=False, + ) + organization = models.ForeignKey( + Organization, + on_delete=models.CASCADE, + related_name="source_extensions", + verbose_name=_("организация"), + ) + source_group = models.CharField( + _("группа источников"), + max_length=64, + choices=SourceGroup.choices, + db_index=True, + ) + title = models.CharField( + _("название"), + max_length=255, + help_text=_("Человекочитаемое название блока источника"), + ) + status = models.CharField( + _("статус"), + max_length=16, + choices=SourceExtensionStatus.choices, + default=SourceExtensionStatus.ACTIVE, + db_index=True, + ) + records_count = models.PositiveIntegerField( + _("количество записей"), + default=0, + ) + first_seen_at = models.DateTimeField( + _("первая запись"), + null=True, + blank=True, + db_index=True, + ) + last_seen_at = models.DateTimeField( + _("последняя запись"), + null=True, + blank=True, + db_index=True, + ) + last_load_batch = models.PositiveIntegerField( + _("последний пакет загрузки"), + null=True, + blank=True, + db_index=True, + ) + metadata = models.JSONField( + _("метаданные"), + default=dict, + blank=True, + ) + created_at = models.DateTimeField(_("дата создания"), auto_now_add=True) + updated_at = models.DateTimeField(_("дата обновления"), auto_now=True) + + class Meta: + db_table = "organizations_source_extension" + verbose_name = _("расширение источника организации") + verbose_name_plural = _("расширения источников организаций") + ordering = ["organization__name", "source_group"] + constraints = [ + models.UniqueConstraint( + fields=["organization", "source_group"], + name="unique_organization_source_group_extension", + ), + ] + indexes = [ + models.Index(fields=["source_group", "status"]), + models.Index(fields=["organization", "source_group"]), + ] + + def __str__(self) -> str: + return f"{self.organization}: {self.title}" + + def save(self, *args, **kwargs) -> None: + source_group = getattr(self, "source_group_value", "") + if source_group: + self.source_group = source_group + super().save(*args, **kwargs) + + +class FinancialIndicatorsExtension(OrganizationSourceExtension): + """Financial and accounting indicators source group.""" + + source_group_value = SourceGroup.FINANCIAL_INDICATORS + + class Meta: + db_table = "organizations_financial_indicators_extension" + verbose_name = _("финансово-экономические показатели") + verbose_name_plural = _("финансово-экономические показатели") + + +class GovernmentProcurementExtension(OrganizationSourceExtension): + """Government procurement source group.""" + + source_group_value = SourceGroup.GOVERNMENT_PROCUREMENTS + + class Meta: + db_table = "organizations_government_procurement_extension" + verbose_name = _("государственные закупки") + verbose_name_plural = _("государственные закупки") + + +class IndustrialProductionExtension(OrganizationSourceExtension): + """Russian manufacturers and products source group.""" + + source_group_value = SourceGroup.INDUSTRIAL_PRODUCTION + + class Meta: + db_table = "organizations_industrial_production_extension" + verbose_name = _("производители и продукция России") + verbose_name_plural = _("производители и продукция России") + + +class PlannedInspectionExtension(OrganizationSourceExtension): + """Planned inspections source group.""" + + source_group_value = SourceGroup.PLANNED_INSPECTIONS + + class Meta: + db_table = "organizations_planned_inspection_extension" + verbose_name = _("плановые проверки") + verbose_name_plural = _("плановые проверки") + + +class BankruptcyExtension(OrganizationSourceExtension): + """Bankruptcy procedures source group.""" + + source_group_value = SourceGroup.BANKRUPTCY + + class Meta: + db_table = "organizations_bankruptcy_extension" + verbose_name = _("сведения о процедурах банкротства") + verbose_name_plural = _("сведения о процедурах банкротства") + + +class DefenseSupplierExtension(OrganizationSourceExtension): + """Defense supplier risk source group.""" + + source_group_value = SourceGroup.DEFENSE_SUPPLIERS + + class Meta: + db_table = "organizations_defense_supplier_extension" + verbose_name = _("недобросовестные поставщики ГОЗ") + verbose_name_plural = _("недобросовестные поставщики ГОЗ") + + +class ArbitrationExtension(OrganizationSourceExtension): + """Arbitration cases source group.""" + + source_group_value = SourceGroup.ARBITRATION + + class Meta: + db_table = "organizations_arbitration_extension" + verbose_name = _("арбитражные дела") + verbose_name_plural = _("арбитражные дела") + + +class SecurityRegistryExtension(OrganizationSourceExtension): + """Information security registries source group.""" + + source_group_value = SourceGroup.SECURITY_REGISTRIES + + class Meta: + db_table = "organizations_security_registry_extension" + verbose_name = _("реестры по информационной безопасности") + verbose_name_plural = _("реестры по информационной безопасности") + + +class VacancyExtension(OrganizationSourceExtension): + """Vacancies source group.""" + + source_group_value = SourceGroup.VACANCIES + + class Meta: + db_table = "organizations_vacancy_extension" + verbose_name = _("вакансии") + verbose_name_plural = _("вакансии") + + +class OrganizationSourceRecord(models.Model): + """Subordinate source record stored under a source extension.""" + + uid = models.UUIDField( + _("UID"), + primary_key=True, + default=uuid.uuid4, + editable=False, + ) + extension = models.ForeignKey( + OrganizationSourceExtension, + on_delete=models.CASCADE, + related_name="records", + verbose_name=_("расширение источника"), + ) + record_type = models.CharField( + _("тип записи"), + max_length=64, + db_index=True, + ) + source = models.CharField( + _("источник"), + max_length=64, + db_index=True, + ) + external_id = models.CharField( + _("внешний ID"), + max_length=255, + blank=True, + db_index=True, + ) + title = models.TextField( + _("заголовок"), + blank=True, + ) + record_date = models.CharField( + _("дата записи"), + max_length=255, + blank=True, + db_index=True, + ) + amount = models.DecimalField( + _("сумма"), + max_digits=20, + decimal_places=2, + null=True, + blank=True, + ) + status = models.CharField( + _("статус"), + max_length=255, + blank=True, + db_index=True, + ) + url = models.TextField( + _("URL"), + blank=True, + ) + payload = models.JSONField( + _("исходные данные"), + default=dict, + blank=True, + ) + legacy_model = models.CharField( + _("legacy model"), + max_length=255, + blank=True, + db_index=True, + ) + legacy_pk = models.CharField( + _("legacy pk"), + max_length=64, + blank=True, + db_index=True, + ) + load_batch = models.PositiveIntegerField( + _("ID пакета загрузки"), + null=True, + blank=True, + db_index=True, + ) + created_at = models.DateTimeField(_("дата создания"), auto_now_add=True) + updated_at = models.DateTimeField(_("дата обновления"), auto_now=True) + + class Meta: + db_table = "organizations_source_record" + verbose_name = _("запись источника организации") + verbose_name_plural = _("записи источников организаций") + ordering = ["-created_at"] + constraints = [ + models.UniqueConstraint( + fields=["source", "external_id"], + condition=~Q(external_id=""), + name="unique_source_record_external_id", + ), + models.UniqueConstraint( + fields=["legacy_model", "legacy_pk"], + condition=~Q(legacy_model="") & ~Q(legacy_pk=""), + name="unique_source_record_legacy_identity", + ), + ] + indexes = [ + models.Index(fields=["extension", "source"]), + models.Index(fields=["source", "record_type"]), + models.Index(fields=["load_batch", "source"]), + ] + + def __str__(self) -> str: + return self.title or self.external_id or str(self.uid) + + +class OrganizationSourceFinancialLine(models.Model): + """Structured financial report line under a source record.""" + + source_record = models.ForeignKey( + OrganizationSourceRecord, + on_delete=models.CASCADE, + related_name="financial_lines", + verbose_name=_("запись источника"), + ) + form_code = models.CharField(_("код формы"), max_length=10, db_index=True) + line_code = models.CharField(_("код строки"), max_length=10, db_index=True) + line_name = models.CharField(_("наименование строки"), max_length=255) + year = models.PositiveSmallIntegerField(_("год"), db_index=True) + period_start = models.BigIntegerField( + _("на начало периода"), + null=True, + blank=True, + ) + period_end = models.BigIntegerField( + _("на конец периода"), + null=True, + blank=True, + ) + + class Meta: + db_table = "organizations_source_financial_line" + verbose_name = _("строка финансового источника") + verbose_name_plural = _("строки финансовых источников") + constraints = [ + models.UniqueConstraint( + fields=["source_record", "form_code", "line_code", "year"], + name="unique_source_financial_line_year", + ), + ] + indexes = [ + models.Index(fields=["source_record", "form_code", "line_code"]), + models.Index(fields=["year", "line_code"]), + ] + + def __str__(self) -> str: + return f"{self.line_code} ({self.line_name[:30]}) - {self.year}" + class OrganizationDataSnapshot(models.Model): """Precomputed API v2 data payload for one canonical organization.""" diff --git a/src/organizations/serializers.py b/src/organizations/serializers.py index 51e526d..344dc59 100644 --- a/src/organizations/serializers.py +++ b/src/organizations/serializers.py @@ -1,27 +1,221 @@ -"""Serializers for organizations API v2.""" - -from typing import Any +"""Serializers for organization-centric API v2.""" +from django.db.models import CharField, Q +from django.db.models.functions import Cast +from registers.models import RegistryMembershipPeriod from rest_framework import serializers -from organizations.data_sources import ( - data_source_summary, - snapshot_data_with_api_keys, +from organizations.models import ( + Organization, + OrganizationSourceExtension, + OrganizationSourceFinancialLine, + OrganizationSourceRecord, ) -from organizations.models import Organization + +REGISTRY_ORGANIZATION_CACHE_KEY = "_registry_organization_by_canonical_uid" + + +class OrganizationSourceFinancialLineSerializer(serializers.ModelSerializer): + """Structured financial line under a source record.""" + + class Meta: + model = OrganizationSourceFinancialLine + fields = [ + "id", + "form_code", + "line_code", + "line_name", + "year", + "period_start", + "period_end", + ] + read_only_fields = fields + + +class OrganizationSourceRecordSerializer(serializers.ModelSerializer): + """Source record stored under one source extension.""" + + extension_uid = serializers.UUIDField(source="extension.uid", read_only=True) + financial_lines = OrganizationSourceFinancialLineSerializer(many=True, read_only=True) + organization = serializers.SerializerMethodField() + source_group = serializers.CharField(source="extension.source_group", read_only=True) + + class Meta: + model = OrganizationSourceRecord + fields = [ + "uid", + "extension_uid", + "source_group", + "record_type", + "source", + "external_id", + "title", + "record_date", + "amount", + "status", + "url", + "payload", + "legacy_model", + "legacy_pk", + "load_batch", + "created_at", + "updated_at", + "financial_lines", + "organization", + ] + read_only_fields = fields + + def get_organization(self, obj) -> dict[str, str]: + organization = obj.extension.organization + registry_organization = self._get_registry_organization(organization) + + name = organization.name + inn = organization.inn + kpp = organization.kpp + ogrn = organization.ogrn + if registry_organization is not None: + name = registry_organization.pn_name or name + inn = self._value_to_string(registry_organization.mn_inn) or inn + kpp = self._value_to_string(registry_organization.in_kpp) or kpp + ogrn = self._value_to_string(registry_organization.mn_ogrn) or ogrn + + return { + "uid": str(organization.uid), + "name": name, + "inn": inn, + "kpp": kpp, + "ogrn": ogrn, + "ogrip": organization.ogrip, + } + + def _get_registry_organization(self, organization: Organization): + cache = self.context.get(REGISTRY_ORGANIZATION_CACHE_KEY) + if cache is None: + cache = self._build_registry_organization_cache() + self.context[REGISTRY_ORGANIZATION_CACHE_KEY] = cache + return cache.get(organization.uid) + + def _build_registry_organization_cache(self) -> dict: + records = self._get_source_records_for_cache() + organizations_by_uid = { + record.extension.organization.uid: record.extension.organization + for record in records + } + if not organizations_by_uid: + return {} + + inn_values = { + organization.inn + for organization in organizations_by_uid.values() + if organization.inn + } + ogrn_values = { + organization.ogrn + for organization in organizations_by_uid.values() + if organization.ogrn + } + ogrn_values.update( + organization.ogrip + for organization in organizations_by_uid.values() + if organization.ogrip + ) + + identity_filter = Q() + if inn_values: + identity_filter |= Q(registry_inn_text__in=inn_values) + if ogrn_values: + identity_filter |= Q(registry_ogrn_text__in=ogrn_values) + if not identity_filter: + return {uid: None for uid in organizations_by_uid} + + memberships = ( + RegistryMembershipPeriod.objects.filter(ended_at__isnull=True) + .select_related("organization") + .annotate( + registry_inn_text=Cast( + "organization__mn_inn", + output_field=CharField(), + ), + registry_ogrn_text=Cast( + "organization__mn_ogrn", + output_field=CharField(), + ), + ) + .filter(identity_filter) + .order_by("organization__pn_name", "organization_id") + ) + + registry_by_inn = {} + registry_by_ogrn = {} + for membership in memberships: + registry_by_inn.setdefault( + membership.registry_inn_text, + membership.organization, + ) + registry_by_ogrn.setdefault( + membership.registry_ogrn_text, + membership.organization, + ) + + cache = {} + for uid, organization in organizations_by_uid.items(): + registry_organization = None + if organization.inn: + registry_organization = registry_by_inn.get(organization.inn) + if registry_organization is None and organization.ogrn: + registry_organization = registry_by_ogrn.get(organization.ogrn) + if registry_organization is None and organization.ogrip: + registry_organization = registry_by_ogrn.get(organization.ogrip) + cache[uid] = registry_organization + return cache + + def _get_source_records_for_cache(self) -> list[OrganizationSourceRecord]: + instance = getattr(self.root, "instance", None) + if instance is None: + instance = getattr(self, "instance", None) + if instance is None: + return [] + if isinstance(instance, OrganizationSourceRecord): + return [instance] + return list(instance) + + @staticmethod + def _value_to_string(value) -> str: + if value is None: + return "" + return str(value) + + +class OrganizationSourceExtensionSerializer(serializers.ModelSerializer): + """Compact source extension representation.""" + + class Meta: + model = OrganizationSourceExtension + fields = [ + "uid", + "source_group", + "title", + "status", + "records_count", + "first_seen_at", + "last_seen_at", + "last_load_batch", + "metadata", + "created_at", + "updated_at", + ] + read_only_fields = fields class OrganizationSerializer(serializers.ModelSerializer): - """Canonical organization representation.""" + """Canonical organization with compact source summaries.""" - data = serializers.SerializerMethodField() - data_sources = serializers.SerializerMethodField() normalized_name = serializers.CharField(read_only=True) + sources = serializers.SerializerMethodField() registries = serializers.SerializerMethodField() class Meta: model = Organization - ref_name = "CanonicalOrganization" fields = [ "uid", "name", @@ -30,67 +224,30 @@ class OrganizationSerializer(serializers.ModelSerializer): "kpp", "ogrn", "ogrip", - "data", - "data_sources", + "identity_status", + "primary_identity", + "sources", "registries", ] read_only_fields = fields - def get_data(self, obj) -> dict[str, Any]: - snapshot = getattr(obj, "data_snapshot", None) - if snapshot is not None: - data_sources = self.context.get("data_sources") - if data_sources is not None and not data_sources: - return {} - - data = snapshot_data_with_api_keys(snapshot.data) - if data_sources is None: - return data - return { - source: data.get(source, []) - for source in data_sources - if source in data - } - - enrichment = self.context.get("enrichment", {}).get(str(obj.uid)) - if enrichment is None: - return {} - data_sources = self.context.get("data_sources") - if data_sources is not None: - return { - source: enrichment.data_presence.get(source, []) - for source in data_sources - if source in enrichment.data_presence - } - return enrichment.data_presence - - def get_data_sources(self, obj) -> list[dict[str, int | str]]: - snapshot = getattr(obj, "data_snapshot", None) - if snapshot is not None: - data_source_counts = getattr(snapshot, "data_source_counts", None) - if data_source_counts: - return data_source_counts - if "data" in snapshot.get_deferred_fields(): - return [] - return data_source_summary(snapshot_data_with_api_keys(snapshot.data)) - - enrichment = self.context.get("enrichment", {}).get(str(obj.uid)) - if enrichment is None: - return [] - return data_source_summary(enrichment.data_presence) + def get_sources(self, obj) -> list[dict]: + extensions = obj.source_extensions.all() + return OrganizationSourceExtensionSerializer(extensions, many=True).data def get_registries(self, obj) -> list[dict[str, str]]: - snapshot = getattr(obj, "data_snapshot", None) - if snapshot is not None: - return snapshot.registries - - enrichment = self.context.get("enrichment", {}).get(str(obj.uid)) - if enrichment is None: + query = RegistryMembershipPeriod.objects.filter(ended_at__isnull=True) + if obj.inn: + query = query.filter(organization__mn_inn=obj.inn) + elif obj.ogrn: + query = query.filter(organization__mn_ogrn=obj.ogrn) + else: return [] + return [ { - "id": registry.id, - "name": registry.name, + "id": str(membership.registry_id), + "name": membership.registry.name, } - for registry in enrichment.registries + for membership in query.select_related("registry").order_by("registry__name") ] diff --git a/src/organizations/source_backfill.py b/src/organizations/source_backfill.py new file mode 100644 index 0000000..b53a574 --- /dev/null +++ b/src/organizations/source_backfill.py @@ -0,0 +1,594 @@ +"""Backfill organization source extensions from legacy parser tables.""" + +from __future__ import annotations + +from collections.abc import Iterable +from dataclasses import dataclass +from datetime import date, datetime +from decimal import Decimal +from typing import Any +from uuid import UUID + +from apps.parsers.models import ( + FinancialReport, + FinancialReportLine, + GenericParserRecord, + IndustrialCertificateRecord, + IndustrialProductRecord, + InspectionRecord, + ManufacturerRecord, + ParserLoadLog, + ProcurementRecord, +) +from django.db import transaction +from django.db.models import Count, Max, Min, Model + +from organizations.models import ( + Organization, + OrganizationSourceExtension, + OrganizationSourceFinancialLine, + OrganizationSourceRecord, +) +from organizations.name_normalization import normalize_organization_name +from organizations.source_groups import ( + SOURCE_GROUP_DESCRIPTORS, + SourceGroupDescriptor, + get_source_group_descriptor, +) +from organizations.source_identity import normalize_identity_fields + + +@dataclass(frozen=True) +class OrganizationSourceBackfillResult: + """Counters returned by organization source backfill.""" + + scanned: int = 0 + created_organizations: int = 0 + created_extensions: int = 0 + updated_extensions: int = 0 + created_records: int = 0 + updated_records: int = 0 + created_financial_lines: int = 0 + updated_financial_lines: int = 0 + unresolved: int = 0 + + def plus(self, other: OrganizationSourceBackfillResult) -> OrganizationSourceBackfillResult: + return OrganizationSourceBackfillResult( + scanned=self.scanned + other.scanned, + created_organizations=self.created_organizations + other.created_organizations, + created_extensions=self.created_extensions + other.created_extensions, + updated_extensions=self.updated_extensions + other.updated_extensions, + created_records=self.created_records + other.created_records, + updated_records=self.updated_records + other.updated_records, + created_financial_lines=self.created_financial_lines + other.created_financial_lines, + updated_financial_lines=self.updated_financial_lines + other.updated_financial_lines, + unresolved=self.unresolved + other.unresolved, + ) + + +@dataclass(frozen=True) +class LegacyRecordAdapter: + """Normalized legacy source record data.""" + + source: str + record_type: str + external_id: str + title: str + organization_name: str + inn: str + kpp: str + ogrn: str + ogrip: str + record_date: str + amount: Decimal | None + status: str + url: str + payload: dict[str, Any] + legacy_model: str + legacy_pk: str + load_batch: int | None + + +class OrganizationSourceBackfillService: + """Build organization source extensions from current parser tables.""" + + @classmethod + def backfill( + cls, + *, + source: str | None = None, + batch_id: int | None = None, + ) -> OrganizationSourceBackfillResult: + if source is None: + result = OrganizationSourceBackfillResult() + for descriptor_source in SOURCE_GROUP_DESCRIPTORS: + result = result.plus( + cls.backfill(source=descriptor_source, batch_id=batch_id) + ) + return result + + descriptor = get_source_group_descriptor(str(source)) + adapters = cls._iter_adapters(descriptor, batch_id=batch_id) + return cls._backfill_adapters(descriptor, adapters) + + @classmethod + def _backfill_adapters( + cls, + descriptor: SourceGroupDescriptor, + adapters: Iterable[LegacyRecordAdapter], + ) -> OrganizationSourceBackfillResult: + scanned = 0 + created_organizations = 0 + created_extensions = 0 + updated_extensions = 0 + created_records = 0 + updated_records = 0 + created_financial_lines = 0 + updated_financial_lines = 0 + unresolved = 0 + touched_extension_ids: set[str] = set() + + with transaction.atomic(): + for adapter in adapters: + scanned += 1 + organization, organization_created = cls._resolve_or_create_organization(adapter) + if organization is None: + unresolved += 1 + continue + if organization_created: + created_organizations += 1 + + extension, extension_created = descriptor.extension_model.objects.get_or_create( + organization=organization, + defaults={ + "source_group": descriptor.source_group, + "title": descriptor.title, + "last_load_batch": adapter.load_batch, + }, + ) + if extension_created: + created_extensions += 1 + else: + updated_extensions += cls._update_extension(extension, descriptor, adapter) + + source_record, record_created = OrganizationSourceRecord.objects.update_or_create( + legacy_model=adapter.legacy_model, + legacy_pk=adapter.legacy_pk, + defaults={ + "extension": extension, + "record_type": adapter.record_type, + "source": adapter.source, + "external_id": adapter.external_id, + "title": adapter.title, + "record_date": adapter.record_date, + "amount": adapter.amount, + "status": adapter.status, + "url": adapter.url, + "payload": adapter.payload, + "load_batch": adapter.load_batch, + }, + ) + if record_created: + created_records += 1 + else: + updated_records += 1 + + if adapter.source == ParserLoadLog.Source.FNS_REPORTS: + line_result = cls._backfill_financial_lines(source_record, adapter.legacy_pk) + created_financial_lines += line_result[0] + updated_financial_lines += line_result[1] + + touched_extension_ids.add(str(extension.uid)) + + cls._refresh_extension_counters(touched_extension_ids) + + return OrganizationSourceBackfillResult( + scanned=scanned, + created_organizations=created_organizations, + created_extensions=created_extensions, + updated_extensions=updated_extensions, + created_records=created_records, + updated_records=updated_records, + created_financial_lines=created_financial_lines, + updated_financial_lines=updated_financial_lines, + unresolved=unresolved, + ) + + @staticmethod + def _update_extension( + extension: OrganizationSourceExtension, + descriptor: SourceGroupDescriptor, + adapter: LegacyRecordAdapter, + ) -> int: + changed = False + if extension.title != descriptor.title: + extension.title = descriptor.title + changed = True + if adapter.load_batch is not None and extension.last_load_batch != adapter.load_batch: + extension.last_load_batch = adapter.load_batch + changed = True + if changed: + extension.save(update_fields=["title", "last_load_batch", "updated_at"]) + return 1 + return 0 + + @classmethod + def _iter_adapters( + cls, + descriptor: SourceGroupDescriptor, + *, + batch_id: int | None, + ) -> Iterable[LegacyRecordAdapter]: + source = str(descriptor.source) + model_and_adapter = cls._legacy_model_adapter(source) + if model_and_adapter is None: + queryset = GenericParserRecord.objects.filter(source=source) + adapter_factory = cls._generic_adapter + else: + model, adapter_factory = model_and_adapter + queryset = model.objects.all() + + if batch_id is not None: + queryset = queryset.filter(load_batch=batch_id) + + for record in queryset.iterator(): + yield adapter_factory(record, descriptor) + + @classmethod + def _legacy_model_adapter(cls, source: str): + return { + ParserLoadLog.Source.INSPECTIONS: ( + InspectionRecord, + cls._inspection_adapter, + ), + ParserLoadLog.Source.FNS_REPORTS: ( + FinancialReport, + cls._financial_report_adapter, + ), + ParserLoadLog.Source.PROCUREMENTS: ( + ProcurementRecord, + cls._procurement_adapter, + ), + ParserLoadLog.Source.INDUSTRIAL: ( + IndustrialCertificateRecord, + cls._industrial_certificate_adapter, + ), + ParserLoadLog.Source.INDUSTRIAL_PRODUCTS: ( + IndustrialProductRecord, + cls._industrial_product_adapter, + ), + ParserLoadLog.Source.MANUFACTURES: ( + ManufacturerRecord, + cls._manufacturer_adapter, + ), + }.get(source) + + @classmethod + def _resolve_or_create_organization( + cls, + adapter: LegacyRecordAdapter, + ) -> tuple[Organization | None, bool]: + organization = cls._resolve_organization(adapter) + if organization is not None: + return organization, False + + name = adapter.organization_name or adapter.title or adapter.external_id + if not name: + return None, False + + organization = Organization.objects.create( + name=name, + inn=adapter.inn, + kpp=adapter.kpp, + ogrn=adapter.ogrn if len(adapter.ogrn) == 13 else "", + ogrip=adapter.ogrip, + ) + return organization, True + + @classmethod + def _resolve_organization(cls, adapter: LegacyRecordAdapter) -> Organization | None: + for resolver in ( + cls._resolve_by_inn_kpp, + cls._resolve_by_ogrn_or_ogrip, + cls._resolve_by_ogrip, + cls._resolve_by_unique_inn, + cls._resolve_by_exact_normalized_name, + ): + organization = resolver(adapter) + if organization is not None: + return organization + + return None + + @staticmethod + def _resolve_by_inn_kpp(adapter: LegacyRecordAdapter) -> Organization | None: + if not adapter.inn or not adapter.kpp: + return None + return Organization.objects.filter( + inn=adapter.inn, + kpp=adapter.kpp, + ).first() + + @staticmethod + def _resolve_by_ogrn_or_ogrip(adapter: LegacyRecordAdapter) -> Organization | None: + if not adapter.ogrn: + return None + return ( + Organization.objects.filter(ogrn=adapter.ogrn).first() + or Organization.objects.filter(ogrip=adapter.ogrn).first() + ) + + @staticmethod + def _resolve_by_ogrip(adapter: LegacyRecordAdapter) -> Organization | None: + if not adapter.ogrip: + return None + return Organization.objects.filter(ogrip=adapter.ogrip).first() + + @staticmethod + def _resolve_by_unique_inn(adapter: LegacyRecordAdapter) -> Organization | None: + if not adapter.inn: + return None + organizations = list(Organization.objects.filter(inn=adapter.inn)[:2]) + return organizations[0] if len(organizations) == 1 else None + + @staticmethod + def _resolve_by_exact_normalized_name( + adapter: LegacyRecordAdapter, + ) -> Organization | None: + normalized_name = normalize_organization_name(adapter.organization_name) + if not normalized_name: + return None + matches = list( + Organization.objects.filter(name__iexact=adapter.organization_name)[:2] + ) + return matches[0] if len(matches) == 1 else None + + @classmethod + def _backfill_financial_lines( + cls, + source_record: OrganizationSourceRecord, + report_pk: str, + ) -> tuple[int, int]: + created = 0 + updated = 0 + for line in FinancialReportLine.objects.filter(report_id=report_pk).iterator(): + _, was_created = OrganizationSourceFinancialLine.objects.update_or_create( + source_record=source_record, + form_code=line.form_code, + line_code=line.line_code, + year=line.year, + defaults={ + "line_name": line.line_name, + "period_start": line.period_start, + "period_end": line.period_end, + }, + ) + if was_created: + created += 1 + else: + updated += 1 + return created, updated + + @staticmethod + def _refresh_extension_counters(extension_ids: set[str]) -> None: + for extension in OrganizationSourceExtension.objects.filter(uid__in=extension_ids): + aggregate = extension.records.aggregate( + records_count=Count("uid"), + first_seen_at=Min("created_at"), + last_seen_at=Max("created_at"), + last_load_batch=Max("load_batch"), + ) + extension.records_count = aggregate["records_count"] or 0 + extension.first_seen_at = aggregate["first_seen_at"] + extension.last_seen_at = aggregate["last_seen_at"] + extension.last_load_batch = aggregate["last_load_batch"] + extension.save( + update_fields=[ + "records_count", + "first_seen_at", + "last_seen_at", + "last_load_batch", + "updated_at", + ] + ) + + @classmethod + def _inspection_adapter( + cls, + record: InspectionRecord, + descriptor: SourceGroupDescriptor, + ) -> LegacyRecordAdapter: + return cls._adapter( + record, + descriptor, + external_id=record.registration_number, + title=record.organisation_name, + organization_name=record.organisation_name, + inn=record.inn, + ogrn=record.ogrn, + record_date=record.start_date, + status=record.status, + ) + + @classmethod + def _financial_report_adapter( + cls, + record: FinancialReport, + descriptor: SourceGroupDescriptor, + ) -> LegacyRecordAdapter: + return cls._adapter( + record, + descriptor, + external_id=record.external_id, + title=record.file_name, + organization_name="", + ogrn=record.ogrn, + status=record.status, + ) + + @classmethod + def _procurement_adapter( + cls, + record: ProcurementRecord, + descriptor: SourceGroupDescriptor, + ) -> LegacyRecordAdapter: + return cls._adapter( + record, + descriptor, + external_id=record.purchase_number, + title=record.purchase_name, + organization_name=record.customer_name, + inn=record.customer_inn, + kpp=record.customer_kpp, + ogrn=record.customer_ogrn, + record_date=record.publish_date, + amount=record.max_price_amount, + status=record.status, + url=record.href, + ) + + @classmethod + def _industrial_certificate_adapter( + cls, + record: IndustrialCertificateRecord, + descriptor: SourceGroupDescriptor, + ) -> LegacyRecordAdapter: + return cls._adapter( + record, + descriptor, + external_id=record.certificate_number, + title=record.certificate_number, + organization_name=record.organisation_name, + inn=record.inn, + ogrn=record.ogrn, + record_date=record.issue_date, + url=record.certificate_file_url, + ) + + @classmethod + def _industrial_product_adapter( + cls, + record: IndustrialProductRecord, + descriptor: SourceGroupDescriptor, + ) -> LegacyRecordAdapter: + return cls._adapter( + record, + descriptor, + external_id=record.registry_number, + title=record.product_name, + organization_name=record.full_organisation_name, + inn=record.inn, + ogrn=record.ogrn, + ) + + @classmethod + def _manufacturer_adapter( + cls, + record: ManufacturerRecord, + descriptor: SourceGroupDescriptor, + ) -> LegacyRecordAdapter: + return cls._adapter( + record, + descriptor, + external_id=record.inn, + title=record.full_legal_name, + organization_name=record.full_legal_name, + inn=record.inn, + ogrn=record.ogrn, + ) + + @classmethod + def _generic_adapter( + cls, + record: GenericParserRecord, + descriptor: SourceGroupDescriptor, + ) -> LegacyRecordAdapter: + payload = cls._model_payload(record) + if isinstance(record.payload, dict): + payload.update(record.payload) + return cls._adapter( + record, + descriptor, + external_id=record.external_id, + title=record.title, + organization_name=record.organisation_name, + inn=record.inn, + ogrn=record.ogrn, + record_date=record.record_date, + amount=record.amount, + status=record.status, + url=record.url, + payload=payload, + ) + + @classmethod + def _adapter( + cls, + record, + descriptor: SourceGroupDescriptor, + *, + external_id: str, + title: str, + organization_name: str, + inn: str = "", + kpp: str = "", + ogrn: str = "", + ogrip: str = "", + record_date: str = "", + amount: Decimal | None = None, + status: str = "", + url: str = "", + payload: dict[str, Any] | None = None, + ) -> LegacyRecordAdapter: + normalized_inn, normalized_kpp, normalized_ogrn, normalized_ogrip = ( + normalize_identity_fields( + inn=inn, + kpp=kpp, + ogrn=ogrn, + ogrip=ogrip, + ) + ) + return LegacyRecordAdapter( + source=str(descriptor.source), + record_type=descriptor.record_type, + external_id=str(external_id or ""), + title=str(title or ""), + organization_name=str(organization_name or ""), + inn=normalized_inn, + kpp=normalized_kpp, + ogrn=normalized_ogrn, + ogrip=normalized_ogrip, + record_date=str(record_date or ""), + amount=amount, + status=str(status or ""), + url=str(url or ""), + payload=payload if payload is not None else cls._model_payload(record), + legacy_model=cls._legacy_model_name(record), + legacy_pk=str(record.pk), + load_batch=getattr(record, "load_batch", None), + ) + + @staticmethod + def _legacy_model_name(record: Model) -> str: + module = record.__class__.__module__.removesuffix(".models") + return f"{module}.{record.__class__.__name__}" + + @staticmethod + def _model_payload(record: Model) -> dict[str, Any]: + payload = {"id": record.pk} + for field in record._meta.concrete_fields: + name = field.name + value = ( + getattr(record, field.attname) + if field.is_relation and field.many_to_one + else getattr(record, name) + ) + if name == "id": + continue + if isinstance(value, datetime | date): + payload[name] = value.isoformat() + elif isinstance(value, Decimal | UUID): + payload[name] = str(value) + else: + payload[name] = value + return payload diff --git a/src/organizations/source_groups.py b/src/organizations/source_groups.py new file mode 100644 index 0000000..e44e664 --- /dev/null +++ b/src/organizations/source_groups.py @@ -0,0 +1,163 @@ +"""Source group mapping for organization source extensions.""" + +from __future__ import annotations + +from dataclasses import dataclass + +from apps.parsers.models import ParserLoadLog + +from organizations.models import ( + ArbitrationExtension, + BankruptcyExtension, + DefenseSupplierExtension, + FinancialIndicatorsExtension, + GovernmentProcurementExtension, + IndustrialProductionExtension, + OrganizationSourceExtension, + PlannedInspectionExtension, + SecurityRegistryExtension, + SourceGroup, + VacancyExtension, +) + + +@dataclass(frozen=True) +class SourceGroupDescriptor: + """Mapping from parser source to organization extension group.""" + + source: str + source_group: str + record_type: str + title: str + extension_model: type[OrganizationSourceExtension] + + +SOURCE_GROUP_DESCRIPTORS: dict[str, SourceGroupDescriptor] = { + ParserLoadLog.Source.FNS_REPORTS: SourceGroupDescriptor( + source=ParserLoadLog.Source.FNS_REPORTS, + source_group=SourceGroup.FINANCIAL_INDICATORS, + record_type="financial_report", + title="Финансово-экономические показатели", + extension_model=FinancialIndicatorsExtension, + ), + ParserLoadLog.Source.PROCUREMENTS: SourceGroupDescriptor( + source=ParserLoadLog.Source.PROCUREMENTS, + source_group=SourceGroup.GOVERNMENT_PROCUREMENTS, + record_type="procurement", + title="Государственные закупки по 44-ФЗ и 223-ФЗ", + extension_model=GovernmentProcurementExtension, + ), + ParserLoadLog.Source.PROCUREMENTS_44FZ: SourceGroupDescriptor( + source=ParserLoadLog.Source.PROCUREMENTS_44FZ, + source_group=SourceGroup.GOVERNMENT_PROCUREMENTS, + record_type="procurement_44fz", + title="Государственные закупки по 44-ФЗ и 223-ФЗ", + extension_model=GovernmentProcurementExtension, + ), + ParserLoadLog.Source.PROCUREMENTS_223FZ: SourceGroupDescriptor( + source=ParserLoadLog.Source.PROCUREMENTS_223FZ, + source_group=SourceGroup.GOVERNMENT_PROCUREMENTS, + record_type="procurement_223fz", + title="Государственные закупки по 44-ФЗ и 223-ФЗ", + extension_model=GovernmentProcurementExtension, + ), + ParserLoadLog.Source.CONTRACTS: SourceGroupDescriptor( + source=ParserLoadLog.Source.CONTRACTS, + source_group=SourceGroup.GOVERNMENT_PROCUREMENTS, + record_type="contract", + title="Государственные закупки по 44-ФЗ и 223-ФЗ", + extension_model=GovernmentProcurementExtension, + ), + ParserLoadLog.Source.INDUSTRIAL: SourceGroupDescriptor( + source=ParserLoadLog.Source.INDUSTRIAL, + source_group=SourceGroup.INDUSTRIAL_PRODUCTION, + record_type="industrial_certificate", + title="Производители и продукция России", + extension_model=IndustrialProductionExtension, + ), + ParserLoadLog.Source.INDUSTRIAL_PRODUCTS: SourceGroupDescriptor( + source=ParserLoadLog.Source.INDUSTRIAL_PRODUCTS, + source_group=SourceGroup.INDUSTRIAL_PRODUCTION, + record_type="industrial_product", + title="Производители и продукция России", + extension_model=IndustrialProductionExtension, + ), + ParserLoadLog.Source.MANUFACTURES: SourceGroupDescriptor( + source=ParserLoadLog.Source.MANUFACTURES, + source_group=SourceGroup.INDUSTRIAL_PRODUCTION, + record_type="manufacturer", + title="Производители и продукция России", + extension_model=IndustrialProductionExtension, + ), + ParserLoadLog.Source.INSPECTIONS: SourceGroupDescriptor( + source=ParserLoadLog.Source.INSPECTIONS, + source_group=SourceGroup.PLANNED_INSPECTIONS, + record_type="inspection", + title="Плановые проверки Генпрокуратуры России", + extension_model=PlannedInspectionExtension, + ), + ParserLoadLog.Source.FEDRESURS_BANKRUPTCY: SourceGroupDescriptor( + source=ParserLoadLog.Source.FEDRESURS_BANKRUPTCY, + source_group=SourceGroup.BANKRUPTCY, + record_type="bankruptcy", + title="Сведения о процедурах банкротства", + extension_model=BankruptcyExtension, + ), + ParserLoadLog.Source.UNFAIR_SUPPLIERS: SourceGroupDescriptor( + source=ParserLoadLog.Source.UNFAIR_SUPPLIERS, + source_group=SourceGroup.DEFENSE_SUPPLIERS, + record_type="unfair_supplier", + title="Недобросовестные поставщики ГОЗ", + extension_model=DefenseSupplierExtension, + ), + ParserLoadLog.Source.FAS_GOZ: SourceGroupDescriptor( + source=ParserLoadLog.Source.FAS_GOZ, + source_group=SourceGroup.DEFENSE_SUPPLIERS, + record_type="fas_goz", + title="Недобросовестные поставщики ГОЗ", + extension_model=DefenseSupplierExtension, + ), + ParserLoadLog.Source.ARBITRATION: SourceGroupDescriptor( + source=ParserLoadLog.Source.ARBITRATION, + source_group=SourceGroup.ARBITRATION, + record_type="arbitration_case", + title="Арбитражные дела", + extension_model=ArbitrationExtension, + ), + ParserLoadLog.Source.FSTEC: SourceGroupDescriptor( + source=ParserLoadLog.Source.FSTEC, + source_group=SourceGroup.SECURITY_REGISTRIES, + record_type="security_registry", + title="Реестры по информационной безопасности", + extension_model=SecurityRegistryExtension, + ), + ParserLoadLog.Source.TRUDVSEM: SourceGroupDescriptor( + source=ParserLoadLog.Source.TRUDVSEM, + source_group=SourceGroup.VACANCIES, + record_type="vacancy", + title="Вакансии Работа России", + extension_model=VacancyExtension, + ), + "hh": SourceGroupDescriptor( + source="hh", + source_group=SourceGroup.VACANCIES, + record_type="vacancy", + title="Вакансии Работа России", + extension_model=VacancyExtension, + ), + "superjob": SourceGroupDescriptor( + source="superjob", + source_group=SourceGroup.VACANCIES, + record_type="vacancy", + title="Вакансии Работа России", + extension_model=VacancyExtension, + ), +} + + +def get_source_group_descriptor(source: str) -> SourceGroupDescriptor: + """Return source group descriptor or raise for unsupported source.""" + try: + return SOURCE_GROUP_DESCRIPTORS[str(source)] + except KeyError as exc: + raise ValueError(f"Unsupported organization source group: {source}") from exc diff --git a/src/organizations/source_identity.py b/src/organizations/source_identity.py new file mode 100644 index 0000000..c5d7a81 --- /dev/null +++ b/src/organizations/source_identity.py @@ -0,0 +1,52 @@ +"""Identity normalization helpers for organization source ingestion.""" + +from __future__ import annotations + +import re + +VALID_INN_LENGTHS = frozenset({10, 12}) +VALID_KPP_LENGTH = 9 +VALID_OGRN_LENGTH = 13 +VALID_OGRIP_LENGTH = 15 + + +def digits(value: str | None) -> str: + """Return only decimal digits from a source identity value.""" + return re.sub(r"\D+", "", str(value or "")) + + +def normalize_identity_fields( + *, + inn: str | None = "", + kpp: str | None = "", + ogrn: str | None = "", + ogrip: str | None = "", +) -> tuple[str, str, str, str]: + """Normalize parser identity values to canonical Organization constraints.""" + inn_digits = digits(inn) + kpp_digits = digits(kpp) + ogrn_digits = digits(ogrn) + ogrip_digits = digits(ogrip) + + normalized_ogrip = "" + if len(ogrip_digits) == VALID_OGRIP_LENGTH: + normalized_ogrip = ogrip_digits + elif len(ogrn_digits) == VALID_OGRIP_LENGTH: + normalized_ogrip = ogrn_digits + + normalized_inn = inn_digits if len(inn_digits) in VALID_INN_LENGTHS else "" + normalized_kpp = ( + "" + if normalized_ogrip + else kpp_digits + if len(kpp_digits) == VALID_KPP_LENGTH + else "" + ) + normalized_ogrn = ( + "" + if normalized_ogrip + else ogrn_digits + if len(ogrn_digits) == VALID_OGRN_LENGTH + else "" + ) + return normalized_inn, normalized_kpp, normalized_ogrn, normalized_ogrip diff --git a/src/organizations/source_ingestion.py b/src/organizations/source_ingestion.py new file mode 100644 index 0000000..ad7cc47 --- /dev/null +++ b/src/organizations/source_ingestion.py @@ -0,0 +1,962 @@ +"""Direct parser ingestion into organization source storage.""" + +from __future__ import annotations + +from collections import defaultdict +from collections.abc import Iterable, Iterator +from dataclasses import dataclass, field +from decimal import Decimal +from typing import Any + +from django.db import transaction +from django.db.models import Count, Max, Min, Q +from django.db.models.functions import Lower +from django.utils import timezone + +from organizations.models import ( + Organization, + OrganizationSourceExtension, + OrganizationSourceFinancialLine, + OrganizationSourceRecord, +) +from organizations.name_normalization import normalize_organization_name +from organizations.source_groups import ( + SourceGroupDescriptor, + get_source_group_descriptor, +) +from organizations.source_identity import normalize_identity_fields + + +@dataclass(frozen=True) +class SourceFinancialLineInput: + """Structured financial line parsed from a source record.""" + + form_code: str + line_code: str + line_name: str + year: int + period_start: int | None = None + period_end: int | None = None + + +@dataclass(frozen=True) +class SourceRecordInput: + """Normalized parser output ready for organization source storage.""" + + external_id: str + title: str + organization_name: str + inn: str = "" + kpp: str = "" + ogrn: str = "" + ogrip: str = "" + record_date: str = "" + amount: Decimal | None = None + status: str = "" + url: str = "" + payload: dict[str, Any] = field(default_factory=dict) + financial_lines: Iterable[SourceFinancialLineInput] = field(default_factory=tuple) + + +@dataclass(frozen=True) +class OrganizationSourceIngestionResult: + """Counters returned by direct source ingestion.""" + + scanned: int = 0 + created_organizations: int = 0 + created_extensions: int = 0 + updated_extensions: int = 0 + created_records: int = 0 + updated_records: int = 0 + created_financial_lines: int = 0 + updated_financial_lines: int = 0 + unresolved: int = 0 + + +@dataclass(frozen=True) +class _NormalizedRecordInput: + index: int + record: SourceRecordInput + inn: str + kpp: str + ogrn: str + ogrip: str + organization_name: str + + +class OrganizationSourceIngestionService: + """Persist parser outputs directly into organization source storage.""" + + chunk_size = 500 + + @classmethod + def save_records( + cls, + *, + source: str, + load_batch: int | None, + records: Iterable[SourceRecordInput], + ) -> OrganizationSourceIngestionResult: + descriptor = get_source_group_descriptor(str(source)) + deduplicated = cls._deduplicate_records(records) + return cls._save_records(descriptor, load_batch, deduplicated) + + @staticmethod + def _deduplicate_records( + records: Iterable[SourceRecordInput], + ) -> list[SourceRecordInput]: + by_external_id: dict[str, SourceRecordInput] = {} + without_external_id = [] + for record in records: + external_id = str(record.external_id or "") + if not external_id: + without_external_id.append(record) + continue + by_external_id[external_id] = record + return [*by_external_id.values(), *without_external_id] + + @classmethod + def _save_records( + cls, + descriptor: SourceGroupDescriptor, + load_batch: int | None, + records: list[SourceRecordInput], + ) -> OrganizationSourceIngestionResult: + scanned = len(records) + if not records: + return OrganizationSourceIngestionResult() + + with transaction.atomic(): + normalized_records = cls._normalize_records(records) + organizations_by_index, created_organizations = ( + cls._resolve_or_create_organizations(normalized_records) + ) + del normalized_records + unresolved = scanned - len(organizations_by_index) + + extensions_by_organization_id, created_extensions, updated_extensions = ( + cls._resolve_or_create_extensions( + descriptor=descriptor, + load_batch=load_batch, + organizations=organizations_by_index.values(), + ) + ) + + touched_extension_ids: set[str] = set() + for organization in organizations_by_index.values(): + extension = extensions_by_organization_id.get(organization.uid) + if extension is not None: + touched_extension_ids.add(str(extension.uid)) + + ( + created_records, + updated_records, + created_financial_lines, + updated_financial_lines, + ) = cls._bulk_upsert_source_records( + descriptor=descriptor, + load_batch=load_batch, + records=records, + record_inputs_with_extensions=cls._iter_record_inputs_with_extensions( + records=records, + organizations_by_index=organizations_by_index, + extensions_by_organization_id=extensions_by_organization_id, + ), + ) + + cls._refresh_extension_counters(touched_extension_ids) + + return OrganizationSourceIngestionResult( + scanned=scanned, + created_organizations=created_organizations, + created_extensions=created_extensions, + updated_extensions=updated_extensions, + created_records=created_records, + updated_records=updated_records, + created_financial_lines=created_financial_lines, + updated_financial_lines=updated_financial_lines, + unresolved=unresolved, + ) + + @staticmethod + def _chunks(values: list[Any], chunk_size: int) -> Iterable[list[Any]]: + for offset in range(0, len(values), chunk_size): + yield values[offset : offset + chunk_size] + + @staticmethod + def _iter_chunks(values: Iterable[Any], chunk_size: int) -> Iterator[list[Any]]: + chunk = [] + for value in values: + chunk.append(value) + if len(chunk) >= chunk_size: + yield chunk + chunk = [] + if chunk: + yield chunk + + @staticmethod + def _iter_record_inputs_with_extensions( + *, + records: list[SourceRecordInput], + organizations_by_index: dict[int, Organization], + extensions_by_organization_id: dict[Any, OrganizationSourceExtension], + ) -> Iterator[tuple[int, SourceRecordInput, OrganizationSourceExtension]]: + for index, organization in organizations_by_index.items(): + extension = extensions_by_organization_id.get(organization.uid) + if extension is not None: + yield index, records[index], extension + + @classmethod + def _normalize_records( + cls, + records: list[SourceRecordInput], + ) -> list[_NormalizedRecordInput]: + normalized_records = [] + for index, record_input in enumerate(records): + inn, kpp, ogrn, ogrip = normalize_identity_fields( + inn=record_input.inn, + kpp=record_input.kpp, + ogrn=record_input.ogrn, + ogrip=record_input.ogrip, + ) + normalized_records.append( + _NormalizedRecordInput( + index=index, + record=record_input, + inn=inn, + kpp=kpp, + ogrn=ogrn, + ogrip=ogrip, + organization_name=str(record_input.organization_name or ""), + ) + ) + return normalized_records + + @classmethod + def _resolve_or_create_organizations( + cls, + normalized_records: list[_NormalizedRecordInput], + ) -> tuple[dict[int, Organization], int]: + organizations_by_index: dict[int, Organization] = {} + + cls._resolve_organizations_by_inn_kpp( + normalized_records, + organizations_by_index, + ) + cls._resolve_organizations_by_ogrn_or_ogrip( + normalized_records, + organizations_by_index, + ) + cls._resolve_organizations_by_unique_inn( + normalized_records, + organizations_by_index, + ) + cls._resolve_organizations_by_exact_name( + normalized_records, + organizations_by_index, + ) + + return cls._create_missing_organizations( + normalized_records, + organizations_by_index, + ) + + @classmethod + def _resolve_organizations_by_inn_kpp( + cls, + normalized_records: list[_NormalizedRecordInput], + organizations_by_index: dict[int, Organization], + ) -> None: + keys = sorted( + { + (record.inn, record.kpp) + for record in normalized_records + if record.index not in organizations_by_index + and record.inn + and record.kpp + } + ) + if not keys: + return + + organizations_by_key: dict[tuple[str, str], Organization] = {} + for chunk in cls._chunks(keys, cls.chunk_size): + query = Q() + for inn, kpp in chunk: + query |= Q(inn=inn, kpp=kpp) + for organization in Organization.objects.filter(query): + organizations_by_key[(organization.inn, organization.kpp)] = organization + + for record in normalized_records: + if record.index in organizations_by_index: + continue + organization = organizations_by_key.get((record.inn, record.kpp)) + if organization is not None: + organizations_by_index[record.index] = organization + + @classmethod + def _resolve_organizations_by_ogrn_or_ogrip( + cls, + normalized_records: list[_NormalizedRecordInput], + organizations_by_index: dict[int, Organization], + ) -> None: + ogrn_values = sorted( + { + record.ogrn + for record in normalized_records + if record.index not in organizations_by_index and record.ogrn + } + ) + ogrip_values = sorted( + { + record.ogrip + for record in normalized_records + if record.index not in organizations_by_index and record.ogrip + } + ) + lookup_values = sorted({*ogrn_values, *ogrip_values}) + if not lookup_values: + return + + by_ogrn: dict[str, Organization] = {} + by_ogrip: dict[str, Organization] = {} + for chunk in cls._chunks(lookup_values, cls.chunk_size): + for organization in Organization.objects.filter( + Q(ogrn__in=chunk) | Q(ogrip__in=chunk) + ): + if organization.ogrn: + by_ogrn[organization.ogrn] = organization + if organization.ogrip: + by_ogrip[organization.ogrip] = organization + + for record in normalized_records: + if record.index in organizations_by_index: + continue + organization = by_ogrn.get(record.ogrn) or by_ogrip.get(record.ogrn) + if organization is None and record.ogrip: + organization = by_ogrip.get(record.ogrip) + if organization is not None: + organizations_by_index[record.index] = organization + + @classmethod + def _resolve_organizations_by_unique_inn( + cls, + normalized_records: list[_NormalizedRecordInput], + organizations_by_index: dict[int, Organization], + ) -> None: + inn_values = sorted( + { + record.inn + for record in normalized_records + if record.index not in organizations_by_index and record.inn + } + ) + if not inn_values: + return + + organizations_by_inn: dict[str, list[Organization]] = defaultdict(list) + for chunk in cls._chunks(inn_values, cls.chunk_size): + for organization in Organization.objects.filter(inn__in=chunk).order_by( + "inn" + ): + organizations_by_inn[organization.inn].append(organization) + + unique_by_inn = { + inn: organizations[0] + for inn, organizations in organizations_by_inn.items() + if len(organizations) == 1 + } + for record in normalized_records: + if record.index in organizations_by_index: + continue + organization = unique_by_inn.get(record.inn) + if organization is not None: + organizations_by_index[record.index] = organization + + @classmethod + def _resolve_organizations_by_exact_name( + cls, + normalized_records: list[_NormalizedRecordInput], + organizations_by_index: dict[int, Organization], + ) -> None: + names = sorted( + { + record.organization_name.strip() + for record in normalized_records + if record.index not in organizations_by_index + and normalize_organization_name(record.organization_name) + } + ) + if not names: + return + + names_lower = [name.lower() for name in names] + organizations_by_name: dict[str, list[Organization]] = defaultdict(list) + for chunk in cls._chunks(names_lower, cls.chunk_size): + for organization in Organization.objects.annotate( + name_lower=Lower("name") + ).filter(name_lower__in=chunk): + organizations_by_name[organization.name.lower()].append(organization) + + unique_by_name = { + name: organizations[0] + for name, organizations in organizations_by_name.items() + if len(organizations) == 1 + } + for record in normalized_records: + if record.index in organizations_by_index: + continue + organization = unique_by_name.get(record.organization_name.strip().lower()) + if organization is not None: + organizations_by_index[record.index] = organization + + @classmethod + def _create_missing_organizations( + cls, + normalized_records: list[_NormalizedRecordInput], + organizations_by_index: dict[int, Organization], + ) -> tuple[dict[int, Organization], int]: + indexes_by_create_key: dict[tuple[str, str], list[int]] = defaultdict(list) + organizations_by_create_key: dict[tuple[str, str], Organization] = {} + + for record in normalized_records: + if record.index in organizations_by_index: + continue + + name = ( + record.organization_name.strip() + or str(record.record.title or "").strip() + or str(record.record.external_id or "").strip() + ) + if not name: + continue + + create_key = cls._organization_create_key(record, name) + indexes_by_create_key[create_key].append(record.index) + if create_key in organizations_by_create_key: + continue + + organization = Organization( + name=name, + inn=record.inn, + kpp=record.kpp, + ogrn=record.ogrn, + ogrip=record.ogrip, + ) + organization.identity_status = organization._resolve_identity_status() + organization.primary_identity = organization._resolve_primary_identity() + organizations_by_create_key[create_key] = organization + + create_instances = list(organizations_by_create_key.values()) + if not create_instances: + return organizations_by_index, 0 + + created_count = 0 + for chunk in cls._chunks(create_instances, cls.chunk_size): + created_count += len( + Organization.objects.bulk_create(chunk, ignore_conflicts=True) + ) + + for create_key, organization in organizations_by_create_key.items(): + persisted = cls._refetch_created_organization(organization) + if persisted is None: + continue + for index in indexes_by_create_key[create_key]: + organizations_by_index[index] = persisted + + return organizations_by_index, created_count + + @staticmethod + def _organization_create_key( + record: _NormalizedRecordInput, + name: str, + ) -> tuple[str, str]: + if record.inn and record.kpp: + return ("inn_kpp", f"{record.inn}:{record.kpp}") + if record.ogrip: + return ("ogrip", record.ogrip) + if record.ogrn: + return ("ogrn", record.ogrn) + if record.inn: + return ("inn", record.inn) + normalized_name = normalize_organization_name(name) + if normalized_name: + return ("name", normalized_name) + return ("external_id", str(record.record.external_id or name)) + + @staticmethod + def _refetch_created_organization( + organization: Organization, + ) -> Organization | None: + if organization.inn and organization.kpp: + return Organization.objects.filter( + inn=organization.inn, + kpp=organization.kpp, + ).first() + if organization.ogrn: + return Organization.objects.filter(ogrn=organization.ogrn).first() + if organization.ogrip: + return Organization.objects.filter(ogrip=organization.ogrip).first() + if organization.inn: + matches = list(Organization.objects.filter(inn=organization.inn)[:2]) + return matches[0] if len(matches) == 1 else None + return Organization.objects.filter(name__iexact=organization.name).first() + + @classmethod + def _resolve_or_create_extensions( + cls, + *, + descriptor: SourceGroupDescriptor, + load_batch: int | None, + organizations: Iterable[Organization], + ) -> tuple[dict[Any, OrganizationSourceExtension], int, int]: + unique_organizations = {organization.uid: organization for organization in organizations} + if not unique_organizations: + return {}, 0, 0 + + existing_extensions = { + extension.organization_id: extension + for extension in descriptor.extension_model.objects.filter( + organization_id__in=list(unique_organizations) + ) + } + + created_extensions = 0 + extensions_by_organization_id = dict(existing_extensions) + for organization_id, organization in unique_organizations.items(): + if organization_id in extensions_by_organization_id: + continue + extensions_by_organization_id[organization_id] = ( + descriptor.extension_model.objects.create( + organization=organization, + source_group=descriptor.source_group, + title=descriptor.title, + last_load_batch=load_batch, + ) + ) + created_extensions += 1 + + changed_extension_ids = [] + for extension in existing_extensions.values(): + changed = False + if extension.title != descriptor.title: + extension.title = descriptor.title + changed = True + if load_batch is not None and extension.last_load_batch != load_batch: + extension.last_load_batch = load_batch + changed = True + if changed: + changed_extension_ids.append(extension.uid) + + updated_extensions = 0 + if changed_extension_ids: + update_kwargs = { + "title": descriptor.title, + "updated_at": timezone.now(), + } + if load_batch is not None: + update_kwargs["last_load_batch"] = load_batch + updated_extensions = descriptor.extension_model.objects.filter( + uid__in=changed_extension_ids + ).update(**update_kwargs) + + return extensions_by_organization_id, created_extensions, updated_extensions + + @classmethod + def _bulk_upsert_source_records( + cls, + *, + descriptor: SourceGroupDescriptor, + load_batch: int | None, + records: list[SourceRecordInput], + record_inputs_with_extensions: Iterable[ + tuple[int, SourceRecordInput, OrganizationSourceExtension] + ], + ) -> tuple[int, int, int, int]: + created_records = 0 + updated_records = 0 + created_financial_lines = 0 + updated_financial_lines = 0 + + for chunk in cls._iter_chunks(record_inputs_with_extensions, cls.chunk_size): + source_records_by_index, chunk_created, chunk_updated = ( + cls._bulk_upsert_source_records_chunk( + descriptor=descriptor, + load_batch=load_batch, + record_inputs_with_extensions=chunk, + ) + ) + created_records += chunk_created + updated_records += chunk_updated + + line_result = cls._save_financial_lines_for_records( + records=records, + source_records_by_index=source_records_by_index, + ) + created_financial_lines += line_result[0] + updated_financial_lines += line_result[1] + source_records_by_index.clear() + + return ( + created_records, + updated_records, + created_financial_lines, + updated_financial_lines, + ) + + @classmethod + def _bulk_upsert_source_records_chunk( + cls, + *, + descriptor: SourceGroupDescriptor, + load_batch: int | None, + record_inputs_with_extensions: list[ + tuple[int, SourceRecordInput, OrganizationSourceExtension] + ], + ) -> tuple[dict[int, OrganizationSourceRecord], int, int]: + if not record_inputs_with_extensions: + return {}, 0, 0 + + source = str(descriptor.source) + external_ids = [ + str(record_input.external_id or "") + for _, record_input, _ in record_inputs_with_extensions + if str(record_input.external_id or "") + ] + existing_by_external_id: dict[str, OrganizationSourceRecord] = {} + for source_record in OrganizationSourceRecord.objects.filter( + source=source, + external_id__in=sorted(set(external_ids)), + ): + existing_by_external_id[source_record.external_id] = source_record + + now = timezone.now() + create_instances: list[OrganizationSourceRecord] = [] + update_instances: list[OrganizationSourceRecord] = [] + source_records_by_index: dict[int, OrganizationSourceRecord] = {} + update_fields = [ + "extension", + "record_type", + "title", + "record_date", + "amount", + "status", + "url", + "payload", + "load_batch", + "legacy_model", + "legacy_pk", + "updated_at", + ] + + for index, record_input, extension in record_inputs_with_extensions: + external_id = str(record_input.external_id or "") + defaults = { + "extension": extension, + "record_type": descriptor.record_type, + "title": str(record_input.title or ""), + "record_date": str(record_input.record_date or ""), + "amount": record_input.amount, + "status": str(record_input.status or ""), + "url": str(record_input.url or ""), + "payload": dict(record_input.payload or {}), + "load_batch": load_batch, + "legacy_model": "", + "legacy_pk": "", + "updated_at": now, + } + + source_record = existing_by_external_id.get(external_id) + if source_record is None: + source_record = OrganizationSourceRecord( + source=source, + external_id=external_id, + created_at=now, + **defaults, + ) + create_instances.append(source_record) + else: + for field_name, value in defaults.items(): + setattr(source_record, field_name, value) + update_instances.append(source_record) + + source_records_by_index[index] = source_record + + created_records = 0 + for chunk in cls._chunks(create_instances, cls.chunk_size): + created_records += len(OrganizationSourceRecord.objects.bulk_create(chunk)) + + for chunk in cls._chunks(update_instances, cls.chunk_size): + OrganizationSourceRecord.objects.bulk_update( + chunk, + fields=update_fields, + batch_size=cls.chunk_size, + ) + + return source_records_by_index, created_records, len(update_instances) + + @classmethod + def _save_financial_lines_for_records( + cls, + *, + records: list[SourceRecordInput], + source_records_by_index: dict[int, OrganizationSourceRecord], + ) -> tuple[int, int]: + created = 0 + updated = 0 + for index, source_record in source_records_by_index.items(): + line_result = cls._save_financial_lines( + source_record, + records[index].financial_lines, + ) + created += line_result[0] + updated += line_result[1] + return created, updated + + @classmethod + def _resolve_or_create_organization( + cls, + record_input: SourceRecordInput, + ) -> tuple[Organization | None, bool]: + inn, kpp, ogrn, ogrip = normalize_identity_fields( + inn=record_input.inn, + kpp=record_input.kpp, + ogrn=record_input.ogrn, + ogrip=record_input.ogrip, + ) + organization = cls._resolve_organization( + inn=inn, + kpp=kpp, + ogrn=ogrn, + ogrip=ogrip, + organization_name=record_input.organization_name, + ) + if organization is not None: + return organization, False + + name = ( + str(record_input.organization_name or "").strip() + or str(record_input.title or "").strip() + or str(record_input.external_id or "").strip() + ) + if not name: + return None, False + + return ( + Organization.objects.create( + name=name, + inn=inn, + kpp=kpp, + ogrn=ogrn, + ogrip=ogrip, + ), + True, + ) + + @classmethod + def _resolve_organization( + cls, + *, + inn: str, + kpp: str, + ogrn: str, + ogrip: str, + organization_name: str, + ) -> Organization | None: + for resolver in ( + cls._resolve_by_inn_kpp, + cls._resolve_by_ogrn_or_ogrip, + cls._resolve_by_ogrip, + cls._resolve_by_unique_inn, + cls._resolve_by_exact_normalized_name, + ): + organization = resolver( + inn=inn, + kpp=kpp, + ogrn=ogrn, + ogrip=ogrip, + organization_name=organization_name, + ) + if organization is not None: + return organization + return None + + @staticmethod + def _resolve_by_inn_kpp( + *, + inn: str, + kpp: str, + **_kwargs, + ) -> Organization | None: + if not inn or not kpp: + return None + return Organization.objects.filter(inn=inn, kpp=kpp).first() + + @staticmethod + def _resolve_by_ogrn_or_ogrip( + *, + ogrn: str, + **_kwargs, + ) -> Organization | None: + if not ogrn: + return None + return ( + Organization.objects.filter(ogrn=ogrn).first() + or Organization.objects.filter(ogrip=ogrn).first() + ) + + @staticmethod + def _resolve_by_ogrip( + *, + ogrip: str, + **_kwargs, + ) -> Organization | None: + if not ogrip: + return None + return Organization.objects.filter(ogrip=ogrip).first() + + @staticmethod + def _resolve_by_unique_inn( + *, + inn: str, + **_kwargs, + ) -> Organization | None: + if not inn: + return None + organizations = list(Organization.objects.filter(inn=inn)[:2]) + return organizations[0] if len(organizations) == 1 else None + + @staticmethod + def _resolve_by_exact_normalized_name( + *, + organization_name: str, + **_kwargs, + ) -> Organization | None: + normalized_name = normalize_organization_name(organization_name) + if not normalized_name: + return None + matches = list(Organization.objects.filter(name__iexact=organization_name)[:2]) + return matches[0] if len(matches) == 1 else None + + @staticmethod + def _update_extension( + extension: OrganizationSourceExtension, + descriptor: SourceGroupDescriptor, + load_batch: int | None, + ) -> int: + changed = False + if extension.title != descriptor.title: + extension.title = descriptor.title + changed = True + if load_batch is not None and extension.last_load_batch != load_batch: + extension.last_load_batch = load_batch + changed = True + if changed: + extension.save(update_fields=["title", "last_load_batch", "updated_at"]) + return 1 + return 0 + + @staticmethod + def _upsert_source_record( + *, + descriptor: SourceGroupDescriptor, + extension: OrganizationSourceExtension, + record_input: SourceRecordInput, + load_batch: int | None, + ) -> tuple[OrganizationSourceRecord, bool]: + defaults = { + "extension": extension, + "record_type": descriptor.record_type, + "title": str(record_input.title or ""), + "record_date": str(record_input.record_date or ""), + "amount": record_input.amount, + "status": str(record_input.status or ""), + "url": str(record_input.url or ""), + "payload": dict(record_input.payload or {}), + "load_batch": load_batch, + "legacy_model": "", + "legacy_pk": "", + } + external_id = str(record_input.external_id or "") + if external_id: + return OrganizationSourceRecord.objects.update_or_create( + source=str(descriptor.source), + external_id=external_id, + defaults=defaults, + ) + return ( + OrganizationSourceRecord.objects.create( + source=str(descriptor.source), + external_id="", + **defaults, + ), + True, + ) + + @staticmethod + def _save_financial_lines( + source_record: OrganizationSourceRecord, + financial_lines: Iterable[SourceFinancialLineInput], + ) -> tuple[int, int]: + created = 0 + updated = 0 + for line in financial_lines: + _, was_created = OrganizationSourceFinancialLine.objects.update_or_create( + source_record=source_record, + form_code=str(line.form_code), + line_code=str(line.line_code), + year=line.year, + defaults={ + "line_name": str(line.line_name), + "period_start": line.period_start, + "period_end": line.period_end, + }, + ) + if was_created: + created += 1 + else: + updated += 1 + return created, updated + + @staticmethod + def _refresh_extension_counters(extension_ids: set[str]) -> None: + if not extension_ids: + return + + aggregates = { + row["extension_id"]: row + for row in OrganizationSourceRecord.objects.filter( + extension_id__in=extension_ids + ) + .values("extension_id") + .annotate( + records_count=Count("uid"), + first_seen_at=Min("created_at"), + last_seen_at=Max("created_at"), + last_load_batch=Max("load_batch"), + ) + } + + now = timezone.now() + extensions = list( + OrganizationSourceExtension.objects.filter(uid__in=extension_ids) + ) + for extension in extensions: + aggregate = aggregates.get(extension.uid, {}) + extension.records_count = aggregate.get("records_count") or 0 + extension.first_seen_at = aggregate.get("first_seen_at") + extension.last_seen_at = aggregate.get("last_seen_at") + extension.last_load_batch = aggregate.get("last_load_batch") + extension.updated_at = now + + OrganizationSourceExtension.objects.bulk_update( + extensions, + fields=[ + "records_count", + "first_seen_at", + "last_seen_at", + "last_load_batch", + "updated_at", + ], + batch_size=1000, + ) diff --git a/src/organizations/tasks.py b/src/organizations/tasks.py index dee8f5d..4d50d8a 100644 --- a/src/organizations/tasks.py +++ b/src/organizations/tasks.py @@ -1,31 +1,57 @@ -"""Celery tasks for organizations snapshots.""" +"""Celery tasks for organization source extensions.""" from __future__ import annotations import logging +from dataclasses import asdict from celery import shared_task from organizations.cache import invalidate_organization_api_cache -from organizations.services import OrganizationDataSnapshotRefreshService +from organizations.source_backfill import OrganizationSourceBackfillService logger = logging.getLogger(__name__) @shared_task -def refresh_all_organization_data_snapshots(batch_size: int = 100) -> dict: - """Refresh all organization data snapshots for API v2.""" - result = OrganizationDataSnapshotRefreshService.refresh(batch_size=batch_size) +def backfill_all_organization_sources(batch_size: int = 100) -> dict: + """Backfill all organization source extensions from legacy parser tables.""" + result = OrganizationSourceBackfillService.backfill() + invalidate_organization_api_cache() + payload = {"batch_size": batch_size, **asdict(result)} + logger.info("All organization source extensions backfilled: %s", payload) + return payload + + +@shared_task +def backfill_organization_sources_for_parser_batch( + *, + source: str, + batch_id: int, + batch_size: int = 100, +) -> dict: + """Backfill source extensions affected by one parser batch.""" + result = OrganizationSourceBackfillService.backfill( + source=source, + batch_id=batch_id, + ) invalidate_organization_api_cache() payload = { - "processed": result.processed, - "created": result.created, - "updated": result.updated, + "source": source, + "batch_id": batch_id, + "batch_size": batch_size, + **asdict(result), } - logger.info("All organization data snapshots refreshed: %s", payload) + logger.info("Organization source extensions backfilled: %s", payload) return payload +@shared_task +def refresh_all_organization_data_snapshots(batch_size: int = 100) -> dict: + """Deprecated compatibility wrapper for the old snapshot schedule.""" + return backfill_all_organization_sources(batch_size=batch_size) + + @shared_task def refresh_organization_data_snapshots_for_parser_batch( *, @@ -33,19 +59,9 @@ def refresh_organization_data_snapshots_for_parser_batch( batch_id: int, batch_size: int = 100, ) -> dict: - """Refresh snapshots for organizations affected by one parser batch.""" - result = OrganizationDataSnapshotRefreshService.refresh_for_parser_batch( + """Deprecated compatibility wrapper for old parser task imports.""" + return backfill_organization_sources_for_parser_batch( source=source, batch_id=batch_id, batch_size=batch_size, ) - invalidate_organization_api_cache() - payload = { - "source": source, - "batch_id": batch_id, - "processed": result.processed, - "created": result.created, - "updated": result.updated, - } - logger.info("Organization data snapshots refreshed: %s", payload) - return payload diff --git a/src/organizations/urls.py b/src/organizations/urls.py index 6d0a619..3632d4a 100644 --- a/src/organizations/urls.py +++ b/src/organizations/urls.py @@ -3,10 +3,24 @@ from django.urls import include, path from rest_framework.routers import DefaultRouter -from organizations.views import OrganizationViewSet +from organizations.views import ( + OrganizationSourceExtensionViewSet, + OrganizationSourceRecordViewSet, + OrganizationViewSet, +) router = DefaultRouter() router.register(r"organizations", OrganizationViewSet, basename="organizations") +router.register( + r"organization-sources", + OrganizationSourceExtensionViewSet, + basename="organization-sources", +) +router.register( + r"organization-source-records", + OrganizationSourceRecordViewSet, + basename="organization-source-records", +) organizations_urlpatterns = [ path("", include(router.urls)), diff --git a/src/organizations/views.py b/src/organizations/views.py index b2f01de..646f8d8 100644 --- a/src/organizations/views.py +++ b/src/organizations/views.py @@ -1,39 +1,46 @@ -"""Views for organizations API v2.""" +"""Views for organization-centric API v2.""" from __future__ import annotations import hashlib +import json from typing import Any from apps.core.openapi import swagger_tag from django.conf import settings from django.core.cache import cache +from django.db.models import CharField, Q +from django.db.models.functions import Cast from django_filters import rest_framework as filters from drf_yasg import openapi from drf_yasg.utils import swagger_auto_schema -from rest_framework.exceptions import ValidationError +from registers.models import RegistryMembershipPeriod +from rest_framework.decorators import action from rest_framework.filters import OrderingFilter, SearchFilter from rest_framework.permissions import AllowAny, IsAuthenticated from rest_framework.response import Response from rest_framework.viewsets import ReadOnlyModelViewSet -from organizations.api_enrichment import ( - API_DATA_SOURCE_KEY_SET, - OrganizationApiEnrichmentService, - to_api_data_source, - to_internal_data_source, -) from organizations.cache import ( DEFAULT_ORGANIZATION_API_CACHE_TIMEOUT_SECONDS, ORGANIZATION_API_CACHE_PREFIX, get_organization_api_cache_version, ) from organizations.filters import OrganizationFilter -from organizations.models import Organization -from organizations.serializers import OrganizationSerializer +from organizations.models import ( + Organization, + OrganizationSourceExtension, + OrganizationSourceRecord, + SourceGroup, +) +from organizations.serializers import ( + OrganizationSerializer, + OrganizationSourceExtensionSerializer, + OrganizationSourceRecordSerializer, +) ORGANIZATIONS_TAG = swagger_tag("Организации", "Organizations") -ORGANIZATION_DATA_SOURCE_KEYS = ", ".join(sorted(API_DATA_SOURCE_KEY_SET)) +FALSE_QUERY_VALUES = {"0", "false", "no", "off"} def _query_parameter( @@ -57,38 +64,12 @@ def _query_parameter( ) -ORGANIZATION_DATA_PARAMS = [ - _query_parameter( - "data", - description=( - "Ограничить блок data одним или несколькими источниками. " - f"Допустимые значения: {ORGANIZATION_DATA_SOURCE_KEYS}. " - "Можно передать несколько параметров или CSV-строку. " - "На list endpoint блок data по умолчанию пустой; передайте этот " - "параметр, чтобы вернуть данные источников." - ), - ), - _query_parameter( - "data_sources", - description=( - "Alias параметра data. Оставлен для явного указания набора источников." - ), - ), - _query_parameter( - "exclude_data", - description=( - "Исключить один или несколько источников из блока data. " - f"Допустимые значения: {ORGANIZATION_DATA_SOURCE_KEYS}." - ), - ), - _query_parameter( - "exclude_data_sources", - description=( - "Alias параметра exclude_data. Можно передать несколько значений " - "или CSV-строку." - ), - ), -] +def _is_truthy_query_value(value: str) -> bool: + return value.strip().lower() not in FALSE_QUERY_VALUES + + +SOURCE_GROUP_VALUES = [choice.value for choice in SourceGroup] + ORGANIZATION_LIST_PARAMS = [ _query_parameter( "page", @@ -104,12 +85,12 @@ ORGANIZATION_LIST_PARAMS = [ ), _query_parameter( "search", - description="Полнотекстовый поиск по наименованию, ИНН, КПП, ОГРН и ОГРИП.", + description="Поиск по наименованию, ИНН, КПП, ОГРН, ОГРИП и основному идентификатору.", ), _query_parameter( "ordering", description=( - "Сортировка по uid, name, inn, kpp, ogrn или ogrip. " + "Сортировка по uid, name, inn, kpp, ogrn, ogrip или identity_status. " "Префикс '-' включает обратный порядок." ), ), @@ -118,6 +99,11 @@ ORGANIZATION_LIST_PARAMS = [ _query_parameter("kpp", description="Точный фильтр по КПП."), _query_parameter("ogrn", description="Точный фильтр по ОГРН."), _query_parameter("ogrip", description="Точный фильтр по ОГРИП."), + _query_parameter( + "identity_status", + description="Фильтр полноты реквизитов организации.", + enum=[choice.value for choice in Organization.IdentityStatus], + ), _query_parameter( "registry", description="UUID реестра. Возвращает организации из активного участия.", @@ -136,15 +122,19 @@ ORGANIZATION_LIST_PARAMS = [ param_type=openapi.TYPE_BOOLEAN, default=True, ), + _query_parameter( + "source_group", + description="Фильтр по группе источников организации.", + enum=SOURCE_GROUP_VALUES, + ), *[ _query_parameter( - f"has_{source}", - description=f"Фильтр наличия данных источника {source}.", + f"has_{source_group}", + description=f"Фильтр наличия группы источников {source_group}.", param_type=openapi.TYPE_BOOLEAN, ) - for source in sorted(API_DATA_SOURCE_KEY_SET) + for source_group in SOURCE_GROUP_VALUES ], - *ORGANIZATION_DATA_PARAMS, ] ORGANIZATION_DETAIL_PARAMS = [ openapi.Parameter( @@ -155,89 +145,54 @@ ORGANIZATION_DETAIL_PARAMS = [ required=True, description="UID организации.", ), - *ORGANIZATION_DATA_PARAMS, ] -ORGANIZATION_SCHEMA = openapi.Schema( - type=openapi.TYPE_OBJECT, - required=["uid", "name", "inn", "data", "data_sources", "registries"], - properties={ - "uid": openapi.Schema(type=openapi.TYPE_STRING, format=openapi.FORMAT_UUID), - "name": openapi.Schema(type=openapi.TYPE_STRING), - "normalized_name": openapi.Schema(type=openapi.TYPE_STRING), - "inn": openapi.Schema(type=openapi.TYPE_STRING), - "kpp": openapi.Schema(type=openapi.TYPE_STRING), - "ogrn": openapi.Schema(type=openapi.TYPE_STRING), - "ogrip": openapi.Schema(type=openapi.TYPE_STRING), - "data": openapi.Schema( - type=openapi.TYPE_OBJECT, - description=( - "Данные по источникам. Ключи управляются параметрами data/" - "exclude_data." - ), - additional_properties=openapi.Schema( - type=openapi.TYPE_ARRAY, - items=openapi.Schema(type=openapi.TYPE_OBJECT), - ), - ), - "data_sources": openapi.Schema( - type=openapi.TYPE_ARRAY, - items=openapi.Schema( - type=openapi.TYPE_OBJECT, - properties={ - "source": openapi.Schema(type=openapi.TYPE_STRING), - "count": openapi.Schema(type=openapi.TYPE_INTEGER), - }, - ), - ), - "registries": openapi.Schema( - type=openapi.TYPE_ARRAY, - items=openapi.Schema( - type=openapi.TYPE_OBJECT, - properties={ - "id": openapi.Schema(type=openapi.TYPE_STRING), - "name": openapi.Schema(type=openapi.TYPE_STRING), - }, - ), - ), - }, -) -ORGANIZATION_LIST_RESPONSE = openapi.Response( - description="Пагинированный список организаций v2.", - schema=openapi.Schema( - type=openapi.TYPE_OBJECT, - properties={ - "success": openapi.Schema(type=openapi.TYPE_BOOLEAN), - "data": openapi.Schema( - type=openapi.TYPE_ARRAY, - items=ORGANIZATION_SCHEMA, - ), - "errors": openapi.Schema( - type=openapi.TYPE_ARRAY, - items=openapi.Schema(type=openapi.TYPE_OBJECT), - description="Список ошибок; null при успешном ответе.", - ), - "meta": openapi.Schema( - type=openapi.TYPE_OBJECT, - properties={ - "pagination": openapi.Schema( - type=openapi.TYPE_OBJECT, - properties={ - "page": openapi.Schema(type=openapi.TYPE_INTEGER), - "page_size": openapi.Schema(type=openapi.TYPE_INTEGER), - "total_count": openapi.Schema(type=openapi.TYPE_INTEGER), - "total_pages": openapi.Schema(type=openapi.TYPE_INTEGER), - "has_next": openapi.Schema(type=openapi.TYPE_BOOLEAN), - "has_previous": openapi.Schema(type=openapi.TYPE_BOOLEAN), - }, - ), - }, - ), - }, +SOURCE_EXTENSION_PATH_PARAMS = [ + openapi.Parameter( + name="uid", + in_=openapi.IN_PATH, + type=openapi.TYPE_STRING, + format=openapi.FORMAT_UUID, + required=True, + description="UID расширения источника.", ), +] +SOURCE_RECORD_LIST_PARAMS = [ + _query_parameter( + "source_group", + description="Фильтр по группе источников.", + enum=SOURCE_GROUP_VALUES, + ), + _query_parameter("source", description="Фильтр по legacy source внутри группы."), + _query_parameter("record_type", description="Фильтр по типу записи."), + _query_parameter( + "has_registry", + description="Фильтр наличия активного участия организации записи в любом реестре.", + param_type=openapi.TYPE_BOOLEAN, + ), + _query_parameter( + "organization", + description="UID организации.", + format_=openapi.FORMAT_UUID, + ), + _query_parameter( + "search", + description=( + "Поиск по организации, реквизитам, заголовку, внешнему ID, " + "статусу, датам, URL и исходным данным записи." + ), + ), + _query_parameter("page", description="Номер страницы.", param_type=openapi.TYPE_INTEGER), + _query_parameter( + "page_size", + description="Размер страницы. Максимум 100.", + param_type=openapi.TYPE_INTEGER, + ), +] +ORGANIZATION_LIST_RESPONSE = openapi.Response( + description="Пагинированный список организаций v2 с компактными источниками.", ) ORGANIZATION_DETAIL_RESPONSE = openapi.Response( description="Карточка организации v2.", - schema=ORGANIZATION_SCHEMA, ) @@ -283,7 +238,7 @@ class CachedReadOnlyMixin: class OrganizationViewSet(CachedReadOnlyMixin, ReadOnlyModelViewSet): - """Read-only API for canonical organizations.""" + """Read-only API for canonical organizations and source summaries.""" queryset = Organization.objects.order_by("name", "uid") serializer_class = OrganizationSerializer @@ -295,8 +250,23 @@ class OrganizationViewSet(CachedReadOnlyMixin, ReadOnlyModelViewSet): OrderingFilter, ] filterset_class = OrganizationFilter - search_fields = ["name", "inn", "kpp", "ogrn", "ogrip"] - ordering_fields = ["name", "inn", "kpp", "ogrn", "ogrip", "uid"] + search_fields = [ + "name", + "inn", + "kpp", + "ogrn", + "ogrip", + "primary_identity", + ] + ordering_fields = [ + "name", + "inn", + "kpp", + "ogrn", + "ogrip", + "identity_status", + "uid", + ] ordering = ["name", "uid"] def get_permissions(self): @@ -305,10 +275,7 @@ class OrganizationViewSet(CachedReadOnlyMixin, ReadOnlyModelViewSet): return super().get_permissions() def get_queryset(self): - queryset = super().get_queryset().select_related("data_snapshot") - if self._should_defer_snapshot_data(): - queryset = queryset.defer("data_snapshot__data") - + queryset = super().get_queryset().prefetch_related("source_extensions") if self.action != "list" or "has_registry" in self.request.query_params: return queryset @@ -321,20 +288,6 @@ class OrganizationViewSet(CachedReadOnlyMixin, ReadOnlyModelViewSet): return filterset.qs return queryset - def _should_defer_snapshot_data(self) -> bool: - if getattr(self, "action", None) != "list": - return False - - return not any( - name in self.request.query_params - for name in ( - "data", - "data_sources", - "exclude_data", - "exclude_data_sources", - ) - ) - @swagger_auto_schema( tags=[ORGANIZATIONS_TAG], operation_id="v2_organizations_list", @@ -343,10 +296,8 @@ class OrganizationViewSet(CachedReadOnlyMixin, ReadOnlyModelViewSet): "Возвращает канонический справочник организаций API v2. " "По умолчанию показывает только организации с активным участием " "в реестрах; передайте has_registry=false, чтобы снять это ограничение. " - "Поддерживает пагинацию, поиск по наименованию и реквизитам, фильтры " - "по реестрам и наличию данных по источникам. Для list endpoint " - "тяжелый блок data по умолчанию пустой; передайте data/data_sources, " - "чтобы вернуть данные конкретных источников." + "Данные источников возвращаются компактным списком sources; детальные " + "записи доступны через endpoints расширений источников." ), manual_parameters=ORGANIZATION_LIST_PARAMS, responses={200: ORGANIZATION_LIST_RESPONSE}, @@ -354,7 +305,7 @@ class OrganizationViewSet(CachedReadOnlyMixin, ReadOnlyModelViewSet): def list(self, request, *args: Any, **kwargs: Any) -> Response: return self._cached_response( request, - lambda: self._list_with_enrichment(request, *args, **kwargs), + lambda: super(OrganizationViewSet, self).list(request, *args, **kwargs), ) @swagger_auto_schema( @@ -362,9 +313,8 @@ class OrganizationViewSet(CachedReadOnlyMixin, ReadOnlyModelViewSet): operation_id="v2_organizations_retrieve", operation_summary="Карточка организации", operation_description=( - "Возвращает одну организацию по UID с реестрами и данными источников. " - "Параметры data/data_sources и exclude_data/exclude_data_sources " - "позволяют запросить только нужные блоки данных." + "Возвращает одну организацию по UID с активными реестрами и компактными " + "группами источников." ), manual_parameters=ORGANIZATION_DETAIL_PARAMS, responses={200: ORGANIZATION_DETAIL_RESPONSE, 404: "Организация не найдена"}, @@ -372,123 +322,237 @@ class OrganizationViewSet(CachedReadOnlyMixin, ReadOnlyModelViewSet): def retrieve(self, request, *args: Any, **kwargs: Any) -> Response: return self._cached_response( request, - lambda: self._retrieve_with_enrichment(request, *args, **kwargs), + lambda: super(OrganizationViewSet, self).retrieve(request, *args, **kwargs), ) - def _list_with_enrichment(self, request, *args: Any, **kwargs: Any) -> Response: - queryset = self.filter_queryset(self.get_queryset()) - data_sources = self._parse_data_sources(request, default=set()) + @swagger_auto_schema( + tags=[ORGANIZATIONS_TAG], + operation_id="v2_organizations_sources", + operation_summary="Источники организации", + operation_description="Возвращает source extensions одной организации.", + responses={200: "Список source extensions", 404: "Организация не найдена"}, + ) + @action(detail=True, methods=["get"]) + def sources(self, request, *args: Any, **kwargs: Any) -> Response: + organization = self.get_object() + serializer = OrganizationSourceExtensionSerializer( + organization.source_extensions.all(), + many=True, + ) + return Response(serializer.data) + +class OrganizationSourceExtensionViewSet(ReadOnlyModelViewSet): + """Read-only API for source extensions and their records.""" + + queryset = OrganizationSourceExtension.objects.select_related("organization").order_by( + "organization__name", + "source_group", + ) + serializer_class = OrganizationSourceExtensionSerializer + permission_classes = [IsAuthenticated] + lookup_field = "uid" + filter_backends = [OrderingFilter] + ordering_fields = ["source_group", "records_count", "last_seen_at", "uid"] + ordering = ["source_group", "uid"] + + def get_permissions(self): + if getattr(settings, "ORGANIZATIONS_V2_ALLOW_ANONYMOUS", False): + return [AllowAny()] + return super().get_permissions() + + @swagger_auto_schema( + tags=[ORGANIZATIONS_TAG], + operation_id="v2_organization_sources_records", + operation_summary="Записи источника организации", + operation_description="Возвращает записи под конкретным source extension.", + manual_parameters=SOURCE_EXTENSION_PATH_PARAMS, + responses={200: "Пагинированный список записей источника", 404: "Источник не найден"}, + ) + @action(detail=True, methods=["get"]) + def records(self, request, *args: Any, **kwargs: Any) -> Response: + extension = self.get_object() + queryset = extension.records.prefetch_related("financial_lines").order_by( + "-created_at", + "-uid", + ) page = self.paginate_queryset(queryset) if page is not None: - organizations = list(page) - enrichment = self._build_missing_snapshot_enrichment( - organizations, - data_sources, - ) - serializer = self.get_serializer( - organizations, - many=True, - context={ - **self.get_serializer_context(), - "data_sources": data_sources, - "enrichment": enrichment, - }, - ) + serializer = OrganizationSourceRecordSerializer(page, many=True) return self.get_paginated_response(serializer.data) - organizations = list(queryset) - enrichment = self._build_missing_snapshot_enrichment( - organizations, - data_sources, - ) - serializer = self.get_serializer( - organizations, - many=True, - context={ - **self.get_serializer_context(), - "data_sources": data_sources, - "enrichment": enrichment, - }, - ) + serializer = OrganizationSourceRecordSerializer(queryset, many=True) return Response(serializer.data) - def _retrieve_with_enrichment( - self, - request, - *args: Any, - **kwargs: Any, - ) -> Response: - organization = self.get_object() - data_sources = self._parse_data_sources(request) - enrichment = self._build_missing_snapshot_enrichment( - [organization], - data_sources, - ) - serializer = self.get_serializer( - organization, - context={ - **self.get_serializer_context(), - "data_sources": data_sources, - "enrichment": enrichment, - }, - ) - return Response(serializer.data) + +class OrganizationSourceRecordViewSet(ReadOnlyModelViewSet): + """Read-only flat API for source records across source extensions.""" + + queryset = OrganizationSourceRecord.objects.select_related( + "extension", + "extension__organization", + ).prefetch_related("financial_lines").order_by("-created_at", "-uid") + serializer_class = OrganizationSourceRecordSerializer + permission_classes = [IsAuthenticated] + lookup_field = "uid" + filter_backends = [OrderingFilter] + search_fields = [ + "title", + "external_id", + "record_type", + "source", + "record_date", + "status", + "url", + "legacy_model", + "legacy_pk", + "source_record_amount_text", + "source_record_load_batch_text", + "source_record_payload_text", + "extension__title", + "extension__source_group", + "extension__organization__name", + "extension__organization__inn", + "extension__organization__kpp", + "extension__organization__ogrn", + "extension__organization__ogrip", + ] + ordering_fields = [ + "created_at", + "updated_at", + "record_date", + "title", + "uid", + "extension__organization__name", + "extension__organization__inn", + "extension__organization__ogrn", + ] + ordering = ["-created_at", "-uid"] + + def get_permissions(self): + if getattr(settings, "ORGANIZATIONS_V2_ALLOW_ANONYMOUS", False): + return [AllowAny()] + return super().get_permissions() + + def get_queryset(self): + queryset = super().get_queryset() + params = self.request.query_params + source_group = params.get("source_group") + source = params.get("source") + record_type = params.get("record_type") + organization = params.get("organization") + has_registry = params.get("has_registry") + search_terms = SearchFilter().get_search_terms(self.request) + + if source_group: + queryset = queryset.filter(extension__source_group=source_group) + if source: + queryset = queryset.filter(source=source) + if record_type: + queryset = queryset.filter(record_type=record_type) + if organization: + queryset = queryset.filter(extension__organization_id=organization) + if has_registry is not None: + registry_query = self._registry_membership_query() + if _is_truthy_query_value(has_registry): + queryset = queryset.filter(registry_query) + else: + queryset = queryset.exclude(registry_query) + if search_terms: + queryset = self._filter_search_queryset(queryset, search_terms) + + return queryset @staticmethod - def _build_missing_snapshot_enrichment( - organizations: list[Organization], - data_sources: set[str] | None, - ) -> dict: - missing = [ - organization - for organization in organizations - if not hasattr(organization, "data_snapshot") - ] - if not missing: - return {} - return OrganizationApiEnrichmentService.build_for( - missing, - data_sources=data_sources, + def _registry_membership_query(): + inn_values, ogrn_values = OrganizationFilter._registry_identity_value_querysets() + + return ( + Q(extension__organization__inn__in=inn_values) + | Q(extension__organization__ogrn__in=ogrn_values) + | Q(extension__organization__ogrip__in=ogrn_values) ) - @staticmethod - def _parse_data_sources( - request, - *, - default: set[str] | None = None, - ) -> set[str] | None: - included = _query_param_values(request, "data", "data_sources") - excluded = _query_param_values(request, "exclude_data", "exclude_data_sources") + @classmethod + def _filter_search_queryset(cls, queryset, search_terms: list[str]): + queryset = queryset.annotate( + source_record_amount_text=Cast("amount", output_field=CharField()), + source_record_load_batch_text=Cast( + "load_batch", + output_field=CharField(), + ), + source_record_payload_text=Cast("payload", output_field=CharField()), + ) - unknown = (included | excluded) - API_DATA_SOURCE_KEY_SET - if unknown: - raise ValidationError( - { - "data": ( - "Unknown data source(s): " - + ", ".join(sorted(unknown)) - + ". Available sources: " - + ", ".join(sorted(API_DATA_SOURCE_KEY_SET)) + for search_term in search_terms: + queryset = queryset.filter(cls._source_record_search_query(search_term)) + + return queryset + + @classmethod + def _source_record_search_query(cls, search_term: str) -> Q: + query = Q() + for field_name in cls.search_fields: + query |= Q(**{f"{field_name}__icontains": search_term}) + if field_name == "source_record_payload_text": + escaped_search_term = cls._json_escaped_search_term(search_term) + if escaped_search_term != search_term: + query |= Q( + **{f"{field_name}__icontains": escaped_search_term}, ) - } + return query | cls._registry_search_query(search_term) + + @staticmethod + def _json_escaped_search_term(search_term: str) -> str: + return json.dumps(search_term, ensure_ascii=True)[1:-1] + + @staticmethod + def _registry_search_query(search_term: str) -> Q: + registry_membership = ( + RegistryMembershipPeriod.objects.filter( + ended_at__isnull=True, ) - - if included: - return { - to_api_data_source(to_internal_data_source(source)) - for source in included - excluded - } - if excluded: - return API_DATA_SOURCE_KEY_SET - excluded - return default - - -def _query_param_values(request, *names: str) -> set[str]: - values: set[str] = set() - for name in names: - for raw_value in request.query_params.getlist(name): - values.update( - value.strip() for value in raw_value.split(",") if value.strip() + .order_by() + .annotate( + registry_inn_text=Cast( + "organization__mn_inn", + output_field=CharField(), + ), + registry_kpp_text=Cast( + "organization__in_kpp", + output_field=CharField(), + ), + registry_ogrn_text=Cast( + "organization__mn_ogrn", + output_field=CharField(), + ), ) - return values + .filter( + Q(organization__pn_name__icontains=search_term) + | Q(registry_inn_text__icontains=search_term) + | Q(registry_kpp_text__icontains=search_term) + | Q(registry_ogrn_text__icontains=search_term), + ) + ) + inn_values = registry_membership.values_list("registry_inn_text", flat=True) + ogrn_values = registry_membership.values_list("registry_ogrn_text", flat=True) + + return ( + Q(extension__organization__inn__in=inn_values) + | Q(extension__organization__ogrn__in=ogrn_values) + | Q(extension__organization__ogrip__in=ogrn_values) + ) + + @swagger_auto_schema( + tags=[ORGANIZATIONS_TAG], + operation_id="v2_organization_source_records_list", + operation_summary="Записи источников организаций", + operation_description=( + "Возвращает плоский пагинированный список записей источников с " + "данными организации и финансовыми строками при наличии." + ), + manual_parameters=SOURCE_RECORD_LIST_PARAMS, + responses={200: "Пагинированный список записей источников"}, + ) + def list(self, request, *args: Any, **kwargs: Any) -> Response: + return super().list(request, *args, **kwargs) diff --git a/src/settings/base.py b/src/settings/base.py index ebd615f..f084185 100644 --- a/src/settings/base.py +++ b/src/settings/base.py @@ -44,6 +44,7 @@ INSTALLED_APPS = [ "django_celery_beat", "django_celery_results", "drf_yasg", + "polymorphic", # Local apps "apps.core", "organizations", diff --git a/tests/apps/organizations/test_api_v2.py b/tests/apps/organizations/test_api_v2.py index 88c007f..af6642c 100644 --- a/tests/apps/organizations/test_api_v2.py +++ b/tests/apps/organizations/test_api_v2.py @@ -1,11 +1,6 @@ """Tests for organizations API v2.""" -from apps.parsers.models import ( - FinancialReport, - FinancialReportLine, - GenericParserRecord, - ParserLoadLog, -) +from apps.parsers.models import ParserLoadLog from django.core.cache import cache from django.db import connection from django.test import override_settings @@ -13,18 +8,20 @@ from django.test.utils import CaptureQueriesContext from django.urls import reverse from organizations.cache import invalidate_organization_api_cache from organizations.filters import OrganizationFilter -from organizations.models import Organization, OrganizationDataSnapshot -from organizations.services import OrganizationDataSnapshotRefreshService +from organizations.models import ( + FinancialIndicatorsExtension, + GovernmentProcurementExtension, + IndustrialProductionExtension, + Organization, + OrganizationSourceFinancialLine, + OrganizationSourceRecord, + PlannedInspectionExtension, + SecurityRegistryExtension, + VacancyExtension, +) from rest_framework import status from rest_framework.test import APITestCase -from tests.apps.parsers.factories import ( - IndustrialCertificateRecordFactory, - IndustrialProductRecordFactory, - InspectionRecordFactory, - ManufacturerRecordFactory, - ProcurementRecordFactory, -) from tests.apps.registers.factories import ( OrganizationFactory as RegistryOrganizationFactory, ) @@ -101,25 +98,26 @@ class OrganizationsApiV2Test(APITestCase): "ordering", "registry", "has_registry", - "has_fns_reports", - "data", - "exclude_data", + "identity_status", + "source_group", + "has_financial_indicators", + "has_planned_inspections", ): self.assertIn(expected_name, list_parameters) self.assertIn( "по умолчанию true", list_parameters["has_registry"]["description"] ) self.assertIn( - "industrial_products", - list_parameters["data"]["description"], + "группе источников", + list_parameters["source_group"]["description"], ) detail_parameters = { parameter["name"]: parameter for parameter in detail_operation["parameters"] } self.assertEqual(detail_parameters["uid"]["type"], "string") self.assertEqual(detail_parameters["uid"]["format"], "uuid") - self.assertIn("data", detail_parameters) - self.assertIn("exclude_data", detail_parameters) + self.assertNotIn("data", detail_parameters) + self.assertNotIn("exclude_data", detail_parameters) self.assertIn( "Пагинированный", list_operation["responses"]["200"]["description"] ) @@ -127,6 +125,8 @@ class OrganizationsApiV2Test(APITestCase): "Карточка организации", detail_operation["responses"]["200"]["description"], ) + self.assertIn("/api/v2/organizations/{uid}/sources/", paths) + self.assertIn("/api/v2/organization-sources/{uid}/records/", paths) def test_retrieve_returns_item_by_uid(self): organization = Organization.objects.create( @@ -164,20 +164,25 @@ class OrganizationsApiV2Test(APITestCase): self.assertEqual(response.data["name"], 'АКЦИОНЕРНОЕ ОБЩЕСТВО "СЕВЕРНЫЙ МОСТ"') self.assertEqual(response.data["normalized_name"], 'АО "Северный Мост"') - def test_list_omits_full_snapshot_data_by_default_but_keeps_summary(self): + def test_list_returns_compact_source_summaries_without_records(self): organization = Organization.objects.create( name='ООО "Легкий список"', inn="7712345682", kpp="771201005", ogrn="1027700132200", ) - OrganizationDataSnapshot.objects.create( + extension = IndustrialProductionExtension.objects.create( organization=organization, - data={ - "industrial": [{"id": 1}], - "fns_reports": [{"id": 2}, {"id": 3}], - }, - registries=[], + title="Производители и продукция России", + records_count=2, + metadata={"sources": ["industrial", "industrial_products"]}, + ) + OrganizationSourceRecord.objects.create( + extension=extension, + record_type="industrial_certificate", + source="industrial", + external_id="cert-list-summary", + payload={"certificate_number": "CERT-LIST"}, ) response = self.client.get( @@ -187,30 +192,35 @@ class OrganizationsApiV2Test(APITestCase): self.assertEqual(response.status_code, status.HTTP_200_OK) item = response.data["data"][0] - self.assertEqual(item["data"], {}) + self.assertNotIn("data", item) + self.assertNotIn("data_sources", item) + self.assertEqual(item["sources"][0]["source_group"], "industrial_production") + self.assertEqual(item["sources"][0]["records_count"], 2) self.assertEqual( - item["data_sources"], - [ - {"source": "fns_reports", "count": 2}, - {"source": "industrial", "count": 1}, - ], + item["sources"][0]["metadata"], + {"sources": ["industrial", "industrial_products"]}, ) - def test_list_default_uses_snapshot_summary_without_loading_full_data(self): + def test_list_source_summaries_do_not_load_source_records(self): organization = Organization.objects.create( - name='ООО "Легкий снапшот"', + name='ООО "Легкие источники"', inn="7712345685", kpp="771201008", ogrn="1027700132203", ) - OrganizationDataSnapshot.objects.create( + extension = FinancialIndicatorsExtension.objects.create( organization=organization, - data={ - "industrial": [{"id": index} for index in range(100)], - "fns_reports": [{"id": index} for index in range(50)], - }, - registries=[], + title="Финансово-экономические показатели", + records_count=100, ) + for index in range(100): + OrganizationSourceRecord.objects.create( + extension=extension, + record_type="financial_report", + source="fns_reports", + external_id=f"summary-no-record-{index}", + title=f"Report {index}", + ) with CaptureQueriesContext(connection) as captured: response = self.client.get( @@ -220,64 +230,58 @@ class OrganizationsApiV2Test(APITestCase): self.assertEqual(response.status_code, status.HTTP_200_OK) item = response.data["data"][0] - self.assertEqual(item["data"], {}) - self.assertEqual( - item["data_sources"], - [ - {"source": "fns_reports", "count": 50}, - {"source": "industrial", "count": 100}, - ], - ) - full_snapshot_data_queries = [ + self.assertEqual(item["sources"][0]["records_count"], 100) + source_record_queries = [ query["sql"] for query in captured - if "ORGANIZATIONS_DATA_SNAPSHOT" in query["sql"].upper() - and '"data"' in query["sql"] + if "ORGANIZATIONS_SOURCE_RECORD" in query["sql"].upper() ] - self.assertEqual(full_snapshot_data_queries, []) + self.assertEqual(source_record_queries, []) - def test_list_returns_snapshot_data_when_sources_are_requested(self): + def test_source_records_endpoint_returns_requested_source_payload(self): organization = Organization.objects.create( name='ООО "Явные данные"', inn="7712345683", kpp="771201006", ogrn="1027700132201", ) - OrganizationDataSnapshot.objects.create( + extension = IndustrialProductionExtension.objects.create( organization=organization, - data={ - "industrial": [{"id": 1}], - "fns_reports": [{"id": 2}], - }, - registries=[], + title="Производители и продукция России", + records_count=1, + ) + OrganizationSourceRecord.objects.create( + extension=extension, + record_type="industrial_certificate", + source="industrial", + external_id="cert-source-records", + payload={"certificate_number": "CERT-SOURCE-RECORDS"}, ) response = self.client.get( - reverse("api_v2:organizations:organizations-list"), - { - "inn": organization.inn, - "has_registry": "false", - "data": "industrial", - }, + reverse( + "api_v2:organizations:organization-sources-records", + args=[extension.uid], + ) ) self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertEqual( - response.data["data"][0]["data"], - {"industrial": [{"id": 1}]}, + response.data["data"][0]["payload"], + {"certificate_number": "CERT-SOURCE-RECORDS"}, ) - def test_detail_keeps_full_snapshot_data_by_default(self): + def test_detail_returns_compact_source_summaries_by_default(self): organization = Organization.objects.create( name='ООО "Полная карточка"', inn="7712345684", kpp="771201007", ogrn="1027700132202", ) - OrganizationDataSnapshot.objects.create( + FinancialIndicatorsExtension.objects.create( organization=organization, - data={"industrial": [{"id": 1}]}, - registries=[], + title="Финансово-экономические показатели", + records_count=1, ) response = self.client.get( @@ -287,44 +291,44 @@ class OrganizationsApiV2Test(APITestCase): ) self.assertEqual(response.status_code, status.HTTP_200_OK) - self.assertEqual(response.data["data"], {"industrial": [{"id": 1}]}) + self.assertNotIn("data", response.data) + self.assertEqual(response.data["sources"][0]["source_group"], "financial_indicators") - def test_list_keeps_data_source_summary_when_data_payload_is_excluded(self): + def test_source_group_filter_limits_organizations_by_source_extension(self): organization = Organization.objects.create( name='ООО "Сводка данных"', inn="7712345681", kpp="771201004", ogrn="1027700132199", ) - IndustrialCertificateRecordFactory( - inn=organization.inn, - ogrn=organization.ogrn, + other = Organization.objects.create( + name='ООО "Без промышленности"', + inn="7712345686", + kpp="771201009", + ogrn="1027700132204", ) - OrganizationDataSnapshotRefreshService.refresh( - organization_uids=[str(organization.uid)] + IndustrialProductionExtension.objects.create( + organization=organization, + title="Производители и продукция России", + records_count=1, + ) + FinancialIndicatorsExtension.objects.create( + organization=other, + title="Финансово-экономические показатели", + records_count=1, ) response = self.client.get( reverse("api_v2:organizations:organizations-list"), { - "inn": organization.inn, + "source_group": "industrial_production", "has_registry": "false", - "exclude_data": ( - "industrial,industrial_products,manufactures,inspections," - "procurements,procurements_44fz,procurements_223fz,contracts," - "unfair_suppliers,fas_goz,arbitration,fedresurs_bankruptcy," - "fstec,vacancies,fns_reports" - ), }, ) self.assertEqual(response.status_code, status.HTTP_200_OK) - item = response.data["data"][0] - self.assertEqual(item["data"], {}) - self.assertEqual( - item["data_sources"], - [{"source": "industrial", "count": 1}], - ) + self.assertEqual(response.data["meta"]["pagination"]["total_count"], 1) + self.assertEqual(response.data["data"][0]["uid"], str(organization.uid)) def test_normalized_name_compacts_scientific_production_forms(self): organization = Organization.objects.create( @@ -520,7 +524,7 @@ class OrganizationsApiV2Test(APITestCase): self.assertEqual(second_response["X-Cache"], "HIT") self.assertEqual(second_response.data["name"], 'ООО "Деталь"') - def test_list_includes_parser_presence_and_active_registries(self): + def test_list_includes_source_groups_and_active_registries(self): organization = Organization.objects.create( name='ООО "Данные"', inn="7777777777", @@ -536,61 +540,75 @@ class OrganizationsApiV2Test(APITestCase): registry=registry, organization=registry_organization, ) - certificate = IndustrialCertificateRecordFactory( - inn=organization.inn, - ogrn=organization.ogrn, - certificate_number="CERT-ORG-V2-1", + industrial = IndustrialProductionExtension.objects.create( + organization=organization, + title="Производители и продукция России", + records_count=3, ) - product = IndustrialProductRecordFactory( - inn=organization.inn, - ogrn=organization.ogrn, - registry_number="PRODUCT-ORG-V2-1", + planned = PlannedInspectionExtension.objects.create( + organization=organization, + title="Плановые проверки Генпрокуратуры России", + records_count=1, ) - manufacturer = ManufacturerRecordFactory( - inn=organization.inn, - ogrn=organization.ogrn, - full_legal_name='ООО "Данные"', + procurements = GovernmentProcurementExtension.objects.create( + organization=organization, + title="Государственные закупки по 44-ФЗ и 223-ФЗ", + records_count=2, ) - inspection = InspectionRecordFactory( - inn=organization.inn, - ogrn=organization.ogrn, - registration_number="INSPECTION-ORG-V2-1", + security = SecurityRegistryExtension.objects.create( + organization=organization, + title="Реестры по информационной безопасности", + records_count=1, ) - procurement = ProcurementRecordFactory( - customer_inn=organization.inn, - customer_ogrn=organization.ogrn, - purchase_number="PROCUREMENT-ORG-V2-1", + financial = FinancialIndicatorsExtension.objects.create( + organization=organization, + title="Финансово-экономические показатели", + records_count=1, ) - generic_record = GenericParserRecord.objects.create( - load_batch=1, - source=ParserLoadLog.Source.FSTEC, + OrganizationSourceRecord.objects.create( + extension=industrial, + record_type="industrial_certificate", + source="industrial", + external_id="CERT-ORG-V2-1", + payload={"certificate_number": "CERT-ORG-V2-1"}, + ) + OrganizationSourceRecord.objects.create( + extension=planned, + record_type="inspection", + source="inspections", + external_id="INSPECTION-ORG-V2-1", + payload={"registration_number": "INSPECTION-ORG-V2-1"}, + ) + OrganizationSourceRecord.objects.create( + extension=procurements, + record_type="procurement", + source="procurements", + external_id="PROCUREMENT-ORG-V2-1", + payload={"purchase_number": "PROCUREMENT-ORG-V2-1"}, + ) + OrganizationSourceRecord.objects.create( + extension=procurements, + record_type="procurement", + source="procurements_44fz", + external_id="procurements-44fz-1", + payload={"external_id": "procurements-44fz-1"}, + ) + OrganizationSourceRecord.objects.create( + extension=security, + record_type="security_registry", + source="fstec", external_id="fstec-1", - inn=organization.inn, - ogrn=organization.ogrn, - organisation_name=organization.name, - title="FSTEC record", payload={"source": "fstec"}, ) - GenericParserRecord.objects.create( - load_batch=1, - source=ParserLoadLog.Source.PROCUREMENTS_44FZ, - external_id="procurements-44fz-1", - inn=organization.inn, - ogrn=organization.ogrn, - organisation_name=organization.name, - title="44-FZ record", - ) - financial_report = FinancialReport.objects.create( + report_record = OrganizationSourceRecord.objects.create( + extension=financial, + record_type="financial_report", + source="fns_reports", external_id="fin-1", - ogrn=organization.ogrn, - file_name="fin.xlsx", - file_hash="a" * 64, - load_batch=1, - status=FinancialReport.Status.SUCCESS, - source=FinancialReport.SourceType.API, + payload={"external_id": "fin-1"}, ) - FinancialReportLine.objects.create( - report=financial_report, + OrganizationSourceFinancialLine.objects.create( + source_record=report_record, form_code="1", line_code="1100", line_name="Внеоборотные активы", @@ -598,8 +616,8 @@ class OrganizationsApiV2Test(APITestCase): period_start=100, period_end=200, ) - FinancialReportLine.objects.create( - report=financial_report, + OrganizationSourceFinancialLine.objects.create( + source_record=report_record, form_code="1", line_code="1300", line_name="Капитал и резервы", @@ -607,8 +625,8 @@ class OrganizationsApiV2Test(APITestCase): period_start=300, period_end=400, ) - FinancialReportLine.objects.create( - report=financial_report, + OrganizationSourceFinancialLine.objects.create( + source_record=report_record, form_code="2", line_code="2110", line_name="Выручка", @@ -621,10 +639,7 @@ class OrganizationsApiV2Test(APITestCase): reverse("api_v2:organizations:organizations-list"), { "inn": organization.inn, - "data": ( - "industrial,industrial_products,manufactures,inspections," - "procurements,procurements_44fz,fstec,fns_reports" - ), + "has_registry": "true", }, ) @@ -635,105 +650,30 @@ class OrganizationsApiV2Test(APITestCase): [{"id": str(registry.id), "name": "Росатом ГОЗ"}], ) self.assertNotIn("data_presence", item) - self.assertIn("data", item) + self.assertNotIn("data", item) + sources = {source["source_group"]: source for source in item["sources"]} self.assertEqual( - item["data"]["industrial"], - [ - { - "id": certificate.id, - "load_batch": certificate.load_batch, - "issue_date": certificate.issue_date, - "issue_date_normalized": None, - "certificate_number": "CERT-ORG-V2-1", - "expiry_date": certificate.expiry_date, - "expiry_date_normalized": None, - "certificate_file_url": certificate.certificate_file_url, - "organisation_name": certificate.organisation_name, - "inn": organization.inn, - "ogrn": organization.ogrn, - "registry_organization": None, - "created_at": certificate.created_at.isoformat().replace( - "+00:00", "Z" - ), - "updated_at": certificate.updated_at.isoformat().replace( - "+00:00", "Z" - ), - } - ], - ) - self.assertEqual( - item["data"]["industrial_products"][0]["registry_number"], - "PRODUCT-ORG-V2-1", - ) - self.assertEqual( - item["data"]["industrial_products"][0]["id"], - product.id, - ) - self.assertEqual( - item["data"]["manufactures"][0]["id"], - manufacturer.id, - ) - self.assertEqual( - item["data"]["inspections"][0]["registration_number"], - "INSPECTION-ORG-V2-1", - ) - self.assertEqual( - item["data"]["inspections"][0]["id"], - inspection.id, - ) - self.assertEqual( - item["data"]["procurements"][0]["purchase_number"], - "PROCUREMENT-ORG-V2-1", - ) - self.assertEqual( - item["data"]["procurements"][0]["id"], - procurement.id, - ) - self.assertEqual( - item["data"]["procurements_44fz"][0]["external_id"], - "procurements-44fz-1", - ) - self.assertEqual( - item["data"]["fstec"][0]["id"], - generic_record.id, - ) - self.assertEqual( - item["data"]["fns_reports"][0]["external_id"], - "fin-1", - ) - self.assertEqual( - item["data"]["fns_reports"][0]["id"], - financial_report.id, - ) - self.assertEqual(item["data"]["fns_reports"][0]["lines_count"], 3) - self.assertEqual( - item["data"]["fns_reports"][0]["lines"]["2021"]["active"]["1100"], + set(sources), { - "form_code": "1", - "name": "Внеоборотные активы", - "period_start": 100, - "period_end": 200, + "financial_indicators", + "government_procurements", + "industrial_production", + "planned_inspections", + "security_registries", }, ) - self.assertEqual( - item["data"]["fns_reports"][0]["lines"]["2021"]["passive"]["1300"], - { - "form_code": "1", - "name": "Капитал и резервы", - "period_start": 300, - "period_end": 400, - }, + self.assertEqual(sources["industrial_production"]["records_count"], 3) + self.assertEqual(sources["financial_indicators"]["records_count"], 1) + + records_response = self.client.get( + reverse( + "api_v2:organizations:organization-sources-records", + args=[financial.uid], + ) ) - self.assertEqual( - item["data"]["fns_reports"][0]["lines"]["2022"]["form_2"]["2110"], - { - "form_code": "2", - "name": "Выручка", - "period_start": None, - "period_end": 500, - }, - ) - self.assertTrue(all(isinstance(value, list) for value in item["data"].values())) + self.assertEqual(records_response.status_code, status.HTTP_200_OK) + self.assertEqual(records_response.data["data"][0]["external_id"], "fin-1") + self.assertEqual(len(records_response.data["data"][0]["financial_lines"]), 3) def test_filters_by_registry_and_has_registry(self): with_registry = Organization.objects.create( @@ -858,18 +798,15 @@ class OrganizationsApiV2Test(APITestCase): kpp="780001003", ogrn="1027700133003", ) - IndustrialCertificateRecordFactory( - inn=with_industrial.inn, - ogrn=with_industrial.ogrn, + IndustrialProductionExtension.objects.create( + organization=with_industrial, + title="Производители и продукция России", + records_count=1, ) - FinancialReport.objects.create( - external_id="fin-presence-1", - ogrn=with_fns.ogrn, - file_name="fin_presence.xlsx", - file_hash="b" * 64, - load_batch=1, - status=FinancialReport.Status.SUCCESS, - source=FinancialReport.SourceType.API, + FinancialIndicatorsExtension.objects.create( + organization=with_fns, + title="Финансово-экономические показатели", + records_count=1, ) has_industrial = self.client.get( @@ -912,14 +849,10 @@ class OrganizationsApiV2Test(APITestCase): kpp="780001004", ogrn="1027700133004", ) - FinancialReport.objects.create( - external_id="fin-presence-no-preload", - ogrn=organization.ogrn, - file_name="fin_presence_no_preload.xlsx", - file_hash="d" * 64, - load_batch=1, - status=FinancialReport.Status.SUCCESS, - source=FinancialReport.SourceType.API, + FinancialIndicatorsExtension.objects.create( + organization=organization, + title="Финансово-экономические показатели", + records_count=1, ) with CaptureQueriesContext(connection) as captured: @@ -938,218 +871,165 @@ class OrganizationsApiV2Test(APITestCase): ] self.assertEqual(distinct_report_queries, []) - def test_limits_response_data_sources(self): + def test_sources_action_returns_only_current_organization_extensions(self): organization = Organization.objects.create( name='ООО "Источник"', inn="7800000201", kpp="780002101", ogrn="1027700133201", ) - IndustrialCertificateRecordFactory( - inn=organization.inn, - ogrn=organization.ogrn, + other = Organization.objects.create( + name='ООО "Другой источник"', + inn="7800000205", + kpp="780002105", + ogrn="1027700133205", ) - GenericParserRecord.objects.create( - load_batch=1, - source=ParserLoadLog.Source.FSTEC, - external_id="fstec-source-limit", - inn=organization.inn, - ogrn=organization.ogrn, - organisation_name=organization.name, + industrial = IndustrialProductionExtension.objects.create( + organization=organization, + title="Производители и продукция России", + records_count=1, ) - FinancialReport.objects.create( - external_id="fin-source-limit", - ogrn=organization.ogrn, - file_name="fin_source_limit.xlsx", - file_hash="c" * 64, - load_batch=1, - status=FinancialReport.Status.SUCCESS, - source=FinancialReport.SourceType.API, + FinancialIndicatorsExtension.objects.create( + organization=organization, + title="Финансово-экономические показатели", + records_count=1, + ) + VacancyExtension.objects.create( + organization=other, + title="Вакансии", + records_count=1, ) - detail_url = reverse( - "api_v2:organizations:organizations-detail", - args=[organization.uid], - ) - only_response = self.client.get( - detail_url, - {"data": "industrial,fns_reports"}, - ) - excluded_response = self.client.get( - detail_url, - {"exclude_data": "fns_reports"}, - ) - repeated_response = self.client.get( - detail_url, - [("data", "industrial"), ("data", "fstec")], - ) - empty_response = self.client.get( - detail_url, - {"data": "industrial", "exclude_data": "industrial"}, + response = self.client.get( + reverse( + "api_v2:organizations:organizations-sources", + args=[organization.uid], + ) ) - self.assertEqual(only_response.status_code, status.HTTP_200_OK) + self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertEqual( - set(only_response.data["data"]), - {"industrial", "fns_reports"}, + {source["source_group"] for source in response.data}, + {"financial_indicators", "industrial_production"}, ) - self.assertEqual(len(only_response.data["data"]["industrial"]), 1) - self.assertEqual(len(only_response.data["data"]["fns_reports"]), 1) + sources = {source["source_group"]: source for source in response.data} + self.assertEqual(sources["industrial_production"]["uid"], str(industrial.uid)) - self.assertEqual(excluded_response.status_code, status.HTTP_200_OK) - self.assertNotIn("fns_reports", excluded_response.data["data"]) - self.assertIn("industrial", excluded_response.data["data"]) - - self.assertEqual(repeated_response.status_code, status.HTTP_200_OK) - self.assertEqual( - set(repeated_response.data["data"]), - {"industrial", "fstec"}, - ) - self.assertEqual(empty_response.status_code, status.HTTP_200_OK) - self.assertEqual(empty_response.data["data"], {}) - - def test_rejects_unknown_response_data_source(self): - organization = Organization.objects.create( - name='ООО "Неверный источник"', + def test_unknown_source_group_filter_returns_empty_page(self): + Organization.objects.create( + name='ООО "Неверная группа"', inn="7800000202", kpp="780002102", ogrn="1027700133202", ) response = self.client.get( - reverse( - "api_v2:organizations:organizations-detail", args=[organization.uid] - ), - {"data": "unknown"}, + reverse("api_v2:organizations:organizations-list"), + {"source_group": "unknown", "has_registry": "false"}, ) - self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) - self.assertIn("data", response.data) + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(response.data["meta"]["pagination"]["total_count"], 0) - def test_selected_data_sources_keep_constant_query_count(self): + def test_detail_source_summaries_do_not_load_source_records(self): organization = Organization.objects.create( name='ООО "Без N+1"', inn="7800000203", kpp="780002103", ogrn="1027700133203", ) - IndustrialCertificateRecordFactory( - inn=organization.inn, - ogrn=organization.ogrn, + financial = FinancialIndicatorsExtension.objects.create( + organization=organization, + title="Финансово-экономические показатели", + records_count=4, ) - for index, year in enumerate((2021, 2022, 2023, 2024), start=1): - report = FinancialReport.objects.create( - external_id=f"fin-query-count-{year}", - ogrn=organization.ogrn, - file_name=f"fin_query_count_{year}.xlsx", - file_hash=f"{index}" * 64, - load_batch=1, - status=FinancialReport.Status.SUCCESS, - source=FinancialReport.SourceType.API, + security = SecurityRegistryExtension.objects.create( + organization=organization, + title="Реестры по информационной безопасности", + records_count=1, + ) + vacancies = VacancyExtension.objects.create( + organization=organization, + title="Вакансии", + records_count=1, + ) + for index in range(200): + OrganizationSourceRecord.objects.create( + extension=financial, + record_type="financial_report", + source="fns_reports", + external_id=f"financial-query-count-{index}", ) - FinancialReportLine.objects.bulk_create( - [ - FinancialReportLine( - report=report, - form_code="1", - line_code=str(1100 + line_index), - line_name=f"Line {line_index}", - year=year, - period_start=line_index, - period_end=line_index * 2, - ) - for line_index in range(1, 101) - ] - ) - for source in ( - ParserLoadLog.Source.FSTEC, - ParserLoadLog.Source.CONTRACTS, - ParserLoadLog.Source.TRUDVSEM, + for extension, source in ( + (security, "fstec"), + (vacancies, "vacancies"), ): - GenericParserRecord.objects.create( - load_batch=1, + OrganizationSourceRecord.objects.create( + extension=extension, + record_type=source, source=source, external_id=f"{source}-query-count", - inn=organization.inn, - ogrn=organization.ogrn, - organisation_name=organization.name, ) - OrganizationDataSnapshotRefreshService.refresh( - organization_uids=[str(organization.uid)], - ) url = reverse( "api_v2:organizations:organizations-detail", args=[organization.uid] ) with CaptureQueriesContext(connection) as captured: - response = self.client.get( - url, - {"data": "industrial,fns_reports,fstec,contracts,vacancies"}, - ) + response = self.client.get(url) self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertEqual( - set(response.data["data"]), - {"industrial", "fns_reports", "fstec", "contracts", "vacancies"}, + {source["source_group"] for source in response.data["sources"]}, + {"financial_indicators", "security_registries", "vacancies"}, ) - self.assertNotIn("trudvsem", response.data["data"]) - self.assertEqual(len(response.data["data"]["fns_reports"]), 4) - self.assertLessEqual(len(captured), 1) + source_record_queries = [ + query["sql"] + for query in captured + if "ORGANIZATIONS_SOURCE_RECORD" in query["sql"].upper() + ] + self.assertEqual(source_record_queries, []) - def test_v2_renames_trudvsem_data_source_to_vacancies(self): + def test_trudvsem_filter_alias_uses_vacancies_source_group(self): organization = Organization.objects.create( name='ООО "Вакансии"', inn="7800000204", kpp="780002104", ogrn="1027700133204", ) - GenericParserRecord.objects.create( - load_batch=1, - source=ParserLoadLog.Source.TRUDVSEM, + extension = VacancyExtension.objects.create( + organization=organization, + title="Вакансии", + records_count=1, + ) + OrganizationSourceRecord.objects.create( + extension=extension, + record_type="vacancy", + source="trudvsem", external_id="vacancy-1", - inn=organization.inn, - ogrn=organization.ogrn, - organisation_name=organization.name, title="Инженер", ) - OrganizationDataSnapshotRefreshService.refresh( - organization_uids=[str(organization.uid)], - ) - detail_url = reverse( - "api_v2:organizations:organizations-detail", - args=[organization.uid], - ) - vacancies_response = self.client.get(detail_url, {"data": "vacancies"}) - internal_source_response = self.client.get(detail_url, {"data": "trudvsem"}) has_vacancies_response = self.client.get( reverse("api_v2:organizations:organizations-list"), { - "has_vacancies": "true", + "has_trudvsem": "true", "has_registry": "false", - "data": "vacancies", }, ) - - self.assertEqual(vacancies_response.status_code, status.HTTP_200_OK) - self.assertEqual(set(vacancies_response.data["data"]), {"vacancies"}) - self.assertEqual( - vacancies_response.data["data"]["vacancies"][0]["external_id"], - "vacancy-1", + records_response = self.client.get( + reverse( + "api_v2:organizations:organization-sources-records", + args=[extension.uid], + ) ) - self.assertNotIn("trudvsem", vacancies_response.data["data"]) - - self.assertEqual( - internal_source_response.status_code, - status.HTTP_400_BAD_REQUEST, - ) - self.assertIn("trudvsem", str(internal_source_response.data)) self.assertEqual(has_vacancies_response.status_code, status.HTTP_200_OK) self.assertEqual( has_vacancies_response.data["data"][0]["uid"], str(organization.uid), ) + self.assertEqual(records_response.status_code, status.HTTP_200_OK) + self.assertEqual(records_response.data["data"][0]["source"], "trudvsem") @override_settings(ORGANIZATIONS_V2_ALLOW_ANONYMOUS=True) def test_dev_flag_allows_anonymous_access(self): diff --git a/tests/apps/organizations/test_api_v2_source_extensions.py b/tests/apps/organizations/test_api_v2_source_extensions.py new file mode 100644 index 0000000..f67deef --- /dev/null +++ b/tests/apps/organizations/test_api_v2_source_extensions.py @@ -0,0 +1,389 @@ +"""Tests for organization API v2 backed by source extensions.""" + +from django.core.cache import cache +from django.urls import reverse +from organizations.models import ( + Organization, + OrganizationSourceRecord, + PlannedInspectionExtension, +) +from rest_framework import status +from rest_framework.test import APITestCase + +from tests.apps.registers.factories import ( + OrganizationFactory as RegistryOrganizationFactory, +) +from tests.apps.registers.factories import ( + RegisterFactory, + RegistryMembershipPeriodFactory, +) +from tests.apps.user.factories import UserFactory + + +class OrganizationSourceExtensionsApiV2Test(APITestCase): + """Checks organization-centric source extension API contract.""" + + def setUp(self): + cache.clear() + self.user = UserFactory.create_user() + self.client.force_authenticate(self.user) + + def test_list_returns_compact_source_summaries_instead_of_embedded_data(self): + organization = Organization.objects.create( + name='ООО "API"', + inn="7707083810", + ogrn="1027700132010", + ) + extension = PlannedInspectionExtension.objects.create( + organization=organization, + title="Плановые проверки Генпрокуратуры России", + records_count=1, + ) + OrganizationSourceRecord.objects.create( + extension=extension, + record_type="inspection", + source="inspections", + external_id="INSP-API", + title="Проверка API", + payload={"registration_number": "INSP-API"}, + ) + + response = self.client.get( + reverse("api_v2:organizations:organizations-list"), + {"has_registry": "false"}, + ) + + self.assertEqual(response.status_code, status.HTTP_200_OK) + item = response.data["data"][0] + self.assertNotIn("data", item) + self.assertNotIn("data_sources", item) + self.assertEqual(item["identity_status"], Organization.IdentityStatus.COMPLETE) + self.assertEqual(item["sources"][0]["uid"], str(extension.uid)) + self.assertEqual(item["sources"][0]["source_group"], "planned_inspections") + self.assertEqual(item["sources"][0]["records_count"], 1) + + def test_organization_sources_action_returns_extensions(self): + organization = Organization.objects.create( + name='ООО "Sources"', + inn="7707083811", + ogrn="1027700132011", + ) + extension = PlannedInspectionExtension.objects.create( + organization=organization, + title="Плановые проверки Генпрокуратуры России", + ) + + response = self.client.get( + reverse( + "api_v2:organizations:organizations-sources", + args=[organization.uid], + ) + ) + + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(response.data[0]["uid"], str(extension.uid)) + + def test_source_records_endpoint_returns_extension_records(self): + organization = Organization.objects.create( + name='ООО "Records"', + inn="7707083812", + ogrn="1027700132012", + ) + extension = PlannedInspectionExtension.objects.create( + organization=organization, + title="Плановые проверки Генпрокуратуры России", + ) + OrganizationSourceRecord.objects.create( + extension=extension, + record_type="inspection", + source="inspections", + external_id="INSP-RECORD", + title="Проверка records", + payload={"registration_number": "INSP-RECORD"}, + ) + + response = self.client.get( + reverse( + "api_v2:organizations:organization-sources-records", + args=[extension.uid], + ) + ) + + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(response.data["data"][0]["external_id"], "INSP-RECORD") + self.assertEqual( + response.data["data"][0]["payload"]["registration_number"], + "INSP-RECORD", + ) + + def test_flat_source_records_endpoint_filters_by_source_group(self): + target = Organization.objects.create( + name='ООО "Flat Records"', + inn="7707083813", + ogrn="1027700132013", + ) + other = Organization.objects.create( + name='ООО "Other Records"', + inn="7707083814", + ogrn="1027700132014", + ) + extension = PlannedInspectionExtension.objects.create( + organization=target, + title="Плановые проверки Генпрокуратуры России", + ) + other_extension = PlannedInspectionExtension.objects.create( + organization=other, + title="Плановые проверки Генпрокуратуры России", + ) + OrganizationSourceRecord.objects.create( + extension=extension, + record_type="inspection", + source="inspections", + external_id="INSP-FLAT", + title="Проверка flat", + payload={"registration_number": "INSP-FLAT"}, + ) + OrganizationSourceRecord.objects.create( + extension=other_extension, + record_type="inspection", + source="inspections", + external_id="INSP-OTHER", + title="Другая проверка", + payload={"registration_number": "INSP-OTHER"}, + ) + + response = self.client.get( + reverse("api_v2:organizations:organization-source-records-list"), + { + "source_group": "planned_inspections", + "organization": str(target.uid), + }, + ) + + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(response.data["meta"]["pagination"]["total_count"], 1) + record = response.data["data"][0] + self.assertEqual(record["external_id"], "INSP-FLAT") + self.assertEqual(record["source_group"], "planned_inspections") + self.assertEqual(record["organization"]["uid"], str(target.uid)) + + def test_flat_source_records_endpoint_filters_by_has_registry(self): + with_registry = Organization.objects.create( + name='ООО "With Registry Source"', + inn="7707083815", + ogrn="1027700132015", + ) + without_registry = Organization.objects.create( + name='ООО "Without Registry Source"', + inn="7707083816", + ogrn="1027700132016", + ) + with_registry_extension = PlannedInspectionExtension.objects.create( + organization=with_registry, + title="Плановые проверки Генпрокуратуры России", + ) + without_registry_extension = PlannedInspectionExtension.objects.create( + organization=without_registry, + title="Плановые проверки Генпрокуратуры России", + ) + OrganizationSourceRecord.objects.create( + extension=with_registry_extension, + record_type="inspection", + source="inspections", + external_id="INSP-WITH-REGISTRY", + title="Проверка организации из реестра", + ) + OrganizationSourceRecord.objects.create( + extension=without_registry_extension, + record_type="inspection", + source="inspections", + external_id="INSP-WITHOUT-REGISTRY", + title="Проверка организации без реестра", + ) + registry = RegisterFactory(name="Реестр source records") + registry_organization = RegistryOrganizationFactory( + mn_inn=int(with_registry.inn), + mn_ogrn=int(with_registry.ogrn), + ) + RegistryMembershipPeriodFactory( + registry=registry, + organization=registry_organization, + ) + + only_registry = self.client.get( + reverse("api_v2:organizations:organization-source-records-list"), + { + "has_registry": "true", + "source_group": "planned_inspections", + }, + ) + without_registry_response = self.client.get( + reverse("api_v2:organizations:organization-source-records-list"), + { + "has_registry": "false", + "source_group": "planned_inspections", + "ordering": "extension__organization__inn", + }, + ) + + self.assertEqual(only_registry.status_code, status.HTTP_200_OK) + self.assertEqual(only_registry.data["meta"]["pagination"]["total_count"], 1) + self.assertEqual( + only_registry.data["data"][0]["external_id"], + "INSP-WITH-REGISTRY", + ) + + self.assertEqual(without_registry_response.status_code, status.HTTP_200_OK) + self.assertEqual( + without_registry_response.data["meta"]["pagination"]["total_count"], + 1, + ) + self.assertEqual( + without_registry_response.data["data"][0]["external_id"], + "INSP-WITHOUT-REGISTRY", + ) + + def test_source_record_organization_uses_active_registry_identity(self): + organization = Organization.objects.create( + name="1241800009703", + ogrn="1241800009703", + ) + extension = PlannedInspectionExtension.objects.create( + organization=organization, + title="Плановые проверки Генпрокуратуры России", + ) + OrganizationSourceRecord.objects.create( + extension=extension, + record_type="inspection", + source="inspections", + external_id="INSP-REGISTRY-IDENTITY", + title="Проверка организации из реестра", + ) + registry_organization = RegistryOrganizationFactory( + pn_name='ООО "Реестровое имя"', + mn_inn=1800020960, + mn_ogrn=int(organization.ogrn), + in_kpp=180001001, + ) + RegistryMembershipPeriodFactory(organization=registry_organization) + + response = self.client.get( + reverse("api_v2:organizations:organization-source-records-list"), + { + "has_registry": "true", + "source_group": "planned_inspections", + }, + ) + + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(response.data["meta"]["pagination"]["total_count"], 1) + response_organization = response.data["data"][0]["organization"] + self.assertEqual(response_organization["uid"], str(organization.uid)) + self.assertEqual(response_organization["name"], 'ООО "Реестровое имя"') + self.assertEqual(response_organization["inn"], "1800020960") + self.assertEqual(response_organization["kpp"], "180001001") + self.assertEqual(response_organization["ogrn"], organization.ogrn) + + def test_flat_source_records_searches_payload_values_displayed_in_tables(self): + target = Organization.objects.create( + name='ООО "Поиск по payload"', + inn="7707083817", + ogrn="1027700132017", + ) + other = Organization.objects.create( + name='ООО "Другая payload"', + inn="7707083818", + ogrn="1027700132018", + ) + extension = PlannedInspectionExtension.objects.create( + organization=target, + title="Плановые проверки Генпрокуратуры России", + ) + other_extension = PlannedInspectionExtension.objects.create( + organization=other, + title="Плановые проверки Генпрокуратуры России", + ) + OrganizationSourceRecord.objects.create( + extension=extension, + record_type="inspection", + source="inspections", + external_id="INSP-PAYLOAD", + title="Проверка payload", + payload={ + "registration_number": "INSPECTION-PAYLOAD-42", + "control_authority": "Северный Ростехнадзор", + "start_date_normalized": "2026-06-15", + }, + ) + OrganizationSourceRecord.objects.create( + extension=other_extension, + record_type="inspection", + source="inspections", + external_id="INSP-OTHER-PAYLOAD", + title="Другая проверка payload", + payload={ + "registration_number": "INSPECTION-OTHER-42", + "control_authority": "Другой контрольный орган", + }, + ) + + for search_value in ( + "INSPECTION-PAYLOAD-42", + "Северный Ростехнадзор", + "2026-06-15", + ): + with self.subTest(search_value=search_value): + response = self.client.get( + reverse("api_v2:organizations:organization-source-records-list"), + { + "has_registry": "false", + "source_group": "planned_inspections", + "search": search_value, + }, + ) + + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(response.data["meta"]["pagination"]["total_count"], 1) + self.assertEqual( + response.data["data"][0]["external_id"], + "INSP-PAYLOAD", + ) + + def test_flat_source_records_searches_registry_identity_displayed_in_tables(self): + organization = Organization.objects.create( + name="1241800009704", + ogrn="1241800009704", + ) + extension = PlannedInspectionExtension.objects.create( + organization=organization, + title="Плановые проверки Генпрокуратуры России", + ) + OrganizationSourceRecord.objects.create( + extension=extension, + record_type="inspection", + source="inspections", + external_id="INSP-REGISTRY-SEARCH", + title="Проверка поиска по реестру", + ) + registry_organization = RegistryOrganizationFactory( + pn_name='АО "Реестровый поиск"', + mn_inn=1800020961, + mn_ogrn=int(organization.ogrn), + in_kpp=180001002, + ) + RegistryMembershipPeriodFactory(organization=registry_organization) + + response = self.client.get( + reverse("api_v2:organizations:organization-source-records-list"), + { + "has_registry": "true", + "source_group": "planned_inspections", + "search": "Реестровый поиск", + }, + ) + + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(response.data["meta"]["pagination"]["total_count"], 1) + record = response.data["data"][0] + self.assertEqual(record["external_id"], "INSP-REGISTRY-SEARCH") + self.assertEqual(record["organization"]["name"], 'АО "Реестровый поиск"') diff --git a/tests/apps/organizations/test_source_backfill.py b/tests/apps/organizations/test_source_backfill.py new file mode 100644 index 0000000..17627fd --- /dev/null +++ b/tests/apps/organizations/test_source_backfill.py @@ -0,0 +1,219 @@ +"""Tests for organization source backfill from legacy parser tables.""" + +from apps.parsers.models import ( + FinancialReport, + FinancialReportLine, + GenericParserRecord, + ParserLoadLog, +) +from django.core.management import call_command +from django.test import TestCase +from organizations.models import ( + DefenseSupplierExtension, + FinancialIndicatorsExtension, + Organization, + OrganizationSourceFinancialLine, + OrganizationSourceRecord, + PlannedInspectionExtension, + SourceGroup, +) +from organizations.source_backfill import OrganizationSourceBackfillService + +from tests.apps.parsers.factories import ( + InspectionRecordFactory, + ManufacturerRecordFactory, +) +from tests.apps.registers.factories import ( + OrganizationFactory as RegistryOrganizationFactory, +) + + +class OrganizationSourceBackfillServiceTest(TestCase): + """Checks idempotent migration from legacy source records.""" + + def test_backfills_inspection_records_into_planned_inspection_extension(self): + organization = Organization.objects.create( + name='ООО "Проверка"', + inn="7707083801", + ogrn="1027700132001", + ) + inspection = InspectionRecordFactory( + load_batch=77, + inn=organization.inn, + ogrn=organization.ogrn, + organisation_name=organization.name, + registration_number="INSP-77", + ) + + result = OrganizationSourceBackfillService.backfill( + source=ParserLoadLog.Source.INSPECTIONS, + batch_id=77, + ) + + self.assertEqual(result.scanned, 1) + self.assertEqual(result.created_extensions, 1) + self.assertEqual(result.created_records, 1) + extension = PlannedInspectionExtension.objects.get(organization=organization) + self.assertEqual(extension.source_group, SourceGroup.PLANNED_INSPECTIONS) + self.assertEqual(extension.records_count, 1) + record = OrganizationSourceRecord.objects.get(extension=extension) + self.assertEqual(record.source, ParserLoadLog.Source.INSPECTIONS) + self.assertEqual(record.external_id, inspection.registration_number) + self.assertEqual(record.legacy_model, "apps.parsers.InspectionRecord") + self.assertEqual(record.legacy_pk, str(inspection.pk)) + self.assertEqual(record.payload["registration_number"], "INSP-77") + + second_result = OrganizationSourceBackfillService.backfill( + source=ParserLoadLog.Source.INSPECTIONS, + batch_id=77, + ) + + self.assertEqual(second_result.created_extensions, 0) + self.assertEqual(second_result.created_records, 0) + self.assertEqual(second_result.updated_records, 1) + self.assertEqual(OrganizationSourceRecord.objects.count(), 1) + + def test_backfill_payload_serializes_registry_organization_fk_as_id(self): + organization = Organization.objects.create( + name='ООО "FK Payload"', + inn="7707083815", + ogrn="1027700132015", + ) + registry_organization = RegistryOrganizationFactory( + mn_inn=int(organization.inn), + mn_ogrn=int(organization.ogrn), + ) + InspectionRecordFactory( + load_batch=78, + inn=organization.inn, + ogrn=organization.ogrn, + organisation_name=organization.name, + registration_number="INSP-FK", + registry_organization=registry_organization, + ) + + OrganizationSourceBackfillService.backfill( + source=ParserLoadLog.Source.INSPECTIONS, + batch_id=78, + ) + + record = OrganizationSourceRecord.objects.get(external_id="INSP-FK") + self.assertEqual(record.payload["registry_organization"], registry_organization.pk) + + def test_backfills_financial_report_lines(self): + organization = Organization.objects.create( + name='ООО "Финансы"', + inn="7707083802", + ogrn="1027700132002", + ) + report = FinancialReport.objects.create( + external_id="fns-report-1", + ogrn=organization.ogrn, + file_name="fns_report.xlsx", + file_hash="a" * 64, + load_batch=88, + status=FinancialReport.Status.SUCCESS, + source=FinancialReport.SourceType.API, + ) + FinancialReportLine.objects.create( + report=report, + form_code="1", + line_code="1100", + line_name="Нематериальные активы", + year=2025, + period_start=100, + period_end=200, + ) + + result = OrganizationSourceBackfillService.backfill( + source=ParserLoadLog.Source.FNS_REPORTS, + batch_id=88, + ) + + self.assertEqual(result.created_extensions, 1) + self.assertEqual(result.created_records, 1) + extension = FinancialIndicatorsExtension.objects.get(organization=organization) + record = OrganizationSourceRecord.objects.get(extension=extension) + self.assertEqual(record.external_id, "fns-report-1") + self.assertEqual(OrganizationSourceFinancialLine.objects.count(), 1) + self.assertEqual(record.financial_lines.get().period_end, 200) + + def test_backfills_generic_defense_supplier_records(self): + organization = Organization.objects.create( + name='ООО "ГОЗ"', + inn="7707083803", + ) + generic_record = GenericParserRecord.objects.create( + load_batch=99, + source=ParserLoadLog.Source.UNFAIR_SUPPLIERS, + external_id="unfair-1", + inn=organization.inn, + ogrn="", + organisation_name=organization.name, + title="Реестр недобросовестных поставщиков", + record_date="2026-05-18", + status="active", + payload={"reason": "test"}, + ) + + result = OrganizationSourceBackfillService.backfill( + source=ParserLoadLog.Source.UNFAIR_SUPPLIERS, + batch_id=99, + ) + + self.assertEqual(result.created_extensions, 1) + self.assertEqual(result.created_records, 1) + extension = DefenseSupplierExtension.objects.get(organization=organization) + record = OrganizationSourceRecord.objects.get(extension=extension) + self.assertEqual(record.source, ParserLoadLog.Source.UNFAIR_SUPPLIERS) + self.assertEqual(record.external_id, generic_record.external_id) + self.assertEqual(record.payload["reason"], "test") + + def test_backfill_drops_invalid_identity_values_when_creating_organization(self): + manufacturer = ManufacturerRecordFactory( + load_batch=100, + full_legal_name='ИП "Грязный ИНН"', + inn="396440000001000", + ogrn="5385025402942", + ) + + result = OrganizationSourceBackfillService.backfill( + source=ParserLoadLog.Source.MANUFACTURES, + batch_id=100, + ) + + self.assertEqual(result.scanned, 1) + self.assertEqual(result.created_organizations, 1) + self.assertEqual(result.created_records, 1) + self.assertEqual(result.unresolved, 0) + organization = Organization.objects.get(name=manufacturer.full_legal_name) + self.assertEqual(organization.inn, "") + self.assertEqual(organization.ogrn, manufacturer.ogrn) + record = OrganizationSourceRecord.objects.get(legacy_pk=str(manufacturer.pk)) + self.assertEqual(record.payload["inn"], "396440000001000") + + def test_management_command_runs_source_backfill(self): + organization = Organization.objects.create( + name='ООО "Команда"', + inn="7707083804", + ogrn="1027700132004", + ) + InspectionRecordFactory( + load_batch=101, + inn=organization.inn, + ogrn=organization.ogrn, + organisation_name=organization.name, + registration_number="INSP-101", + ) + + call_command( + "backfill_organization_sources", + source=ParserLoadLog.Source.INSPECTIONS, + batch_id=101, + verbosity=0, + ) + + self.assertTrue( + PlannedInspectionExtension.objects.filter(organization=organization).exists() + ) + self.assertEqual(OrganizationSourceRecord.objects.get().external_id, "INSP-101") diff --git a/tests/apps/organizations/test_source_extensions_models.py b/tests/apps/organizations/test_source_extensions_models.py new file mode 100644 index 0000000..ee555f6 --- /dev/null +++ b/tests/apps/organizations/test_source_extensions_models.py @@ -0,0 +1,150 @@ +"""Tests for organization source extension models.""" + +from django.db import IntegrityError, transaction +from django.test import TestCase +from organizations.models import ( + Organization, + OrganizationSourceExtension, + OrganizationSourceFinancialLine, + OrganizationSourceRecord, + PlannedInspectionExtension, + SourceGroup, +) + + +class OrganizationIdentityFieldsTest(TestCase): + """Checks derived organization identity quality fields.""" + + def test_name_only_organization_is_marked_as_missing_identity(self): + organization = Organization.objects.create(name="Без реквизитов") + + self.assertEqual(organization.identity_status, Organization.IdentityStatus.MISSING) + self.assertTrue(organization.primary_identity.startswith("name:")) + + def test_inn_and_ogrn_organization_is_marked_as_complete_identity(self): + organization = Organization.objects.create( + name='ООО "Полная"', + inn="7707083893", + kpp="770701001", + ogrn="1027700132195", + ) + + self.assertEqual(organization.identity_status, Organization.IdentityStatus.COMPLETE) + self.assertEqual(organization.primary_identity, "inn:7707083893:kpp:770701001") + + def test_single_identifier_organization_is_marked_as_partial_identity(self): + organization = Organization.objects.create( + name='ООО "Частичная"', + inn="7707083894", + ) + + self.assertEqual(organization.identity_status, Organization.IdentityStatus.PARTIAL) + self.assertEqual(organization.primary_identity, "inn:7707083894") + + +class OrganizationSourceExtensionModelTest(TestCase): + """Checks polymorphic source extension behavior and constraints.""" + + def test_polymorphic_query_returns_concrete_extension_instance(self): + organization = Organization.objects.create( + name='ООО "Проверки"', + inn="7707083895", + ogrn="1027700132196", + ) + extension = PlannedInspectionExtension.objects.create( + organization=organization, + title="Плановые проверки Генпрокуратуры России", + records_count=1, + last_load_batch=77, + ) + + loaded = OrganizationSourceExtension.objects.get(uid=extension.uid) + + self.assertIsInstance(loaded, PlannedInspectionExtension) + self.assertEqual(loaded.source_group, SourceGroup.PLANNED_INSPECTIONS) + self.assertEqual(loaded.organization, organization) + + def test_one_source_group_extension_per_organization(self): + organization = Organization.objects.create( + name='ООО "Дубликат"', + inn="7707083896", + ogrn="1027700132197", + ) + PlannedInspectionExtension.objects.create( + organization=organization, + title="Плановые проверки Генпрокуратуры России", + ) + + with self.assertRaises(IntegrityError), transaction.atomic(): + PlannedInspectionExtension.objects.create( + organization=organization, + title="Плановые проверки Генпрокуратуры России", + ) + + def test_source_records_are_unique_by_legacy_model_and_pk(self): + organization = Organization.objects.create( + name='ООО "Запись"', + inn="7707083897", + ogrn="1027700132198", + ) + extension = PlannedInspectionExtension.objects.create( + organization=organization, + title="Плановые проверки Генпрокуратуры России", + ) + OrganizationSourceRecord.objects.create( + extension=extension, + record_type="inspection", + source="inspections", + external_id="inspection-1", + title="Проверка 1", + legacy_model="apps.parsers.InspectionRecord", + legacy_pk="1", + payload={"registration_number": "inspection-1"}, + ) + + with self.assertRaises(IntegrityError), transaction.atomic(): + OrganizationSourceRecord.objects.create( + extension=extension, + record_type="inspection", + source="inspections", + external_id="inspection-2", + title="Проверка 2", + legacy_model="apps.parsers.InspectionRecord", + legacy_pk="1", + payload={"registration_number": "inspection-2"}, + ) + + def test_financial_lines_attach_to_source_record(self): + organization = Organization.objects.create( + name='ООО "Финансы"', + inn="7707083898", + ogrn="1027700132199", + ) + extension = OrganizationSourceExtension.objects.create( + organization=organization, + source_group=SourceGroup.FINANCIAL_INDICATORS, + title="Финансово-экономические показатели", + ) + record = OrganizationSourceRecord.objects.create( + extension=extension, + record_type="financial_report", + source="fns_reports", + external_id="report-1", + title="Отчетность ФНС", + legacy_model="apps.parsers.FinancialReport", + legacy_pk="1", + payload={"external_id": "report-1"}, + ) + + OrganizationSourceFinancialLine.objects.create( + source_record=record, + form_code="1", + line_code="1100", + line_name="Нематериальные активы", + year=2025, + period_start=100, + period_end=200, + ) + + self.assertEqual(record.financial_lines.count(), 1) + self.assertEqual(record.financial_lines.get().period_end, 200) diff --git a/tests/apps/organizations/test_source_ingestion.py b/tests/apps/organizations/test_source_ingestion.py new file mode 100644 index 0000000..326d03b --- /dev/null +++ b/tests/apps/organizations/test_source_ingestion.py @@ -0,0 +1,143 @@ +"""Tests for direct parser ingestion into organization source storage.""" + +from decimal import Decimal + +from apps.parsers.models import FinancialReport, GenericParserRecord, ParserLoadLog +from django.test import TestCase +from organizations.models import ( + DefenseSupplierExtension, + FinancialIndicatorsExtension, + Organization, + OrganizationSourceFinancialLine, + OrganizationSourceRecord, + SourceGroup, +) +from organizations.source_ingestion import ( + OrganizationSourceIngestionService, + SourceFinancialLineInput, + SourceRecordInput, +) + + +class OrganizationSourceIngestionServiceTest(TestCase): + """Checks runtime parser writes bypass legacy parser record tables.""" + + def test_save_generic_records_writes_source_records_without_legacy_rows(self): + result = OrganizationSourceIngestionService.save_records( + source=ParserLoadLog.Source.UNFAIR_SUPPLIERS, + load_batch=42, + records=[ + SourceRecordInput( + external_id="unfair-42", + title="Недобросовестный поставщик", + organization_name='ООО "ГОЗ"', + inn="7707083803", + ogrn="", + record_date="2026-05-18", + amount=Decimal("100.50"), + status="active", + url="https://example.test/unfair-42", + payload={"reason": "contract breach"}, + ) + ], + ) + + self.assertEqual(result.scanned, 1) + self.assertEqual(result.created_records, 1) + self.assertEqual(result.updated_records, 0) + self.assertEqual(result.created_extensions, 1) + self.assertEqual(result.unresolved, 0) + self.assertEqual(GenericParserRecord.objects.count(), 0) + + organization = Organization.objects.get(inn="7707083803") + extension = DefenseSupplierExtension.objects.get(organization=organization) + self.assertEqual(extension.source_group, SourceGroup.DEFENSE_SUPPLIERS) + self.assertEqual(extension.records_count, 1) + + record = OrganizationSourceRecord.objects.get(extension=extension) + self.assertEqual(record.source, ParserLoadLog.Source.UNFAIR_SUPPLIERS) + self.assertEqual(record.record_type, "unfair_supplier") + self.assertEqual(record.external_id, "unfair-42") + self.assertEqual(record.amount, Decimal("100.50")) + self.assertEqual(record.payload["reason"], "contract breach") + self.assertEqual(record.legacy_model, "") + self.assertEqual(record.legacy_pk, "") + + def test_save_records_is_idempotent_by_source_external_id(self): + first = SourceRecordInput( + external_id="unfair-idempotent", + title="Old title", + organization_name='ООО "Идемпотентность"', + inn="7707083810", + status="old", + payload={"version": 1}, + ) + second = SourceRecordInput( + external_id="unfair-idempotent", + title="New title", + organization_name='ООО "Идемпотентность"', + inn="7707083810", + status="new", + payload={"version": 2}, + ) + + OrganizationSourceIngestionService.save_records( + source=ParserLoadLog.Source.UNFAIR_SUPPLIERS, + load_batch=43, + records=[first], + ) + result = OrganizationSourceIngestionService.save_records( + source=ParserLoadLog.Source.UNFAIR_SUPPLIERS, + load_batch=44, + records=[second], + ) + + self.assertEqual(result.created_records, 0) + self.assertEqual(result.updated_records, 1) + self.assertEqual(OrganizationSourceRecord.objects.count(), 1) + record = OrganizationSourceRecord.objects.get() + self.assertEqual(record.title, "New title") + self.assertEqual(record.status, "new") + self.assertEqual(record.payload["version"], 2) + self.assertEqual(record.load_batch, 44) + + def test_save_financial_report_writes_financial_lines_without_legacy_report(self): + result = OrganizationSourceIngestionService.save_records( + source=ParserLoadLog.Source.FNS_REPORTS, + load_batch=88, + records=[ + SourceRecordInput( + external_id="fns-report-1", + title="fin_001_1027700132002.xlsx", + organization_name="", + ogrn="1027700132002", + status="success", + payload={ + "file_name": "fin_001_1027700132002.xlsx", + "file_hash": "a" * 64, + }, + financial_lines=[ + SourceFinancialLineInput( + form_code="1", + line_code="1100", + line_name="Нематериальные активы", + year=2025, + period_start=100, + period_end=200, + ) + ], + ) + ], + ) + + self.assertEqual(result.created_records, 1) + self.assertEqual(result.created_financial_lines, 1) + self.assertEqual(FinancialReport.objects.count(), 0) + + organization = Organization.objects.get(ogrn="1027700132002") + extension = FinancialIndicatorsExtension.objects.get(organization=organization) + record = OrganizationSourceRecord.objects.get(extension=extension) + self.assertEqual(record.source, ParserLoadLog.Source.FNS_REPORTS) + line = OrganizationSourceFinancialLine.objects.get(source_record=record) + self.assertEqual(line.line_code, "1100") + self.assertEqual(line.period_end, 200) diff --git a/tests/apps/organizations/test_tasks.py b/tests/apps/organizations/test_tasks.py index a60fe81..fee7a8e 100644 --- a/tests/apps/organizations/test_tasks.py +++ b/tests/apps/organizations/test_tasks.py @@ -1,54 +1,93 @@ -"""Tests for organization snapshot tasks and schedules.""" +"""Tests for organization source backfill tasks and schedules.""" from importlib import import_module +from apps.parsers.models import ParserLoadLog from django.apps import apps as django_apps from django.core.cache import cache from django.test import TestCase from django_celery_beat.models import PeriodicTask from organizations.cache import get_organization_api_cache_version -from organizations.models import Organization -from organizations.tasks import refresh_all_organization_data_snapshots +from organizations.models import ( + IndustrialProductionExtension, + Organization, + OrganizationSourceRecord, +) +from organizations.tasks import ( + backfill_all_organization_sources, + backfill_organization_sources_for_parser_batch, +) from tests.apps.parsers.factories import IndustrialCertificateRecordFactory -class OrganizationSnapshotTasksTest(TestCase): - """Checks Celery tasks that maintain API v2 organization snapshots.""" +class OrganizationSourceBackfillTasksTest(TestCase): + """Checks Celery tasks that maintain API v2 organization source extensions.""" - def test_refresh_all_task_rebuilds_snapshots_and_invalidates_api_cache(self): + def test_backfill_all_task_rebuilds_sources_and_invalidates_api_cache(self): organization = Organization.objects.create( - name='ООО "Снапшот"', + name='ООО "Источник"', inn="7800000401", ogrn="1027700144401", ) IndustrialCertificateRecordFactory( inn=organization.inn, ogrn=organization.ogrn, - certificate_number="FULL-SNAPSHOT-CERT", + certificate_number="FULL-SOURCE-CERT", ) cache.set("unrelated:test", {"keep": True}, timeout=60) cache_version_before = get_organization_api_cache_version() - result = refresh_all_organization_data_snapshots(batch_size=10) + result = backfill_all_organization_sources(batch_size=10) - self.assertEqual(result["processed"], 1) - self.assertEqual(result["created"], 1) - self.assertEqual(result["updated"], 0) + self.assertGreaterEqual(result["scanned"], 1) + self.assertEqual(result["created_records"], 1) self.assertNotEqual( get_organization_api_cache_version(), cache_version_before, ) self.assertEqual(cache.get("unrelated:test"), {"keep": True}) - snapshot = organization.data_snapshot - self.assertEqual( - snapshot.data["industrial"][0]["certificate_number"], - "FULL-SNAPSHOT-CERT", + extension = IndustrialProductionExtension.objects.get( + organization=organization, ) + record = OrganizationSourceRecord.objects.get(extension=extension) + self.assertEqual( + record.payload["certificate_number"], + "FULL-SOURCE-CERT", + ) + + def test_backfill_parser_batch_task_limits_source_and_batch(self): + organization = Organization.objects.create( + name='ООО "Пакет источника"', + inn="7800000402", + ogrn="1027700144402", + ) + IndustrialCertificateRecordFactory( + inn=organization.inn, + ogrn=organization.ogrn, + certificate_number="BATCH-SOURCE-CERT-1", + load_batch=1, + ) + IndustrialCertificateRecordFactory( + inn=organization.inn, + ogrn=organization.ogrn, + certificate_number="BATCH-SOURCE-CERT-2", + load_batch=2, + ) + + result = backfill_organization_sources_for_parser_batch( + source=ParserLoadLog.Source.INDUSTRIAL, + batch_id=2, + ) + + self.assertEqual(result["scanned"], 1) + self.assertEqual(result["created_records"], 1) + record = OrganizationSourceRecord.objects.get() + self.assertEqual(record.payload["certificate_number"], "BATCH-SOURCE-CERT-2") class OrganizationSnapshotScheduleMigrationTest(TestCase): - """Checks data migration that schedules full snapshot refresh.""" + """Checks legacy data migration that schedules the compatibility task.""" def test_migration_seeds_daily_snapshot_refresh_periodic_task(self): migration = import_module( diff --git a/tests/apps/parsers/test_direct_ingestion_services.py b/tests/apps/parsers/test_direct_ingestion_services.py new file mode 100644 index 0000000..f39279d --- /dev/null +++ b/tests/apps/parsers/test_direct_ingestion_services.py @@ -0,0 +1,278 @@ +"""Tests for parser services writing directly to organization source storage.""" + +from decimal import Decimal + +from apps.parsers.clients.common import GenericParserItem +from apps.parsers.clients.fns.schemas import ReportLine +from apps.parsers.clients.minpromtorg.schemas import ( + IndustrialCertificate, + IndustrialProduct, + Manufacturer, +) +from apps.parsers.clients.proverki.schemas import Inspection +from apps.parsers.clients.zakupki.schemas import Procurement +from apps.parsers.models import ( + FinancialReport, + GenericParserRecord, + IndustrialCertificateRecord, + IndustrialProductRecord, + InspectionRecord, + ManufacturerRecord, + ParserLoadLog, + ProcurementRecord, +) +from apps.parsers.services import ( + FNSReportService, + GenericParserRecordService, + IndustrialCertificateService, + IndustrialProductService, + InspectionService, + ManufacturerService, + ProcurementService, +) +from django.test import TestCase +from organizations.models import ( + OrganizationSourceFinancialLine, + OrganizationSourceRecord, +) + + +class DirectIngestionParserServicesTest(TestCase): + """Parser save services should not write legacy parser record rows.""" + + def test_industrial_certificate_save_records_writes_organization_source_records(self): + saved = IndustrialCertificateService.save_certificates( + [ + IndustrialCertificate( + issue_date="01.02.2026", + certificate_number="CERT-DIRECT-1", + expiry_date="2029-02-01", + certificate_file_url="https://example.test/cert.pdf", + organisation_name='ООО "Сертификат"', + inn="7707083801", + ogrn="1027700132001", + ) + ], + batch_id=47, + ) + + self.assertEqual(saved, 1) + self.assertEqual(IndustrialCertificateRecord.objects.count(), 0) + record = OrganizationSourceRecord.objects.get( + source=ParserLoadLog.Source.INDUSTRIAL, + external_id="CERT-DIRECT-1", + ) + self.assertEqual(record.record_type, "industrial_certificate") + self.assertEqual(record.payload["issue_date_normalized"], "2026-02-01") + self.assertEqual(record.payload["expiry_date_normalized"], "2029-02-01") + self.assertEqual(record.url, "https://example.test/cert.pdf") + + def test_manufacturer_save_records_writes_organization_source_records(self): + saved = ManufacturerService.save_manufacturers( + [ + Manufacturer( + full_legal_name='ООО "Производитель"', + inn="7707083802", + ogrn="1027700132002", + address="Москва", + ) + ], + batch_id=48, + ) + + self.assertEqual(saved, 1) + self.assertEqual(ManufacturerRecord.objects.count(), 0) + record = OrganizationSourceRecord.objects.get( + source=ParserLoadLog.Source.MANUFACTURES, + external_id="7707083802", + ) + self.assertEqual(record.record_type, "manufacturer") + self.assertEqual(record.title, 'ООО "Производитель"') + self.assertEqual(record.payload["address"], "Москва") + + def test_industrial_product_save_records_writes_organization_source_records(self): + saved = IndustrialProductService.save_products( + [ + IndustrialProduct( + full_organisation_name='ООО "Продукция"', + inn="7707083809", + ogrn="1027700132009", + registry_number="PROD-DIRECT-1", + product_name="Станок", + product_model="MODEL-1", + okpd2_code="28.41", + tnved_code="8457109000", + regulatory_document="ГОСТ", + ) + ], + batch_id=49, + ) + + self.assertEqual(saved, 1) + self.assertEqual(IndustrialProductRecord.objects.count(), 0) + record = OrganizationSourceRecord.objects.get( + source=ParserLoadLog.Source.INDUSTRIAL_PRODUCTS, + external_id="PROD-DIRECT-1", + ) + self.assertEqual(record.record_type, "industrial_product") + self.assertEqual(record.title, "Станок") + self.assertEqual(record.payload["okpd2_code"], "28.41") + + def test_procurement_save_records_writes_organization_source_records(self): + saved = ProcurementService.save_procurements( + [ + Procurement( + purchase_number="PROC-DIRECT-1", + purchase_name="Поставка оборудования", + customer_inn="7707083810", + customer_kpp="770701001", + customer_ogrn="1027700132010", + customer_name='ООО "Заказчик"', + max_price="1 234 567,89", + currency_code="RUB", + placement_method="Аукцион", + publish_date="01.03.2026", + end_date="2026-03-15", + status="published", + law_type="44-FZ", + purchase_object_info="Оборудование", + href="https://example.test/procurement", + ) + ], + batch_id=50, + region_code="77", + data_year=2026, + data_month=3, + ) + + self.assertEqual(saved, 1) + self.assertEqual(ProcurementRecord.objects.count(), 0) + record = OrganizationSourceRecord.objects.get( + source=ParserLoadLog.Source.PROCUREMENTS, + external_id="PROC-DIRECT-1", + ) + self.assertEqual(record.record_type, "procurement") + self.assertEqual(record.amount, Decimal("1234567.89")) + self.assertEqual(record.payload["publish_date_normalized"], "2026-03-01") + self.assertEqual(record.payload["region_code"], "77") + self.assertEqual(record.payload["data_month"], 3) + + def test_generic_save_records_writes_organization_source_records(self): + saved = GenericParserRecordService.save_records( + [ + GenericParserItem( + source=ParserLoadLog.Source.FAS_GOZ, + external_id="fas-goz-1", + inn="7707083803", + ogrn="", + organisation_name='ООО "ГОЗ"', + title="Уклонение от ГОЗ", + record_date="2026-05-18", + amount=Decimal("12.30"), + status="active", + url="https://example.test/fas-goz-1", + payload={"registry": "fas"}, + ) + ], + batch_id=51, + source=ParserLoadLog.Source.FAS_GOZ, + ) + + self.assertEqual(saved, 1) + self.assertEqual(GenericParserRecord.objects.count(), 0) + record = OrganizationSourceRecord.objects.get( + source=ParserLoadLog.Source.FAS_GOZ, + external_id="fas-goz-1", + ) + self.assertEqual(record.title, "Уклонение от ГОЗ") + self.assertEqual(record.payload["registry"], "fas") + self.assertEqual(record.load_batch, 51) + + def test_inspection_save_records_writes_organization_source_records(self): + saved = InspectionService.save_inspections( + [ + Inspection( + registration_number="INSP-DIRECT-1", + inn="7707083804", + ogrn="1027700132004", + organisation_name='ООО "Проверка"', + control_authority="Контроль", + inspection_type="Плановая", + inspection_form="Документарная", + start_date="01.03.2026", + end_date="2026-03-15", + status="planned", + legal_basis="ФЗ", + result="", + ) + ], + batch_id=52, + data_year=2026, + data_month=3, + ) + + self.assertEqual(saved, 1) + self.assertEqual(InspectionRecord.objects.count(), 0) + record = OrganizationSourceRecord.objects.get( + source=ParserLoadLog.Source.INSPECTIONS, + external_id="INSP-DIRECT-1", + ) + self.assertEqual(record.record_type, "inspection") + self.assertEqual(record.payload["control_authority"], "Контроль") + self.assertEqual(record.payload["start_date_normalized"], "2026-03-01") + self.assertEqual(record.payload["data_year"], 2026) + self.assertEqual(record.payload["data_month"], 3) + + def test_fns_save_report_writes_source_record_and_financial_lines(self): + report = FNSReportService.save_report( + external_id="fns-direct-1", + ogrn="1027700132005", + file_name="fin_001_1027700132005.xlsx", + file_hash="b" * 64, + source="file_watch", + batch_id=53, + lines_data=[ + { + "form_code": "1", + "line_code": "1600", + "line_name": "Баланс", + "year": 2025, + "period_start": 100, + "period_end": 200, + } + ], + ) + + self.assertEqual(FinancialReport.objects.count(), 0) + self.assertEqual(str(report.external_id), "fns-direct-1") + source_record = OrganizationSourceRecord.objects.get( + source=ParserLoadLog.Source.FNS_REPORTS, + external_id="fns-direct-1", + ) + self.assertEqual(report.uid, source_record.uid) + self.assertEqual(source_record.payload["file_hash"], "b" * 64) + line = OrganizationSourceFinancialLine.objects.get(source_record=source_record) + self.assertEqual(line.line_code, "1600") + self.assertEqual(line.period_end, 200) + + def test_fns_exists_by_hash_reads_source_record_payload(self): + FNSReportService.save_report( + external_id="fns-direct-2", + ogrn="1027700132006", + file_name="fin_002_1027700132006.xlsx", + file_hash="c" * 64, + source="file_watch", + batch_id=54, + lines_data=[ + ReportLine( + form_code="2", + line_code="2110", + line_name="Выручка", + year=2025, + period_end=500, + ).__dict__ + ], + ) + + self.assertTrue(FNSReportService.exists_by_hash("c" * 64)) + self.assertTrue(FNSReportService.exists_by_external_id("fns-direct-2")) diff --git a/tests/apps/parsers/test_source_cards_service.py b/tests/apps/parsers/test_source_cards_service.py index 5bc6b2a..146813b 100644 --- a/tests/apps/parsers/test_source_cards_service.py +++ b/tests/apps/parsers/test_source_cards_service.py @@ -5,7 +5,7 @@ from types import SimpleNamespace from unittest.mock import MagicMock, patch from apps.core.models import BackgroundJob, JobStatus -from apps.parsers.models import GenericParserRecord, ParserLoadLog +from apps.parsers.models import ParserLoadLog from apps.parsers.source_cards import ( SOURCE_CARD_DEFINITIONS, SourceCardDefinition, @@ -15,9 +15,37 @@ from apps.parsers.source_cards import ( from django.http import Http404 from django.test import SimpleTestCase, TestCase, override_settings from django.utils import timezone +from organizations.source_ingestion import ( + OrganizationSourceIngestionService, + SourceRecordInput, +) from rest_framework.exceptions import ValidationError +def _save_source_record( + *, + source: str, + external_id: str, + inn: str = "", + organization_name: str = "", + title: str = "", + payload: dict | None = None, +) -> None: + OrganizationSourceIngestionService.save_records( + source=source, + load_batch=1, + records=[ + SourceRecordInput( + external_id=external_id, + title=title, + organization_name=organization_name or title or external_id, + inn=inn, + payload=payload or {}, + ) + ], + ) + + class SourceCardServiceUnitTest(SimpleTestCase): def test_list_cards_exposes_all_frontend_category_slugs_in_menu_order(self): self.assertEqual( @@ -363,19 +391,19 @@ class SourceCardServiceUnitTest(SimpleTestCase): @override_settings(PARSER_STALE_LOAD_MAX_AGE_MINUTES=90) class SourceCardServiceDatabaseTest(TestCase): def test_defense_unreliable_suppliers_counts_unique_generic_organizations(self): - GenericParserRecord.objects.create( + _save_source_record( source=ParserLoadLog.Source.UNFAIR_SUPPLIERS, - load_batch=1, external_id="unfair-1", inn="7701234567", + organization_name='ООО "Поставщик"', title="Недобросовестный поставщик", payload={"number": "unfair-1"}, ) - GenericParserRecord.objects.create( + _save_source_record( source=ParserLoadLog.Source.FAS_GOZ, - load_batch=1, external_id="goz-1", inn="7701234567", + organization_name='ООО "Поставщик"', title="Уклонение от ГОЗ", payload={"number": "goz-1"}, ) @@ -399,19 +427,19 @@ class SourceCardServiceDatabaseTest(TestCase): self.assertEqual(card["organizations_count"], 1) def test_public_procurements_counts_generic_eis_sources(self): - GenericParserRecord.objects.create( + _save_source_record( source=ParserLoadLog.Source.PROCUREMENTS_44FZ, - load_batch=1, external_id="notice-1", inn="7701234567", + organization_name="ГБУ Заказчик", title="Закупка 44-ФЗ", payload={"number": "notice-1"}, ) - GenericParserRecord.objects.create( + _save_source_record( source=ParserLoadLog.Source.CONTRACTS, - load_batch=1, external_id="contract-1", inn="7701234567", + organization_name="ГБУ Заказчик", title="Контракт ЕИС", payload={"number": "contract-1"}, ) @@ -435,30 +463,24 @@ class SourceCardServiceDatabaseTest(TestCase): self.assertEqual(card["organizations_count"], 1) def test_public_procurements_counts_generic_buyers_without_inn(self): - GenericParserRecord.objects.create( + _save_source_record( source=ParserLoadLog.Source.PROCUREMENTS_44FZ, - load_batch=1, external_id="notice-1", - inn="", - organisation_name="ГБУ Заказчик", + organization_name="ГБУ Заказчик", title="Закупка 44-ФЗ", payload={"Заказчик": "ГБУ Заказчик"}, ) - GenericParserRecord.objects.create( + _save_source_record( source=ParserLoadLog.Source.CONTRACTS, - load_batch=1, external_id="contract-1", - inn="", - organisation_name="ГБУ Заказчик", + organization_name="ГБУ Заказчик", title="Контракт ЕИС", payload={"Заказчик": "ГБУ Заказчик"}, ) - GenericParserRecord.objects.create( + _save_source_record( source=ParserLoadLog.Source.PROCUREMENTS_223FZ, - load_batch=1, external_id="notice-2", - inn="", - organisation_name="АО Другой заказчик", + organization_name="АО Другой заказчик", title="Закупка 223-ФЗ", payload={"Наименование заказчика": "АО Другой заказчик"}, ) diff --git a/tests/apps/parsers/test_tasks.py b/tests/apps/parsers/test_tasks.py index bc23f6c..6f2f9e4 100644 --- a/tests/apps/parsers/test_tasks.py +++ b/tests/apps/parsers/test_tasks.py @@ -170,6 +170,27 @@ class ProxyResolutionTestCase(TestCase): self.assertIsNone(result) +class OrganizationSourceBackfillQueueTestCase(TestCase): + """Tests parser tasks queue organization source backfill after DB commit.""" + + def test_queue_organization_source_backfill_runs_after_commit(self): + with ( + patch( + "organizations.tasks.backfill_organization_sources_for_parser_batch.delay", + ) as delay_mock, + self.captureOnCommitCallbacks(execute=True), + ): + parser_tasks._queue_organization_source_backfill( + ParserLoadLog.Source.INDUSTRIAL, + 7, + ) + + delay_mock.assert_called_once_with( + source=ParserLoadLog.Source.INDUSTRIAL, + batch_id=7, + ) + + class SyncRuProxiesTaskTestCase(TestCase): """Tests for periodic RU proxy sync task.""" @@ -299,6 +320,52 @@ class GenericSourceFetchTestCase(TestCase): self.assertEqual(captured_inns, [str(organization.mn_inn)]) self.assertNotIn(str(no_membership.mn_inn), captured_inns) + def test_checko_bankruptcy_items_group_messages_by_procedure(self): + company = SimpleNamespace( + ogrn="1052452047450", + inn="2452031093", + short_name='АО "СИБПРОМПРОЕКТ"', + bankruptcy=( + SimpleNamespace( + type="Сообщение ЕФРСБ", + date="2026-04-22", + case_number="", + ), + SimpleNamespace( + type="Сообщение ЕФРСБ", + date="2026-04-20", + case_number="", + ), + ), + ) + + records = parser_tasks._checko_bankruptcy_items( + company=company, + fallback_inn="2452031093", + fallback_ogrn="1052452047450", + fallback_name='АО "СИБПРОМПРОЕКТ"', + ) + + self.assertEqual(len(records), 1) + self.assertEqual(records[0].external_id, "checko-fedresurs:2452031093") + self.assertEqual(records[0].record_date, "2026-04-22") + self.assertEqual(records[0].payload["messages_count"], 2) + self.assertEqual( + records[0].payload["messages"], + [ + { + "case_number": "", + "date": "2026-04-22", + "type": "Сообщение ЕФРСБ", + }, + { + "case_number": "", + "date": "2026-04-20", + "type": "Сообщение ЕФРСБ", + }, + ], + ) + @override_settings(CHECKO_API_KEY="test-key", ARBITRATION_CHECKO_LIMIT=10) def test_arbitration_fetches_checko_legal_cases_for_active_registry_organizations( self, diff --git a/tests/apps/parsers/test_views.py b/tests/apps/parsers/test_views.py index 5b8d7a4..45f9e50 100644 --- a/tests/apps/parsers/test_views.py +++ b/tests/apps/parsers/test_views.py @@ -20,6 +20,10 @@ from django.core.files.uploadedfile import SimpleUploadedFile from django.urls import reverse from openpyxl import Workbook from organizations.models import Organization +from organizations.source_ingestion import ( + OrganizationSourceIngestionService, + SourceRecordInput, +) from rest_framework import status from rest_framework.test import APITestCase @@ -67,6 +71,32 @@ def _build_fns_zip_bytes(file_map: dict[str, bytes]) -> bytes: return buf.getvalue() +def _save_source_record( + *, + source: str, + external_id: str, + inn: str = "", + ogrn: str = "", + organization_name: str = "Test organization", + title: str = "Source record", + payload: dict | None = None, +) -> None: + OrganizationSourceIngestionService.save_records( + source=source, + load_batch=1, + records=[ + SourceRecordInput( + external_id=external_id, + title=title, + organization_name=organization_name, + inn=inn, + ogrn=ogrn, + payload=payload or {}, + ) + ], + ) + + def _create_procurement_record() -> ProcurementRecord: return ProcurementRecord.objects.create( load_batch=fake.random_int(min=1, max=1000), @@ -442,35 +472,33 @@ class ParsersViewSetTest(APITestCase): self.assertNotIn("kpp", detail) def test_dashboard_data_exposes_source_groups_for_page(self): - GenericParserRecord.objects.create( - load_batch=1, + _save_source_record( source=ParserLoadLog.Source.PROCUREMENTS_44FZ, external_id="eis-44fz-1", + organization_name="Customer 1", title="EIS 44-FZ notice 1", payload={"registry": "44fz"}, ) - GenericParserRecord.objects.create( - load_batch=1, + _save_source_record( source=ParserLoadLog.Source.PROCUREMENTS_44FZ, external_id="eis-44fz-2", + organization_name="Customer 2", title="EIS 44-FZ notice 2", payload={"registry": "44fz"}, ) - GenericParserRecord.objects.create( - load_batch=1, + _save_source_record( source=ParserLoadLog.Source.TRUDVSEM, external_id="trudvsem-1", + organization_name="Employer", title="Vacancy", payload={"registry": "trudvsem"}, ) - FinancialReport.objects.create( + _save_source_record( + source=ParserLoadLog.Source.FNS_REPORTS, external_id=_digits(5), ogrn=_digits(13), - file_name=f"fin_{_digits(5)}_{_digits(13)}.xlsx", - file_hash=fake.sha256(raw_output=False), - load_batch=1, - status=FinancialReport.Status.SUCCESS, - source=FinancialReport.SourceType.API, + organization_name="FNS organization", + title="FNS report", ) self.client.force_authenticate(self.user) @@ -513,17 +541,14 @@ class ParsersViewSetTest(APITestCase): mn_ogrn=1107746880031, ) RegistryMembershipPeriodFactory(organization=registry_organization) - FinancialReport.objects.create( + _save_source_record( + source=ParserLoadLog.Source.FNS_REPORTS, external_id=_digits(5), ogrn=str(registry_organization.mn_ogrn), - file_name=f"fin_{_digits(5)}_{registry_organization.mn_ogrn}.xlsx", - file_hash=fake.sha256(raw_output=False), - load_batch=1, - status=FinancialReport.Status.SUCCESS, - source=FinancialReport.SourceType.API, + organization_name="Registry FNS organization", + title="FNS report", ) - GenericParserRecord.objects.create( - load_batch=1, + _save_source_record( source=ParserLoadLog.Source.UNFAIR_SUPPLIERS, external_id="unfair-1", title="Unfair supplier record", @@ -561,36 +586,40 @@ class ParsersViewSetTest(APITestCase): roscosmos_membership = RegistryMembershipPeriodFactory( organization=roscosmos_organization ) - FinancialReport.objects.create( + _save_source_record( + source=ParserLoadLog.Source.FNS_REPORTS, external_id=_digits(5), ogrn=str(rosatom_organization.mn_ogrn), - file_name=f"fin_{_digits(5)}_{rosatom_organization.mn_ogrn}.xlsx", - file_hash=fake.sha256(raw_output=False), - load_batch=1, - status=FinancialReport.Status.SUCCESS, - source=FinancialReport.SourceType.API, + organization_name="Rosatom", + title="FNS report", ) - IndustrialCertificateRecordFactory( + _save_source_record( + source=ParserLoadLog.Source.INDUSTRIAL, + external_id="industrial-registry-analytics", + title="Industrial certificate", + organization_name="Rosatom", inn=str(rosatom_organization.mn_inn), ogrn=str(rosatom_organization.mn_ogrn), ) - GenericParserRecord.objects.create( - load_batch=1, + _save_source_record( source=ParserLoadLog.Source.UNFAIR_SUPPLIERS, external_id="unfair-registry-analytics", title="Risk signal", inn=str(roscosmos_organization.mn_inn), ogrn=str(roscosmos_organization.mn_ogrn), ) - GenericParserRecord.objects.create( - load_batch=1, + _save_source_record( source=ParserLoadLog.Source.FEDRESURS_BANKRUPTCY, external_id="bankruptcy-registry-analytics", title="Bankruptcy risk signal", inn=str(roscosmos_organization.mn_inn), ogrn=str(roscosmos_organization.mn_ogrn), ) - InspectionRecordFactory( + _save_source_record( + source=ParserLoadLog.Source.INSPECTIONS, + external_id="inspection-registry-analytics", + title="Inspection risk signal", + organization_name="Rosatom", inn=str(rosatom_organization.mn_inn), ogrn=str(rosatom_organization.mn_ogrn), ) diff --git a/ts_client.zip b/ts_client.zip new file mode 100644 index 0000000..906f03f Binary files /dev/null and b/ts_client.zip differ diff --git a/ts_client/.gitignore b/ts_client/.gitignore new file mode 100644 index 0000000..1eae0cf --- /dev/null +++ b/ts_client/.gitignore @@ -0,0 +1,2 @@ +dist/ +node_modules/ diff --git a/ts_client/README.md b/ts_client/README.md new file mode 100644 index 0000000..9119e31 --- /dev/null +++ b/ts_client/README.md @@ -0,0 +1,192 @@ +# TypeScript-клиент API организаций Mostovik + +Клиент покрывает два endpoint API v2: + +- `GET /api/v2/organizations/` — список организаций. +- `GET /api/v2/organizations/{uid}/` — карточка одной организации. + +Пакет не имеет runtime-зависимостей и использует `fetch`. В `settings.dev` backend открывает эти endpoint без JWT, в остальных окружениях можно передать `accessToken` или свои headers. + +```ts +import { MostovikOrganizationsClient } from "@mostovik/organizations-api-client"; + +const client = new MostovikOrganizationsClient({ + baseUrl: "https://backend.example.com", + accessToken: "", +}); + +const page = await client.listOrganizations({ + page: 1, + page_size: 20, + search: "мост", + has_registry: true, + has_industrial: true, + data: ["industrial", "fns_reports"], +}); + +const item = await client.getOrganization(page.data[0].uid, { + data_sources: ["industrial", "fns_reports"], +}); +``` + +## Методы + +### `listOrganizations(params?, options?)` + +Вызывает `GET /api/v2/organizations/`. + +Возвращает пагинированный ответ: + +```ts +{ + success: true, + data: Organization[], + errors: null, + meta: { + pagination: { + page: number, + page_size: number, + total_count: number, + total_pages: number, + has_next: boolean, + has_previous: boolean, + }, + }, +} +``` + +Важно: backend по умолчанию применяет `has_registry=true` для list endpoint, если параметр `has_registry` не передан. Чтобы получить организации без активного участия в реестрах, передайте `has_registry: false`. + +### `getOrganization(uid, params?, options?)` + +Вызывает `GET /api/v2/organizations/{uid}/`. + +Возвращает сам объект `Organization`, без wrapper `success/data/meta`. + +## Параметры списка + +| Параметр | Тип | Назначение | +| --- | --- | --- | +| `page` | `number` | Номер страницы пагинации. Backend default: `1`. | +| `page_size` | `number` | Размер страницы. Backend default: `20`, максимум `100`. | +| `search` | `string` | Поиск по наименованию, ИНН, КПП, ОГРН и ОГРИП. | +| `ordering` | `OrganizationOrdering` | Сортировка по `uid`, `name`, `inn`, `kpp`, `ogrn`, `ogrip`; префикс `-` включает обратный порядок, например `-name`. | +| `name` | `string` | Фильтр по части полного наименования организации. | +| `inn` | `string` | Точный фильтр по ИНН. | +| `kpp` | `string` | Точный фильтр по КПП. | +| `ogrn` | `string` | Точный фильтр по ОГРН. | +| `ogrip` | `string` | Точный фильтр по ОГРИП. | +| `registry` | `string` | UUID реестра. Возвращает организации с активным участием в этом реестре. | +| `registry_name` | `string` | Фильтр по части наименования реестра. | +| `has_registry` | `boolean` | Фильтр наличия активного участия в любом реестре. | +| `has_industrial` | `boolean` | Наличие данных источника `industrial`. | +| `has_industrial_products` | `boolean` | Наличие данных источника `industrial_products`. | +| `has_manufactures` | `boolean` | Наличие данных источника `manufactures`. | +| `has_inspections` | `boolean` | Наличие данных источника `inspections`. | +| `has_procurements` | `boolean` | Наличие данных источника `procurements`. | +| `has_procurements_44fz` | `boolean` | Наличие данных источника `procurements_44fz`. | +| `has_procurements_223fz` | `boolean` | Наличие данных источника `procurements_223fz`. | +| `has_contracts` | `boolean` | Наличие данных источника `contracts`. | +| `has_unfair_suppliers` | `boolean` | Наличие данных источника `unfair_suppliers`. | +| `has_fas_goz` | `boolean` | Наличие данных источника `fas_goz`. | +| `has_arbitration` | `boolean` | Наличие данных источника `arbitration`. | +| `has_fedresurs_bankruptcy` | `boolean` | Наличие данных источника `fedresurs_bankruptcy`. | +| `has_fstec` | `boolean` | Наличие данных источника `fstec`. | +| `has_vacancies` | `boolean` | Наличие данных публичного источника API v2 `vacancies`. Внутренний backend source — `trudvsem`. | +| `has_trudvsem` | `boolean` | Deprecated alias backend для `has_vacancies`; оставлен в типах, но новый код должен использовать `has_vacancies`. | +| `has_fns_reports` | `boolean` | Наличие данных источника `fns_reports`. | +| `data` | `OrganizationDataSource \| OrganizationDataSource[]` | Вернуть в блоке `data` только указанные источники. | +| `data_sources` | `OrganizationDataSource \| OrganizationDataSource[]` | Alias для `data`. | +| `exclude_data` | `OrganizationDataSource \| OrganizationDataSource[]` | Исключить указанные источники из блока `data`. | +| `exclude_data_sources` | `OrganizationDataSource \| OrganizationDataSource[]` | Alias для `exclude_data`. | + +## Параметры карточки + +`getOrganization(uid, params)` принимает path-параметр `uid` и параметры управления блоком `data`: + +| Параметр | Тип | Назначение | +| --- | --- | --- | +| `uid` | `string` | UID организации в path: `/api/v2/organizations/{uid}/`. | +| `data` | `OrganizationDataSource \| OrganizationDataSource[]` | Вернуть в блоке `data` только указанные источники. | +| `data_sources` | `OrganizationDataSource \| OrganizationDataSource[]` | Alias для `data`. | +| `exclude_data` | `OrganizationDataSource \| OrganizationDataSource[]` | Исключить указанные источники из блока `data`. | +| `exclude_data_sources` | `OrganizationDataSource \| OrganizationDataSource[]` | Alias для `exclude_data`. | + +Backend также принимает CSV-строки для `data`/`exclude_data`, но в клиенте лучше передавать массив: так TypeScript проверит допустимые source keys. + +## Источники данных + +Допустимые публичные source keys API v2: + +- `arbitration` +- `contracts` +- `fas_goz` +- `fedresurs_bankruptcy` +- `fns_reports` +- `fstec` +- `industrial` +- `industrial_products` +- `inspections` +- `manufactures` +- `procurements` +- `procurements_44fz` +- `procurements_223fz` +- `unfair_suppliers` +- `vacancies` + +Публичный ключ для Работа России — `vacancies`. Параметры `data=trudvsem` и `data_sources=trudvsem` backend API v2 отклоняет. + +## Структуры ответа + +`Organization` содержит: + +- `uid`, `name`, `normalized_name`, `inn`, `kpp`, `ogrn`, `ogrip` +- `registries` — активные реестры организации: `{ id, name }[]` +- `data_sources` — краткая сводка непустых источников: `{ source, count }[]` +- `data` — объект, где ключи являются source keys, а значения — массивы записей источников + +В `types.ts` описаны структуры всех возвращаемых блоков `data`: + +- `IndustrialCertificateRecord` для `industrial` +- `IndustrialProductRecord` для `industrial_products` +- `ManufacturerRecord` для `manufactures` +- `InspectionRecord` для `inspections` +- `ProcurementRecord` для `procurements` +- `GenericOrganizationSourceRecord` для `procurements_44fz`, `procurements_223fz`, `contracts`, `unfair_suppliers`, `fas_goz`, `arbitration`, `fedresurs_bankruptcy`, `fstec`, `vacancies` +- `FinancialReportRecord` и `FinancialReportLineValue` для `fns_reports` + +Для `GenericOrganizationSourceRecord.payload` используется отдельный тип `GenericSourcePayload`. Это JSON-object, потому что backend сохраняет в этом поле исходный нормализованный документ внешнего generic-источника как `dict`. + +## Ошибки + +Для HTTP-ответов вне диапазона `2xx` клиент бросает `ApiClientError`: + +```ts +import { ApiClientError } from "@mostovik/organizations-api-client"; + +try { + await client.getOrganization(uid, { data: ["industrial"] }); +} catch (error) { + if (error instanceof ApiClientError) { + console.log(error.status); + console.log(error.payload); + } +} +``` + +`ApiClientError.payload` типизирован как `ApiErrorPayload`: + +```ts +{ + success: false, + data: null, + errors: Array<{ + code: string, + message: string, + details?: JsonObject, + }>, + meta: { + request_id?: string, + } | null, +} +``` diff --git a/ts_client/package.json b/ts_client/package.json new file mode 100644 index 0000000..90a5803 --- /dev/null +++ b/ts_client/package.json @@ -0,0 +1,26 @@ +{ + "name": "@mostovik/organizations-api-client", + "version": "0.1.0", + "description": "TypeScript client for Mostovik organizations API v2", + "type": "module", + "main": "./dist/src/index.js", + "types": "./dist/src/index.d.ts", + "exports": { + ".": { + "types": "./dist/src/index.d.ts", + "import": "./dist/src/index.js" + } + }, + "files": [ + "dist/src", + "README.md" + ], + "scripts": { + "build": "tsc -p tsconfig.json", + "test": "tsc -p tsconfig.json && node --test dist/test/*.test.js" + }, + "devDependencies": { + "@types/node": "^25.0.0", + "typescript": "^5.9.3" + } +} diff --git a/ts_client/src/client.ts b/ts_client/src/client.ts new file mode 100644 index 0000000..d426289 --- /dev/null +++ b/ts_client/src/client.ts @@ -0,0 +1,257 @@ +import type { + ApiErrorPayload, + Organization, + OrganizationDetailParams, + OrganizationListParams, + OrganizationListResponse, + QueryListValue, + JsonValue, +} from "./types.js"; + +export type FetchLike = ( + input: RequestInfo, + init?: RequestInit, +) => Promise; + +export interface MostovikOrganizationsClientOptions { + baseUrl: string | URL; + fetch?: FetchLike; + headers?: HeadersInit; + accessToken?: string; +} + +export interface ApiRequestOptions { + headers?: HeadersInit; + signal?: AbortSignal; +} + +export class ApiClientError extends Error { + readonly status: number; + readonly payload: ApiErrorPayload; + + constructor(message: string, status: number, payload: ApiErrorPayload) { + super(message); + this.name = "ApiClientError"; + this.status = status; + this.payload = payload; + } +} + +type QueryParamValue = string | number | boolean | null | undefined; +type QueryParam = QueryParamValue | readonly QueryParamValue[]; +type QueryParams = Record; + +export class MostovikOrganizationsClient { + private readonly baseUrl: string; + private readonly fetchImpl: FetchLike; + private readonly defaultHeaders: HeadersInit | undefined; + private readonly accessToken: string | undefined; + + constructor(options: MostovikOrganizationsClientOptions) { + this.baseUrl = normalizeBaseUrl(options.baseUrl); + this.fetchImpl = options.fetch ?? defaultFetch(); + this.defaultHeaders = options.headers; + this.accessToken = options.accessToken; + } + + listOrganizations( + params: OrganizationListParams = {}, + options: ApiRequestOptions = {}, + ): Promise { + return this.request( + "api/v2/organizations/", + params as QueryParams, + options, + ); + } + + getOrganization( + uid: string, + params: OrganizationDetailParams = {}, + options: ApiRequestOptions = {}, + ): Promise { + return this.request( + `api/v2/organizations/${encodeURIComponent(uid)}/`, + params as QueryParams, + options, + ); + } + + private async request( + path: string, + params: QueryParams, + options: ApiRequestOptions, + ): Promise { + const url = this.buildUrl(path, params); + const init: RequestInit = { + method: "GET", + headers: this.buildHeaders(options.headers), + }; + if (options.signal !== undefined) { + init.signal = options.signal; + } + + const response = await this.fetchImpl(url.toString(), init); + const payload = await parseResponsePayload(response); + + if (!response.ok) { + const errorPayload = toApiErrorPayload(payload, response.status); + throw new ApiClientError( + `GET ${url.pathname} failed with HTTP ${response.status}`, + response.status, + errorPayload, + ); + } + + return payload as T; + } + + private buildUrl(path: string, params: QueryParams): URL { + const url = new URL(path, this.baseUrl); + appendQueryParams(url, params); + return url; + } + + private buildHeaders(requestHeaders?: HeadersInit): Headers { + const headers = new Headers(this.defaultHeaders); + headers.set("Accept", "application/json"); + + if (this.accessToken !== undefined && !headers.has("Authorization")) { + headers.set("Authorization", `Bearer ${this.accessToken}`); + } + + if (requestHeaders !== undefined) { + new Headers(requestHeaders).forEach((value, key) => { + headers.set(key, value); + }); + } + + return headers; + } +} + +function normalizeBaseUrl(baseUrl: string | URL): string { + const value = String(baseUrl); + return value.endsWith("/") ? value : `${value}/`; +} + +function defaultFetch(): FetchLike { + if (typeof globalThis.fetch !== "function") { + throw new Error( + "No fetch implementation is available. Pass fetch in client options.", + ); + } + + return (input, init) => globalThis.fetch(input, init); +} + +function appendQueryParams(url: URL, params: QueryParams): void { + Object.entries(params).forEach(([key, value]) => { + if (isQueryParamArray(value)) { + value.forEach((item) => appendQueryParam(url, key, item)); + return; + } + + appendQueryParam(url, key, value); + }); +} + +function isQueryParamArray(value: QueryParam): value is readonly QueryParamValue[] { + return Array.isArray(value); +} + +function appendQueryParam(url: URL, key: string, value: QueryParamValue): void { + if (value === undefined || value === null) { + return; + } + + url.searchParams.append(key, String(value)); +} + +async function parseResponsePayload(response: Response): Promise { + const contentType = response.headers.get("content-type") ?? ""; + const body = await response.text(); + if (body === "") { + return null; + } + + if (contentType.includes("application/json")) { + return JSON.parse(body) as JsonValue; + } + + try { + return JSON.parse(body) as JsonValue; + } catch { + return body; + } +} + +function toApiErrorPayload( + payload: JsonValue | null, + status: number, +): ApiErrorPayload { + if ( + isJsonObject(payload) && + payload.success === false && + payload.data === null && + Array.isArray(payload.errors) + ) { + const meta = payload.meta ?? null; + return { + success: false, + data: null, + errors: payload.errors.map(toApiErrorDetail), + meta: isJsonObject(meta) ? meta : null, + }; + } + + return { + success: false, + data: null, + errors: [ + { + code: `http_${status}`, + message: "HTTP request failed", + details: { + payload, + }, + }, + ], + meta: null, + }; +} + +function toApiErrorDetail(value: JsonValue): { + code: string; + message: string; + details?: { readonly [key: string]: JsonValue }; +} { + if (isJsonObject(value)) { + const details = value.details ?? null; + const code = typeof value.code === "string" ? value.code : "error"; + const message = + typeof value.message === "string" ? value.message : JSON.stringify(value); + if (isJsonObject(details)) { + return { + code, + message, + details, + }; + } + return { + code, + message, + }; + } + + return { + code: "error", + message: String(value), + }; +} + +function isJsonObject(value: JsonValue | null): value is { + readonly [key: string]: JsonValue; +} { + return typeof value === "object" && value !== null && !Array.isArray(value); +} diff --git a/ts_client/src/index.ts b/ts_client/src/index.ts new file mode 100644 index 0000000..27b43c2 --- /dev/null +++ b/ts_client/src/index.ts @@ -0,0 +1,8 @@ +export { + ApiClientError, + MostovikOrganizationsClient, + type ApiRequestOptions, + type FetchLike, + type MostovikOrganizationsClientOptions, +} from "./client.js"; +export * from "./types.js"; diff --git a/ts_client/src/types.ts b/ts_client/src/types.ts new file mode 100644 index 0000000..d041933 --- /dev/null +++ b/ts_client/src/types.ts @@ -0,0 +1,343 @@ +export type UuidString = string; +export type IsoDateString = string; +export type IsoDateTimeString = string; +export type DecimalString = string; + +export type JsonPrimitive = string | number | boolean | null; +export type JsonValue = + | JsonPrimitive + | JsonValue[] + | JsonObject; +export type JsonObject = { readonly [key: string]: JsonValue }; +export type GenericSourcePayload = JsonObject; + +export interface ApiErrorPayload { + success: false; + data: null; + errors: ApiErrorDetail[]; + meta: ApiErrorMeta | null; +} + +export interface ApiErrorDetail { + code: string; + message: string; + details?: JsonObject; +} + +export interface ApiErrorMeta { + request_id?: string; +} + +export const ORGANIZATION_DATA_SOURCES = [ + "arbitration", + "contracts", + "fas_goz", + "fedresurs_bankruptcy", + "fns_reports", + "fstec", + "industrial", + "industrial_products", + "inspections", + "manufactures", + "procurements", + "procurements_44fz", + "procurements_223fz", + "unfair_suppliers", + "vacancies", +] as const; + +export type OrganizationDataSource = (typeof ORGANIZATION_DATA_SOURCES)[number]; + +export type OrganizationGenericInternalSource = + | "arbitration" + | "contracts" + | "fas_goz" + | "fedresurs_bankruptcy" + | "fstec" + | "procurements_44fz" + | "procurements_223fz" + | "trudvsem" + | "unfair_suppliers"; + +export type OrganizationOrderingField = + | "uid" + | "name" + | "inn" + | "kpp" + | "ogrn" + | "ogrip"; + +export type OrganizationOrdering = + | OrganizationOrderingField + | `-${OrganizationOrderingField}`; + +export type QueryListValue = T | readonly T[]; + +export interface OrganizationDataSelectionParams { + /** Ограничивает блок `data` указанными источниками. В клиенте лучше передавать массив вместо CSV-строки. */ + data?: QueryListValue; + /** Alias для `data`: явно задает набор источников, которые нужно вернуть в `data`. */ + data_sources?: QueryListValue; + /** Исключает указанные источники из блока `data`. */ + exclude_data?: QueryListValue; + /** Alias для `exclude_data`: явно задает набор источников, которые нужно исключить из `data`. */ + exclude_data_sources?: QueryListValue; +} + +export interface OrganizationListParams extends OrganizationDataSelectionParams { + /** Номер страницы пагинации. Backend default: 1. */ + page?: number; + /** Размер страницы. Backend default: 20, максимум 100. */ + page_size?: number; + /** Поиск по наименованию, ИНН, КПП, ОГРН и ОГРИП. */ + search?: string; + /** Сортировка по `uid`, `name`, `inn`, `kpp`, `ogrn`, `ogrip`; префикс `-` включает обратный порядок. */ + ordering?: OrganizationOrdering; + /** Фильтр по части полного наименования организации. */ + name?: string; + /** Точный фильтр по ИНН. */ + inn?: string; + /** Точный фильтр по КПП. */ + kpp?: string; + /** Точный фильтр по ОГРН. */ + ogrn?: string; + /** Точный фильтр по ОГРИП. */ + ogrip?: string; + /** UUID реестра: возвращает организации с активным участием в этом реестре. */ + registry?: UuidString; + /** Фильтр по части наименования реестра. */ + registry_name?: string; + /** Наличие активного участия в любом реестре. Для list backend по умолчанию применяет `true`, если параметр не передан. */ + has_registry?: boolean; + /** Наличие данных источника `industrial`. */ + has_industrial?: boolean; + /** Наличие данных источника `industrial_products`. */ + has_industrial_products?: boolean; + /** Наличие данных источника `manufactures`. */ + has_manufactures?: boolean; + /** Наличие данных источника `inspections`. */ + has_inspections?: boolean; + /** Наличие данных источника `procurements`. */ + has_procurements?: boolean; + /** Наличие данных источника `procurements_44fz`. */ + has_procurements_44fz?: boolean; + /** Наличие данных источника `procurements_223fz`. */ + has_procurements_223fz?: boolean; + /** Наличие данных источника `contracts`. */ + has_contracts?: boolean; + /** Наличие данных источника `unfair_suppliers`. */ + has_unfair_suppliers?: boolean; + /** Наличие данных источника `fas_goz`. */ + has_fas_goz?: boolean; + /** Наличие данных источника `arbitration`. */ + has_arbitration?: boolean; + /** Наличие данных источника `fedresurs_bankruptcy`. */ + has_fedresurs_bankruptcy?: boolean; + /** Наличие данных источника `fstec`. */ + has_fstec?: boolean; + /** Наличие данных публичного API v2 источника `vacancies` (внутренний backend source: `trudvsem`). */ + has_vacancies?: boolean; + /** @deprecated Внутренний alias backend для `has_vacancies`; используйте `has_vacancies`. */ + has_trudvsem?: boolean; + /** Наличие данных источника `fns_reports`. */ + has_fns_reports?: boolean; +} + +export interface OrganizationDetailParams extends OrganizationDataSelectionParams {} + +export interface OrganizationListResponse { + success: true; + data: Organization[]; + errors: null; + meta: { + pagination: PagePagination; + }; +} + +export interface PagePagination { + page: number; + page_size: number; + total_count: number; + total_pages: number; + has_next: boolean; + has_previous: boolean; +} + +export interface Organization { + uid: UuidString; + name: string; + normalized_name: string; + inn: string; + kpp: string; + ogrn: string; + ogrip: string; + data: OrganizationData; + data_sources: OrganizationDataSourceSummary[]; + registries: OrganizationRegistry[]; +} + +export interface OrganizationRegistry { + id: UuidString; + name: string; +} + +export interface OrganizationDataSourceSummary { + source: OrganizationDataSource; + count: number; +} + +export type OrganizationData = Partial; + +export interface OrganizationDataBySource { + arbitration: GenericOrganizationSourceRecord[]; + contracts: GenericOrganizationSourceRecord[]; + fas_goz: GenericOrganizationSourceRecord[]; + fedresurs_bankruptcy: GenericOrganizationSourceRecord[]; + fns_reports: FinancialReportRecord[]; + fstec: GenericOrganizationSourceRecord[]; + industrial: IndustrialCertificateRecord[]; + industrial_products: IndustrialProductRecord[]; + inspections: InspectionRecord[]; + manufactures: ManufacturerRecord[]; + procurements: ProcurementRecord[]; + procurements_44fz: GenericOrganizationSourceRecord[]; + procurements_223fz: GenericOrganizationSourceRecord[]; + unfair_suppliers: GenericOrganizationSourceRecord[]; + vacancies: GenericOrganizationSourceRecord[]; +} + +export interface OrganizationTimedRecord { + id: number; + load_batch: number; + registry_organization: number | null; + created_at: IsoDateTimeString; + updated_at: IsoDateTimeString; +} + +export interface IndustrialCertificateRecord extends OrganizationTimedRecord { + issue_date: string; + issue_date_normalized: IsoDateString | null; + certificate_number: string; + expiry_date: string; + expiry_date_normalized: IsoDateString | null; + certificate_file_url: string; + organisation_name: string; + inn: string; + ogrn: string; +} + +export interface IndustrialProductRecord extends OrganizationTimedRecord { + full_organisation_name: string; + ogrn: string; + inn: string; + registry_number: string; + product_name: string; + product_model: string; + okpd2_code: string; + tnved_code: string; + regulatory_document: string; +} + +export interface ManufacturerRecord extends OrganizationTimedRecord { + full_legal_name: string; + inn: string; + ogrn: string; + address: string; +} + +export interface InspectionRecord extends OrganizationTimedRecord { + registration_number: string; + inn: string; + ogrn: string; + organisation_name: string; + control_authority: string; + inspection_type: string; + inspection_form: string; + start_date: string; + start_date_normalized: IsoDateString | null; + end_date: string; + end_date_normalized: IsoDateString | null; + status: string; + legal_basis: string; + result: string; + is_federal_law_248: boolean; + data_year: number | null; + data_month: number | null; +} + +export interface ProcurementRecord extends OrganizationTimedRecord { + purchase_number: string; + purchase_name: string; + customer_inn: string; + customer_kpp: string; + customer_ogrn: string; + customer_name: string; + max_price: string; + max_price_amount: DecimalString | null; + currency_code: string; + placement_method: string; + publish_date: string; + publish_date_normalized: IsoDateString | null; + end_date: string; + end_date_normalized: IsoDateString | null; + status: string; + law_type: string; + purchase_object_info: string; + href: string; + region_code: string; + data_year: number | null; + data_month: number | null; +} + +export interface GenericOrganizationSourceRecord extends OrganizationTimedRecord { + source: OrganizationGenericInternalSource; + external_id: string; + inn: string; + ogrn: string; + organisation_name: string; + title: string; + record_date: string; + amount: DecimalString | null; + status: string; + url: string; + payload: GenericSourcePayload; +} + +export type FinancialReportStatus = + | "failed" + | "pending" + | "processing" + | "success"; + +export type FinancialReportSource = "api" | "file_watch"; + +export interface FinancialReportRecord { + id: number; + external_id: string; + ogrn: string; + registry_organization: number | null; + file_name: string; + file_hash: string; + load_batch: number; + status: FinancialReportStatus; + source: FinancialReportSource; + error_message: string; + created_at: IsoDateTimeString; + updated_at: IsoDateTimeString; + lines_count: number; + lines: FinancialReportLinesByYear; +} + +export type FinancialReportSection = "active" | "balance" | "passive" | `form_${string}`; + +export type FinancialReportLinesByYear = Record< + string, + Partial>> +>; + +export interface FinancialReportLineValue { + form_code: string; + name: string; + period_start: number | null; + period_end: number | null; +} diff --git a/ts_client/test/client.test.ts b/ts_client/test/client.test.ts new file mode 100644 index 0000000..f9eb7db --- /dev/null +++ b/ts_client/test/client.test.ts @@ -0,0 +1,257 @@ +import assert from "node:assert/strict"; +import test from "node:test"; + +import { + MostovikOrganizationsClient, + type ApiErrorPayload, + type FetchLike, + type GenericSourcePayload, + type Organization, + type OrganizationListResponse, +} from "../src/index.js"; + +function jsonResponse(payload: unknown, init: ResponseInit = {}): Response { + return new Response(JSON.stringify(payload), { + status: init.status ?? 200, + headers: { + "content-type": "application/json", + ...init.headers, + }, + }); +} + +test("listOrganizations serializes all supported organization filters", async () => { + const calls: RequestInfo[] = []; + const fetchImpl: FetchLike = async (input) => { + calls.push(input); + const payload: OrganizationListResponse = { + success: true, + data: [], + errors: null, + meta: { + pagination: { + page: 2, + page_size: 50, + total_count: 0, + total_pages: 0, + has_next: false, + has_previous: true, + }, + }, + }; + return jsonResponse(payload); + }; + const client = new MostovikOrganizationsClient({ + baseUrl: "https://api.example.test/", + fetch: fetchImpl, + headers: { Authorization: "Bearer static-token" }, + }); + + const response = await client.listOrganizations({ + page: 2, + page_size: 50, + search: "мост", + ordering: "-name", + name: "Северный", + inn: "7711111111", + kpp: "771101001", + ogrn: "1027700132111", + ogrip: "304500116000157", + registry: "8e7a3cb8-6fb2-43a8-b847-62d84ea9f34f", + registry_name: "Росатом", + has_registry: false, + has_industrial: true, + has_industrial_products: false, + has_manufactures: true, + has_inspections: false, + has_procurements: true, + has_procurements_44fz: false, + has_procurements_223fz: true, + has_contracts: false, + has_unfair_suppliers: true, + has_fas_goz: false, + has_arbitration: true, + has_fedresurs_bankruptcy: false, + has_fstec: true, + has_vacancies: true, + has_trudvsem: false, + has_fns_reports: false, + data: ["industrial", "fns_reports"], + exclude_data_sources: "vacancies", + }); + + assert.equal(response.success, true); + assert.equal(calls.length, 1); + + const url = new URL(String(calls[0])); + assert.equal(url.origin, "https://api.example.test"); + assert.equal(url.pathname, "/api/v2/organizations/"); + assert.equal(url.searchParams.get("page"), "2"); + assert.equal(url.searchParams.get("page_size"), "50"); + assert.equal(url.searchParams.get("ordering"), "-name"); + assert.equal(url.searchParams.get("has_registry"), "false"); + assert.deepEqual(url.searchParams.getAll("data"), ["industrial", "fns_reports"]); + assert.equal(url.searchParams.get("exclude_data_sources"), "vacancies"); + assert.equal(url.searchParams.get("has_vacancies"), "true"); + assert.equal(url.searchParams.get("has_trudvsem"), "false"); +}); + +test("getOrganization requests detail endpoint and returns typed source data", async () => { + const calls: RequestInfo[] = []; + const uid = "5dc142b3-dcf4-4b90-807a-a80e883dd05c"; + const payload: Organization = { + uid, + name: "ООО \"Данные\"", + normalized_name: "ООО \"Данные\"", + inn: "7777777777", + kpp: "777701001", + ogrn: "1027700132777", + ogrip: "", + registries: [ + { + id: "0b85bf08-c6d9-4c07-98c3-a057630e3a35", + name: "Росатом ГОЗ", + }, + ], + data_sources: [ + { + source: "industrial", + count: 1, + }, + { + source: "fns_reports", + count: 1, + }, + ], + data: { + industrial: [ + { + id: 1, + load_batch: 10, + issue_date: "01.01.2025", + issue_date_normalized: "2025-01-01", + certificate_number: "CERT-1", + expiry_date: "", + expiry_date_normalized: null, + certificate_file_url: "https://example.test/cert.pdf", + organisation_name: "ООО \"Данные\"", + inn: "7777777777", + ogrn: "1027700132777", + registry_organization: null, + created_at: "2026-05-01T10:00:00Z", + updated_at: "2026-05-01T10:00:00Z", + }, + ], + fns_reports: [ + { + id: 2, + external_id: "fin-1", + ogrn: "1027700132777", + registry_organization: null, + file_name: "fin.xlsx", + file_hash: "a".repeat(64), + load_batch: 11, + status: "success", + source: "api", + error_message: "", + created_at: "2026-05-01T10:00:00Z", + updated_at: "2026-05-01T10:00:00Z", + lines_count: 1, + lines: { + "2024": { + active: { + "1100": { + form_code: "1", + name: "Внеоборотные активы", + period_start: 100, + period_end: 200, + }, + }, + }, + }, + }, + ], + }, + }; + const fetchImpl: FetchLike = async (input) => { + calls.push(input); + return jsonResponse(payload); + }; + const client = new MostovikOrganizationsClient({ + baseUrl: "https://api.example.test", + fetch: fetchImpl, + }); + + const organization = await client.getOrganization(uid, { + data_sources: ["industrial", "fns_reports"], + }); + + assert.equal(organization.uid, uid); + assert.equal(organization.data.industrial?.[0]?.certificate_number, "CERT-1"); + assert.equal( + organization.data.fns_reports?.[0]?.lines["2024"]?.active?.["1100"]?.period_end, + 200, + ); + const url = new URL(String(calls[0])); + assert.equal(url.pathname, `/api/v2/organizations/${uid}/`); + assert.deepEqual(url.searchParams.getAll("data_sources"), [ + "industrial", + "fns_reports", + ]); +}); + +test("throws ApiClientError with status and payload for non-2xx responses", async () => { + const errorPayload: ApiErrorPayload = { + success: false, + data: null, + errors: [ + { + code: "validation_error", + message: "Validation failed", + details: { + fields: { + data: ["Unknown data source(s): unknown"], + }, + }, + }, + ], + meta: { + request_id: "test-request-id", + }, + }; + const fetchImpl: FetchLike = async () => + jsonResponse(errorPayload, { status: 400 }); + const client = new MostovikOrganizationsClient({ + baseUrl: "https://api.example.test", + fetch: fetchImpl, + }); + + await assert.rejects( + () => client.getOrganization("uid", { data: "industrial" }), + (error: unknown) => { + assert.equal(error instanceof Error, true); + assert.equal((error as { name?: string }).name, "ApiClientError"); + assert.equal((error as { status?: number }).status, 400); + assert.deepEqual((error as { payload?: ApiErrorPayload }).payload, errorPayload); + return true; + }, + ); +}); + +test("generic source payload is represented as a typed JSON object", () => { + const payload: GenericSourcePayload = { + provider: "checko", + target: { + inn: "7711111199", + ogrn: "1027700132199", + }, + suppliers: [ + { + inn: "7722222299", + amount: 1000, + }, + ], + }; + + assert.equal(payload.provider, "checko"); +}); diff --git a/ts_client/tsconfig.json b/ts_client/tsconfig.json new file mode 100644 index 0000000..0053ae6 --- /dev/null +++ b/ts_client/tsconfig.json @@ -0,0 +1,24 @@ +{ + "compilerOptions": { + "target": "ES2022", + "module": "NodeNext", + "moduleResolution": "NodeNext", + "lib": [ + "ES2022", + "DOM" + ], + "rootDir": ".", + "outDir": "dist", + "declaration": true, + "strict": true, + "noUncheckedIndexedAccess": true, + "exactOptionalPropertyTypes": true, + "esModuleInterop": true, + "forceConsistentCasingInFileNames": true, + "skipLibCheck": true + }, + "include": [ + "src/**/*.ts", + "test/**/*.ts" + ] +} diff --git a/uv.lock b/uv.lock index 851707c..2ecb86e 100644 --- a/uv.lock +++ b/uv.lock @@ -632,6 +632,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/95/40/fba02261d88173a2e27b058afe4cbdecde35caa53c22f0a2e4c1a194ee02/django_jazzmin-2.6.2-py3-none-any.whl", hash = "sha256:f7eb509b8a2e92260c2e836dc856fc9ca8888e572094201f9021d7b620bb96b2", size = 2832680, upload-time = "2024-04-18T19:19:34.954Z" }, ] +[[package]] +name = "django-polymorphic" +version = "3.1.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "django" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/4e/06/f43d3d7e690a6bd90c0e300d824f5aad0e9840cfd8d5bb164fd06ef6bcfc/django-polymorphic-3.1.0.tar.gz", hash = "sha256:d6955b5308bf6e41dcb22ba7c96f00b51dfa497a8a5ab1e9c06c7951bf417bf8", size = 50106, upload-time = "2021-11-18T11:55:40.825Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/86/f7/e87f43be83760793fc3baeed499839b1580fda863dd30441c1661ff2fd96/django_polymorphic-3.1.0-py3-none-any.whl", hash = "sha256:08bc4f4f4a773a19b2deced5a56deddd1ef56ebd15207bf4052e2901c25ef57e", size = 63415, upload-time = "2021-11-18T11:55:38.663Z" }, +] + [[package]] name = "django-redis" version = "5.4.0" @@ -1376,6 +1388,7 @@ dependencies = [ { name = "django-cors-headers" }, { name = "django-filter" }, { name = "django-jazzmin" }, + { name = "django-polymorphic" }, { name = "django-redis" }, { name = "django-rest-swagger" }, { name = "djangorestframework" }, @@ -1500,6 +1513,7 @@ requires-dist = [ { name = "django-extensions", marker = "extra == 'dev'", specifier = "==3.2.3" }, { name = "django-filter", specifier = "==23.5" }, { name = "django-jazzmin", specifier = ">=2.6.2" }, + { name = "django-polymorphic", specifier = ">=3.1,<4.0" }, { name = "django-redis", specifier = "==5.4.0" }, { name = "django-rest-swagger", specifier = ">=2.2.0" }, { name = "djangorestframework", specifier = "==3.14.0" },