From c0957c5f5c95c198f5a7ed0e721a2274dd47cd7f Mon Sep 17 00:00:00 2001 From: Aleksandr Meshchriakov Date: Mon, 27 Apr 2026 23:36:28 +0200 Subject: [PATCH] feat: add parser source dashboard and scheduling --- .env.example | 10 +- .gitea/workflows/ci-cd.yml | 19 +- deploy/scripts/deploy.sh | 14 +- deploy/systemd/celery-beat.service | 3 +- deploy/systemd/celery-worker.service | 10 +- deploy/systemd/gunicorn.service | 4 +- docker-compose.yml | 3 + pyproject.toml | 4 +- src/apps/backups/__init__.py | 1 + src/apps/backups/admin.py | 34 + src/apps/backups/apps.py | 9 + src/apps/backups/migrations/0001_initial.py | 168 ++ src/apps/backups/migrations/__init__.py | 1 + src/apps/backups/models.py | 81 + src/apps/backups/serializers.py | 15 + src/apps/backups/services.py | 504 +++++ src/apps/backups/tasks.py | 117 + src/apps/backups/urls.py | 10 + src/apps/backups/views.py | 93 + src/apps/core/openapi.py | 2 +- src/apps/core/serializers.py | 10 +- src/apps/core/services.py | 11 +- src/apps/core/views.py | 209 +- src/apps/exchange/__init__.py | 1 + src/apps/exchange/admin.py | 22 + src/apps/exchange/apps.py | 9 + src/apps/exchange/migrations/0001_initial.py | 100 + src/apps/exchange/migrations/__init__.py | 1 + src/apps/exchange/models.py | 94 + src/apps/exchange/serializers.py | 255 +++ src/apps/exchange/services.py | 711 ++++++ src/apps/exchange/tasks.py | 71 + src/apps/exchange/urls.py | 36 + src/apps/exchange/views.py | 250 +++ src/apps/parsers/admin.py | 79 + src/apps/parsers/api_result_urls.py | 159 ++ src/apps/parsers/clients/__init__.py | 5 + src/apps/parsers/clients/base.py | 152 +- src/apps/parsers/clients/common/__init__.py | 13 + src/apps/parsers/clients/common/schemas.py | 22 + src/apps/parsers/clients/common/structured.py | 1018 +++++++++ src/apps/parsers/clients/proverki/client.py | 15 +- src/apps/parsers/clients/trudvsem/__init__.py | 5 + src/apps/parsers/clients/trudvsem/client.py | 199 ++ src/apps/parsers/frontend_compat.py | 790 +++++++ .../0006_add_generic_parser_record.py | 64 + .../migrations/0007_parserbatchsequence.py | 29 + .../0008_seed_weekly_parser_schedules.py | 83 + src/apps/parsers/models.py | 150 ++ src/apps/parsers/serializers.py | 331 ++- src/apps/parsers/services.py | 392 +++- src/apps/parsers/source_registry.py | 294 +++ src/apps/parsers/tasks.py | 649 +++++- src/apps/parsers/urls.py | 28 +- src/apps/parsers/views.py | 1341 ++++++++++- src/apps/registers/__init__.py | 1 + src/apps/registers/admin.py | 64 + src/apps/registers/apps.py | 10 + src/apps/registers/migrations/0001_initial.py | 378 ++++ src/apps/registers/migrations/__init__.py | 1 + src/apps/registers/models.py | 195 ++ src/apps/registers/serializers.py | 150 ++ src/apps/registers/services.py | 452 ++++ src/apps/registers/tasks.py | 69 + src/apps/registers/urls.py | 28 + src/apps/registers/views.py | 286 +++ src/apps/user/models.py | 8 +- src/apps/user/serializers.py | 32 +- src/apps/user/urls.py | 3 +- src/apps/user/views.py | 162 +- src/config/api_v1_urls.py | 61 +- src/config/celery.py | 6 +- src/config/settings/base.py | 37 + src/config/settings/production.py | 49 +- src/config/settings/test.py | 4 + src/config/urls.py | 23 + src/templates/dashboard.html | 1985 +++++++++++++++++ tests/apps/backups/__init__.py | 1 + tests/apps/backups/test_services_views.py | 131 ++ tests/apps/core/test_openapi.py | 94 + tests/apps/core/test_services.py | 33 +- tests/apps/core/test_views.py | 127 +- tests/apps/exchange/__init__.py | 1 + tests/apps/exchange/factories.py | 19 + tests/apps/exchange/test_views.py | 167 ++ tests/apps/parsers/factories.py | 41 +- tests/apps/parsers/test_clients.py | 469 +++- tests/apps/parsers/test_models.py | 28 +- tests/apps/parsers/test_services.py | 196 +- tests/apps/parsers/test_tasks.py | 204 ++ tests/apps/parsers/test_views.py | 753 +++++++ tests/apps/registers/__init__.py | 1 + tests/apps/registers/factories.py | 61 + tests/apps/registers/test_views.py | 155 ++ tests/apps/user/test_serializers.py | 30 +- tests/apps/user/test_views.py | 63 +- 96 files changed, 15012 insertions(+), 266 deletions(-) create mode 100644 src/apps/backups/__init__.py create mode 100644 src/apps/backups/admin.py create mode 100644 src/apps/backups/apps.py create mode 100644 src/apps/backups/migrations/0001_initial.py create mode 100644 src/apps/backups/migrations/__init__.py create mode 100644 src/apps/backups/models.py create mode 100644 src/apps/backups/serializers.py create mode 100644 src/apps/backups/services.py create mode 100644 src/apps/backups/tasks.py create mode 100644 src/apps/backups/urls.py create mode 100644 src/apps/backups/views.py create mode 100644 src/apps/exchange/__init__.py create mode 100644 src/apps/exchange/admin.py create mode 100644 src/apps/exchange/apps.py create mode 100644 src/apps/exchange/migrations/0001_initial.py create mode 100644 src/apps/exchange/migrations/__init__.py create mode 100644 src/apps/exchange/models.py create mode 100644 src/apps/exchange/serializers.py create mode 100644 src/apps/exchange/services.py create mode 100644 src/apps/exchange/tasks.py create mode 100644 src/apps/exchange/urls.py create mode 100644 src/apps/exchange/views.py create mode 100644 src/apps/parsers/api_result_urls.py create mode 100644 src/apps/parsers/clients/common/__init__.py create mode 100644 src/apps/parsers/clients/common/schemas.py create mode 100644 src/apps/parsers/clients/common/structured.py create mode 100644 src/apps/parsers/clients/trudvsem/__init__.py create mode 100644 src/apps/parsers/clients/trudvsem/client.py create mode 100644 src/apps/parsers/frontend_compat.py create mode 100644 src/apps/parsers/migrations/0006_add_generic_parser_record.py create mode 100644 src/apps/parsers/migrations/0007_parserbatchsequence.py create mode 100644 src/apps/parsers/migrations/0008_seed_weekly_parser_schedules.py create mode 100644 src/apps/parsers/source_registry.py create mode 100644 src/apps/registers/__init__.py create mode 100644 src/apps/registers/admin.py create mode 100644 src/apps/registers/apps.py create mode 100644 src/apps/registers/migrations/0001_initial.py create mode 100644 src/apps/registers/migrations/__init__.py create mode 100644 src/apps/registers/models.py create mode 100644 src/apps/registers/serializers.py create mode 100644 src/apps/registers/services.py create mode 100644 src/apps/registers/tasks.py create mode 100644 src/apps/registers/urls.py create mode 100644 src/apps/registers/views.py create mode 100644 src/templates/dashboard.html create mode 100644 tests/apps/backups/__init__.py create mode 100644 tests/apps/backups/test_services_views.py create mode 100644 tests/apps/exchange/__init__.py create mode 100644 tests/apps/exchange/factories.py create mode 100644 tests/apps/exchange/test_views.py create mode 100644 tests/apps/parsers/test_tasks.py create mode 100644 tests/apps/parsers/test_views.py create mode 100644 tests/apps/registers/__init__.py create mode 100644 tests/apps/registers/factories.py create mode 100644 tests/apps/registers/test_views.py diff --git a/.env.example b/.env.example index f3fe2f2..341f9f6 100644 --- a/.env.example +++ b/.env.example @@ -2,6 +2,7 @@ # Скопируйте этот файл в .env и измените значения по необходимости # Django Settings +DJANGO_SETTINGS_MODULE=config.settings.development DEBUG=True SECRET_KEY=django-insecure-development-key-change-in-production ALLOWED_HOSTS=localhost,127.0.0.1,0.0.0.0 @@ -21,6 +22,13 @@ REDIS_CACHE_URL=redis://localhost:6379/1 CELERY_BROKER_URL=redis://localhost:6379/0 CELERY_RESULT_BACKEND=redis://localhost:6379/0 +# Exchange / backup export +EXCHANGE_CREDENTIALS_ENCRYPTION_KEY=change-me-for-exchange-passwords +# 32 bytes encoded with base64-url; generate and keep stable for decrypting .bin exports. +BACKUP_ENCRYPTION_KEY= +BACKUP_KEY_ID=default +BACKUP_EXPORT_DIRECTORY=/tmp/mostovik-backups + # CORS Settings CORS_ALLOWED_ORIGINS=http://localhost:3000,http://127.0.0.1:3000 @@ -28,4 +36,4 @@ CORS_ALLOWED_ORIGINS=http://localhost:3000,http://127.0.0.1:3000 LOG_LEVEL=INFO # Scrapy Settings -SCRAPY_LOG_LEVEL=INFO \ No newline at end of file +SCRAPY_LOG_LEVEL=INFO diff --git a/.gitea/workflows/ci-cd.yml b/.gitea/workflows/ci-cd.yml index 971d6f7..b1fc3ba 100644 --- a/.gitea/workflows/ci-cd.yml +++ b/.gitea/workflows/ci-cd.yml @@ -2,14 +2,15 @@ name: CI/CD Pipeline on: push: - branches: [ main, develop ] + branches: [ main, dev ] pull_request: - branches: [ main, develop ] + branches: [ main, dev ] jobs: lint: name: Code Quality Checks runs-on: ubuntu-latest + timeout-minutes: 15 steps: - name: Checkout code @@ -21,9 +22,7 @@ jobs: python-version: '3.11' - name: Install uv - run: | - curl -LsSf https://astral.sh/uv/install.sh | sh - echo "$HOME/.local/bin" >> $GITHUB_PATH + run: python -m pip install --upgrade pip uv - name: Create virtual environment run: uv venv @@ -46,6 +45,7 @@ jobs: test: name: Run Tests runs-on: ubuntu-latest + timeout-minutes: 20 services: postgres: image: postgres:15.10 @@ -81,9 +81,7 @@ jobs: python-version: '3.11' - name: Install uv - run: | - curl -LsSf https://astral.sh/uv/install.sh | sh - echo "$HOME/.local/bin" >> $GITHUB_PATH + run: python -m pip install --upgrade pip uv - name: Create virtual environment run: uv venv @@ -113,7 +111,7 @@ jobs: cd src python manage.py test --verbosity=2 env: - DJANGO_SETTINGS_MODULE: config.settings.development + DJANGO_SETTINGS_MODULE: config.settings.test DATABASE_URL: postgres://postgres:postgres@localhost:5432/test_db REDIS_URL: redis://localhost:6379/0 CELERY_BROKER_URL: redis://localhost:6379/0 @@ -122,6 +120,7 @@ jobs: build: name: Build Docker Images runs-on: ubuntu-latest + timeout-minutes: 20 needs: [lint, test] steps: @@ -181,7 +180,7 @@ jobs: name: Push to Gitea Registry runs-on: ubuntu-latest needs: [build] - if: github.ref == 'refs/heads/main' || github.ref == 'refs/heads/develop' + if: github.ref == 'refs/heads/main' || github.ref == 'refs/heads/dev' steps: - name: Checkout code diff --git a/deploy/scripts/deploy.sh b/deploy/scripts/deploy.sh index 9403f3f..b009359 100644 --- a/deploy/scripts/deploy.sh +++ b/deploy/scripts/deploy.sh @@ -73,7 +73,10 @@ uv pip install -r requirements-dev.txt # Настройка переменных окружения echo "Настройка переменных окружения..." cp .env.example .env -# Здесь можно автоматически заполнить .env файл или запросить ввод +sed -i 's/^DJANGO_SETTINGS_MODULE=.*/DJANGO_SETTINGS_MODULE=config.settings.production/' .env +sed -i 's/^DEBUG=.*/DEBUG=False/' .env +export DJANGO_SETTINGS_MODULE=config.settings.production +# Здесь нужно заполнить SECRET_KEY, ALLOWED_HOSTS и доступы к БД перед запуском # Настройка базы данных echo "Настройка базы данных..." @@ -88,9 +91,10 @@ python manage.py makemigrations python manage.py migrate python manage.py collectstatic --noinput -# Создание суперпользователя (опционально) -echo "Создание суперпользователя..." -echo "from django.contrib.auth import get_user_model; User = get_user_model(); User.objects.create_superuser('admin', 'admin@example.com', 'adminpass') if not User.objects.filter(username='admin').exists() else None" | python manage.py shell +# Создание суперпользователя выполняется только через переменные окружения Django. +if [ "${CREATE_DJANGO_SUPERUSER:-false}" = "true" ]; then + python manage.py createsuperuser --noinput +fi # Настройка systemd сервисов echo "Настройка systemd сервисов..." @@ -124,4 +128,4 @@ systemctl restart apache2 echo "=== Развертывание завершено успешно ===" echo "Проект доступен по адресу: https://ваш-ip-адрес" echo "Админка Django: https://ваш-ip-адрес/admin/" -echo "API документация: https://ваш-ip-адрес/api/" \ No newline at end of file +echo "API документация: https://ваш-ip-адрес/api/" diff --git a/deploy/systemd/celery-beat.service b/deploy/systemd/celery-beat.service index f54f84b..2ccbc36 100644 --- a/deploy/systemd/celery-beat.service +++ b/deploy/systemd/celery-beat.service @@ -7,9 +7,10 @@ Type=simple User=www-data Group=www-data EnvironmentFile=/var/www/project/.env +Environment=DJANGO_SETTINGS_MODULE=config.settings.production WorkingDirectory=/var/www/project/src ExecStart=/var/www/project/venv/bin/celery -A config beat --loglevel=INFO --scheduler django_celery_beat.schedulers:DatabaseScheduler Restart=always [Install] -WantedBy=multi-user.target \ No newline at end of file +WantedBy=multi-user.target diff --git a/deploy/systemd/celery-worker.service b/deploy/systemd/celery-worker.service index a7cabb0..7dc37ed 100644 --- a/deploy/systemd/celery-worker.service +++ b/deploy/systemd/celery-worker.service @@ -3,14 +3,16 @@ Description=Celery Worker for Django project After=network.target redis.service postgresql.service [Service] -Type=forking +Type=simple User=www-data Group=www-data EnvironmentFile=/var/www/project/.env +Environment=DJANGO_SETTINGS_MODULE=config.settings.production +RuntimeDirectory=celery WorkingDirectory=/var/www/project/src -ExecStart=/var/www/project/venv/bin/celery -A config worker --loglevel=INFO --pidfile=/run/celery/worker.pid +ExecStart=/var/www/project/venv/bin/celery -A config worker --loglevel=INFO ExecReload=/bin/kill -HUP $MAINPID -PIDFile=/run/celery/worker.pid +Restart=always [Install] -WantedBy=multi-user.target \ No newline at end of file +WantedBy=multi-user.target diff --git a/deploy/systemd/gunicorn.service b/deploy/systemd/gunicorn.service index 20b059f..5fb8831 100644 --- a/deploy/systemd/gunicorn.service +++ b/deploy/systemd/gunicorn.service @@ -6,6 +6,8 @@ After=network.target Type=notify User=www-data Group=www-data +EnvironmentFile=/var/www/project/.env +Environment=DJANGO_SETTINGS_MODULE=config.settings.production RuntimeDirectory=gunicorn WorkingDirectory=/var/www/project/src ExecStart=/var/www/project/venv/bin/gunicorn config.wsgi:application \ @@ -24,4 +26,4 @@ TimeoutStopSec=5 PrivateTmp=true [Install] -WantedBy=multi-user.target \ No newline at end of file +WantedBy=multi-user.target diff --git a/docker-compose.yml b/docker-compose.yml index a76721b..fdef50a 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -48,6 +48,7 @@ services: redis: condition: service_healthy environment: + - DJANGO_SETTINGS_MODULE=config.settings.development - DEBUG=${DEBUG:-True} - SECRET_KEY=${SECRET_KEY:-django-insecure-development-key} - POSTGRES_HOST=db @@ -84,6 +85,7 @@ services: redis: condition: service_healthy environment: + - DJANGO_SETTINGS_MODULE=config.settings.development - DEBUG=${DEBUG:-True} - POSTGRES_HOST=db - POSTGRES_PORT=5432 @@ -112,6 +114,7 @@ services: redis: condition: service_healthy environment: + - DJANGO_SETTINGS_MODULE=config.settings.development - DEBUG=${DEBUG:-True} - POSTGRES_HOST=db - POSTGRES_PORT=5432 diff --git a/pyproject.toml b/pyproject.toml index 5ed9f1a..06674e4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -106,7 +106,7 @@ packages = ["src"] # ================================================================================== [tool.pytest.ini_options] DJANGO_SETTINGS_MODULE = "config.settings.test" -python_paths = ["src"] +pythonpath = ["src"] testpaths = ["tests"] addopts = [ "--verbose", @@ -127,6 +127,8 @@ markers = [ "serializers: marks tests for serializers", "services: marks tests for services", "factories: marks tests for factories", + "network: marks tests that require network access", + "e2e: marks end-to-end tests", ] filterwarnings = [ diff --git a/src/apps/backups/__init__.py b/src/apps/backups/__init__.py new file mode 100644 index 0000000..7ba1906 --- /dev/null +++ b/src/apps/backups/__init__.py @@ -0,0 +1 @@ +"""Приложение защищённых backup-экспортов.""" diff --git a/src/apps/backups/admin.py b/src/apps/backups/admin.py new file mode 100644 index 0000000..de0d8ab --- /dev/null +++ b/src/apps/backups/admin.py @@ -0,0 +1,34 @@ +"""Admin для приложения backups.""" + +from apps.backups.models import BackupExportJob +from django.contrib import admin + + +@admin.register(BackupExportJob) +class BackupExportJobAdmin(admin.ModelAdmin): + """Admin для backup-задач.""" + + list_display = ( + "actual_date", + "registry", + "status", + "task_id", + "organizations_count", + "archive_size", + "created_at", + ) + list_filter = ("status", "registry", "actual_date") + search_fields = ("task_id", "archive_filename", "checksum_sha256", "registry__name") + readonly_fields = ( + "archive_path", + "archive_filename", + "checksum_filename", + "checksum_sha256", + "archive_size", + "organizations_count", + "error", + "started_at", + "completed_at", + "created_at", + "updated_at", + ) diff --git a/src/apps/backups/apps.py b/src/apps/backups/apps.py new file mode 100644 index 0000000..90070e7 --- /dev/null +++ b/src/apps/backups/apps.py @@ -0,0 +1,9 @@ +from django.apps import AppConfig + + +class BackupsConfig(AppConfig): + """Конфигурация приложения backup-экспорта.""" + + default_auto_field = "django.db.models.BigAutoField" + name = "apps.backups" + verbose_name = "Backup-экспорт" diff --git a/src/apps/backups/migrations/0001_initial.py b/src/apps/backups/migrations/0001_initial.py new file mode 100644 index 0000000..04cd6ea --- /dev/null +++ b/src/apps/backups/migrations/0001_initial.py @@ -0,0 +1,168 @@ +from django.conf import settings +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + initial = True + + dependencies = [ + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ("registers", "0001_initial"), + ] + + operations = [ + migrations.CreateModel( + name="BackupExportJob", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ( + "created_at", + models.DateTimeField( + auto_now_add=True, + db_index=True, + help_text="Дата и время создания записи", + verbose_name="создано", + ), + ), + ( + "updated_at", + models.DateTimeField( + auto_now=True, + help_text="Дата и время последнего обновления", + verbose_name="обновлено", + ), + ), + ( + "actual_date", + models.DateField(db_index=True, verbose_name="дата актуальности"), + ), + ( + "status", + models.CharField( + choices=[ + ("pending", "Ожидает"), + ("started", "Выполняется"), + ("success", "Успешно"), + ("failure", "Ошибка"), + ], + db_index=True, + default="pending", + max_length=20, + verbose_name="статус", + ), + ), + ( + "task_id", + models.CharField( + blank=True, + max_length=255, + verbose_name="ID задачи Celery", + ), + ), + ( + "archive_path", + models.TextField(blank=True, verbose_name="путь к архиву"), + ), + ( + "archive_filename", + models.CharField( + blank=True, + max_length=255, + verbose_name="имя архива", + ), + ), + ( + "checksum_filename", + models.CharField( + blank=True, + max_length=255, + verbose_name="имя файла контрольной суммы", + ), + ), + ( + "checksum_sha256", + models.CharField( + blank=True, + max_length=64, + verbose_name="контрольная сумма SHA256", + ), + ), + ( + "archive_size", + models.PositiveIntegerField( + blank=True, + null=True, + verbose_name="размер архива", + ), + ), + ( + "organizations_count", + models.PositiveIntegerField( + default=0, + verbose_name="количество организаций", + ), + ), + ("error", models.TextField(blank=True, verbose_name="ошибка")), + ( + "started_at", + models.DateTimeField( + blank=True, + null=True, + verbose_name="время начала", + ), + ), + ( + "completed_at", + models.DateTimeField( + blank=True, + null=True, + verbose_name="время завершения", + ), + ), + ( + "registry", + models.ForeignKey( + blank=True, + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="backup_export_jobs", + to="registers.register", + verbose_name="реестр", + ), + ), + ( + "requested_by", + models.ForeignKey( + blank=True, + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="backup_export_jobs", + to=settings.AUTH_USER_MODEL, + verbose_name="запрошено пользователем", + ), + ), + ], + options={ + "verbose_name": "задача backup-экспорта", + "verbose_name_plural": "задачи backup-экспорта", + "db_table": "backups_export_job", + "ordering": ["-actual_date", "-created_at"], + }, + ), + migrations.AddConstraint( + model_name="backupexportjob", + constraint=models.UniqueConstraint( + fields=("actual_date", "registry"), + name="unique_backup_date_registry", + ), + ), + ] diff --git a/src/apps/backups/migrations/__init__.py b/src/apps/backups/migrations/__init__.py new file mode 100644 index 0000000..8ddb310 --- /dev/null +++ b/src/apps/backups/migrations/__init__.py @@ -0,0 +1 @@ +"""Миграции приложения backup-экспорта.""" diff --git a/src/apps/backups/models.py b/src/apps/backups/models.py new file mode 100644 index 0000000..fe38d2b --- /dev/null +++ b/src/apps/backups/models.py @@ -0,0 +1,81 @@ +"""Модели приложения backups.""" + +from apps.core.mixins import TimestampMixin +from apps.registers.models import Register +from django.conf import settings +from django.db import models +from django.utils.translation import gettext_lazy as _ + + +class BackupExportJob(TimestampMixin, models.Model): + """Задача формирования экспортного backup-архива.""" + + class Status(models.TextChoices): + PENDING = "pending", _("Ожидает") + STARTED = "started", _("Выполняется") + SUCCESS = "success", _("Успешно") + FAILURE = "failure", _("Ошибка") + + actual_date = models.DateField(_("дата актуальности"), db_index=True) + registry = models.ForeignKey( + Register, + on_delete=models.SET_NULL, + null=True, + blank=True, + related_name="backup_export_jobs", + verbose_name=_("реестр"), + ) + status = models.CharField( + _("статус"), + max_length=20, + choices=Status.choices, + default=Status.PENDING, + db_index=True, + ) + task_id = models.CharField(_("ID задачи Celery"), max_length=255, blank=True) + requested_by = models.ForeignKey( + settings.AUTH_USER_MODEL, + on_delete=models.SET_NULL, + null=True, + blank=True, + related_name="backup_export_jobs", + verbose_name=_("запрошено пользователем"), + ) + archive_path = models.TextField(_("путь к архиву"), blank=True) + archive_filename = models.CharField(_("имя архива"), max_length=255, blank=True) + checksum_filename = models.CharField( + _("имя файла контрольной суммы"), + max_length=255, + blank=True, + ) + checksum_sha256 = models.CharField( + _("контрольная сумма SHA256"), + max_length=64, + blank=True, + ) + archive_size = models.PositiveIntegerField( + _("размер архива"), null=True, blank=True + ) + organizations_count = models.PositiveIntegerField( + _("количество организаций"), + default=0, + ) + error = models.TextField(_("ошибка"), blank=True) + started_at = models.DateTimeField(_("время начала"), null=True, blank=True) + completed_at = models.DateTimeField(_("время завершения"), null=True, blank=True) + + class Meta: + db_table = "backups_export_job" + verbose_name = _("задача backup-экспорта") + verbose_name_plural = _("задачи backup-экспорта") + ordering = ["-actual_date", "-created_at"] + constraints = [ + models.UniqueConstraint( + fields=["actual_date", "registry"], + name="unique_backup_date_registry", + ) + ] + + def __str__(self) -> str: + registry = self.registry.name if self.registry_id else "all" + return f"Backup {registry} {self.actual_date} [{self.status}]" diff --git a/src/apps/backups/serializers.py b/src/apps/backups/serializers.py new file mode 100644 index 0000000..bef9b21 --- /dev/null +++ b/src/apps/backups/serializers.py @@ -0,0 +1,15 @@ +"""Сериализаторы API backup-экспорта.""" + +from apps.registers.models import Register +from rest_framework import serializers + + +class BackupExportRequestSerializer(serializers.Serializer): + """Параметры экспорта защищённого backup архива.""" + + actual_date = serializers.DateField(required=False) + registry = serializers.PrimaryKeyRelatedField( + queryset=Register.objects.all(), + required=False, + allow_null=True, + ) diff --git a/src/apps/backups/services.py b/src/apps/backups/services.py new file mode 100644 index 0000000..f9b162f --- /dev/null +++ b/src/apps/backups/services.py @@ -0,0 +1,504 @@ +"""Сервисы создания защищённых backup-архивов.""" + +from __future__ import annotations + +import base64 +import hashlib +import json +import os +import struct +import uuid +import zlib +from contextlib import suppress +from dataclasses import dataclass +from datetime import date, datetime +from decimal import Decimal +from io import BytesIO +from pathlib import Path +from uuid import UUID +from zipfile import ZIP_DEFLATED, ZipFile + +from apps.backups.models import BackupExportJob +from apps.parsers.models import ( + GenericParserRecord, + IndustrialCertificateRecord, + InspectionRecord, + ManufacturerRecord, + ParserLoadLog, +) +from apps.registers.models import ( + Organization, + Register, + RegisterUpload, + RegistryMembershipPeriod, +) +from cryptography.hazmat.primitives.ciphers.aead import AESGCM +from django.conf import settings +from django.db import IntegrityError, transaction +from django.db.models import Model, Q +from django.utils import timezone + + +class BackupExportError(ValueError): + """Ошибка формирования backup-архива.""" + + +@dataclass(frozen=True) +class BackupArtifact: + """Итоговый артефакт backup-экспорта.""" + + archive_bytes: bytes + archive_filename: str + bin_filename: str + checksum_filename: str + checksum_sha256: str + organizations_count: int + actual_date: date + + +@dataclass(frozen=True) +class BackupRequestResult: + """Результат обработки запроса на экспорт backup.""" + + action: str + message: str + actual_date: date + task_id: str + + +class BackupExportService: + """Экспорт организаций реестра и связанных parser-данных в bin-архив.""" + + MAGIC = b"MSBK" + BIN_FORMAT_VERSION = 1 + AAD = b"mostovik-backup-v1" + + @classmethod + def build_backup_archive( + cls, + *, + actual_date: date | None = None, + registry: Register | None = None, + ) -> BackupArtifact: + snapshot_date = actual_date or timezone.localdate() + active_org_ids = cls._get_active_organization_ids(registry=registry) + if not active_org_ids: + raise BackupExportError("Нет организаций для экспорта") + + payload = cls._build_export_payload( + actual_date=snapshot_date, + active_org_ids=active_org_ids, + registry=registry, + ) + payload_bytes = cls._serialize_payload(payload) + compressed_payload = zlib.compress(payload_bytes, level=9) + encrypted_payload, crypto_header = cls._encrypt_payload(compressed_payload) + bin_bytes = cls._build_bin_container( + encrypted_payload=encrypted_payload, + header_payload=crypto_header + | { + "actual_date": snapshot_date.isoformat(), + "registry_id": str(registry.id) if registry else "", + "registry_name": registry.name if registry else "", + "organizations_count": len(active_org_ids), + "plaintext_sha256": hashlib.sha256(payload_bytes).hexdigest(), + "compressed_sha256": hashlib.sha256(compressed_payload).hexdigest(), + "ciphertext_sha256": hashlib.sha256(encrypted_payload).hexdigest(), + }, + ) + checksum_sha256 = hashlib.sha256(bin_bytes).hexdigest() + timestamp = timezone.now().strftime("%Y%m%d_%H%M%S") + prefix = "mostovik_registry_backup" + bin_filename = f"{prefix}_{timestamp}.bin" + checksum_filename = f"{bin_filename}.sha256" + archive_filename = f"{prefix}_{timestamp}.zip" + archive_bytes = cls._build_zip_archive( + bin_filename=bin_filename, + bin_bytes=bin_bytes, + checksum_filename=checksum_filename, + checksum_sha256=checksum_sha256, + ) + return BackupArtifact( + archive_bytes=archive_bytes, + archive_filename=archive_filename, + bin_filename=bin_filename, + checksum_filename=checksum_filename, + checksum_sha256=checksum_sha256, + organizations_count=len(active_org_ids), + actual_date=snapshot_date, + ) + + @classmethod + def _get_active_organization_ids(cls, *, registry: Register | None) -> list[int]: + queryset = RegistryMembershipPeriod.objects.all() + if registry: + queryset = queryset.filter(registry=registry) + return list(queryset.values_list("organization_id", flat=True).distinct()) + + @classmethod + def _build_export_payload( + cls, + *, + actual_date: date, + active_org_ids: list[int], + registry: Register | None, + ) -> dict: + organizations = Organization.objects.filter(id__in=active_org_ids).order_by( + "id" + ) + org_inns = [ + str(item) for item in organizations.values_list("mn_inn", flat=True) + ] + org_ogrns = [ + str(item) for item in organizations.values_list("mn_ogrn", flat=True) + ] + + memberships = RegistryMembershipPeriod.objects.filter( + organization_id__in=active_org_ids + ) + if registry: + memberships = memberships.filter(registry=registry) + register_ids = list( + memberships.values_list("registry_id", flat=True).distinct() + ) + + industrial = IndustrialCertificateRecord.objects.filter( + Q(inn__in=org_inns) | Q(ogrn__in=org_ogrns) + ).order_by("id") + manufacturers = ManufacturerRecord.objects.filter( + Q(inn__in=org_inns) | Q(ogrn__in=org_ogrns) + ).order_by("id") + inspections = InspectionRecord.objects.filter( + Q(inn__in=org_inns) | Q(ogrn__in=org_ogrns) + ).order_by("id") + generic = GenericParserRecord.objects.filter( + Q(inn__in=org_inns) | Q(ogrn__in=org_ogrns) + ).order_by("id") + load_logs = cls._related_load_logs( + industrial=industrial, + manufacturers=manufacturers, + inspections=inspections, + generic=generic, + ) + + export_map = { + Organization: organizations, + Register: Register.objects.filter(id__in=register_ids).order_by("name"), + RegisterUpload: RegisterUpload.objects.filter( + registry_id__in=register_ids + ).order_by("id"), + RegistryMembershipPeriod: memberships.order_by( + "registry_id", "organization_id" + ), + ParserLoadLog: load_logs, + IndustrialCertificateRecord: industrial, + ManufacturerRecord: manufacturers, + InspectionRecord: inspections, + GenericParserRecord: generic, + } + + schema: dict[str, dict] = {} + data: dict[str, list[dict]] = {} + for model, queryset in export_map.items(): + schema[model._meta.label] = cls._build_model_schema(model) + data[model._meta.label] = cls._serialize_queryset( + model=model, queryset=queryset + ) + + return { + "format": "mostovik-backup-payload", + "version": cls.BIN_FORMAT_VERSION, + "generated_at": timezone.now().isoformat(), + "actual_date": actual_date.isoformat(), + "registry_id": str(registry.id) if registry else "", + "registry_name": registry.name if registry else "", + "organizations_count": len(active_org_ids), + "schema": schema, + "data": data, + } + + @classmethod + def _related_load_logs(cls, *, industrial, manufacturers, inspections, generic): + query = Q(pk__isnull=True) + batches = [ + (ParserLoadLog.Source.INDUSTRIAL, industrial), + (ParserLoadLog.Source.MANUFACTURES, manufacturers), + (ParserLoadLog.Source.INSPECTIONS, inspections), + ] + for source, queryset in batches: + batch_ids = list(queryset.values_list("load_batch", flat=True).distinct()) + if batch_ids: + query |= Q(source=source, batch_id__in=batch_ids) + generic_pairs = list(generic.values_list("source", "load_batch").distinct()) + for source, batch_id in generic_pairs: + query |= Q(source=source, batch_id=batch_id) + return ParserLoadLog.objects.filter(query).order_by("source", "batch_id") + + @classmethod + def _build_model_schema(cls, model: type[Model]) -> dict: + return { + "app_label": model._meta.app_label, + "model_name": model._meta.model_name, + "db_table": model._meta.db_table, + "fields": [ + { + "name": field.name, + "attname": field.attname, + "column": field.column, + "type": field.get_internal_type(), + "null": field.null, + "primary_key": field.primary_key, + "is_relation": field.is_relation, + } + for field in model._meta.local_fields + ], + } + + @classmethod + def _serialize_queryset(cls, *, model: type[Model], queryset) -> list[dict]: + field_names = [field.attname for field in model._meta.local_fields] + return [ + {key: cls._normalize_value(value) for key, value in row.items()} + for row in queryset.values(*field_names).iterator(chunk_size=1000) + ] + + @classmethod + def _normalize_value(cls, value): + if isinstance(value, datetime | date): + return value.isoformat() + if isinstance(value, Decimal): + return str(value) + if isinstance(value, UUID): + return str(value) + if isinstance(value, bytes): + return { + "__type__": "bytes", + "base64": base64.b64encode(value).decode("ascii"), + } + return value + + @classmethod + def _serialize_payload(cls, payload: dict) -> bytes: + return json.dumps( + payload, + ensure_ascii=False, + sort_keys=True, + separators=(",", ":"), + ).encode("utf-8") + + @classmethod + def _encrypt_payload(cls, payload_bytes: bytes) -> tuple[bytes, dict]: + raw_key = cls._read_encryption_key() + nonce = os.urandom(12) + encrypted_payload = AESGCM(raw_key).encrypt(nonce, payload_bytes, cls.AAD) + return encrypted_payload, { + "algorithm": "AES-256-GCM", + "key_id": getattr(settings, "BACKUP_KEY_ID", "default"), + "nonce": base64.urlsafe_b64encode(nonce).decode("ascii"), + "aad": base64.urlsafe_b64encode(cls.AAD).decode("ascii"), + } + + @classmethod + def _read_encryption_key(cls) -> bytes: + key_raw = getattr(settings, "BACKUP_ENCRYPTION_KEY", "") + if not key_raw: + if getattr(settings, "DEBUG", False): + return hashlib.sha256(settings.SECRET_KEY.encode("utf-8")).digest() + raise BackupExportError("Не задан BACKUP_ENCRYPTION_KEY в настройках") + try: + normalized_key = key_raw + ("=" * (-len(key_raw) % 4)) + decoded_key = base64.urlsafe_b64decode(normalized_key) + except Exception as exc: + raise BackupExportError( + "BACKUP_ENCRYPTION_KEY должен быть base64-url кодированным ключом" + ) from exc + if len(decoded_key) != 32: + raise BackupExportError( + "BACKUP_ENCRYPTION_KEY после декодирования должен быть 32 байта" + ) + return decoded_key + + @classmethod + def _build_bin_container( + cls, *, encrypted_payload: bytes, header_payload: dict + ) -> bytes: + header = { + "format": "mostovik-backup-bin", + "version": cls.BIN_FORMAT_VERSION, + "generated_at": timezone.now().isoformat(), + } | header_payload + header_bytes = json.dumps( + header, + ensure_ascii=True, + sort_keys=True, + separators=(",", ":"), + ).encode("utf-8") + if len(header_bytes) > 2**32 - 1: + raise BackupExportError("Заголовок backup контейнера слишком большой") + return ( + cls.MAGIC + + bytes([cls.BIN_FORMAT_VERSION]) + + struct.pack(">I", len(header_bytes)) + + header_bytes + + encrypted_payload + ) + + @classmethod + def _build_zip_archive( + cls, + *, + bin_filename: str, + bin_bytes: bytes, + checksum_filename: str, + checksum_sha256: str, + ) -> bytes: + stream = BytesIO() + checksum_content = f"{checksum_sha256} {bin_filename}\n".encode() + with ZipFile(stream, mode="w", compression=ZIP_DEFLATED) as archive: + archive.writestr(bin_filename, bin_bytes) + archive.writestr(checksum_filename, checksum_content) + return stream.getvalue() + + +class BackupExportJobService: + """Оркестрация асинхронного формирования и отдачи backup-архивов.""" + + @classmethod + @transaction.atomic + def check_or_start_job( + cls, + *, + actual_date: date, + registry: Register | None, + requested_by_id: int | None, + ) -> BackupRequestResult: + job = cls._get_job_for_update(actual_date=actual_date, registry=registry) + existing = cls._result_for_existing_job(actual_date=actual_date, job=job) + if existing is not None: + return existing + if job: + cls._cleanup_job_artifact(job) + job.delete() + try: + new_job = BackupExportJob.objects.create( + actual_date=actual_date, + registry=registry, + requested_by_id=requested_by_id, + status=BackupExportJob.Status.PENDING, + ) + except IntegrityError: + concurrent = cls._get_job_for_update( + actual_date=actual_date, registry=registry + ) + concurrent_result = cls._result_for_existing_job( + actual_date=actual_date, + job=concurrent, + ) + if concurrent_result is not None: + return concurrent_result + raise + task_id = str(uuid.uuid4()) + new_job.task_id = task_id + new_job.save(update_fields=["task_id", "updated_at"]) + transaction.on_commit( + lambda: cls._enqueue_backup_task(job_id=new_job.id, task_id=task_id) + ) + return BackupRequestResult( + action="started", + message="Формирование backup запущено.", + actual_date=actual_date, + task_id=task_id, + ) + + @staticmethod + def _enqueue_backup_task(*, job_id: int, task_id: str) -> None: + from apps.backups.tasks import generate_backup_for_date + + generate_backup_for_date.apply_async(kwargs={"job_id": job_id}, task_id=task_id) + + @classmethod + def _result_for_existing_job( + cls, + *, + actual_date: date, + job: BackupExportJob | None, + ) -> BackupRequestResult | None: + if job is None: + return None + if job.status in ( + BackupExportJob.Status.PENDING, + BackupExportJob.Status.STARTED, + ): + return BackupRequestResult( + action="wait", + message="Backup формируется, подождите.", + actual_date=actual_date, + task_id=job.task_id, + ) + if job.status == BackupExportJob.Status.SUCCESS and cls._archive_exists(job): + return BackupRequestResult( + action="download", + message="Backup готов.", + actual_date=actual_date, + task_id=job.task_id, + ) + return None + + @classmethod + @transaction.atomic + def consume_ready_archive( + cls, + *, + actual_date: date, + registry: Register | None, + ) -> BackupArtifact: + job = cls._get_job_for_update(actual_date=actual_date, registry=registry) + if job is None: + raise BackupExportError("Задача backup не найдена") + if job.status != BackupExportJob.Status.SUCCESS: + raise BackupExportError("Backup еще не готов") + if not cls._archive_exists(job): + job.delete() + raise BackupExportError( + "Файл backup отсутствует, запустите формирование снова" + ) + archive_path = Path(job.archive_path) + archive_bytes = archive_path.read_bytes() + archive_filename = job.archive_filename or archive_path.name + with suppress(Exception): + archive_path.unlink(missing_ok=True) + artifact = BackupArtifact( + archive_bytes=archive_bytes, + archive_filename=archive_filename, + bin_filename="", + checksum_filename=job.checksum_filename, + checksum_sha256=job.checksum_sha256 + or hashlib.sha256(archive_bytes).hexdigest(), + organizations_count=job.organizations_count, + actual_date=job.actual_date, + ) + job.delete() + return artifact + + @classmethod + def _get_job_for_update( + cls, + *, + actual_date: date, + registry: Register | None, + ) -> BackupExportJob | None: + return ( + BackupExportJob.objects.select_for_update() + .filter(actual_date=actual_date, registry=registry) + .first() + ) + + @staticmethod + def _archive_exists(job: BackupExportJob) -> bool: + return bool(job.archive_path) and Path(job.archive_path).is_file() + + @staticmethod + def _cleanup_job_artifact(job: BackupExportJob) -> None: + if job.archive_path: + with suppress(Exception): + Path(job.archive_path).unlink(missing_ok=True) diff --git a/src/apps/backups/tasks.py b/src/apps/backups/tasks.py new file mode 100644 index 0000000..b354c05 --- /dev/null +++ b/src/apps/backups/tasks.py @@ -0,0 +1,117 @@ +"""Celery-задачи приложения backups.""" + +from __future__ import annotations + +import logging +import traceback +import uuid +from pathlib import Path + +from apps.backups.models import BackupExportJob +from apps.backups.services import BackupExportService +from apps.core.services import BackgroundJobService +from celery import shared_task +from django.conf import settings +from django.utils import timezone + +logger = logging.getLogger(__name__) + + +def _resolve_backup_target_path(file_name: str) -> Path: + target_dir = Path(getattr(settings, "BACKUP_EXPORT_DIRECTORY", settings.MEDIA_ROOT)) + target_dir.mkdir(parents=True, exist_ok=True) + target = target_dir / file_name + if target.exists(): + suffix = uuid.uuid4().hex[:8] + target = target_dir / f"{target.stem}_{suffix}{target.suffix}" + return target + + +@shared_task(bind=True) +def generate_backup_for_date(self, job_id: int) -> dict: + """Сформировать backup-архив по записи BackupExportJob.""" + task_id = self.request.id + job = BackupExportJob.objects.select_related("registry").filter(id=job_id).first() + if job is None: + return {"status": "skipped", "reason": "job_not_found"} + + background_job = BackgroundJobService.get_by_task_id_or_none(task_id) + if background_job is None: + background_job = BackgroundJobService.create_job( + task_id=task_id, + task_name="apps.backups.tasks.generate_backup_for_date", + user_id=job.requested_by_id, + meta={ + "actual_date": job.actual_date.isoformat(), + "registry_id": str(job.registry_id) if job.registry_id else "", + }, + ) + + job.task_id = task_id + job.status = BackupExportJob.Status.STARTED + job.error = "" + job.started_at = timezone.now() + job.completed_at = None + job.save( + update_fields=[ + "task_id", + "status", + "error", + "started_at", + "completed_at", + "updated_at", + ] + ) + background_job.mark_started() + background_job.update_progress(10, "Подготовка backup-данных") + + try: + artifact = BackupExportService.build_backup_archive( + actual_date=job.actual_date, + registry=job.registry, + ) + background_job.update_progress(70, "Запись архива") + archive_path = _resolve_backup_target_path(artifact.archive_filename) + archive_path.write_bytes(artifact.archive_bytes) + + job.status = BackupExportJob.Status.SUCCESS + job.archive_path = str(archive_path) + job.archive_filename = artifact.archive_filename + job.checksum_filename = artifact.checksum_filename + job.checksum_sha256 = artifact.checksum_sha256 + job.archive_size = len(artifact.archive_bytes) + job.organizations_count = artifact.organizations_count + job.completed_at = timezone.now() + job.error = "" + job.save( + update_fields=[ + "status", + "archive_path", + "archive_filename", + "checksum_filename", + "checksum_sha256", + "archive_size", + "organizations_count", + "completed_at", + "error", + "updated_at", + ] + ) + result = { + "status": "success", + "actual_date": job.actual_date.isoformat(), + "registry_id": str(job.registry_id) if job.registry_id else "", + "archive_filename": job.archive_filename, + "checksum_sha256": job.checksum_sha256, + "organizations_count": job.organizations_count, + } + background_job.complete(result=result) + return result + except Exception as exc: + logger.exception("Backup export failed for job_id=%s", job.id) + job.status = BackupExportJob.Status.FAILURE + job.error = str(exc) + job.completed_at = timezone.now() + job.save(update_fields=["status", "error", "completed_at", "updated_at"]) + background_job.fail(error=str(exc), traceback_str=traceback.format_exc()) + raise diff --git a/src/apps/backups/urls.py b/src/apps/backups/urls.py new file mode 100644 index 0000000..af8b596 --- /dev/null +++ b/src/apps/backups/urls.py @@ -0,0 +1,10 @@ +"""URL-конфигурация приложения backups.""" + +from apps.backups.views import BackupExportView +from django.urls import path + +app_name = "backups" + +urlpatterns = [ + path("export/", BackupExportView.as_view(), name="export"), +] diff --git a/src/apps/backups/views.py b/src/apps/backups/views.py new file mode 100644 index 0000000..4895d95 --- /dev/null +++ b/src/apps/backups/views.py @@ -0,0 +1,93 @@ +"""API views экспорта защищённых backup архивов.""" + +from apps.backups.serializers import BackupExportRequestSerializer +from apps.backups.services import BackupExportError, BackupExportJobService +from apps.core.response import api_response +from django.http import HttpResponse +from django.utils import timezone +from drf_yasg import openapi +from drf_yasg.utils import swagger_auto_schema +from rest_framework import status +from rest_framework.exceptions import ValidationError +from rest_framework.permissions import IsAdminUser +from rest_framework.views import APIView + +BACKUPS_TAG = "Backups" + + +class BackupExportView(APIView): + """Асинхронный экспорт защищённого backup архива.""" + + permission_classes = [IsAdminUser] + + @swagger_auto_schema( + tags=[BACKUPS_TAG], + operation_summary="Export encrypted registry backup", + operation_description=( + "Starts encrypted `.bin` archive generation for organizations in a " + "registry and returns the ready ZIP on a repeated call after the " + "Celery task has completed." + ), + request_body=BackupExportRequestSerializer, + responses={ + 200: openapi.Response( + description="Ready backup archive", + schema=openapi.Schema(type=openapi.TYPE_FILE), + ), + 202: openapi.Response( + description="Backup task queued or still running", + schema=openapi.Schema(type=openapi.TYPE_OBJECT), + ), + 400: "Validation error", + 403: "Admin permissions required", + }, + ) + def post(self, request): + serializer = BackupExportRequestSerializer(data=request.data) + serializer.is_valid(raise_exception=True) + actual_date = ( + serializer.validated_data.get("actual_date") or timezone.localdate() + ) + registry = serializer.validated_data.get("registry") + + try: + result = BackupExportJobService.check_or_start_job( + actual_date=actual_date, + registry=registry, + requested_by_id=request.user.id + if request.user.is_authenticated + else None, + ) + except BackupExportError as exc: + raise ValidationError({"backup": str(exc)}) from exc + + if result.action in {"started", "wait"}: + return api_response( + { + "status": result.action, + "message": result.message, + "actual_date": result.actual_date.isoformat(), + "registry_id": str(registry.id) if registry else "", + "task_id": result.task_id, + }, + status_code=status.HTTP_202_ACCEPTED, + ) + + try: + artifact = BackupExportJobService.consume_ready_archive( + actual_date=result.actual_date, + registry=registry, + ) + except BackupExportError as exc: + raise ValidationError({"backup": str(exc)}) from exc + + response = HttpResponse(artifact.archive_bytes, content_type="application/zip") + response.status_code = status.HTTP_200_OK + response[ + "Content-Disposition" + ] = f'attachment; filename="{artifact.archive_filename}"' + response["X-Backup-SHA256"] = artifact.checksum_sha256 + response["X-Backup-Checksum-File"] = artifact.checksum_filename + response["X-Backup-Organizations"] = str(artifact.organizations_count) + response["X-Backup-Actual-Date"] = artifact.actual_date.isoformat() + return response diff --git a/src/apps/core/openapi.py b/src/apps/core/openapi.py index 59055b1..62537ba 100644 --- a/src/apps/core/openapi.py +++ b/src/apps/core/openapi.py @@ -49,7 +49,7 @@ def api_docs( 200: UserSerializer, 401: "Не авторизован", }, - tags=["Пользователи"], + tags=["Users"], ) def get(self, request): ... diff --git a/src/apps/core/serializers.py b/src/apps/core/serializers.py index e4a13c9..3ed1a85 100644 --- a/src/apps/core/serializers.py +++ b/src/apps/core/serializers.py @@ -26,7 +26,7 @@ class BackgroundJobSerializer(serializers.Serializer): started_at = serializers.DateTimeField(read_only=True) completed_at = serializers.DateTimeField(read_only=True) created_at = serializers.DateTimeField(read_only=True) - duration = serializers.FloatField(read_only=True, source="duration") + duration = serializers.FloatField(read_only=True) # Вычисляемые поля is_finished = serializers.BooleanField(read_only=True) @@ -45,3 +45,11 @@ class BackgroundJobListSerializer(serializers.Serializer): progress = serializers.IntegerField(read_only=True) created_at = serializers.DateTimeField(read_only=True) is_finished = serializers.BooleanField(read_only=True) + + +class BackgroundJobListQuerySerializer(serializers.Serializer): + """Query-параметры списка фоновых задач.""" + + limit = serializers.IntegerField( + required=False, default=50, min_value=1, max_value=100 + ) diff --git a/src/apps/core/services.py b/src/apps/core/services.py index 48aeedf..1631419 100644 --- a/src/apps/core/services.py +++ b/src/apps/core/services.py @@ -262,6 +262,11 @@ class BulkOperationsMixin: Количество созданных записей """ total_created = 0 + if update_conflicts: + raise NotImplementedError( + "bulk_create(update_conflicts=...) requires Django 4.1+; " + "this project is pinned to Django 3.2" + ) for i in range(0, len(instances), chunk_size): chunk = instances[i : i + chunk_size] @@ -269,12 +274,6 @@ class BulkOperationsMixin: "ignore_conflicts": ignore_conflicts, } - # Django 4.1+ поддерживает update_conflicts - if update_conflicts and update_fields and unique_fields: - kwargs["update_conflicts"] = True - kwargs["update_fields"] = update_fields - kwargs["unique_fields"] = unique_fields - created = cls.model.objects.bulk_create(chunk, **kwargs) total_created += len(created) diff --git a/src/apps/core/views.py b/src/apps/core/views.py index 056376c..50f2c8e 100644 --- a/src/apps/core/views.py +++ b/src/apps/core/views.py @@ -7,12 +7,17 @@ Provides endpoints for: - Detailed health check (DB, Redis, Celery status) """ +import json import logging import time from typing import Any +from celery import current_app from django.conf import settings from django.db import connection +from django.http import StreamingHttpResponse +from drf_yasg import openapi +from drf_yasg.utils import swagger_auto_schema from rest_framework import status from rest_framework.permissions import AllowAny from rest_framework.request import Request @@ -21,6 +26,8 @@ from rest_framework.views import APIView logger = logging.getLogger(__name__) +JOBS_TAG = "Jobs" + class HealthCheckView(APIView): """ @@ -197,24 +204,128 @@ class BackgroundJobStatusView(APIView): permission_classes = [IsAuthenticated] + @staticmethod + def _check_access(request: Request, job) -> Response | None: + """Проверить доступ к задаче по владельцу или staff-правам.""" + if not request.user.is_staff and job.user_id != request.user.id: + return Response( + {"detail": "Нет доступа к этой задаче"}, + status=status.HTTP_403_FORBIDDEN, + ) + return None + + @swagger_auto_schema( + operation_summary="Статус фоновой задачи", + operation_description="Возвращает BackgroundJob по Celery task_id.", + tags=[JOBS_TAG], + responses={200: "BackgroundJob"}, + ) def get(self, request: Request, task_id: str) -> Response: """Получить статус задачи по task_id.""" from apps.core.serializers import BackgroundJobSerializer from apps.core.services import BackgroundJobService job = BackgroundJobService.get_by_task_id(task_id) - - # Проверка доступа: только владелец или админ - if job.user_id and job.user_id != request.user.id and not request.user.is_staff: - return Response( - {"detail": "Нет доступа к этой задаче"}, - status=status.HTTP_403_FORBIDDEN, - ) + access_error = self._check_access(request, job) + if access_error is not None: + return access_error serializer = BackgroundJobSerializer(job) return Response(serializer.data) +class BackgroundJobStreamView(BackgroundJobStatusView): + """SSE stream со статусом фоновой задачи для старого frontend API.""" + + poll_interval_seconds = 1.0 + + @staticmethod + def _build_sse_message(*, event: str, payload: dict[str, Any]) -> str: + return f"event: {event}\ndata: {json.dumps(payload, ensure_ascii=False)}\n\n" + + def _build_progress_payload(self, job) -> dict[str, Any]: + return { + "task_id": job.task_id, + "status": "running", + "progress": job.progress, + "message": job.progress_message, + } + + def _build_final_payload(self, job) -> tuple[str, dict[str, Any]]: + if job.is_successful: + return ( + "completed", + { + "task_id": job.task_id, + "status": "success", + "progress": job.progress, + "result": job.result, + }, + ) + + return ( + "failed", + { + "task_id": job.task_id, + "status": "error", + "progress": job.progress, + "message": ( + job.error or job.progress_message or "Задача завершилась с ошибкой" + ), + }, + ) + + def _event_stream(self, task_id: str): + from apps.core.services import BackgroundJobService + + last_snapshot: tuple[str, int, str, str] | None = None + while True: + job = BackgroundJobService.get_by_task_id(task_id) + snapshot = ( + job.status, + job.progress, + job.progress_message, + job.error, + ) + + if snapshot != last_snapshot: + last_snapshot = snapshot + if job.is_finished: + event, payload = self._build_final_payload(job) + yield self._build_sse_message(event=event, payload=payload) + break + + yield self._build_sse_message( + event="progress", + payload=self._build_progress_payload(job), + ) + + time.sleep(self.poll_interval_seconds) + + @swagger_auto_schema( + operation_summary="Поток статуса фоновой задачи", + operation_description="SSE stream до завершения BackgroundJob.", + tags=[JOBS_TAG], + responses={200: "SSE stream"}, + ) + def get(self, request: Request, task_id: str) -> StreamingHttpResponse | Response: + """Открыть SSE stream по task_id.""" + from apps.core.services import BackgroundJobService + + job = BackgroundJobService.get_by_task_id(task_id) + access_error = self._check_access(request, job) + if access_error is not None: + return access_error + + response = StreamingHttpResponse( + self._event_stream(task_id), + content_type="text/event-stream", + ) + response["Cache-Control"] = "no-cache" + response["X-Accel-Buffering"] = "no" + return response + + class BackgroundJobListView(APIView): """ Список фоновых задач пользователя. @@ -231,13 +342,39 @@ class BackgroundJobListView(APIView): permission_classes = [IsAuthenticated] + @swagger_auto_schema( + operation_summary="Список фоновых задач", + operation_description="Возвращает задачи текущего пользователя.", + manual_parameters=[ + openapi.Parameter( + "status", + openapi.IN_QUERY, + description="Фильтр по статусу", + type=openapi.TYPE_STRING, + ), + openapi.Parameter( + "limit", + openapi.IN_QUERY, + description="Максимальное количество задач", + type=openapi.TYPE_INTEGER, + ), + ], + tags=[JOBS_TAG], + responses={200: "BackgroundJob list"}, + ) def get(self, request: Request) -> Response: """Получить список задач пользователя.""" - from apps.core.serializers import BackgroundJobListSerializer + from apps.core.serializers import ( + BackgroundJobListQuerySerializer, + BackgroundJobListSerializer, + ) from apps.core.services import BackgroundJobService + query_serializer = BackgroundJobListQuerySerializer(data=request.query_params) + query_serializer.is_valid(raise_exception=True) + status_filter = request.query_params.get("status") - limit = min(int(request.query_params.get("limit", 50)), 100) + limit = query_serializer.validated_data["limit"] jobs = BackgroundJobService.get_user_jobs( user_id=request.user.id, @@ -247,3 +384,57 @@ class BackgroundJobListView(APIView): serializer = BackgroundJobListSerializer(jobs, many=True) return Response(serializer.data) + + +class BackgroundJobControlView(APIView): + """Ручное управление фоновой задачей.""" + + from rest_framework.permissions import IsAuthenticated + + permission_classes = [IsAuthenticated] + + @swagger_auto_schema( + operation_summary="Управление фоновой задачей", + operation_description="Сейчас поддерживается только action=revoke.", + request_body=openapi.Schema( + type=openapi.TYPE_OBJECT, + properties={ + "action": openapi.Schema( + type=openapi.TYPE_STRING, + enum=["revoke"], + default="revoke", + ) + }, + ), + tags=[JOBS_TAG], + responses={200: "BackgroundJob", 400: "Задачу нельзя отменить"}, + ) + def post(self, request: Request, task_id: str) -> Response: + """Отменить Celery-задачу и обновить локальный статус.""" + from apps.core.models import JobStatus + from apps.core.serializers import BackgroundJobSerializer + from apps.core.services import BackgroundJobService + + action = request.data.get("action", "revoke") + if action != "revoke": + return Response( + {"detail": "Поддерживается только action=revoke"}, + status=status.HTTP_400_BAD_REQUEST, + ) + + job = BackgroundJobService.get_by_task_id(task_id) + if not request.user.is_staff and job.user_id != request.user.id: + return Response( + {"detail": "Нет доступа к этой задаче"}, + status=status.HTTP_403_FORBIDDEN, + ) + if job.status in (JobStatus.SUCCESS, JobStatus.FAILURE, JobStatus.REVOKED): + return Response( + {"detail": "Задача уже завершена"}, + status=status.HTTP_400_BAD_REQUEST, + ) + + current_app.control.revoke(job.task_id, terminate=True) + job.revoke() + serializer = BackgroundJobSerializer(job) + return Response(serializer.data) diff --git a/src/apps/exchange/__init__.py b/src/apps/exchange/__init__.py new file mode 100644 index 0000000..40d4e8a --- /dev/null +++ b/src/apps/exchange/__init__.py @@ -0,0 +1 @@ +"""Приложение обмена данными с внешней БД.""" diff --git a/src/apps/exchange/admin.py b/src/apps/exchange/admin.py new file mode 100644 index 0000000..8a3b9da --- /dev/null +++ b/src/apps/exchange/admin.py @@ -0,0 +1,22 @@ +"""Admin для приложения exchange.""" + +from apps.exchange.models import ExchangeConnection +from django.contrib import admin + + +@admin.register(ExchangeConnection) +class ExchangeConnectionAdmin(admin.ModelAdmin): + """Admin для подключений обмена.""" + + list_display = ( + "server", + "port", + "database_name", + "schema_name", + "username", + "is_active", + "last_checked_at", + ) + list_filter = ("is_active",) + search_fields = ("server", "database_name", "schema_name", "username") + readonly_fields = ("last_checked_at", "last_error", "created_at", "updated_at") diff --git a/src/apps/exchange/apps.py b/src/apps/exchange/apps.py new file mode 100644 index 0000000..ef4649c --- /dev/null +++ b/src/apps/exchange/apps.py @@ -0,0 +1,9 @@ +from django.apps import AppConfig + + +class ExchangeConfig(AppConfig): + """Конфигурация приложения обмена.""" + + default_auto_field = "django.db.models.BigAutoField" + name = "apps.exchange" + verbose_name = "Обмен с внешней БД" diff --git a/src/apps/exchange/migrations/0001_initial.py b/src/apps/exchange/migrations/0001_initial.py new file mode 100644 index 0000000..40f44d5 --- /dev/null +++ b/src/apps/exchange/migrations/0001_initial.py @@ -0,0 +1,100 @@ +from django.db import migrations, models + + +class Migration(migrations.Migration): + initial = True + + dependencies = [] + + operations = [ + migrations.CreateModel( + name="ExchangeConnection", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ( + "created_at", + models.DateTimeField( + auto_now_add=True, + db_index=True, + help_text="Дата и время создания записи", + verbose_name="создано", + ), + ), + ( + "updated_at", + models.DateTimeField( + auto_now=True, + help_text="Дата и время последнего обновления", + verbose_name="обновлено", + ), + ), + ("server", models.CharField(max_length=255, verbose_name="сервер")), + ("port", models.PositiveIntegerField(default=5432, verbose_name="порт")), + ( + "username", + models.CharField(max_length=255, verbose_name="пользователь"), + ), + ( + "password", + models.TextField( + help_text="Хранится зашифрованным", + verbose_name="пароль", + ), + ), + ( + "database_name", + models.CharField(max_length=255, verbose_name="имя БД"), + ), + ( + "schema_name", + models.CharField( + default="public", + max_length=255, + verbose_name="имя схемы", + ), + ), + ( + "is_active", + models.BooleanField( + db_index=True, + default=False, + verbose_name="активное", + ), + ), + ( + "last_checked_at", + models.DateTimeField( + blank=True, + null=True, + verbose_name="последняя проверка", + ), + ), + ( + "last_error", + models.TextField(blank=True, verbose_name="последняя ошибка"), + ), + ], + options={ + "verbose_name": "подключение обмена", + "verbose_name_plural": "подключения обмена", + "db_table": "exchange_connection", + "ordering": ["-is_active", "-created_at"], + }, + ), + migrations.AddConstraint( + model_name="exchangeconnection", + constraint=models.UniqueConstraint( + condition=models.Q(is_active=True), + fields=("is_active",), + name="unique_active_exchange_connection", + ), + ), + ] diff --git a/src/apps/exchange/migrations/__init__.py b/src/apps/exchange/migrations/__init__.py new file mode 100644 index 0000000..abe9c56 --- /dev/null +++ b/src/apps/exchange/migrations/__init__.py @@ -0,0 +1 @@ +"""Миграции приложения обмена.""" diff --git a/src/apps/exchange/models.py b/src/apps/exchange/models.py new file mode 100644 index 0000000..5cf3874 --- /dev/null +++ b/src/apps/exchange/models.py @@ -0,0 +1,94 @@ +"""Модели приложения обмена данными.""" + +import base64 +import hashlib + +from apps.core.mixins import TimestampMixin +from cryptography.fernet import Fernet, InvalidToken +from django.conf import settings +from django.db import models +from django.db.models import Q +from django.utils.translation import gettext_lazy as _ + + +class ExchangeConnection(TimestampMixin, models.Model): + """Подключение к целевой БД для обмена данными.""" + + PASSWORD_PREFIX = "enc:v1:" # noqa: S105 + + server = models.CharField(_("сервер"), max_length=255) + port = models.PositiveIntegerField(_("порт"), default=5432) + username = models.CharField(_("пользователь"), max_length=255) + password = models.TextField(_("пароль"), help_text=_("Хранится зашифрованным")) + database_name = models.CharField(_("имя БД"), max_length=255) + schema_name = models.CharField(_("имя схемы"), max_length=255, default="public") + is_active = models.BooleanField(_("активное"), default=False, db_index=True) + last_checked_at = models.DateTimeField( + _("последняя проверка"), null=True, blank=True + ) + last_error = models.TextField(_("последняя ошибка"), blank=True) + + class Meta: + db_table = "exchange_connection" + verbose_name = _("подключение обмена") + verbose_name_plural = _("подключения обмена") + ordering = ["-is_active", "-created_at"] + constraints = [ + models.UniqueConstraint( + fields=["is_active"], + condition=Q(is_active=True), + name="unique_active_exchange_connection", + ) + ] + + def __str__(self) -> str: + return ( + f"{self.username}@{self.server}:{self.port}/" + f"{self.database_name}[{self.schema_name}]" + ) + + @classmethod + def _get_cipher(cls) -> Fernet: + secret_material = ( + getattr(settings, "EXCHANGE_CREDENTIALS_ENCRYPTION_KEY", "") + or settings.SECRET_KEY + ) + digest = hashlib.sha256(secret_material.encode("utf-8")).digest() + return Fernet(base64.urlsafe_b64encode(digest)) + + @classmethod + def is_password_encrypted(cls, value: str) -> bool: + return bool(value) and value.startswith(cls.PASSWORD_PREFIX) + + @classmethod + def encrypt_password(cls, raw_password: str) -> str: + encrypted = ( + cls._get_cipher().encrypt(raw_password.encode("utf-8")).decode("ascii") + ) + return f"{cls.PASSWORD_PREFIX}{encrypted}" + + @classmethod + def decrypt_password(cls, stored_password: str) -> str: + if not cls.is_password_encrypted(stored_password): + return stored_password + token = stored_password[len(cls.PASSWORD_PREFIX) :].encode("ascii") + try: + return cls._get_cipher().decrypt(token).decode("utf-8") + except InvalidToken as exc: + raise ValueError( + "Не удалось расшифровать пароль exchange connection" + ) from exc + + def get_decrypted_password(self) -> str: + return self.decrypt_password(self.password) + + def save(self, *args, **kwargs): + password_was_encrypted = False + if self.password and not self.is_password_encrypted(self.password): + self.password = self.encrypt_password(self.password) + password_was_encrypted = True + + update_fields = kwargs.get("update_fields") + if password_was_encrypted and update_fields is not None: + kwargs["update_fields"] = list(set(update_fields) | {"password"}) + super().save(*args, **kwargs) diff --git a/src/apps/exchange/serializers.py b/src/apps/exchange/serializers.py new file mode 100644 index 0000000..7cd9b3d --- /dev/null +++ b/src/apps/exchange/serializers.py @@ -0,0 +1,255 @@ +"""Сериализаторы приложения обмена данными.""" + +import json +import uuid +from typing import Any + +from apps.exchange.models import ExchangeConnection +from django_celery_beat.models import IntervalSchedule, PeriodicTask +from rest_framework import serializers + + +def validate_exchange_copy_payload(attrs: dict[str, Any]) -> dict[str, Any]: + """Проверить параметры запуска копирования.""" + mode = attrs["mode"] + table = attrs.get("table") + tables = attrs.get("tables") + if mode == "single" and not table: + raise serializers.ValidationError( + {"table": "Для mode=single нужно указать table"} + ) + if mode == "selected" and not tables: + raise serializers.ValidationError( + {"tables": "Для mode=selected нужно указать tables"} + ) + if mode != "single" and table: + raise serializers.ValidationError( + {"table": "Поле table допустимо только для mode=single"} + ) + if mode != "selected" and tables: + raise serializers.ValidationError( + {"tables": "Поле tables допустимо только для mode=selected"} + ) + return attrs + + +def get_periodic_task_payload(task: PeriodicTask) -> dict[str, Any]: + """Извлечь payload exchange-задачи из kwargs django-celery-beat.""" + try: + kwargs = json.loads(task.kwargs or "{}") + except json.JSONDecodeError: + return {} + payload = kwargs.get("payload") + return payload if isinstance(payload, dict) else {} + + +class ExchangeConnectionSerializer(serializers.ModelSerializer): + """Сериализатор подключения без выдачи пароля.""" + + class Meta: + model = ExchangeConnection + fields = [ + "id", + "server", + "port", + "username", + "database_name", + "schema_name", + "is_active", + "last_checked_at", + "last_error", + "created_at", + "updated_at", + ] + read_only_fields = fields + + +class ExchangeConnectionCreateSerializer(serializers.Serializer): + """Входные данные для создания активного подключения.""" + + server = serializers.CharField(max_length=255) + port = serializers.IntegerField(min_value=1, max_value=65535, default=5432) + username = serializers.CharField(max_length=255) + password = serializers.CharField() + database_name = serializers.CharField(max_length=255) + schema_name = serializers.RegexField( + regex=r"^[A-Za-z_][A-Za-z0-9_]*$", + max_length=255, + default="public", + error_messages={ + "invalid": "Имя схемы должно начинаться с буквы/_ и содержать буквы, цифры, _" + }, + ) + + +class ExchangeCopyRequestSerializer(serializers.Serializer): + """Параметры запуска копирования данных.""" + + mode = serializers.ChoiceField(choices=["all", "single", "selected"], default="all") + table = serializers.CharField(required=False) + tables = serializers.ListField( + child=serializers.CharField(), + required=False, + allow_empty=False, + ) + truncate_before_copy = serializers.BooleanField(default=True) + + def validate(self, attrs): + return validate_exchange_copy_payload(attrs) + + +class ExchangePeriodicTaskSerializer(serializers.ModelSerializer): + """Сериализатор периодической задачи обмена.""" + + schedule_type = serializers.SerializerMethodField() + interval_every = serializers.SerializerMethodField() + interval_period = serializers.SerializerMethodField() + crontab_minute = serializers.SerializerMethodField() + crontab_hour = serializers.SerializerMethodField() + enabled = serializers.BooleanField(read_only=True) + payload = serializers.SerializerMethodField() + + class Meta: + model = PeriodicTask + fields = [ + "id", + "name", + "enabled", + "schedule_type", + "interval_every", + "interval_period", + "crontab_minute", + "crontab_hour", + "payload", + ] + read_only_fields = fields + + def get_schedule_type(self, obj: PeriodicTask) -> str | None: + if obj.interval_id: + return "interval" + if obj.crontab_id: + return "daily" + return None + + def get_interval_every(self, obj: PeriodicTask) -> int | None: + return obj.interval.every if obj.interval_id else None + + def get_interval_period(self, obj: PeriodicTask) -> str | None: + return obj.interval.period if obj.interval_id else None + + @staticmethod + def _coerce_crontab_number(value: str | None) -> int | None: + if value is None: + return None + return int(value) if str(value).isdigit() else None + + def get_crontab_minute(self, obj: PeriodicTask) -> int | None: + return ( + self._coerce_crontab_number(obj.crontab.minute) if obj.crontab_id else None + ) + + def get_crontab_hour(self, obj: PeriodicTask) -> int | None: + return self._coerce_crontab_number(obj.crontab.hour) if obj.crontab_id else None + + def get_payload(self, obj: PeriodicTask) -> dict: + return get_periodic_task_payload(obj) + + +class ExchangePeriodicTaskUpsertSerializer(serializers.Serializer): + """Создание/изменение периодической задачи обмена.""" + + schedule_type = serializers.ChoiceField( + choices=["interval", "daily"], required=False + ) + mode = serializers.ChoiceField( + choices=["all", "single", "selected"], required=False + ) + table = serializers.CharField(required=False, allow_blank=True) + tables = serializers.ListField( + child=serializers.CharField(), + required=False, + allow_empty=False, + ) + interval_every = serializers.IntegerField(min_value=1, required=False) + interval_period = serializers.ChoiceField( + choices=[choice[0] for choice in IntervalSchedule.PERIOD_CHOICES], + required=False, + ) + crontab_minute = serializers.IntegerField(min_value=0, max_value=59, required=False) + crontab_hour = serializers.IntegerField(min_value=0, max_value=23, required=False) + truncate_before_copy = serializers.BooleanField(required=False) + enabled = serializers.BooleanField(required=False) + + def validate(self, attrs: dict[str, Any]) -> dict[str, Any]: + schedule_type = attrs.get("schedule_type") + if not schedule_type and self.instance: + schedule_type = "interval" if self.instance.interval_id else "daily" + if not schedule_type: + raise serializers.ValidationError({"schedule_type": "Нужно указать тип."}) + + attrs["name"] = ( + self.instance.name if self.instance else f"exchange-{uuid.uuid4()}" + ) + attrs["payload"] = validate_exchange_copy_payload(self._build_payload(attrs)) + attrs["schedule"] = self._build_schedule(attrs, schedule_type) + return attrs + + def _build_payload(self, attrs: dict[str, Any]) -> dict[str, Any]: + current_payload = ( + get_periodic_task_payload(self.instance) if self.instance else {} + ) + table = attrs.get("table", current_payload.get("table")) + if table == "": + table = None + return { + "mode": attrs.get("mode", current_payload.get("mode", "all")), + "table": table, + "tables": attrs.get("tables", current_payload.get("tables")), + "truncate_before_copy": attrs.get( + "truncate_before_copy", + current_payload.get("truncate_before_copy", True), + ), + } + + def _build_schedule( + self, attrs: dict[str, Any], schedule_type: str + ) -> dict[str, Any]: + if schedule_type == "interval": + current = ( + self.instance.interval + if self.instance and self.instance.interval_id + else None + ) + every = attrs.get("interval_every", current.every if current else None) + period = attrs.get("interval_period", current.period if current else None) + errors = {} + if every is None: + errors["interval_every"] = "Обязательное поле для interval." + if period is None: + errors["interval_period"] = "Обязательное поле для interval." + if errors: + raise serializers.ValidationError(errors) + return {"type": "interval", "every": every, "period": period} + + current = ( + self.instance.crontab + if self.instance and self.instance.crontab_id + else None + ) + minute = attrs.get("crontab_minute", int(current.minute) if current else None) + hour = attrs.get("crontab_hour", int(current.hour) if current else None) + errors = {} + if minute is None: + errors["crontab_minute"] = "Обязательное поле для daily." + if hour is None: + errors["crontab_hour"] = "Обязательное поле для daily." + if errors: + raise serializers.ValidationError(errors) + return { + "type": "crontab", + "minute": str(minute), + "hour": str(hour), + "day_of_week": "*", + "day_of_month": "*", + "month_of_year": "*", + } diff --git a/src/apps/exchange/services.py b/src/apps/exchange/services.py new file mode 100644 index 0000000..d2f062f --- /dev/null +++ b/src/apps/exchange/services.py @@ -0,0 +1,711 @@ +"""Сервисы приложения обмена данными.""" + +from __future__ import annotations + +import json +from contextlib import suppress +from typing import Any + +from apps.exchange.models import ExchangeConnection +from django.apps import apps as django_apps +from django.conf import settings +from django.core.exceptions import ValidationError as DjangoValidationError +from django.db import IntegrityError, connections, transaction +from django.utils import timezone +from django_celery_beat.models import CrontabSchedule, IntervalSchedule, PeriodicTask + + +class ExchangeServiceError(ValueError): + """Ошибка операций приложения обмена.""" + + +class ExchangeConnectionService: + """Сервис управления подключениями и синхронизацией данных.""" + + MODEL_LABELS = [ + "registers.Register", + "registers.Organization", + "registers.RegisterUpload", + "registers.RegistryMembershipPeriod", + "parsers.ParserBatchSequence", + "parsers.ParserLoadLog", + "parsers.IndustrialCertificateRecord", + "parsers.ManufacturerRecord", + "parsers.InspectionRecord", + "parsers.GenericParserRecord", + ] + + @classmethod + def test_connection_payload(cls, **payload) -> dict[str, str]: + """Проверить подключение и подготовить структуру без сохранения.""" + connection = ExchangeConnection(is_active=False, **payload) + alias = None + try: + alias = cls.test_connection(connection) + cls.prepare_target_structure( + connection=connection, + alias=alias, + schema_name=connection.schema_name, + ) + cls.validate_target_structure( + connection=connection, + alias=alias, + schema_name=connection.schema_name, + ) + finally: + if alias: + cls._cleanup_alias(alias) + return {"status": "success", "message": "Соединение и структура БД валидны."} + + @classmethod + @transaction.atomic + def create_active_connection_and_prepare(cls, **payload) -> ExchangeConnection: + """Создать активное подключение и подготовить целевую БД.""" + old_active_ids = list( + ExchangeConnection.objects.filter(is_active=True).values_list( + "id", flat=True + ) + ) + ExchangeConnection.objects.filter(is_active=True).update(is_active=False) + connection = ExchangeConnection.objects.create(is_active=True, **payload) + try: + cls.validate_saved_connection(connection, prepare_target=True) + except Exception as exc: + connection.delete() + ExchangeConnection.objects.filter(id__in=old_active_ids).update( + is_active=True + ) + raise ExchangeServiceError(str(exc)) from exc + return connection + + @classmethod + def get_active_connection(cls) -> ExchangeConnection: + connection = ExchangeConnection.objects.filter(is_active=True).first() + if connection is None: + raise ExchangeServiceError("Активное подключение не найдено") + return connection + + @classmethod + def test_connection(cls, connection: ExchangeConnection) -> str: + alias = cls._configure_alias(connection) + try: + db_connection = connections[alias] + db_connection.ensure_connection() + with db_connection.cursor() as cursor: + cursor.execute("SELECT 1") + except Exception as exc: + cls._mark_connection_error(connection, str(exc)) + cls._cleanup_alias(alias) + raise ExchangeServiceError( + f"Ошибка подключения к целевой БД: {exc}" + ) from exc + return alias + + @classmethod + def validate_saved_connection( + cls, + connection: ExchangeConnection, + *, + models_to_copy: list | None = None, + prepare_target: bool = False, + ) -> ExchangeConnection: + """Проверить соединение и структуру сохранённого подключения.""" + alias = None + try: + alias = cls.test_connection(connection) + if prepare_target: + cls.prepare_target_structure( + connection=connection, + alias=alias, + schema_name=connection.schema_name, + models_to_copy=models_to_copy, + ) + cls.validate_target_structure( + connection=connection, + alias=alias, + schema_name=connection.schema_name, + models_to_copy=models_to_copy, + ) + finally: + if alias: + cls._cleanup_alias(alias) + + connection.last_checked_at = timezone.now() + connection.last_error = "" + if connection.pk: + connection.save( + update_fields=["last_checked_at", "last_error", "updated_at"] + ) + return connection + + @classmethod + def prepare_target_structure( + cls, + *, + connection: ExchangeConnection, + alias: str, + schema_name: str, + models_to_copy: list | None = None, + ) -> None: + """Применить структуру в target DB: создать схему и недостающие таблицы.""" + try: + cls._create_schema_if_missing(alias=alias, schema_name=schema_name) + existing_tables = cls._get_existing_tables( + alias=alias, schema_name=schema_name + ) + for model in models_to_copy or cls._get_models(): + if model._meta.db_table not in existing_tables: + cls._create_table_without_foreign_keys( + alias=alias, + schema_name=schema_name, + model=model, + ) + existing_tables.add(model._meta.db_table) + else: + cls._add_missing_columns( + alias=alias, + schema_name=schema_name, + model=model, + ) + except Exception as exc: + cls._mark_connection_error(connection, str(exc)) + if isinstance(exc, ExchangeServiceError): + raise + raise ExchangeServiceError( + f"Ошибка подготовки структуры целевой БД: {exc}" + ) from exc + + @classmethod + def validate_target_structure( + cls, + *, + connection: ExchangeConnection, + alias: str, + schema_name: str, + models_to_copy: list | None = None, + ) -> None: + """Проверить наличие схемы, таблиц и колонок target DB.""" + try: + cls._validate_schema_exists(alias=alias, schema_name=schema_name) + models = models_to_copy or cls._get_models() + cls._validate_tables_exist( + alias=alias, + schema_name=schema_name, + models_to_copy=models, + ) + cls._validate_columns_exist( + alias=alias, + schema_name=schema_name, + models_to_copy=models, + ) + except Exception as exc: + cls._mark_connection_error(connection, str(exc)) + if isinstance(exc, ExchangeServiceError): + raise + raise ExchangeServiceError( + f"Ошибка проверки структуры целевой БД: {exc}" + ) from exc + + @classmethod + def copy_data( + cls, + *, + connection: ExchangeConnection, + mode: str, + table: str | None = None, + tables: list[str] | None = None, + truncate_before_copy: bool = True, + ) -> dict[str, Any]: + """Скопировать данные из локальной БД в target DB.""" + alias = cls._configure_alias(connection) + selected_models = cls._resolve_models(mode=mode, table=table, tables=tables) + try: + connections[alias].ensure_connection() + cls.prepare_target_structure( + connection=connection, + alias=alias, + schema_name=connection.schema_name, + models_to_copy=selected_models, + ) + cls.validate_target_structure( + connection=connection, + alias=alias, + schema_name=connection.schema_name, + models_to_copy=selected_models, + ) + if truncate_before_copy: + cls._truncate_tables( + alias=alias, + schema_name=connection.schema_name, + models_to_copy=selected_models, + ) + rows_by_table = { + model._meta.db_table: cls._copy_model_data( + model=model, + alias=alias, + truncate_before_copy=truncate_before_copy, + ) + for model in selected_models + } + connection.last_checked_at = timezone.now() + connection.last_error = "" + connection.save( + update_fields=["last_checked_at", "last_error", "updated_at"] + ) + return { + "mode": mode, + "tables": list(rows_by_table.keys()), + "rows_by_table": rows_by_table, + "total_rows": sum(rows_by_table.values()), + "truncate_before_copy": truncate_before_copy, + } + except Exception as exc: + if not isinstance(exc, ExchangeServiceError): + cls._mark_connection_error(connection, str(exc)) + raise + finally: + cls._cleanup_alias(alias) + + @classmethod + def get_copy_table_choices(cls) -> list[dict[str, str]]: + """Вернуть таблицы, доступные для выгрузки.""" + return [ + { + "table": model._meta.db_table, + "model": model._meta.label, + "title": str( + model._meta.verbose_name_plural or model._meta.verbose_name + ), + } + for model in cls._get_models() + ] + + @classmethod + def _configure_alias(cls, connection: ExchangeConnection) -> str: + alias = f"exchange_target_{connection.id or 'test'}" + if alias in connections.databases: + with suppress(Exception): + connections[alias].close() + connections.databases[alias] = { + "ENGINE": "django.db.backends.postgresql", + "NAME": connection.database_name, + "USER": connection.username, + "PASSWORD": connection.get_decrypted_password(), + "HOST": connection.server, + "PORT": connection.port, + "OPTIONS": {"options": f"-c search_path={connection.schema_name},public"}, + "CONN_MAX_AGE": 0, + "ATOMIC_REQUESTS": False, + "AUTOCOMMIT": True, + "TIME_ZONE": None, + "TEST": {}, + } + storage = getattr(connections, "_connections", None) + if storage is not None and hasattr(storage, "__dict__"): + storage.__dict__.pop(alias, None) + return alias + + @classmethod + def _get_models(cls) -> list: + return [django_apps.get_model(label) for label in cls.MODEL_LABELS] + + @classmethod + def _resolve_models( + cls, + *, + mode: str, + table: str | None, + tables: list[str] | None, + ) -> list: + models = cls._get_models() + if mode == "all": + return models + mapping = {} + for model in models: + mapping[model._meta.db_table] = model + mapping[model._meta.model_name] = model + mapping[model._meta.label_lower] = model + requested_names = [table] if mode == "single" and table else tables or [] + resolved = [] + for requested_name in requested_names: + model = mapping.get(requested_name) + if model is None: + available = ", ".join(sorted(item._meta.db_table for item in models)) + raise ExchangeServiceError( + f"Неизвестная таблица '{requested_name}'. Доступные: {available}" + ) + resolved.append(model) + return resolved + + @classmethod + def _create_schema_if_missing(cls, *, alias: str, schema_name: str) -> None: + with connections[alias].cursor() as cursor: + cursor.execute( + f"CREATE SCHEMA IF NOT EXISTS {connections[alias].ops.quote_name(schema_name)}" + ) + + @classmethod + def _get_existing_tables(cls, *, alias: str, schema_name: str) -> set[str]: + with connections[alias].cursor() as cursor: + cursor.execute( + """ + SELECT table_name + FROM information_schema.tables + WHERE table_schema = %s + """, + [schema_name], + ) + return {row[0] for row in cursor.fetchall()} + + @classmethod + def _create_table_without_foreign_keys(cls, *, alias: str, schema_name: str, model): + connection = connections[alias] + qualified_table = cls._qualified_table_name( + connection=connection, + schema_name=schema_name, + table_name=model._meta.db_table, + ) + column_sql = [] + for field in model._meta.local_fields: + definition = cls._column_definition( + connection=connection, + field=field, + include_constraints=True, + ) + if definition: + column_sql.append(definition) + if not column_sql: + raise ExchangeServiceError( + f"Нет колонок для таблицы {model._meta.db_table}" + ) + with connection.cursor() as cursor: + cursor.execute( + f"CREATE TABLE IF NOT EXISTS {qualified_table} ({', '.join(column_sql)})" + ) + + @classmethod + def _add_missing_columns(cls, *, alias: str, schema_name: str, model) -> None: + connection = connections[alias] + with connection.cursor() as cursor: + cursor.execute( + """ + SELECT column_name + FROM information_schema.columns + WHERE table_schema = %s AND table_name = %s + """, + [schema_name, model._meta.db_table], + ) + existing = {row[0] for row in cursor.fetchall()} + qualified_table = cls._qualified_table_name( + connection=connection, + schema_name=schema_name, + table_name=model._meta.db_table, + ) + for field in model._meta.local_fields: + if field.column in existing: + continue + definition = cls._column_definition( + connection=connection, + field=field, + include_constraints=False, + ) + if definition: + cursor.execute( + f"ALTER TABLE {qualified_table} ADD COLUMN {definition}" + ) + + @staticmethod + def _column_definition(*, connection, field, include_constraints: bool) -> str: + db_type = field.db_type(connection) + if not db_type: + return "" + definition = f"{connection.ops.quote_name(field.column)} {db_type}" + if not include_constraints: + return definition + if field.primary_key: + definition += " PRIMARY KEY" + elif not field.null: + definition += " NOT NULL" + return definition + + @staticmethod + def _qualified_table_name(*, connection, schema_name: str, table_name: str) -> str: + quote = connection.ops.quote_name + return f"{quote(schema_name)}.{quote(table_name)}" + + @classmethod + def _validate_schema_exists(cls, *, alias: str, schema_name: str) -> None: + with connections[alias].cursor() as cursor: + cursor.execute( + "SELECT 1 FROM information_schema.schemata WHERE schema_name = %s", + [schema_name], + ) + if cursor.fetchone() is None: + raise ExchangeServiceError( + f"Схема '{schema_name}' отсутствует в целевой БД" + ) + + @classmethod + def _validate_tables_exist( + cls, *, alias: str, schema_name: str, models_to_copy: list + ): + expected = {model._meta.db_table for model in models_to_copy} + existing = cls._get_existing_tables(alias=alias, schema_name=schema_name) + missing = sorted(expected - existing) + if missing: + raise ExchangeServiceError( + "В целевой БД отсутствуют таблицы: " + ", ".join(missing) + ) + + @classmethod + def _validate_columns_exist( + cls, *, alias: str, schema_name: str, models_to_copy: list + ): + for model in models_to_copy: + expected = {field.column for field in model._meta.local_fields} + with connections[alias].cursor() as cursor: + cursor.execute( + """ + SELECT column_name + FROM information_schema.columns + WHERE table_schema = %s AND table_name = %s + """, + [schema_name, model._meta.db_table], + ) + existing = {row[0] for row in cursor.fetchall()} + missing = sorted(expected - existing) + if missing: + raise ExchangeServiceError( + f"В таблице '{model._meta.db_table}' отсутствуют колонки: " + + ", ".join(missing) + ) + + @classmethod + def _truncate_tables( + cls, + *, + alias: str, + schema_name: str, + models_to_copy: list, + ) -> None: + connection = connections[alias] + with connection.cursor() as cursor: + for model in reversed(models_to_copy): + qualified_table = cls._qualified_table_name( + connection=connection, + schema_name=schema_name, + table_name=model._meta.db_table, + ) + cursor.execute( + f"TRUNCATE TABLE {qualified_table} RESTART IDENTITY CASCADE" + ) + + @classmethod + def _copy_model_data( + cls, + *, + model, + alias: str, + truncate_before_copy: bool, + chunk_size: int = 1000, + ) -> int: + field_names = [field.attname for field in model._meta.local_fields] + queryset = model.objects.using("default").all().order_by("pk") + total = 0 + batch = [] + pk_name = model._meta.pk.attname + for source_obj in queryset.iterator(chunk_size=chunk_size): + row_data = { + field_name: getattr(source_obj, field_name) + for field_name in field_names + } + batch.append(model(**row_data)) + if len(batch) >= chunk_size: + total += cls._insert_batch( + model=model, + alias=alias, + batch=batch, + pk_name=pk_name, + truncate_before_copy=truncate_before_copy, + ) + batch = [] + if batch: + total += cls._insert_batch( + model=model, + alias=alias, + batch=batch, + pk_name=pk_name, + truncate_before_copy=truncate_before_copy, + ) + return total + + @classmethod + def _insert_batch( + cls, + *, + model, + alias: str, + batch: list, + pk_name: str, + truncate_before_copy: bool, + ) -> int: + if truncate_before_copy: + model.objects.using(alias).bulk_create(batch, batch_size=1000) + return len(batch) + pk_values = [getattr(item, pk_name) for item in batch] + existing_before = set( + model.objects.using(alias) + .filter(**{f"{pk_name}__in": pk_values}) + .values_list(pk_name, flat=True) + ) + model.objects.using(alias).bulk_create( + batch, + batch_size=1000, + ignore_conflicts=True, + ) + existing_after = set( + model.objects.using(alias) + .filter(**{f"{pk_name}__in": pk_values}) + .values_list(pk_name, flat=True) + ) + return len(existing_after - existing_before) + + @classmethod + def _mark_connection_error(cls, connection: ExchangeConnection, error: str) -> None: + connection.last_checked_at = timezone.now() + connection.last_error = error + if connection.pk: + connection.save( + update_fields=["last_checked_at", "last_error", "updated_at"] + ) + + @classmethod + def _cleanup_alias(cls, alias: str) -> None: + with suppress(Exception): + connections[alias].close() + with suppress(Exception): + connections.databases.pop(alias, None) + storage = getattr(connections, "_connections", None) + if storage is not None and hasattr(storage, "__dict__"): + storage.__dict__.pop(alias, None) + + +class ExchangePeriodicTaskService: + """Сервис управления периодическими задачами обмена.""" + + TASK_NAME = "apps.exchange.tasks.dispatch_periodic_exchange_copy" + + @classmethod + def get_queryset(cls): + return ( + PeriodicTask.objects.filter(task=cls.TASK_NAME) + .select_related("interval", "crontab") + .order_by("name") + ) + + @classmethod + @transaction.atomic + def create_periodic_task( + cls, + *, + name: str, + payload: dict[str, Any], + schedule: dict[str, Any], + enabled: bool = True, + ) -> PeriodicTask: + task = PeriodicTask( + name=name, + task=cls.TASK_NAME, + kwargs=json.dumps({"payload": payload}, ensure_ascii=False), + enabled=enabled, + ) + cls._assign_schedule(task=task, schedule=schedule) + return cls._save_task(task) + + @classmethod + @transaction.atomic + def update_periodic_task( + cls, + *, + task: PeriodicTask, + payload: dict[str, Any], + schedule: dict[str, Any], + enabled: bool | None = None, + ) -> PeriodicTask: + old_interval_id = task.interval_id + old_crontab_id = task.crontab_id + if enabled is not None: + task.enabled = enabled + task.kwargs = json.dumps({"payload": payload}, ensure_ascii=False) + cls._assign_schedule(task=task, schedule=schedule) + task = cls._save_task(task) + cls._cleanup_unused_interval(old_interval_id) + cls._cleanup_unused_crontab(old_crontab_id) + return task + + @classmethod + def _assign_schedule(cls, *, task: PeriodicTask, schedule: dict[str, Any]) -> None: + if schedule["type"] == "interval": + task.interval = cls._get_or_create_interval(schedule) + task.crontab = None + return + task.crontab = cls._get_or_create_crontab(schedule) + task.interval = None + + @classmethod + def _get_or_create_interval(cls, schedule: dict[str, Any]) -> IntervalSchedule: + interval, _ = IntervalSchedule.objects.get_or_create( + every=schedule["every"], + period=schedule["period"], + ) + return interval + + @classmethod + def _get_or_create_crontab(cls, schedule: dict[str, Any]) -> CrontabSchedule: + crontab, _ = CrontabSchedule.objects.get_or_create( + minute=schedule["minute"], + hour=schedule["hour"], + day_of_week=schedule["day_of_week"], + day_of_month=schedule["day_of_month"], + month_of_year=schedule["month_of_year"], + timezone=settings.TIME_ZONE, + ) + return crontab + + @classmethod + def _save_task(cls, task: PeriodicTask) -> PeriodicTask: + try: + task.full_clean() + task.save() + except DjangoValidationError as exc: + raise ExchangeServiceError(cls._format_validation_error(exc)) from exc + except IntegrityError as exc: + raise ExchangeServiceError( + "Периодическая задача с таким именем уже существует" + ) from exc + return task + + @staticmethod + def _format_validation_error(exc: DjangoValidationError) -> str: + if hasattr(exc, "message_dict"): + messages = [] + for field, errors in exc.message_dict.items(): + messages.extend(f"{field}: {error}" for error in errors) + return "; ".join(messages) + return "; ".join(exc.messages) + + @staticmethod + def _cleanup_unused_interval(interval_id: int | None) -> None: + if ( + interval_id + and not PeriodicTask.objects.filter(interval_id=interval_id).exists() + ): + IntervalSchedule.objects.filter(id=interval_id).delete() + + @staticmethod + def _cleanup_unused_crontab(crontab_id: int | None) -> None: + if ( + crontab_id + and not PeriodicTask.objects.filter(crontab_id=crontab_id).exists() + ): + CrontabSchedule.objects.filter(id=crontab_id).delete() diff --git a/src/apps/exchange/tasks.py b/src/apps/exchange/tasks.py new file mode 100644 index 0000000..09d756e --- /dev/null +++ b/src/apps/exchange/tasks.py @@ -0,0 +1,71 @@ +"""Celery-задачи приложения exchange.""" + +from __future__ import annotations + +import logging +import uuid +from typing import Any + +from apps.core.services import BackgroundJobService +from apps.core.tasks import PeriodicTask as CorePeriodicTask +from apps.exchange.models import ExchangeConnection +from apps.exchange.services import ExchangeConnectionService +from celery import shared_task + +logger = logging.getLogger(__name__) + + +@shared_task(bind=True, base=CorePeriodicTask) +def dispatch_periodic_exchange_copy(self, *, payload: dict[str, Any]) -> dict[str, Any]: + """Поставить в очередь периодическое копирование через активное подключение.""" + active_connection = ExchangeConnectionService.get_active_connection() + task = copy_exchange_data_async.delay( + connection_id=active_connection.id, + payload=payload, + requested_by_id=None, + ) + return { + "status": "queued", + "task_id": task.id, + "connection_id": active_connection.id, + } + + +@shared_task(bind=True) +def copy_exchange_data_async( + self, + *, + connection_id: int, + payload: dict[str, Any], + requested_by_id: int | None = None, +) -> dict[str, Any]: + """Асинхронное копирование данных в target DB.""" + task_id = self.request.id or str(uuid.uuid4()) + background_job = BackgroundJobService.get_by_task_id_or_none(task_id) + if background_job is None: + background_job = BackgroundJobService.create_job( + task_id=task_id, + task_name="apps.exchange.tasks.copy_exchange_data_async", + user_id=requested_by_id, + meta={"connection_id": connection_id, **payload}, + ) + + connection = ExchangeConnection.objects.filter( + id=connection_id, is_active=True + ).first() + if connection is None: + background_job.fail(error="Активное подключение не найдено") + raise ValueError(f"Active exchange connection not found: {connection_id}") + + background_job.mark_started() + background_job.update_progress(10, "Подготовка структуры целевой БД") + try: + result = ExchangeConnectionService.copy_data(connection=connection, **payload) + background_job.update_progress(90, "Фиксация результата") + output = {"status": "success", "connection_id": connection_id, **result} + background_job.complete(result=output) + return output + except Exception as exc: + logger.exception("Exchange copy failed (connection_id=%s)", connection_id) + background_job.fail(error=str(exc)) + raise diff --git a/src/apps/exchange/urls.py b/src/apps/exchange/urls.py new file mode 100644 index 0000000..37135d0 --- /dev/null +++ b/src/apps/exchange/urls.py @@ -0,0 +1,36 @@ +"""URL конфигурация приложения exchange.""" + +from apps.exchange.views import ( + ExchangeConnectionListCreateView, + ExchangeConnectionTestView, + ExchangeCopyDataView, + ExchangePeriodicTaskDetailView, + ExchangePeriodicTaskListCreateView, + ExchangeTableListView, +) +from django.urls import path + +app_name = "exchange" + +urlpatterns = [ + path( + "connections/", ExchangeConnectionListCreateView.as_view(), name="connections" + ), + path( + "connections/test/", + ExchangeConnectionTestView.as_view(), + name="connections-test", + ), + path("tables/", ExchangeTableListView.as_view(), name="tables"), + path("copy/", ExchangeCopyDataView.as_view(), name="copy"), + path( + "periodic-tasks/", + ExchangePeriodicTaskListCreateView.as_view(), + name="periodic-tasks", + ), + path( + "periodic-tasks//", + ExchangePeriodicTaskDetailView.as_view(), + name="periodic-task-detail", + ), +] diff --git a/src/apps/exchange/views.py b/src/apps/exchange/views.py new file mode 100644 index 0000000..d3d1113 --- /dev/null +++ b/src/apps/exchange/views.py @@ -0,0 +1,250 @@ +"""API views для обмена данными с внешней БД.""" + +from contextlib import suppress + +from apps.core.response import api_response +from apps.core.services import BackgroundJobService +from apps.exchange.models import ExchangeConnection +from apps.exchange.serializers import ( + ExchangeConnectionCreateSerializer, + ExchangeConnectionSerializer, + ExchangeCopyRequestSerializer, + ExchangePeriodicTaskSerializer, + ExchangePeriodicTaskUpsertSerializer, +) +from apps.exchange.services import ( + ExchangeConnectionService, + ExchangePeriodicTaskService, + ExchangeServiceError, +) +from apps.exchange.tasks import copy_exchange_data_async +from django.db import IntegrityError +from django.shortcuts import get_object_or_404 +from drf_yasg import openapi +from drf_yasg.utils import swagger_auto_schema +from rest_framework import status +from rest_framework.exceptions import ValidationError +from rest_framework.permissions import IsAdminUser +from rest_framework.views import APIView + +EXCHANGE_TAG = "Exchange" + + +class ExchangeConnectionListCreateView(APIView): + """API списка и создания подключений обмена.""" + + permission_classes = [IsAdminUser] + + @swagger_auto_schema( + tags=[EXCHANGE_TAG], + operation_summary="List exchange connections", + responses={200: ExchangeConnectionSerializer(many=True)}, + ) + def get(self, request): + serializer = ExchangeConnectionSerializer( + ExchangeConnection.objects.all().order_by("-is_active", "-created_at"), + many=True, + ) + return api_response(serializer.data) + + @swagger_auto_schema( + tags=[EXCHANGE_TAG], + operation_summary="Create active exchange connection and prepare target DB", + request_body=ExchangeConnectionCreateSerializer, + responses={201: ExchangeConnectionSerializer, 400: "Validation error"}, + ) + def post(self, request): + serializer = ExchangeConnectionCreateSerializer(data=request.data) + serializer.is_valid(raise_exception=True) + try: + connection = ExchangeConnectionService.create_active_connection_and_prepare( + **serializer.validated_data + ) + except ExchangeServiceError as exc: + raise ValidationError({"connection": str(exc)}) from exc + return api_response( + ExchangeConnectionSerializer(connection).data, + status_code=status.HTTP_201_CREATED, + ) + + +class ExchangeConnectionTestView(APIView): + """API проверки подключения к внешней БД без сохранения.""" + + permission_classes = [IsAdminUser] + + @swagger_auto_schema( + tags=[EXCHANGE_TAG], + operation_summary="Test exchange connection and prepare target schema", + request_body=ExchangeConnectionCreateSerializer, + responses={200: "Connection status", 400: "Validation error"}, + ) + def post(self, request): + serializer = ExchangeConnectionCreateSerializer(data=request.data) + serializer.is_valid(raise_exception=True) + try: + result = ExchangeConnectionService.test_connection_payload( + **serializer.validated_data + ) + except ExchangeServiceError as exc: + raise ValidationError({"connection": str(exc)}) from exc + return api_response(result) + + +class ExchangeTableListView(APIView): + """API списка таблиц, доступных для выгрузки.""" + + permission_classes = [IsAdminUser] + + @swagger_auto_schema( + tags=[EXCHANGE_TAG], + operation_summary="List exchange copy tables", + responses={200: "Table list"}, + ) + def get(self, request): + return api_response(ExchangeConnectionService.get_copy_table_choices()) + + +class ExchangeCopyDataView(APIView): + """API запуска копирования данных в целевую БД.""" + + permission_classes = [IsAdminUser] + + @swagger_auto_schema( + tags=[EXCHANGE_TAG], + operation_summary="Copy data to target DB", + request_body=ExchangeCopyRequestSerializer, + responses={ + 202: openapi.Response( + description="Copy queued", + schema=openapi.Schema(type=openapi.TYPE_OBJECT), + ), + 400: "Validation error", + }, + ) + def post(self, request): + serializer = ExchangeCopyRequestSerializer(data=request.data) + serializer.is_valid(raise_exception=True) + try: + active_connection = ExchangeConnectionService.get_active_connection() + task = copy_exchange_data_async.delay( + connection_id=active_connection.id, + payload=serializer.validated_data, + requested_by_id=request.user.id, + ) + with suppress(IntegrityError): + BackgroundJobService.create_job( + task_id=task.id, + task_name="apps.exchange.tasks.copy_exchange_data_async", + user_id=request.user.id, + meta={ + "connection_id": active_connection.id, + **serializer.validated_data, + }, + ) + except ExchangeServiceError as exc: + raise ValidationError({"copy": str(exc)}) from exc + + return api_response( + { + "status": "started", + "task_id": task.id, + "connection_id": active_connection.id, + "mode": serializer.validated_data["mode"], + "truncate_before_copy": serializer.validated_data[ + "truncate_before_copy" + ], + }, + status_code=status.HTTP_202_ACCEPTED, + ) + + +class ExchangePeriodicTaskListCreateView(APIView): + """API списка и создания периодических задач обмена.""" + + permission_classes = [IsAdminUser] + + @swagger_auto_schema( + tags=[EXCHANGE_TAG], + operation_summary="List exchange schedules", + responses={200: ExchangePeriodicTaskSerializer(many=True)}, + ) + def get(self, request): + serializer = ExchangePeriodicTaskSerializer( + ExchangePeriodicTaskService.get_queryset(), + many=True, + ) + return api_response(serializer.data) + + @swagger_auto_schema( + tags=[EXCHANGE_TAG], + operation_summary="Create exchange schedule", + request_body=ExchangePeriodicTaskUpsertSerializer, + responses={201: ExchangePeriodicTaskSerializer, 400: "Validation error"}, + ) + def post(self, request): + serializer = ExchangePeriodicTaskUpsertSerializer(data=request.data) + serializer.is_valid(raise_exception=True) + try: + task = ExchangePeriodicTaskService.create_periodic_task( + name=serializer.validated_data["name"], + enabled=serializer.validated_data.get("enabled", True), + payload=serializer.validated_data["payload"], + schedule=serializer.validated_data["schedule"], + ) + except ExchangeServiceError as exc: + raise ValidationError({"periodic_task": str(exc)}) from exc + return api_response( + ExchangePeriodicTaskSerializer(task).data, + status_code=status.HTTP_201_CREATED, + ) + + +class ExchangePeriodicTaskDetailView(APIView): + """API управления периодической задачей обмена.""" + + permission_classes = [IsAdminUser] + + @swagger_auto_schema( + tags=[EXCHANGE_TAG], + operation_summary="Get exchange schedule", + responses={200: ExchangePeriodicTaskSerializer, 404: "Not found"}, + ) + def get(self, request, task_id: int): + task = get_object_or_404(ExchangePeriodicTaskService.get_queryset(), id=task_id) + return api_response(ExchangePeriodicTaskSerializer(task).data) + + @swagger_auto_schema( + tags=[EXCHANGE_TAG], + operation_summary="Update exchange schedule", + request_body=ExchangePeriodicTaskUpsertSerializer, + responses={200: ExchangePeriodicTaskSerializer, 404: "Not found"}, + ) + def patch(self, request, task_id: int): + task = get_object_or_404(ExchangePeriodicTaskService.get_queryset(), id=task_id) + serializer = ExchangePeriodicTaskUpsertSerializer( + task, + data=request.data, + partial=True, + ) + serializer.is_valid(raise_exception=True) + try: + updated = ExchangePeriodicTaskService.update_periodic_task( + task=task, + enabled=serializer.validated_data.get("enabled"), + payload=serializer.validated_data["payload"], + schedule=serializer.validated_data["schedule"], + ) + except ExchangeServiceError as exc: + raise ValidationError({"periodic_task": str(exc)}) from exc + return api_response(ExchangePeriodicTaskSerializer(updated).data) + + @swagger_auto_schema( + tags=[EXCHANGE_TAG], + operation_summary="Delete exchange schedule", + responses={204: "Deleted", 404: "Not found"}, + ) + def delete(self, request, task_id: int): + task = get_object_or_404(ExchangePeriodicTaskService.get_queryset(), id=task_id) + task.delete() + return api_response(None, status_code=status.HTTP_204_NO_CONTENT) diff --git a/src/apps/parsers/admin.py b/src/apps/parsers/admin.py index 1a8741c..fd89b9a 100644 --- a/src/apps/parsers/admin.py +++ b/src/apps/parsers/admin.py @@ -3,9 +3,11 @@ Admin configuration for parsers app. """ from apps.parsers.models import ( + GenericParserRecord, IndustrialCertificateRecord, InspectionRecord, ManufacturerRecord, + ParserBatchSequence, ParserLoadLog, Proxy, ) @@ -120,6 +122,16 @@ class ParserLoadLogAdmin(admin.ModelAdmin): return False +@admin.register(ParserBatchSequence) +class ParserBatchSequenceAdmin(admin.ModelAdmin): + """Admin для атомарных счётчиков batch_id.""" + + list_display = ["source", "next_batch_id", "updated_at"] + search_fields = ["source"] + readonly_fields = ["created_at", "updated_at"] + ordering = ["source"] + + class HasCertificateNumberFilter(admin.SimpleListFilter): """Фильтр по наличию номера сертификата.""" @@ -385,3 +397,70 @@ class InspectionRecordAdmin(admin.ModelAdmin): def has_change_permission(self, request, obj=None): """Запретить редактирование записей.""" return False + + +@admin.register(GenericParserRecord) +class GenericParserRecordAdmin(admin.ModelAdmin): + """Admin для универсальных записей новых источников.""" + + list_display = [ + "source", + "external_id", + "organisation_name_short", + "inn", + "record_date", + "status", + "load_batch", + "created_at", + ] + list_filter = ["source", "load_batch", "created_at"] + search_fields = [ + "external_id", + "organisation_name", + "title", + "inn", + "ogrn", + ] + readonly_fields = ["created_at", "updated_at", "load_batch", "payload"] + ordering = ["-created_at"] + list_per_page = 100 + date_hierarchy = "created_at" + + fieldsets = ( + ( + "Источник", + {"fields": ("source", "external_id", "load_batch")}, + ), + ( + "Организация", + {"fields": ("organisation_name", "inn", "ogrn")}, + ), + ( + "Запись", + {"fields": ("title", "record_date", "amount", "status", "url")}, + ), + ( + "Исходные данные", + {"fields": ("payload",), "classes": ("collapse",)}, + ), + ( + "Даты", + {"fields": ("created_at", "updated_at"), "classes": ("collapse",)}, + ), + ) + + def organisation_name_short(self, obj): + """Сокращённое название организации.""" + name = obj.organisation_name or obj.title or "" + return name[:60] + "..." if len(name) > 60 else name + + organisation_name_short.short_description = "Организация/заголовок" + organisation_name_short.admin_order_field = "organisation_name" + + def has_add_permission(self, request): + """Запретить создание записей вручную.""" + return False + + def has_change_permission(self, request, obj=None): + """Запретить редактирование записей.""" + return False diff --git a/src/apps/parsers/api_result_urls.py b/src/apps/parsers/api_result_urls.py new file mode 100644 index 0000000..2c6b7db --- /dev/null +++ b/src/apps/parsers/api_result_urls.py @@ -0,0 +1,159 @@ +"""Frontend-oriented parser result API v1 routes.""" + +from apps.parsers.source_registry import PARSER_SOURCES, ParserSourceDescriptor +from apps.parsers.views import ( + RESULT_DETAIL_PARAMS, + RESULT_LIST_PARAMS, + UPLOAD_FILE_PARAM, + ParserResultRecordSerializer, + ParserRunResponseSerializer, + ParserUploadView, + SourceResultDetailView, + SourceResultListView, + source_result_swagger_tag, +) +from django.urls import path +from drf_yasg.utils import swagger_auto_schema + +app_name = "parser_results" +ROUTE_TITLES = { + "zakupki": "ЕИС Закупки", + "proverki": "Проверки Генпрокуратуры", +} + + +def _result_source_descriptors(): + """Вернуть уникальные источники с явным API route.""" + seen_routes = set() + for descriptor in PARSER_SOURCES.values(): + if not descriptor.api_route or descriptor.api_route in seen_routes: + continue + seen_routes.add(descriptor.api_route) + yield descriptor + + +def _upload_source_descriptors(): + """Вернуть уникальные источники с route ручной загрузки.""" + seen_routes = set() + for descriptor in PARSER_SOURCES.values(): + if not descriptor.supports_file_upload: + continue + upload_route = descriptor.upload_api_route + if not upload_route or upload_route in seen_routes: + continue + seen_routes.add(upload_route) + yield descriptor + + +def _route_title(descriptor: ParserSourceDescriptor) -> str: + """Вернуть человекочитаемое название публичного route.""" + return ROUTE_TITLES.get(descriptor.api_route, descriptor.title) + + +def _route_description(descriptor: ParserSourceDescriptor) -> str: + """Вернуть описание данных, опубликованных на одном route.""" + scopes = [ + item.data_scope + for item in PARSER_SOURCES.values() + if item.api_route == descriptor.api_route + ] + return "; ".join(dict.fromkeys(scopes)) + + +def _list_view(descriptor: ParserSourceDescriptor): + tag = source_result_swagger_tag(descriptor.key) + title = _route_title(descriptor) + description = _route_description(descriptor) + + class SourceListView(SourceResultListView): + source_key = descriptor.key + + @swagger_auto_schema( + operation_summary=f"{title}: get_list", + operation_description=( + f"Список записей источника: {description}. " + "Фильтры передаются query-параметрами." + ), + manual_parameters=RESULT_LIST_PARAMS, + tags=[tag], + responses={200: ParserResultRecordSerializer(many=True)}, + ) + def get(self, request): + return super().get(request) + + return SourceListView.as_view() + + +def _detail_view(descriptor: ParserSourceDescriptor): + tag = source_result_swagger_tag(descriptor.key) + title = _route_title(descriptor) + description = _route_description(descriptor) + + class SourceDetailView(SourceResultDetailView): + source_key = descriptor.key + + @swagger_auto_schema( + operation_summary=f"{title}: get", + operation_description=( + f"Одна запись источника: {description}. " + "Query-параметры дополнительно сужают выборку." + ), + manual_parameters=RESULT_DETAIL_PARAMS, + tags=[tag], + responses={200: ParserResultRecordSerializer, 404: "Запись не найдена"}, + ) + def get(self, request, pk: int): + return super().get(request, pk=pk) + + return SourceDetailView.as_view() + + +def _upload_view(descriptor: ParserSourceDescriptor): + tag = source_result_swagger_tag(descriptor.key) + + class SourceUploadView(ParserUploadView): + @swagger_auto_schema( + operation_summary=f"{descriptor.title}: загрузить файл", + operation_description=( + f"Ручная загрузка файла для источника: {descriptor.data_scope}. " + "Файл обрабатывается через Celery." + ), + manual_parameters=[UPLOAD_FILE_PARAM], + consumes=["multipart/form-data"], + tags=[tag], + responses={202: ParserRunResponseSerializer, 400: "Ошибка валидации"}, + ) + def post(self, request): + return super().post(request, source_key=descriptor.key) + + return SourceUploadView.as_view() + + +urlpatterns = [] +for source_descriptor in _result_source_descriptors(): + route_name = source_descriptor.api_route.replace("/", "-") + urlpatterns.extend( + [ + path( + f"{source_descriptor.api_route}/", + _list_view(source_descriptor), + name=f"{route_name}-get-list", + ), + path( + f"{source_descriptor.api_route}//", + _detail_view(source_descriptor), + name=f"{route_name}-get", + ), + ] + ) + +for source_descriptor in _upload_source_descriptors(): + upload_route = source_descriptor.upload_api_route + route_name = upload_route.replace("/", "-") + urlpatterns.append( + path( + f"{upload_route}/", + _upload_view(source_descriptor), + name=f"{route_name}-upload", + ) + ) diff --git a/src/apps/parsers/clients/__init__.py b/src/apps/parsers/clients/__init__.py index e2cb352..5557f95 100644 --- a/src/apps/parsers/clients/__init__.py +++ b/src/apps/parsers/clients/__init__.py @@ -8,15 +8,20 @@ """ from apps.parsers.clients.base import BaseHTTPClient +from apps.parsers.clients.common import GenericParserItem, StructuredDataClient from apps.parsers.clients.minpromtorg import ( IndustrialProductionClient, ManufacturesClient, ) from apps.parsers.clients.proverki import ProverkiClient +from apps.parsers.clients.trudvsem import TrudvsemClient __all__ = [ "BaseHTTPClient", + "GenericParserItem", "IndustrialProductionClient", "ManufacturesClient", "ProverkiClient", + "StructuredDataClient", + "TrudvsemClient", ] diff --git a/src/apps/parsers/clients/base.py b/src/apps/parsers/clients/base.py index 9bfbd5a..2ecadd3 100644 --- a/src/apps/parsers/clients/base.py +++ b/src/apps/parsers/clients/base.py @@ -68,6 +68,7 @@ class BaseHTTPClient: proxies: list[str] | None = None timeout: int = 30 headers: dict[str, str] = field(default_factory=dict) + verify_ssl: bool = True def __post_init__(self) -> None: """Инициализация после создания dataclass.""" @@ -167,7 +168,12 @@ class BaseHTTPClient: logger.info("GET %s (proxy: %s)", url, self._current_proxy) try: - response = self.session.get(url, params=params, timeout=self.timeout) + response = self.session.get( + url, + params=params, + timeout=self.timeout, + verify=self.verify_ssl, + ) except requests.exceptions.ConnectionError as e: logger.error("Connection error: %s - %s", url, e) raise ConnectionError(f"Failed to connect to {url}", url=url) from e @@ -203,12 +209,75 @@ class BaseHTTPClient: response = self.get(endpoint, params=params) return response.json() - def download_file(self, endpoint: str) -> bytes: + def post_json( + self, + endpoint: str, + *, + payload: dict[str, Any] | None = None, + params: dict[str, Any] | None = None, + ) -> dict: + """ + Выполнить POST запрос с JSON body и вернуть JSON. + + Нужен для официальных web API, которые используют POST для поиска. + """ + response = self.post(endpoint, json_payload=payload, params=params) + return response.json() + + def post( + self, + endpoint: str, + *, + json_payload: dict[str, Any] | None = None, + params: dict[str, Any] | None = None, + ) -> requests.Response: + """Выполнить POST запрос.""" + url = self._build_url(endpoint) + logger.info("POST %s (proxy: %s)", url, self._current_proxy) + + try: + response = self.session.post( + url, + params=params, + json=json_payload, + timeout=self.timeout, + verify=self.verify_ssl, + ) + except requests.exceptions.ConnectionError as e: + logger.error("Connection error: %s - %s", url, e) + raise ConnectionError(f"Failed to connect to {url}", url=url) from e + except requests.exceptions.Timeout as e: + logger.error("Timeout: %s", url) + raise ConnectionError(f"Request timeout for {url}", url=url) from e + except requests.exceptions.RequestException as e: + logger.error("Request error: %s - %s", url, e) + raise HTTPClientError(f"Request failed: {e}", url=url) from e + + if not response.ok: + logger.error("HTTP error %d: %s", response.status_code, url) + raise HTTPError( + f"HTTP {response.status_code} for {url}", + status_code=response.status_code, + url=url, + ) + + logger.debug("Response %d from %s", response.status_code, url) + return response + + def download_file( + self, + endpoint: str, + *, + max_size_bytes: int | None = None, + chunk_size: int = 1024 * 1024, + ) -> bytes: """ Скачать файл. Args: endpoint: Путь или полный URL файла + max_size_bytes: Максимальный допустимый размер файла + chunk_size: Размер чанка потокового чтения Returns: Содержимое файла как bytes @@ -216,12 +285,87 @@ class BaseHTTPClient: url = self._build_url(endpoint) logger.info("Downloading file: %s", url) - response = self.get(endpoint) - content = response.content + response = self._get_stream(url) + try: + self._validate_response(response, url) + self._validate_content_length(response, max_size_bytes, url) + content = self._read_stream(response, max_size_bytes, chunk_size, url) + finally: + response.close() logger.info("Downloaded %d bytes from %s", len(content), url) return content + def _get_stream(self, url: str) -> requests.Response: + """Открыть потоковый GET запрос.""" + try: + return self.session.get( + url, + stream=True, + timeout=self.timeout, + verify=self.verify_ssl, + ) + except requests.exceptions.ConnectionError as e: + logger.error("Connection error: %s - %s", url, e) + raise ConnectionError(f"Failed to connect to {url}", url=url) from e + except requests.exceptions.Timeout as e: + logger.error("Timeout: %s", url) + raise ConnectionError(f"Request timeout for {url}", url=url) from e + except requests.exceptions.RequestException as e: + logger.error("Request error: %s - %s", url, e) + raise HTTPClientError(f"Request failed: {e}", url=url) from e + + def _validate_response(self, response: requests.Response, url: str) -> None: + """Проверить HTTP статус ответа.""" + if response.ok: + return + logger.error("HTTP error %d: %s", response.status_code, url) + raise HTTPError( + f"HTTP {response.status_code} for {url}", + status_code=response.status_code, + url=url, + ) + + def _validate_content_length( + self, + response: requests.Response, + max_size_bytes: int | None, + url: str, + ) -> None: + """Проверить Content-Length до чтения тела ответа.""" + content_length = response.headers.get("Content-Length") + if not content_length or max_size_bytes is None: + return + try: + size = int(content_length) + except ValueError: + return + if size > max_size_bytes: + raise HTTPClientError( + f"File exceeds size limit: {size} bytes", + url=url, + ) + + def _read_stream( + self, + response: requests.Response, + max_size_bytes: int | None, + chunk_size: int, + url: str, + ) -> bytes: + """Прочитать тело ответа чанками с лимитом размера.""" + chunks = bytearray() + for chunk in response.iter_content(chunk_size=chunk_size): + if not chunk: + continue + chunks.extend(chunk) + if max_size_bytes is not None and len(chunks) > max_size_bytes: + raise HTTPClientError( + f"File exceeds size limit: {len(chunks)} bytes", + url=url, + ) + return bytes(chunks) + def close(self) -> None: """Закрыть сессию.""" if self._session is not None: diff --git a/src/apps/parsers/clients/common/__init__.py b/src/apps/parsers/clients/common/__init__.py new file mode 100644 index 0000000..f64c3a6 --- /dev/null +++ b/src/apps/parsers/clients/common/__init__.py @@ -0,0 +1,13 @@ +"""Общие клиенты и DTO для новых разнородных источников.""" + +from apps.parsers.clients.common.schemas import GenericParserItem +from apps.parsers.clients.common.structured import ( + StructuredDataClient, + StructuredDataClientError, +) + +__all__ = [ + "GenericParserItem", + "StructuredDataClient", + "StructuredDataClientError", +] diff --git a/src/apps/parsers/clients/common/schemas.py b/src/apps/parsers/clients/common/schemas.py new file mode 100644 index 0000000..933ea73 --- /dev/null +++ b/src/apps/parsers/clients/common/schemas.py @@ -0,0 +1,22 @@ +"""DTO для универсальных записей новых источников.""" + +from dataclasses import dataclass, field +from decimal import Decimal +from typing import Any + + +@dataclass(frozen=True) +class GenericParserItem: + """Нормализованная запись из внешнего источника.""" + + source: str + external_id: str + inn: str = "" + ogrn: str = "" + organisation_name: str = "" + title: str = "" + record_date: str = "" + amount: Decimal | None = None + status: str = "" + url: str = "" + payload: dict[str, Any] = field(default_factory=dict) diff --git a/src/apps/parsers/clients/common/structured.py b/src/apps/parsers/clients/common/structured.py new file mode 100644 index 0000000..5f597b4 --- /dev/null +++ b/src/apps/parsers/clients/common/structured.py @@ -0,0 +1,1018 @@ +"""Клиент для загрузки структурированных файлов новых источников.""" + +import csv +import hashlib +import io +import json +import logging +import re +import zipfile +from collections import Counter +from dataclasses import dataclass, field +from decimal import Decimal, InvalidOperation +from typing import Any +from urllib.parse import urljoin + +from apps.parsers.clients.base import BaseHTTPClient, HTTPClientError +from apps.parsers.clients.common.schemas import GenericParserItem +from bs4 import BeautifulSoup +from defusedxml import ElementTree as ET +from openpyxl import load_workbook + +logger = logging.getLogger(__name__) + +LIST_KEYS = ( + "data", + "content", + "results", + "items", + "records", + "rows", + "documents", + "list", +) +SUPPORTED_EXCEL_EXTENSIONS = (".xlsx", ".xlsm") +SUPPORTED_ZIP_EXTENSIONS = ( + ".json", + ".csv", + ".xml", + ".html", + ".htm", + ".xlsx", + ".xlsm", +) +MAX_FILE_SIZE_BYTES = 200 * 1024 * 1024 +MAX_ZIP_ENTRIES = 100 +MAX_ZIP_UNCOMPRESSED_BYTES = 500 * 1024 * 1024 +MAX_ZIP_COMPRESSION_RATIO = 200 +MAX_RECORDS = 200_000 +MPT_PRODUCTS_SOURCE = "mpt_products" +FSTEC_SOURCE = "fstec" +FAS_GOZ_SOURCE = "fas_goz" +EIS_CARD_SOURCES = { + "procurements_44fz", + "procurements_223fz", + "contracts", + "unfair_suppliers", +} +ZAKUPKI_BASE_URL = "https://zakupki.gov.ru" +GISP_PRODUCTS_DOWNLOAD_LABEL = "Скачать только действующие" +GISP_PRODUCTS_API_PATH = "/pp719v2/pub/prod/b/" +GISP_PRODUCTS_PAGE_SIZE = 100 + + +class StructuredDataClientError(HTTPClientError): + """Ошибка клиента структурированных данных.""" + + pass + + +@dataclass +class StructuredDataClient: + """ + Универсальный клиент для файловых выгрузок. + + Поддерживает JSON, CSV, XLSX, XML, HTML-таблицы и ZIP-архивы с этими файлами. + Используется как первый слайс для источников, где формат выгрузки уточняется + отдельно и может отличаться между ведомствами. + """ + + source: str + proxies: list[str] | None = None + timeout: int = 120 + max_file_size_bytes: int = MAX_FILE_SIZE_BYTES + max_zip_entries: int = MAX_ZIP_ENTRIES + max_zip_uncompressed_bytes: int = MAX_ZIP_UNCOMPRESSED_BYTES + max_records: int = MAX_RECORDS + verify_ssl: bool = True + _http_client: BaseHTTPClient | None = field(default=None, repr=False) + + @property + def http_client(self) -> BaseHTTPClient: + """Ленивая инициализация HTTP клиента.""" + if self._http_client is None: + self._http_client = BaseHTTPClient( + base_url="", + proxies=self.proxies, + timeout=self.timeout, + verify_ssl=self.verify_ssl, + ) + return self._http_client + + def fetch_records( + self, + *, + file_url: str | None = None, + content: bytes | None = None, + file_name: str = "", + ) -> list[GenericParserItem]: + """ + Загрузить и нормализовать записи из структурированного файла. + + Args: + file_url: URL выгрузки. + content: Содержимое файла, используется в тестах или при внешней загрузке. + file_name: Имя файла для определения формата. + """ + if content is None: + if not file_url: + raise StructuredDataClientError("file_url is required") + if self._is_gisp_products_page(file_url): + rows = self._fetch_gisp_products_page(file_url) + records = [ + self._normalize_row(row, index) for index, row in enumerate(rows) + ] + logger.info( + "Parsed %d records for source=%s", + len(records), + self.source, + ) + return records + + content = self.http_client.download_file( + file_url, + max_size_bytes=self.max_file_size_bytes, + ) + file_name = file_name or file_url.rsplit("/", 1)[-1] + download_url = self._extract_preferred_html_download_url( + content, + base_url=file_url, + ) + if download_url: + content = self.http_client.download_file( + download_url, + max_size_bytes=self.max_file_size_bytes, + ) + file_name = download_url.rsplit("/", 1)[-1] or file_name + + self._validate_file_size(len(content), file_name or "content") + rows = self._parse_content(content, file_name=file_name) + records = [self._normalize_row(row, index) for index, row in enumerate(rows)] + logger.info("Parsed %d records for source=%s", len(records), self.source) + return records + + def _is_gisp_products_page(self, file_url: str) -> bool: + """Проверить, что это web API страницы продукции ГИСП.""" + return self.source == MPT_PRODUCTS_SOURCE and "/pp719v2/pub/prod" in file_url + + def _fetch_gisp_products_page(self, file_url: str) -> list[dict]: + """Загрузить первую страницу реестра продукции ГИСП через официальный UI API.""" + api_url = urljoin(file_url, GISP_PRODUCTS_API_PATH) + data = self.http_client.post_json( + api_url, + payload={ + "opt": { + "skip": 0, + "take": GISP_PRODUCTS_PAGE_SIZE, + "requireTotalCount": True, + "sort": [{"selector": "res_date", "desc": True}], + } + }, + ) + items = data.get("items") + if not isinstance(items, list): + raise StructuredDataClientError("GISP products API returned no items list") + self._validate_record_count(len(items)) + return [self._as_dict(item) for item in items] + + def _parse_content( # noqa: C901 + self, content: bytes, *, file_name: str = "" + ) -> list[dict]: + """Определить формат и распарсить файл.""" + name = file_name.lower() + + if name.endswith(SUPPORTED_EXCEL_EXTENSIONS): + return self._parse_xlsx(content) + if name.endswith(".zip"): + return self._parse_zip(content) + if name.endswith(".json"): + return self._parse_json(content) + if name.endswith(".csv"): + return self._parse_csv(content) + if name.endswith(".xml"): + return self._parse_xml(content) + if name.endswith((".html", ".htm")): + return self._parse_html(content) + + if content.startswith(b"PK\x03\x04"): + try: + return self._parse_xlsx(content) + except Exception: + return self._parse_zip(content) + + text = self._decode(content).lstrip() + if text.startswith(("{", "[")): + return self._parse_json(content) + if text.startswith("<"): + if self._looks_like_html(text): + return self._parse_html(content) + return self._parse_xml(content) + return self._parse_csv(content) + + def _looks_like_html(self, text: str) -> bool: + """Определить HTML-страницу до попытки XML-разбора.""" + prefix = text[:8192].lower() + return ( + prefix.startswith(" list[dict]: + """Распарсить поддерживаемые файлы внутри ZIP.""" + rows: list[dict] = [] + with zipfile.ZipFile(io.BytesIO(content)) as archive: + supported_files = [ + info + for info in archive.infolist() + if not info.is_dir() + and info.filename.lower().endswith(SUPPORTED_ZIP_EXTENSIONS) + ] + if len(supported_files) > self.max_zip_entries: + raise StructuredDataClientError( + f"ZIP contains too many supported files: {len(supported_files)}" + ) + + total_uncompressed = sum(info.file_size for info in supported_files) + if total_uncompressed > self.max_zip_uncompressed_bytes: + raise StructuredDataClientError( + "ZIP uncompressed size exceeds limit: " + f"{total_uncompressed} bytes" + ) + + for info in supported_files: + if info.is_dir(): + continue + name = info.filename.lower() + self._validate_zip_member(info) + rows.extend( + self._parse_content(archive.read(info.filename), file_name=name) + ) + self._validate_record_count(len(rows)) + return rows + + def _parse_json(self, content: bytes) -> list[dict]: + """Распарсить JSON и найти список записей.""" + data = json.loads(self._decode(content)) + node = self._extract_list_node(data) + if isinstance(node, list): + self._validate_record_count(len(node)) + return [self._as_dict(item) for item in node] + return [self._as_dict(node)] + + def _extract_list_node(self, data: Any) -> Any: + """Найти вероятный список записей в JSON-ответе.""" + if isinstance(data, list): + return data + if not isinstance(data, dict): + return data + + for key in LIST_KEYS: + value = data.get(key) + if isinstance(value, list): + return value + if isinstance(value, dict): + nested = self._extract_list_node(value) + if isinstance(nested, list): + return nested + return data + + def _parse_csv(self, content: bytes) -> list[dict]: + """Распарсить CSV с автоопределением разделителя.""" + text = self._decode(content) + sample = text[:4096] + try: + dialect = csv.Sniffer().sniff(sample, delimiters=",;\t|") + reader = csv.DictReader(io.StringIO(text), dialect=dialect) + except csv.Error: + reader = csv.DictReader(io.StringIO(text), delimiter=";") + result = [] + for row in reader: + result.append(dict(row)) + self._validate_record_count(len(result)) + return result + + def _parse_xlsx(self, content: bytes) -> list[dict]: + """Распарсить первый лист XLSX.""" + workbook = load_workbook( + filename=io.BytesIO(content), + read_only=True, + data_only=True, + ) + try: + sheet = workbook.active + row_iterator = sheet.iter_rows(values_only=True) + headers_row = next(row_iterator, None) + if not headers_row: + return [] + + headers = [str(value or "").strip() for value in headers_row] + result = [] + for row in row_iterator: + if not row or not any(row): + continue + result.append( + { + headers[index] + if index < len(headers) + else f"field_{index}": value + for index, value in enumerate(row) + } + ) + self._validate_record_count(len(result)) + return result + finally: + workbook.close() + + def _parse_xml(self, content: bytes) -> list[dict]: + """Распарсить XML с поиском повторяющихся элементов-записей.""" + text = self._clean_xml(content) + selector = self._select_xml_records(text) + if selector is None: + return [] + return self._collect_xml_records(text, selector) + + def _select_xml_records(self, text: str) -> tuple[str, int, str | None] | None: + """Выбрать XML-элементы, которые вероятнее всего являются записями.""" + ( + candidates, + direct_children_count, + first_candidate, + ) = self._count_xml_record_candidates(text) + repeated = [ + (depth, tag, count) + for (depth, tag), count in candidates.items() + if count > 1 + ] + if repeated: + depth, tag, _ = min(repeated, key=lambda item: (item[0], -item[2], item[1])) + return "tag", depth, tag + if direct_children_count: + return "depth", 2, None + if first_candidate: + depth, tag = first_candidate + return "tag", depth, tag + return None + + def _count_xml_record_candidates( + self, + text: str, + ) -> tuple[Counter[tuple[int, str]], int, tuple[int, str] | None]: + """Посчитать XML-элементы, похожие на записи.""" + candidates: Counter[tuple[int, str]] = Counter() + direct_children_count = 0 + first_candidate: tuple[int, str] | None = None + stack: list[Any] = [] + root_tag = "" + + for event, element in ET.iterparse( + io.StringIO(text), + events=("start", "end"), + ): + if event == "start": + stack.append(element) + if not root_tag: + root_tag = element.tag + continue + + depth = len(stack) + is_root = element.tag == root_tag and depth == 1 + has_record_shape = bool(element.attrib) or len(list(element)) > 0 + if not is_root and has_record_shape: + key = (depth, self._strip_namespace(element.tag)) + candidates[key] += 1 + if depth == 2: + direct_children_count += 1 + if first_candidate is None: + first_candidate = key + + element.clear() + if stack: + stack.pop() + + return candidates, direct_children_count, first_candidate + + def _collect_xml_records( + self, + text: str, + selector: tuple[str, int, str | None], + ) -> list[dict]: + """Потоково собрать XML-записи по выбранному selector.""" + mode, target_depth, target_tag = selector + result: list[dict] = [] + stack: list[Any] = [] + root_tag = "" + + for event, element in ET.iterparse( + io.StringIO(text), + events=("start", "end"), + ): + if event == "start": + stack.append(element) + if not root_tag: + root_tag = element.tag + continue + + depth = len(stack) + is_root = element.tag == root_tag and depth == 1 + tag = self._strip_namespace(element.tag) + is_candidate = not is_root and depth == target_depth + if mode == "tag": + is_candidate = is_candidate and tag == target_tag + + if is_candidate: + result.append(self._xml_to_dict(element)) + self._validate_record_count(len(result)) + element.clear() + elif is_root: + element.clear() + + if stack: + stack.pop() + + return result + + def _parse_html(self, content: bytes) -> list[dict]: + """Распарсить HTML-страницу источника.""" + soup = BeautifulSoup(self._decode(content), "html.parser") + card_rows = self._parse_source_cards(soup) + if card_rows: + return card_rows + + for table in soup.find_all("table"): + result = self._parse_html_table(table) + if result: + return result + return [] + + def _parse_source_cards(self, soup: BeautifulSoup) -> list[dict]: + """Распарсить карточки поисковой выдачи источников без таблиц.""" + if self.source not in EIS_CARD_SOURCES: + return [] + return self._parse_zakupki_cards(soup) + + def _parse_zakupki_cards(self, soup: BeautifulSoup) -> list[dict]: + """Распарсить карточки официального поиска ЕИС.""" + result = [] + labels = { + "Объект закупки", + "Объекты закупки", + "Заказчик", + "Наименование заказчика", + "Наименование документа", + "Наименование (ФИО) недобросовестного поставщика", + "ИНН (аналог ИНН)", + "Номер реестровой записи в ЕРУЗ", + "Начальная цена", + "Цена контракта", + "Размещено", + "Обновлено", + "Окончание подачи заявок", + "Включено", + "Исключено", + "Утверждение", + "Вступление в силу", + "Способы закупки", + } + for card in soup.select(".search-registry-entry-block"): + lines = self._extract_text_lines(card) + if not lines: + continue + + row: dict[str, Any] = {"raw_text": "\n".join(lines)} + number_index = self._fill_zakupki_number(row, lines) + self._fill_label_pairs(row, lines, labels) + self._fill_zakupki_status(row, lines, number_index, labels) + + link = card.find("a", href=True) + if link: + row["url"] = urljoin(ZAKUPKI_BASE_URL, link["href"]) + if lines[0].endswith("-ФЗ"): + row["law"] = lines[0] + + result.append(row) + self._validate_record_count(len(result)) + return result + + def _parse_html_table(self, table: Any) -> list[dict]: + """Распарсить HTML-таблицу с th или строкой-заголовком в td.""" + rows = table.find_all("tr") + if not rows: + return [] + if self.source == FAS_GOZ_SOURCE: + return self._parse_fas_goz_table_rows(rows) + + headers, data_rows = self._extract_table_headers(rows) + if not headers: + return [] + + result = [] + for row in data_rows: + cells = row.find_all(["td", "th"]) + if not cells: + continue + values = [cell.get_text(" ", strip=True) for cell in cells] + if not any(values): + continue + result.append( + { + headers[index] if index < len(headers) else f"field_{index}": value + for index, value in enumerate(values) + } + ) + self._validate_record_count(len(result)) + return result + + def _parse_fas_goz_table_rows(self, rows: list[Any]) -> list[dict]: + """Распарсить таблицу ФАС ГОЗ с многострочным заголовком.""" + headers = [ + "Номер реестровой записи", + "Уполномоченный орган", + "Постановление", + "Дата вступления постановления", + "Исполнение постановления", + "Полное наименование лица", + "Фирменное наименование лица", + "Адрес лица", + "ИНН", + ] + result = [] + for row in rows: + values = [ + cell.get_text(" ", strip=True) for cell in row.find_all(["td", "th"]) + ] + if len(values) < 8 or self._is_fas_goz_header_number_row(values): + continue + result.append( + { + headers[index] if index < len(headers) else f"field_{index}": value + for index, value in enumerate(values) + } + ) + self._validate_record_count(len(result)) + return result + + def _is_fas_goz_header_number_row(self, values: list[str]) -> bool: + """Определить служебную строку ФАС с номерами колонок 1..8.""" + return all(value.isdigit() for value in values) and values[:3] == [ + "1", + "2", + "3", + ] + + def _extract_table_headers(self, rows: list[Any]) -> tuple[list[str], list[Any]]: + """Выделить заголовки таблицы, включая госстраницы без th.""" + first_header = rows[0].find_all("th") + if first_header: + headers = [cell.get_text(" ", strip=True) for cell in first_header] + return headers, rows[1:] + + first_cells = rows[0].find_all(["th", "td"]) + headers = [cell.get_text(" ", strip=True) for cell in first_cells] + if self._looks_like_table_header(headers): + return headers, rows[1:] + return [], [] + + def _looks_like_table_header(self, headers: list[str]) -> bool: + """Отсеять layout-таблицы без th, но принять реестровые таблицы ФАС.""" + normalized = " ".join(headers).lower() + if len(headers) < 2 or not any(headers): + return False + markers = ( + "номер", + "реестр", + "наименование", + "инн", + "огрн", + "дата", + "информация", + "лиценз", + "постановлен", + ) + return sum(marker in normalized for marker in markers) >= 2 + + def _extract_text_lines(self, node: Any) -> list[str]: + """Получить непустые строки видимого текста.""" + return [ + line.strip() + for line in node.get_text("\n", strip=True).splitlines() + if line.strip() + ] + + def _fill_zakupki_number(self, row: dict, lines: list[str]) -> int | None: + """Найти номер карточки ЕИС.""" + for index, line in enumerate(lines): + match = re.search(r"№\s*([0-9A-Za-zА-Яа-яЁё/_-]+)", line) + if match: + row["number"] = match.group(1) + row["registry_number"] = match.group(1) + return index + return None + + def _fill_label_pairs( + self, + row: dict, + lines: list[str], + labels: set[str], + ) -> None: + """Заполнить пары label -> следующая строка из карточки.""" + for index, line in enumerate(lines[:-1]): + if line in labels: + row[line] = lines[index + 1] + + def _fill_zakupki_status( + self, + row: dict, + lines: list[str], + number_index: int | None, + labels: set[str], + ) -> None: + """Выделить статус карточки ЕИС, если он расположен после номера.""" + if number_index is None: + return + status_index = number_index + 1 + if status_index < len(lines) and lines[status_index] not in labels: + row["status"] = lines[status_index] + + def _normalize_row(self, row: dict, index: int) -> GenericParserItem: + """Привести строку источника к общей DTO.""" + payload = self._json_safe(row) + external_id = self._lookup( + payload, + [ + "external_id", + "id", + "uid", + "guid", + "number", + "registry_number", + "registration_number", + "purchase_number", + "contract_number", + "case_number", + "vacancy_id", + "product_reg_number_2023", + "product_reg_number_2022", + "res_number", + "product_gisp_url", + "регистрационный номер лицензии", + "№ сертификата", + "номер реестровой записи", + "номер реестровой записи в еруз", + "номер", + "реестровый номер", + "номер дела", + "идентификатор", + ], + ) + if not external_id: + external_id = self._make_external_id(payload) + + return GenericParserItem( + source=self.source, + external_id=external_id, + inn=self._lookup( + payload, + [ + "inn", + "supplier_inn", + "employer_inn", + "org_inn", + "инн", + "инн лицензиата", + "инн аналог инн", + ], + ), + ogrn=self._lookup( + payload, + [ + "ogrn", + "supplier_ogrn", + "ogrn_id", + "org_ogrn", + "огрн", + "огрн или огрип лицензиата", + ], + ), + organisation_name=self._lookup( + payload, + [ + "organisation_name", + "organization_name", + "company_name", + "supplier_name", + "employer_name", + "short_name", + "shortName", + "org_name", + "name", + "наименование", + "организация", + "работодатель", + "заказчик", + "заявитель", + "наименование заказчика", + "наименование фио недобросовестного поставщика", + "полное сокращенное наименование лицензиата", + "информация о лице", + "полное наименование лица", + "фирменное наименование лица", + ], + ), + title=self._lookup( + payload, + [ + "title", + "subject", + "description", + "purchase_object", + "vacancy_name", + "product_name", + "product_spec", + "наименование средства шифр", + "объект закупки", + "объекты закупки", + "наименование документа", + "наименование закупки", + "предмет", + "описание", + "должность", + "продукция", + ], + ), + record_date=self._lookup( + payload, + [ + "date", + "publish_date", + "publication_date", + "created_at", + "actualBfoDate", + "period", + "res_date", + "дата", + "дата предоставления лицензии", + "дата внесения в реестр", + "дата вступления постановления", + "размещено", + "обновлено", + "включено", + "утверждение", + ], + ), + amount=self._to_decimal( + self._lookup( + payload, + [ + "amount", + "price", + "sum", + "max_price", + "salary", + "gainSum", + "product_score_value", + "сумма", + "цена", + "начальная цена", + "цена контракта", + ], + ) + ), + status=self._lookup( + payload, + [ + "status", + "state", + "statusCode", + "res_valid_till", + "срок действия сертификата", + "исполнение постановления", + "статус", + "состояние", + ], + ), + url=self._lookup( + payload, + [ + "url", + "href", + "link", + "source_url", + "product_gisp_url", + "gisp_url", + "ссылка", + ], + ), + payload=payload, + ) + + def _lookup(self, row: dict, candidates: list[str]) -> str: + """Найти значение по списку возможных ключей.""" + normalized: dict[str, Any] = {} + for key, value in self._iter_lookup_values(row): + normalized.setdefault(self._normalize_key(key), value) + + for candidate in candidates: + value = normalized.get(self._normalize_key(candidate)) + if value not in (None, ""): + return self._clean_lookup_value(value) + return "" + + def _iter_lookup_values(self, row: dict, prefix: str = ""): + """Итерировать значения, включая вложенные JSON-объекты.""" + for key, value in row.items(): + key_str = str(key) + full_key = f"{prefix}.{key_str}" if prefix else key_str + yield key_str, value + yield full_key, value + if isinstance(value, dict): + yield from self._iter_lookup_values(value, full_key) + + def _clean_lookup_value(self, value: Any) -> str: + """Привести значение lookup к чистому тексту.""" + if isinstance(value, dict | list | tuple): + return "" + text = str(value).strip() + if "<" in text and ">" in text: + text = BeautifulSoup(text, "html.parser").get_text(" ", strip=True) + return re.sub(r"\s+", " ", text).strip() + + def _make_external_id(self, payload: dict) -> str: + """Сформировать стабильный внешний ID, если источник его не дал.""" + raw = json.dumps(payload, ensure_ascii=False, sort_keys=True, default=str) + digest = hashlib.sha256(raw.encode("utf-8")).hexdigest()[:24] + return f"{self.source}:{digest}" + + def _validate_file_size(self, size: int, file_name: str) -> None: + """Проверить размер одиночного файла до разбора.""" + if size > self.max_file_size_bytes: + raise StructuredDataClientError( + f"File {file_name} exceeds size limit: {size} bytes" + ) + + def _validate_record_count(self, count: int) -> None: + """Ограничить количество строк, удерживаемых в памяти.""" + if count > self.max_records: + raise StructuredDataClientError(f"Record count exceeds limit: {count}") + + def _validate_zip_member(self, info: zipfile.ZipInfo) -> None: + """Проверить ZIP-элемент перед чтением в память.""" + self._validate_file_size(info.file_size, info.filename) + if info.compress_size <= 0: + return + compression_ratio = info.file_size / info.compress_size + if compression_ratio > MAX_ZIP_COMPRESSION_RATIO: + raise StructuredDataClientError( + f"ZIP member compression ratio is too high: {info.filename}" + ) + + def _decode(self, content: bytes) -> str: + """Декодировать выгрузку с учётом частой cp1251 в госданных.""" + for encoding in ("utf-8-sig", "utf-8", "cp1251"): + try: + return content.decode(encoding) + except UnicodeDecodeError: + continue + return content.decode("utf-8", errors="replace") + + def _extract_preferred_html_download_url( + self, + content: bytes, + *, + base_url: str, + ) -> str: + """Найти официальный XLSX download на HTML-странице источника.""" + if self.source == FSTEC_SOURCE: + return self._extract_fstec_download_url(content, base_url=base_url) + if self.source != MPT_PRODUCTS_SOURCE: + return "" + text = self._decode(content).lstrip() + if not self._looks_like_html(text): + return "" + + soup = BeautifulSoup(text, "html.parser") + candidates = [] + for link in soup.find_all("a", href=True): + label = link.get_text(" ", strip=True) + href = link["href"] + if "xlsx" not in label.lower() and "xls" not in href.lower(): + continue + candidates.append((label, urljoin(base_url, href))) + + for label, url in candidates: + if GISP_PRODUCTS_DOWNLOAD_LABEL.lower() in label.lower(): + return url + return candidates[0][1] if candidates else "" + + def _extract_fstec_download_url(self, content: bytes, *, base_url: str) -> str: + """Найти CSV-выгрузку реестра ФСТЭК на HTML-странице.""" + text = self._decode(content).lstrip() + if not self._looks_like_html(text): + return "" + + soup = BeautifulSoup(text, "html.parser") + candidates = [ + urljoin(base_url, link["href"]) + for link in soup.find_all("a", href=True) + if "module=rfiles" in link["href"] or "/uploads/reg" in link["href"] + ] + for url in candidates: + if "file=1" in url or url.lower().endswith(".csv"): + return url + return candidates[0] if candidates else "" + + def _clean_xml(self, content: bytes) -> str: + """Удалить управляющие символы, которые часто ломают XML выгрузки.""" + text = self._decode(content) + return re.sub(r"[\x00-\x08\x0b\x0c\x0e-\x1f]", "", text) + + def _xml_to_dict(self, element: Any) -> dict: + """Преобразовать XML-элемент в плоский словарь.""" + data = dict(element.attrib) + children = list(element) + if children: + for child in children: + key = self._strip_namespace(child.tag) + if len(list(child)) > 0: + value = self._xml_to_dict(child) + else: + value = child.text or "" + self._set_xml_value(data, key, value) + elif element.text: + data[self._strip_namespace(element.tag)] = element.text + return data + + def _set_xml_value(self, data: dict, key: str, value: Any) -> None: + """Сохранить повторяющиеся XML-теги списком, не перетирая значения.""" + if key not in data: + data[key] = value + return + current = data[key] + if isinstance(current, list): + current.append(value) + else: + data[key] = [current, value] + + def _strip_namespace(self, tag: str) -> str: + """Убрать XML namespace из имени тега.""" + return tag.rsplit("}", 1)[-1] + + def _as_dict(self, item: Any) -> dict: + """Привести JSON-элемент к словарю.""" + if isinstance(item, dict): + return item + return {"value": item} + + def _json_safe(self, row: dict) -> dict: + """Сделать payload сериализуемым в JSON.""" + result = {} + for key, value in row.items(): + result[str(key)] = self._json_safe_value(value) + return result + + def _json_safe_value(self, value: Any) -> Any: + """Рекурсивно привести значение к JSON-совместимому виду.""" + if isinstance(value, str | int | float | bool) or value is None: + return value + if isinstance(value, Decimal): + return str(value) + if isinstance(value, dict): + return { + str(key): self._json_safe_value(item) for key, item in value.items() + } + if isinstance(value, list | tuple): + return [self._json_safe_value(item) for item in value] + return str(value) + + def _normalize_key(self, key: str) -> str: + """Нормализовать ключ для сопоставления русских и английских колонок.""" + return re.sub(r"[^0-9a-zа-яё]+", "", str(key).lower()) + + def _to_decimal(self, value: str) -> Decimal | None: + """Преобразовать строковое число в Decimal.""" + if not value: + return None + cleaned = value.replace(" ", "").replace("\xa0", "").replace(",", ".") + cleaned = re.sub(r"[^0-9.\-]", "", cleaned) + if not cleaned: + return None + try: + return Decimal(cleaned) + except InvalidOperation: + return None + + def close(self) -> None: + """Закрыть HTTP клиент.""" + if self._http_client is not None: + self._http_client.close() + self._http_client = None + + def __enter__(self) -> "StructuredDataClient": + """Поддержка context manager.""" + return self + + def __exit__(self, exc_type, exc_val, exc_tb) -> None: + """Закрытие при выходе из context manager.""" + self.close() diff --git a/src/apps/parsers/clients/proverki/client.py b/src/apps/parsers/clients/proverki/client.py index f74ade2..e8151d4 100644 --- a/src/apps/parsers/clients/proverki/client.py +++ b/src/apps/parsers/clients/proverki/client.py @@ -790,11 +790,13 @@ class ProverkiClient: "regnum", "id", "number", + "УчетныйНомер", + "УчётныйНомер", ] ) - inn = get_attr_value(["INN", "inn", "ORG_INN", "I_INN"]) - ogrn = get_attr_value(["OGRN", "ogrn", "ORG_OGRN", "I_OGRN"]) + inn = get_attr_value(["INN", "inn", "ORG_INN", "I_INN", "ИНН"]) + ogrn = get_attr_value(["OGRN", "ogrn", "ORG_OGRN", "I_OGRN", "ОГРН"]) organisation_name = get_attr_value( [ "ORG_NAME", @@ -804,6 +806,7 @@ class ProverkiClient: "organisation_name", "org_name", "name", + "Наименование", ] ) control_authority = get_attr_value( @@ -814,6 +817,7 @@ class ProverkiClient: "ORGAN_NAME", "control_authority", "authority", + "КонтрольныйОрган", ] ) inspection_type = get_attr_value( @@ -823,6 +827,7 @@ class ProverkiClient: "I_TYPE", "inspection_type", "type", + "ТипПроверки", ] ) inspection_form = get_attr_value( @@ -832,6 +837,7 @@ class ProverkiClient: "I_FORM", "inspection_form", "form", + "ФормаПроверки", ] ) start_date = get_attr_value( @@ -842,6 +848,7 @@ class ProverkiClient: "start_date", "date_start", "date", + "ДатаНачала", ] ) end_date = get_attr_value( @@ -851,6 +858,7 @@ class ProverkiClient: "DATE_END", "end_date", "date_end", + "ДатаОкончания", ] ) status = get_attr_value( @@ -859,6 +867,7 @@ class ProverkiClient: "I_STATUS", "status", "state", + "Статус", ] ) legal_basis = get_attr_value( @@ -870,6 +879,7 @@ class ProverkiClient: "legal_basis", "basis", "law", + "ПравовоеОснование", ] ) result = get_attr_value( @@ -878,6 +888,7 @@ class ProverkiClient: "I_RESULT", "result", "outcome", + "Результат", ] ) diff --git a/src/apps/parsers/clients/trudvsem/__init__.py b/src/apps/parsers/clients/trudvsem/__init__.py new file mode 100644 index 0000000..2671a95 --- /dev/null +++ b/src/apps/parsers/clients/trudvsem/__init__.py @@ -0,0 +1,5 @@ +"""Клиент портала Работа России.""" + +from apps.parsers.clients.trudvsem.client import TrudvsemClient, TrudvsemClientError + +__all__ = ["TrudvsemClient", "TrudvsemClientError"] diff --git a/src/apps/parsers/clients/trudvsem/client.py b/src/apps/parsers/clients/trudvsem/client.py new file mode 100644 index 0000000..3987547 --- /dev/null +++ b/src/apps/parsers/clients/trudvsem/client.py @@ -0,0 +1,199 @@ +"""Клиент API вакансий портала Работа России.""" + +import hashlib +import json +import logging +from dataclasses import dataclass, field +from decimal import Decimal, InvalidOperation +from typing import Any + +from apps.parsers.clients.base import BaseHTTPClient, HTTPClientError +from apps.parsers.clients.common.schemas import GenericParserItem +from apps.parsers.models import ParserLoadLog + +logger = logging.getLogger(__name__) + +DEFAULT_BASE_URL = "http://opendata.trudvsem.ru/api/v1" +VACANCIES_ENDPOINT = "/vacancies" + + +class TrudvsemClientError(HTTPClientError): + """Ошибка клиента Работа России.""" + + pass + + +@dataclass +class TrudvsemClient: + """Клиент открытого API вакансий Работа России.""" + + proxies: list[str] | None = None + base_url: str = DEFAULT_BASE_URL + timeout: int = 120 + company_search_max_pages: int = 20 + _http_client: BaseHTTPClient | None = field(default=None, repr=False) + + @property + def http_client(self) -> BaseHTTPClient: + """Ленивая инициализация HTTP клиента.""" + if self._http_client is None: + self._http_client = BaseHTTPClient( + base_url=self.base_url, + proxies=self.proxies, + timeout=self.timeout, + ) + return self._http_client + + def fetch_vacancies( + self, + *, + limit: int = 100, + offset: int = 0, + region_code: str | None = None, + company_inn: str | None = None, + text: str | None = None, + ) -> list[GenericParserItem]: + """ + Получить вакансии из открытого API. + + Args: + limit: Размер страницы. + offset: Смещение. + region_code: Код региона в API Работа России. + company_inn: ИНН работодателя для локальной фильтрации результата. + text: Текстовый фильтр API. + """ + if company_inn: + return self._fetch_vacancies_by_company_inn( + limit=limit, + offset=offset, + region_code=region_code, + company_inn=company_inn, + text=text, + ) + + params: dict[str, Any] = {"limit": limit, "offset": offset} + if region_code: + params["region"] = region_code + if text: + params["text"] = text + + try: + response = self.http_client.get_json(VACANCIES_ENDPOINT, params=params) + except HTTPClientError: + raise + except Exception as e: + raise TrudvsemClientError(f"Failed to fetch vacancies: {e}") from e + + vacancies = self._extract_vacancies(response) + records = [self._map_vacancy(vacancy) for vacancy in vacancies] + logger.info("Fetched %d Trudvsem vacancies", len(records)) + return records + + def _fetch_vacancies_by_company_inn( + self, + *, + limit: int, + offset: int, + region_code: str | None, + company_inn: str, + text: str | None, + ) -> list[GenericParserItem]: + """Искать вакансии работодателя по страницам, чтобы не дать ложный ноль.""" + records: list[GenericParserItem] = [] + current_offset = offset + page_size = max(limit, 1) + + for _ in range(self.company_search_max_pages): + params: dict[str, Any] = {"limit": page_size, "offset": current_offset} + if region_code: + params["region"] = region_code + if text: + params["text"] = text + + try: + response = self.http_client.get_json(VACANCIES_ENDPOINT, params=params) + except HTTPClientError: + raise + except Exception as e: + raise TrudvsemClientError(f"Failed to fetch vacancies: {e}") from e + + vacancies = self._extract_vacancies(response) + page_records = [self._map_vacancy(vacancy) for vacancy in vacancies] + records.extend( + record for record in page_records if record.inn == company_inn + ) + if len(records) >= limit: + result = records[:limit] + logger.info("Fetched %d Trudvsem vacancies by INN", len(result)) + return result + if len(vacancies) < page_size: + logger.info("Fetched %d Trudvsem vacancies by INN", len(records)) + return records + current_offset += page_size + + raise TrudvsemClientError( + "Company INN search reached page limit before exhausting vacancies" + ) + + def _extract_vacancies(self, response: dict) -> list[dict]: + """Достать список вакансий из ответа API.""" + results = response.get("results", response) + vacancies = results.get("vacancies", []) if isinstance(results, dict) else [] + + normalized = [] + for item in vacancies: + if isinstance(item, dict) and isinstance(item.get("vacancy"), dict): + normalized.append(item["vacancy"]) + elif isinstance(item, dict): + normalized.append(item) + return normalized + + def _map_vacancy(self, vacancy: dict) -> GenericParserItem: + """Преобразовать вакансию API в общую DTO.""" + company = vacancy.get("company") or {} + salary = vacancy.get("salary") or "" + external_id = str(vacancy.get("id") or vacancy.get("source_id") or "") + if not external_id: + raw = json.dumps(vacancy, ensure_ascii=False, sort_keys=True, default=str) + external_id = hashlib.sha256(raw.encode("utf-8")).hexdigest()[:24] + return GenericParserItem( + source=ParserLoadLog.Source.TRUDVSEM, + external_id=external_id, + inn=str(company.get("inn") or vacancy.get("company_inn") or ""), + ogrn=str(company.get("ogrn") or ""), + organisation_name=str(company.get("name") or vacancy.get("company") or ""), + title=str(vacancy.get("job-name") or vacancy.get("name") or ""), + record_date=str(vacancy.get("creation-date") or vacancy.get("date") or ""), + amount=self._parse_salary(salary), + status=str(vacancy.get("state") or ""), + url=str(vacancy.get("vac_url") or vacancy.get("url") or ""), + payload=vacancy, + ) + + def _parse_salary(self, salary: Any) -> Decimal | None: + """Достать числовую зарплату из ответа API.""" + if isinstance(salary, dict): + value = salary.get("from") or salary.get("to") or salary.get("salary") + else: + value = salary + if value in (None, ""): + return None + try: + return Decimal(str(value).replace(" ", "").replace(",", ".")) + except InvalidOperation: + return None + + def close(self) -> None: + """Закрыть HTTP клиент.""" + if self._http_client is not None: + self._http_client.close() + self._http_client = None + + def __enter__(self) -> "TrudvsemClient": + """Поддержка context manager.""" + return self + + def __exit__(self, exc_type, exc_val, exc_tb) -> None: + """Закрытие при выходе из context manager.""" + self.close() diff --git a/src/apps/parsers/frontend_compat.py b/src/apps/parsers/frontend_compat.py new file mode 100644 index 0000000..ad078b4 --- /dev/null +++ b/src/apps/parsers/frontend_compat.py @@ -0,0 +1,790 @@ +"""Compatibility API for frontend contracts that existed on the dev branch.""" + +from __future__ import annotations + +import csv +from collections.abc import Iterable +from dataclasses import dataclass +from datetime import timedelta +from typing import Any + +from apps.core.response import api_error_response, api_response +from apps.core.services import BackgroundJobService +from apps.parsers.models import ( + GenericParserRecord, + ParserLoadLog, +) +from apps.parsers.serializers import ( + ParserLoadLogSerializer, + ParserRunRequestSerializer, +) +from apps.parsers.source_registry import PARSER_SOURCES +from apps.parsers.views import ( + NATIVE_RECORD_MODELS, + TASKS_BY_NAME, + build_task_kwargs, +) +from django.core.cache import cache +from django.core.paginator import Paginator +from django.db.models import CharField, Max, Q +from django.db.models.functions import Cast +from django.http import Http404, HttpResponse +from drf_yasg import openapi +from drf_yasg.utils import swagger_auto_schema +from rest_framework import status +from rest_framework.exceptions import ValidationError +from rest_framework.permissions import IsAuthenticated +from rest_framework.response import Response +from rest_framework.views import APIView + +FRONTEND_SOURCES_TAG = "Frontend Sources" +PARSING_SETTINGS_TAG = "Parsing Settings" +SYSTEM_LOGS_TAG = "System Logs" + +ACTIVE_JOB_STATUSES = {"pending", "started", "retry"} +SUCCESS_LOAD_STATUSES = {"success", "skipped"} +ERROR_LOAD_STATUSES = {"failed", "failure", "error"} + +PARSING_SETTINGS_CACHE_KEY = "parsers:frontend_compat:parsing_settings" +PARSING_SETTINGS_FIELDS = { + "manufacturers_and_products", + "public_procurements", + "defense_unreliable_suppliers", + "planned_inspections", + "arbitration_cases", + "bankruptcy_procedures", + "information_security_registries", +} +PARSING_SETTINGS_DEFAULTS = { + "manufacturers_and_products": "daily", + "public_procurements": "daily", + "defense_unreliable_suppliers": "weekly", + "planned_inspections": "monthly", + "arbitration_cases": "daily", + "bankruptcy_procedures": "daily", + "information_security_registries": "yearly", +} +PARSING_SETTINGS_FREQUENCIES = {"daily", "weekly", "monthly", "yearly"} +LOG_STATUS_LABELS = { + "success": "Успешно", + "failed": "Ошибка", + "failure": "Ошибка", + "error": "Ошибка", + "in_progress": "В процессе", + "pending": "В очереди", + "started": "В процессе", + "retry": "Повтор", + "skipped": "Пропущено", +} + + +@dataclass(frozen=True) +class FrontendSourceCardDefinition: + """Aggregated source card expected by the old frontend API.""" + + slug: str + title: str + description: str + order: int + source_keys: tuple[str, ...] + refresh_interval: timedelta | None = None + is_available: bool = True + refresh_params: tuple[dict[str, Any], ...] = () + + +SOURCE_CARD_DEFINITIONS = ( + FrontendSourceCardDefinition( + slug="financial-indicators", + title="Финансово-экономические показатели", + description="Финансовая отчетность и показатели ФНС.", + order=10, + source_keys=("fns_financial",), + refresh_interval=timedelta(days=1), + ), + FrontendSourceCardDefinition( + slug="public-procurements", + title="Государственные закупки по 44-ФЗ и 223-ФЗ", + description="Извещения, закупочные процедуры и контракты ЕИС.", + order=20, + source_keys=("procurements_44fz", "procurements_223fz", "contracts"), + refresh_interval=timedelta(hours=6), + ), + FrontendSourceCardDefinition( + slug="manufacturers-and-products", + title="Производители и продукция России", + description="Данные Минпромторга о производителях и промышленной продукции.", + order=30, + source_keys=("industrial", "manufactures", "mpt_products"), + refresh_interval=timedelta(days=1), + ), + FrontendSourceCardDefinition( + slug="planned-inspections", + title="Плановые проверки Генпрокуратуры России", + description="Плановые и внеплановые проверки из открытых данных.", + order=40, + source_keys=("inspections", "sync_inspections"), + refresh_params=( + { + "name": "max_months_per_law", + "label": "Месяцев на закон", + "description": "Ограничение объема синхронизации проверок.", + "required": False, + "type": "integer", + "default": 1, + }, + ), + ), + FrontendSourceCardDefinition( + slug="defense-unreliable-suppliers", + title="Недобросовестные поставщики и ГОЗ", + description="Реестры ФАС и ЕИС по поставщикам и уклонению от ГОЗ.", + order=50, + source_keys=("unfair_suppliers", "fas_goz"), + refresh_interval=timedelta(days=1), + ), + FrontendSourceCardDefinition( + slug="arbitration-cases", + title="Арбитражные дела", + description="Арбитражные дела по организациям.", + order=60, + source_keys=("arbitration",), + refresh_interval=timedelta(days=1), + ), + FrontendSourceCardDefinition( + slug="bankruptcy-procedures", + title="Банкротства Федресурс", + description="Сведения о процедурах банкротства.", + order=70, + source_keys=("fedresurs_bankruptcy",), + refresh_interval=timedelta(days=1), + ), + FrontendSourceCardDefinition( + slug="information-security-registries", + title="Реестры ФСТЭК", + description="Реестры по информационной безопасности.", + order=80, + source_keys=("fstec",), + refresh_interval=timedelta(days=30), + ), + FrontendSourceCardDefinition( + slug="labor-vacancies", + title="Вакансии Работа России", + description="Вакансии работодателей из ЕЦП Работа в России.", + order=90, + source_keys=("trudvsem",), + refresh_params=( + { + "name": "company_inn", + "label": "ИНН работодателя", + "description": "Фильтр вакансий по ИНН работодателя.", + "required": False, + "type": "string", + "default": None, + }, + { + "name": "text", + "label": "Текст", + "description": "Поисковая строка вакансии.", + "required": False, + "type": "string", + "default": None, + }, + { + "name": "limit", + "label": "Лимит", + "description": "Размер страницы API Работа России.", + "required": False, + "type": "integer", + "default": 100, + }, + ), + ), +) +SOURCE_CARD_BY_SLUG = {item.slug: item for item in SOURCE_CARD_DEFINITIONS} + + +def _source_keys_to_model_sources(source_keys: Iterable[str]) -> list[str]: + """Map source registry keys to ParserLoadLog source values without duplicates.""" + sources = [] + for source_key in source_keys: + descriptor = PARSER_SOURCES.get(source_key) + if descriptor and descriptor.source not in sources: + sources.append(descriptor.source) + return sources + + +def _card_slug_for_parser_source(parser_source: str) -> str | None: + for definition in SOURCE_CARD_DEFINITIONS: + if parser_source in _source_keys_to_model_sources(definition.source_keys): + return definition.slug + return None + + +def _card_title_for_parser_source(parser_source: str) -> str | None: + slug = _card_slug_for_parser_source(parser_source) + if not slug: + return None + return SOURCE_CARD_BY_SLUG[slug].title + + +def _get_card_definition(slug: str) -> FrontendSourceCardDefinition: + definition = SOURCE_CARD_BY_SLUG.get(slug) + if definition is None: + raise Http404("Карточка источника не найдена") + return definition + + +def _record_queryset_for_source(source: str): + if source in NATIVE_RECORD_MODELS: + return NATIVE_RECORD_MODELS[source].objects.all() + return GenericParserRecord.objects.filter(source=source) + + +def _records_count_for_source(source: str) -> int: + return _record_queryset_for_source(source).count() + + +def _organizations_count_for_source(source: str) -> int: + queryset = _record_queryset_for_source(source) + field = "inn" + return queryset.exclude(**{field: ""}).values(field).distinct().count() + + +def _data_timestamp_for_source(source: str): + return _record_queryset_for_source(source).aggregate( + last_updated=Max("updated_at") + )["last_updated"] + + +def _latest_load_for_sources( + sources: list[str], + *, + statuses: set[str] | None = None, +) -> ParserLoadLog | None: + queryset = ParserLoadLog.objects.filter(source__in=sources) + if statuses: + queryset = queryset.filter(status__in=statuses) + return queryset.order_by("-updated_at", "-created_at").first() + + +def _serialize_load(load_log: ParserLoadLog | None) -> dict[str, Any] | None: + if load_log is None: + return None + return { + "batch_id": load_log.batch_id, + "source": load_log.source, + "source_display": load_log.get_source_display(), + "records_count": load_log.records_count, + "status": load_log.status, + "error_message": load_log.error_message, + "created_at": load_log.created_at, + "updated_at": load_log.updated_at, + } + + +def _serialize_active_job(job) -> dict[str, Any]: + return { + "task_id": job.task_id, + "task_name": job.task_name, + "status": job.status, + "progress": job.progress, + "progress_message": job.progress_message, + "started_at": job.started_at, + "created_at": job.created_at, + "meta": job.meta, + } + + +def _active_tasks_for_definition( + definition: FrontendSourceCardDefinition, +) -> list[dict]: + task_names = [ + PARSER_SOURCES[source_key].task_name + for source_key in definition.source_keys + if source_key in PARSER_SOURCES + ] + queryset = BackgroundJobService.get_queryset().filter( + task_name__in=task_names, + status__in=ACTIVE_JOB_STATUSES, + ) + return [_serialize_active_job(job) for job in queryset.order_by("-created_at")[:10]] + + +def _status_label(status_value: str) -> str: + labels = { + "success": "Обновлено", + "in_progress": "В процессе", + "error": "Ошибка", + "idle": "Нет данных", + "unavailable": "Не подключено", + } + return labels.get(status_value, status_value) + + +def _status_for_card( + definition: FrontendSourceCardDefinition, + *, + active_tasks: list[dict], + latest_load: ParserLoadLog | None, + last_updated_at, +) -> str: + if not definition.is_available: + return "unavailable" + if active_tasks or (latest_load and latest_load.status == "in_progress"): + return "in_progress" + if latest_load and latest_load.status in ERROR_LOAD_STATUSES: + return "error" + if last_updated_at: + return "success" + return "idle" + + +def _build_source_item(source_key: str) -> dict[str, Any]: + descriptor = PARSER_SOURCES[source_key] + source = descriptor.source + latest_load = _latest_load_for_sources([source]) + latest_success_load = _latest_load_for_sources( + [source], statuses=SUCCESS_LOAD_STATUSES + ) + last_updated_at = ( + latest_success_load.updated_at + if latest_success_load + else _data_timestamp_for_source(source) + ) + return { + "code": descriptor.key, + "title": descriptor.title, + "description": descriptor.data_scope, + "parser_source": source, + "parser_source_display": descriptor.title, + "records_count": _records_count_for_source(source), + "organizations_count": _organizations_count_for_source(source), + "last_updated_at": last_updated_at, + "latest_load": _serialize_load(latest_load), + "latest_success_load": _serialize_load(latest_success_load), + } + + +def _build_source_card(definition: FrontendSourceCardDefinition) -> dict[str, Any]: + source_items = [ + _build_source_item(source_key) + for source_key in definition.source_keys + if source_key in PARSER_SOURCES + ] + sources = _source_keys_to_model_sources(definition.source_keys) + latest_load = _latest_load_for_sources(sources) + latest_success_load = _latest_load_for_sources( + sources, statuses=SUCCESS_LOAD_STATUSES + ) + timestamps = [ + item["last_updated_at"] for item in source_items if item["last_updated_at"] + ] + last_updated_at = latest_success_load.updated_at if latest_success_load else None + if last_updated_at is None and timestamps: + last_updated_at = max(timestamps) + + active_tasks = _active_tasks_for_definition(definition) + progress = ( + round(sum(int(task["progress"]) for task in active_tasks) / len(active_tasks)) + if active_tasks + else 0 + ) + status_value = _status_for_card( + definition, + active_tasks=active_tasks, + latest_load=latest_load, + last_updated_at=last_updated_at, + ) + next_update_at = ( + last_updated_at + definition.refresh_interval + if last_updated_at and definition.refresh_interval + else None + ) + + return { + "slug": definition.slug, + "title": definition.title, + "description": definition.description, + "order": definition.order, + "is_available": definition.is_available, + "status": status_value, + "status_label": _status_label(status_value), + "progress": progress, + "records_count": sum(item["records_count"] for item in source_items), + "organizations_count": sum( + item["organizations_count"] for item in source_items + ), + "last_updated_at": last_updated_at, + "next_update_at": next_update_at, + "error_message": latest_load.error_message if latest_load else "", + "task_names": [ + PARSER_SOURCES[source_key].task_name + for source_key in definition.source_keys + if source_key in PARSER_SOURCES + ], + "refresh_requires_params": any( + item.get("required") for item in definition.refresh_params + ), + "refresh_params": list(definition.refresh_params), + "active_tasks": active_tasks, + "source_items": source_items, + "latest_load": _serialize_load(latest_load), + "latest_success_load": _serialize_load(latest_success_load), + } + + +def _source_status_rows() -> list[dict[str, Any]]: + cards = sorted( + (_build_source_card(definition) for definition in SOURCE_CARD_DEFINITIONS), + key=lambda item: ( + item["last_updated_at"] is None, + -(item["last_updated_at"].timestamp()) if item["last_updated_at"] else 0, + item["title"], + ), + ) + return [ + { + "row_number": index, + "slug": card["slug"], + "source": card["title"], + "status": card["status"], + "status_label": card["status_label"], + "actualized_at": card["last_updated_at"], + "next_update_at": card["next_update_at"], + "records_count": card["records_count"], + "organizations_count": card["organizations_count"], + "progress": card["progress"], + "error_message": card["error_message"], + "active_tasks": card["active_tasks"], + } + for index, card in enumerate(cards, start=1) + ] + + +def _refresh_params_from_request(request) -> dict[str, Any]: + raw_params = request.data.get("params", request.data) + return dict(raw_params) if isinstance(raw_params, dict) else {} + + +def _start_source_refresh( + source_key: str, params: dict[str, Any], user_id: int +) -> dict[str, str]: + descriptor = PARSER_SOURCES[source_key] + serializer = ParserRunRequestSerializer(data=params) + serializer.is_valid(raise_exception=True) + validated = { + key: value + for key, value in serializer.validated_data.items() + if value not in ("", None) + } + task = TASKS_BY_NAME[descriptor.task_name] + task_kwargs = build_task_kwargs(source_key, validated, user_id) + async_result = task.delay(**task_kwargs) + return {"task_id": async_result.id, "task_name": descriptor.task_name} + + +class SourceCardListCompatView(APIView): + """Old frontend alias: GET /api/v1/sources/.""" + + permission_classes = [IsAuthenticated] + + @swagger_auto_schema( + operation_summary="Frontend source cards", + tags=[FRONTEND_SOURCES_TAG], + responses={200: "Source card list"}, + ) + def get(self, request): + cards = [ + _build_source_card(definition) for definition in SOURCE_CARD_DEFINITIONS + ] + cards.sort(key=lambda item: item["order"]) + return api_response(cards) + + +class SourceTaskStatusListCompatView(APIView): + """Old frontend alias: GET /api/v1/sources/statuses/.""" + + permission_classes = [IsAuthenticated] + + @swagger_auto_schema( + operation_summary="Frontend source statuses", + tags=[FRONTEND_SOURCES_TAG], + responses={200: "Source status list"}, + ) + def get(self, request): + return api_response(_source_status_rows()) + + +class SourceCardDetailCompatView(APIView): + """Old frontend alias: GET /api/v1/sources/{slug}/.""" + + permission_classes = [IsAuthenticated] + + @swagger_auto_schema( + operation_summary="Frontend source card detail", + tags=[FRONTEND_SOURCES_TAG], + responses={200: "Source card detail", 404: "Not found"}, + ) + def get(self, request, slug: str): + return api_response(_build_source_card(_get_card_definition(slug))) + + +class SourceCardRefreshCompatView(APIView): + """Old frontend alias: POST /api/v1/sources/{slug}/refresh/.""" + + permission_classes = [IsAuthenticated] + + @swagger_auto_schema( + operation_summary="Refresh frontend source card", + request_body=openapi.Schema(type=openapi.TYPE_OBJECT), + tags=[FRONTEND_SOURCES_TAG], + responses={202: "Queued task ids"}, + ) + def post(self, request, slug: str): + params = _refresh_params_from_request(request) + if params.get("proxies") and not request.user.is_staff: + return api_error_response( + [ + { + "code": "proxy_override_forbidden", + "message": "Прокси для запуска парсера может задавать только staff", + } + ], + status_code=status.HTTP_403_FORBIDDEN, + ) + + definition = _get_card_definition(slug) + tasks = [ + _start_source_refresh(source_key, params, request.user.id) + for source_key in definition.source_keys + if source_key in PARSER_SOURCES + ] + return Response( + { + "task_id": tasks[0]["task_id"] if tasks else None, + "status": "accepted", + "tasks": tasks, + }, + status=status.HTTP_202_ACCEPTED, + ) + + +class ParsingSettingsCompatView(APIView): + """Old frontend alias: GET/PATCH /api/v1/parsing/settings/.""" + + permission_classes = [IsAuthenticated] + + @staticmethod + def _get_settings() -> dict[str, str]: + cached = cache.get(PARSING_SETTINGS_CACHE_KEY) or {} + return {**PARSING_SETTINGS_DEFAULTS, **cached} + + @swagger_auto_schema( + operation_summary="Get parsing settings", + tags=[PARSING_SETTINGS_TAG], + responses={200: "Parsing settings"}, + ) + def get(self, request): + return Response(self._get_settings(), status=status.HTTP_200_OK) + + @swagger_auto_schema( + operation_summary="Update parsing settings", + request_body=openapi.Schema(type=openapi.TYPE_OBJECT), + tags=[PARSING_SETTINGS_TAG], + responses={200: "Parsing settings"}, + ) + def patch(self, request): + unknown_fields = set(request.data) - PARSING_SETTINGS_FIELDS + if unknown_fields: + raise ValidationError( + { + "detail": "Неизвестные настройки: " + + ", ".join(sorted(unknown_fields)) + } + ) + invalid = { + key: value + for key, value in request.data.items() + if value not in PARSING_SETTINGS_FREQUENCIES + } + if invalid: + raise ValidationError( + { + key: "Значение должно быть daily, weekly, monthly или yearly" + for key in invalid + } + ) + + settings_payload = {**self._get_settings(), **request.data} + cache.set(PARSING_SETTINGS_CACHE_KEY, settings_payload, timeout=None) + return Response(settings_payload, status=status.HTTP_200_OK) + + +class ParserLoadLogListCompatView(APIView): + """Old frontend alias: GET /api/v1/system/logs/.""" + + permission_classes = [IsAuthenticated] + + def _get_queryset(self, request): + queryset = ParserLoadLog.objects.all().order_by("-created_at") + source_value = request.query_params.get("source", "").strip() + status_value = request.query_params.get("status", "").strip() + batch_id = request.query_params.get("batch_id", "").strip() + search = request.query_params.get("search", "").strip() + + if source_value: + card_definition = SOURCE_CARD_BY_SLUG.get(source_value) + if card_definition: + queryset = queryset.filter( + source__in=_source_keys_to_model_sources( + card_definition.source_keys + ) + ) + else: + queryset = queryset.filter(source=source_value) + if status_value: + queryset = queryset.filter(status=status_value) + if batch_id: + try: + queryset = queryset.filter(batch_id=int(batch_id)) + except (TypeError, ValueError) as exc: + raise ValidationError( + {"batch_id": "Параметр batch_id должен быть целым числом"} + ) from exc + if search: + queryset = queryset.annotate( + batch_id_text=Cast("batch_id", output_field=CharField()) + ).filter( + Q(source__icontains=search) + | Q(status__icontains=search) + | Q(error_message__icontains=search) + | Q(batch_id_text__icontains=search) + ) + return queryset + + @swagger_auto_schema( + operation_summary="Parser load logs", + manual_parameters=[ + openapi.Parameter("source", openapi.IN_QUERY, type=openapi.TYPE_STRING), + openapi.Parameter("status", openapi.IN_QUERY, type=openapi.TYPE_STRING), + openapi.Parameter("batch_id", openapi.IN_QUERY, type=openapi.TYPE_INTEGER), + openapi.Parameter("search", openapi.IN_QUERY, type=openapi.TYPE_STRING), + openapi.Parameter("page", openapi.IN_QUERY, type=openapi.TYPE_INTEGER), + openapi.Parameter("page_size", openapi.IN_QUERY, type=openapi.TYPE_INTEGER), + ], + tags=[SYSTEM_LOGS_TAG], + responses={200: "Paginated parser logs"}, + ) + def get(self, request): + try: + page_size = max(1, min(int(request.query_params.get("page_size", 20)), 100)) + page = max(1, int(request.query_params.get("page", 1))) + except (TypeError, ValueError) as exc: + raise ValidationError( + {"detail": "Параметры page и page_size должны быть целыми числами"} + ) from exc + paginator = Paginator( + self._serialize_rows(self._get_queryset(request)), page_size + ) + page_obj = paginator.get_page(page) + return Response( + { + "count": paginator.count, + "next": None, + "previous": None, + "results": list(page_obj.object_list), + }, + status=status.HTTP_200_OK, + ) + + def _serialize_rows(self, queryset) -> list[dict[str, Any]]: + return [_serialize_log_row(log) for log in queryset] + + +class ParserLoadLogDetailCompatView(APIView): + """Old frontend alias: GET /api/v1/system/logs/{id}/.""" + + permission_classes = [IsAuthenticated] + + @swagger_auto_schema( + operation_summary="Parser load log detail", + tags=[SYSTEM_LOGS_TAG], + responses={200: ParserLoadLogSerializer, 404: "Not found"}, + ) + def get(self, request, pk: int): + log = ParserLoadLog.objects.filter(pk=pk).first() + if log is None: + return api_error_response( + [{"code": "not_found", "message": "Лог загрузки не найден"}], + status_code=status.HTTP_404_NOT_FOUND, + ) + return Response(_serialize_log_row(log), status=status.HTTP_200_OK) + + +class ParserLoadLogExportCompatView(ParserLoadLogListCompatView): + """Old frontend alias: GET /api/v1/system/logs/export/.""" + + @swagger_auto_schema( + operation_summary="Export parser load logs", + tags=[SYSTEM_LOGS_TAG], + responses={200: "CSV"}, + ) + def get(self, request): + response = HttpResponse(content_type="text/csv; charset=utf-8") + response["Content-Disposition"] = 'attachment; filename="parser-load-logs.csv"' + writer = csv.writer(response) + writer.writerow( + [ + "id", + "batch_id", + "source", + "source_label", + "records_count", + "organizations_count", + "status", + "status_label", + "error_message", + "created_at", + "updated_at", + ] + ) + for row in self._serialize_rows(self._get_queryset(request)): + writer.writerow( + [ + row["id"], + row["batch_id"], + row["source"], + row["source_label"], + row["records_count"], + row["organizations_count"], + row["status"], + row["status_label"], + row["error_message"], + row["created_at"], + row["updated_at"], + ] + ) + return response + + +def _serialize_log_row(log: ParserLoadLog) -> dict[str, Any]: + source_slug = _card_slug_for_parser_source(log.source) or log.source + return { + "id": log.id, + "batch_id": log.batch_id, + "source": source_slug, + "source_label": _card_title_for_parser_source(log.source) + or log.get_source_display(), + "records_count": log.records_count, + "organizations_count": _organizations_count_for_source(log.source), + "status": log.status, + "status_label": LOG_STATUS_LABELS.get(log.status, log.status), + "error_message": log.error_message, + "created_at": log.created_at, + "updated_at": log.updated_at, + } + + +def get_frontend_source_card_slugs() -> list[str]: + """Expose frontend card slugs for tests and documentation.""" + return [definition.slug for definition in SOURCE_CARD_DEFINITIONS] diff --git a/src/apps/parsers/migrations/0006_add_generic_parser_record.py b/src/apps/parsers/migrations/0006_add_generic_parser_record.py new file mode 100644 index 0000000..ca960c3 --- /dev/null +++ b/src/apps/parsers/migrations/0006_add_generic_parser_record.py @@ -0,0 +1,64 @@ +# Generated by Django 3.2.25 on 2026-04-27 10:15 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('parsers', '0005_add_inspection_fz248_fields'), + ] + + operations = [ + migrations.CreateModel( + name='GenericParserRecord', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('created_at', models.DateTimeField(auto_now_add=True, db_index=True, help_text='Дата и время создания записи', verbose_name='создано')), + ('updated_at', models.DateTimeField(auto_now=True, help_text='Дата и время последнего обновления', verbose_name='обновлено')), + ('load_batch', models.PositiveIntegerField(db_index=True, help_text='Идентификатор пакета загрузки', verbose_name='ID пакета загрузки')), + ('source', models.CharField(choices=[('industrial', 'Промышленное производство'), ('manufactures', 'Реестр производителей'), ('inspections', 'Единый реестр проверок'), ('mpt_products', 'Продукция Минпромторга'), ('procurements_44fz', 'Закупки 44-ФЗ'), ('procurements_223fz', 'Закупки 223-ФЗ'), ('contracts', 'Контракты ЕИС'), ('unfair_suppliers', 'Недобросовестные поставщики'), ('fas_goz', 'Уклонение от ГОЗ'), ('fns_financial', 'Финансово-экономические показатели ФНС'), ('arbitration', 'Арбитражные дела'), ('fedresurs_bankruptcy', 'Банкротства Федресурс'), ('fstec', 'Реестры ФСТЭК'), ('trudvsem', 'Вакансии Работа России')], db_index=True, help_text='Источник данных', max_length=50, verbose_name='источник')), + ('external_id', models.CharField(db_index=True, help_text='Стабильный идентификатор записи во внешнем источнике', max_length=255, verbose_name='внешний ID')), + ('inn', models.CharField(blank=True, db_index=True, help_text='ИНН организации, если есть в источнике', max_length=20, verbose_name='ИНН')), + ('ogrn', models.CharField(blank=True, db_index=True, help_text='ОГРН организации, если есть в источнике', max_length=20, verbose_name='ОГРН')), + ('organisation_name', models.TextField(blank=True, help_text='Наименование организации из источника', verbose_name='наименование организации')), + ('title', models.TextField(blank=True, help_text='Краткое описание записи', verbose_name='заголовок')), + ('record_date', models.CharField(blank=True, db_index=True, help_text='Дата записи в формате источника', max_length=30, verbose_name='дата записи')), + ('amount', models.DecimalField(blank=True, decimal_places=2, help_text='Сумма, если источник её содержит', max_digits=20, null=True, verbose_name='сумма')), + ('status', models.CharField(blank=True, help_text='Статус записи во внешнем источнике', max_length=255, verbose_name='статус')), + ('url', models.TextField(blank=True, help_text='Ссылка на карточку/документ во внешнем источнике', verbose_name='URL')), + ('payload', models.JSONField(blank=True, default=dict, help_text='Нормализованный исходный документ', verbose_name='исходные данные')), + ], + options={ + 'verbose_name': 'запись внешнего источника', + 'verbose_name_plural': 'записи внешних источников', + 'db_table': 'parsers_generic_record', + 'ordering': ['-created_at'], + }, + ), + migrations.AlterField( + model_name='parserloadlog', + name='source', + field=models.CharField(choices=[('industrial', 'Промышленное производство'), ('manufactures', 'Реестр производителей'), ('inspections', 'Единый реестр проверок'), ('mpt_products', 'Продукция Минпромторга'), ('procurements_44fz', 'Закупки 44-ФЗ'), ('procurements_223fz', 'Закупки 223-ФЗ'), ('contracts', 'Контракты ЕИС'), ('unfair_suppliers', 'Недобросовестные поставщики'), ('fas_goz', 'Уклонение от ГОЗ'), ('fns_financial', 'Финансово-экономические показатели ФНС'), ('arbitration', 'Арбитражные дела'), ('fedresurs_bankruptcy', 'Банкротства Федресурс'), ('fstec', 'Реестры ФСТЭК'), ('trudvsem', 'Вакансии Работа России')], db_index=True, help_text='Источник данных', max_length=50, verbose_name='источник'), + ), + migrations.AddIndex( + model_name='genericparserrecord', + index=models.Index(fields=['source', 'load_batch'], name='parsers_gen_source_a854d0_idx'), + ), + migrations.AddIndex( + model_name='genericparserrecord', + index=models.Index(fields=['source', 'inn'], name='parsers_gen_source_b29dc4_idx'), + ), + migrations.AddIndex( + model_name='genericparserrecord', + index=models.Index(fields=['source', 'ogrn'], name='parsers_gen_source_937e39_idx'), + ), + migrations.AddIndex( + model_name='genericparserrecord', + index=models.Index(fields=['source', 'record_date'], name='parsers_gen_source_aaa60a_idx'), + ), + migrations.AddConstraint( + model_name='genericparserrecord', + constraint=models.UniqueConstraint(fields=('source', 'external_id'), name='unique_generic_source_external_id'), + ), + ] diff --git a/src/apps/parsers/migrations/0007_parserbatchsequence.py b/src/apps/parsers/migrations/0007_parserbatchsequence.py new file mode 100644 index 0000000..2b152e9 --- /dev/null +++ b/src/apps/parsers/migrations/0007_parserbatchsequence.py @@ -0,0 +1,29 @@ +# Generated by Django 3.2.25 on 2026-04-27 10:54 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('parsers', '0006_add_generic_parser_record'), + ] + + operations = [ + migrations.CreateModel( + name='ParserBatchSequence', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('created_at', models.DateTimeField(auto_now_add=True, db_index=True, help_text='Дата и время создания записи', verbose_name='создано')), + ('updated_at', models.DateTimeField(auto_now=True, help_text='Дата и время последнего обновления', verbose_name='обновлено')), + ('source', models.CharField(choices=[('industrial', 'Промышленное производство'), ('manufactures', 'Реестр производителей'), ('inspections', 'Единый реестр проверок'), ('mpt_products', 'Продукция Минпромторга'), ('procurements_44fz', 'Закупки 44-ФЗ'), ('procurements_223fz', 'Закупки 223-ФЗ'), ('contracts', 'Контракты ЕИС'), ('unfair_suppliers', 'Недобросовестные поставщики'), ('fas_goz', 'Уклонение от ГОЗ'), ('fns_financial', 'Финансово-экономические показатели ФНС'), ('arbitration', 'Арбитражные дела'), ('fedresurs_bankruptcy', 'Банкротства Федресурс'), ('fstec', 'Реестры ФСТЭК'), ('trudvsem', 'Вакансии Работа России')], help_text='Источник данных', max_length=50, unique=True, verbose_name='источник')), + ('next_batch_id', models.PositiveIntegerField(default=1, help_text='Следующий batch_id для источника', verbose_name='следующий ID пакета')), + ], + options={ + 'verbose_name': 'счётчик пакетов парсера', + 'verbose_name_plural': 'счётчики пакетов парсеров', + 'db_table': 'parsers_batch_sequence', + 'ordering': ['source'], + }, + ), + ] diff --git a/src/apps/parsers/migrations/0008_seed_weekly_parser_schedules.py b/src/apps/parsers/migrations/0008_seed_weekly_parser_schedules.py new file mode 100644 index 0000000..22fc0aa --- /dev/null +++ b/src/apps/parsers/migrations/0008_seed_weekly_parser_schedules.py @@ -0,0 +1,83 @@ +import json + +from django.db import migrations + + +WEEKLY_MSK_CRON = { + "minute": "0", + "hour": "0", + "day_of_week": "6", + "day_of_month": "*", + "month_of_year": "*", + "timezone": "Europe/Moscow", +} + +PARSER_WEEKLY_TASKS = [ + ("industrial", "apps.parsers.tasks.parse_industrial_production"), + ("manufactures", "apps.parsers.tasks.parse_manufactures"), + ("inspections", "apps.parsers.tasks.parse_inspections"), + ("sync_inspections", "apps.parsers.tasks.sync_inspections"), + ("mpt_products", "apps.parsers.tasks.parse_mpt_products"), + ("procurements_44fz", "apps.parsers.tasks.parse_procurements_44fz"), + ("procurements_223fz", "apps.parsers.tasks.parse_procurements_223fz"), + ("contracts", "apps.parsers.tasks.parse_contracts"), + ("unfair_suppliers", "apps.parsers.tasks.parse_unfair_suppliers"), + ("fas_goz", "apps.parsers.tasks.parse_fas_goz_evasion"), + ("fns_financial", "apps.parsers.tasks.parse_fns_financial_indicators"), + ("arbitration", "apps.parsers.tasks.parse_arbitration_cases"), + ("fedresurs_bankruptcy", "apps.parsers.tasks.parse_fedresurs_bankruptcy"), + ("fstec", "apps.parsers.tasks.parse_fstec_registers"), + ("trudvsem", "apps.parsers.tasks.parse_trudvsem_vacancies"), +] + + +def seed_weekly_parser_schedules(apps, schema_editor): + CrontabSchedule = apps.get_model("django_celery_beat", "CrontabSchedule") + PeriodicTask = apps.get_model("django_celery_beat", "PeriodicTask") + + crontab, _ = CrontabSchedule.objects.get_or_create(**WEEKLY_MSK_CRON) + + field_names = {field.name for field in PeriodicTask._meta.fields} + schedule_fields = {"crontab": crontab} + for field_name in ("interval", "solar", "clocked"): + if field_name in field_names: + schedule_fields[field_name] = None + + for source_key, task_name in PARSER_WEEKLY_TASKS: + PeriodicTask.objects.update_or_create( + name=f"parser:{source_key}:weekly-saturday-msk", + defaults={ + "task": task_name, + "args": json.dumps([]), + "kwargs": json.dumps({}), + "enabled": True, + "description": ( + "Default parser schedule: weekly on Saturday 00:00 MSK." + ), + **schedule_fields, + }, + ) + + +def remove_weekly_parser_schedules(apps, schema_editor): + PeriodicTask = apps.get_model("django_celery_beat", "PeriodicTask") + task_names = [ + f"parser:{source_key}:weekly-saturday-msk" + for source_key, _task_name in PARSER_WEEKLY_TASKS + ] + PeriodicTask.objects.filter(name__in=task_names).delete() + + +class Migration(migrations.Migration): + + dependencies = [ + ("django_celery_beat", "0018_improve_crontab_helptext"), + ("parsers", "0007_parserbatchsequence"), + ] + + operations = [ + migrations.RunPython( + seed_weekly_parser_schedules, + reverse_code=remove_weekly_parser_schedules, + ), + ] diff --git a/src/apps/parsers/models.py b/src/apps/parsers/models.py index 0cb8c33..cddb8cf 100644 --- a/src/apps/parsers/models.py +++ b/src/apps/parsers/models.py @@ -20,6 +20,17 @@ class ParserLoadLog(TimestampMixin, models.Model): INDUSTRIAL = "industrial", _("Промышленное производство") MANUFACTURES = "manufactures", _("Реестр производителей") INSPECTIONS = "inspections", _("Единый реестр проверок") + MPT_PRODUCTS = "mpt_products", _("Продукция Минпромторга") + PROCUREMENTS_44FZ = "procurements_44fz", _("Закупки 44-ФЗ") + PROCUREMENTS_223FZ = "procurements_223fz", _("Закупки 223-ФЗ") + CONTRACTS = "contracts", _("Контракты ЕИС") + UNFAIR_SUPPLIERS = "unfair_suppliers", _("Недобросовестные поставщики") + FAS_GOZ = "fas_goz", _("Уклонение от ГОЗ") + FNS_FINANCIAL = "fns_financial", _("Финансово-экономические показатели ФНС") + ARBITRATION = "arbitration", _("Арбитражные дела") + FEDRESURS_BANKRUPTCY = "fedresurs_bankruptcy", _("Банкротства Федресурс") + FSTEC = "fstec", _("Реестры ФСТЭК") + TRUDVSEM = "trudvsem", _("Вакансии Работа России") batch_id = models.PositiveIntegerField( _("ID пакета"), @@ -63,6 +74,37 @@ class ParserLoadLog(TimestampMixin, models.Model): return f"Load #{self.batch_id} ({self.source}) - {self.records_count} records" +class ParserBatchSequence(TimestampMixin, models.Model): + """ + Счётчик batch_id для источника. + + Отдельная строка на источник даёт стабильную точку блокировки для Celery, + чтобы параллельные запуски одного парсера не получали одинаковый batch_id. + """ + + source = models.CharField( + _("источник"), + max_length=50, + choices=ParserLoadLog.Source.choices, + unique=True, + help_text=_("Источник данных"), + ) + next_batch_id = models.PositiveIntegerField( + _("следующий ID пакета"), + default=1, + help_text=_("Следующий batch_id для источника"), + ) + + class Meta: + db_table = "parsers_batch_sequence" + verbose_name = _("счётчик пакетов парсера") + verbose_name_plural = _("счётчики пакетов парсеров") + ordering = ["source"] + + def __str__(self) -> str: + return f"{self.source}: next batch {self.next_batch_id}" + + class IndustrialCertificateRecord(TimestampMixin, models.Model): """ Сертификат промышленного производства РФ. @@ -188,6 +230,114 @@ class ManufacturerRecord(TimestampMixin, models.Model): return f"{self.inn} - {self.full_legal_name[:50]}" +class GenericParserRecord(TimestampMixin, models.Model): + """ + Универсальная запись нового источника. + + Используется для слайсов, где данные сильно отличаются по форме: + ЕИС, ФАС, ФНС, КАД, Федресурс, ФСТЭК, Работа России и продукция МПТ. + Общие поля дают быстрый поиск по ИНН/ОГРН, полный исходный документ + сохраняется в payload для последующей специализации модели. + """ + + load_batch = models.PositiveIntegerField( + _("ID пакета загрузки"), + db_index=True, + help_text=_("Идентификатор пакета загрузки"), + ) + source = models.CharField( + _("источник"), + max_length=50, + choices=ParserLoadLog.Source.choices, + db_index=True, + help_text=_("Источник данных"), + ) + external_id = models.CharField( + _("внешний ID"), + max_length=255, + db_index=True, + help_text=_("Стабильный идентификатор записи во внешнем источнике"), + ) + inn = models.CharField( + _("ИНН"), + max_length=20, + blank=True, + db_index=True, + help_text=_("ИНН организации, если есть в источнике"), + ) + ogrn = models.CharField( + _("ОГРН"), + max_length=20, + blank=True, + db_index=True, + help_text=_("ОГРН организации, если есть в источнике"), + ) + organisation_name = models.TextField( + _("наименование организации"), + blank=True, + help_text=_("Наименование организации из источника"), + ) + title = models.TextField( + _("заголовок"), + blank=True, + help_text=_("Краткое описание записи"), + ) + record_date = models.CharField( + _("дата записи"), + max_length=30, + blank=True, + db_index=True, + help_text=_("Дата записи в формате источника"), + ) + amount = models.DecimalField( + _("сумма"), + max_digits=20, + decimal_places=2, + null=True, + blank=True, + help_text=_("Сумма, если источник её содержит"), + ) + status = models.CharField( + _("статус"), + max_length=255, + blank=True, + help_text=_("Статус записи во внешнем источнике"), + ) + url = models.TextField( + _("URL"), + blank=True, + help_text=_("Ссылка на карточку/документ во внешнем источнике"), + ) + payload = models.JSONField( + _("исходные данные"), + default=dict, + blank=True, + help_text=_("Нормализованный исходный документ"), + ) + + class Meta: + db_table = "parsers_generic_record" + verbose_name = _("запись внешнего источника") + verbose_name_plural = _("записи внешних источников") + ordering = ["-created_at"] + indexes = [ + models.Index(fields=["source", "load_batch"]), + models.Index(fields=["source", "inn"]), + models.Index(fields=["source", "ogrn"]), + models.Index(fields=["source", "record_date"]), + ] + constraints = [ + models.UniqueConstraint( + fields=["source", "external_id"], + name="unique_generic_source_external_id", + ), + ] + + def __str__(self) -> str: + label = self.organisation_name or self.title or self.external_id + return f"{self.source}: {label[:80]}" + + class Proxy(TimestampMixin, models.Model): """ Прокси-сервер для парсеров. diff --git a/src/apps/parsers/serializers.py b/src/apps/parsers/serializers.py index 0d53405..c7383c8 100644 --- a/src/apps/parsers/serializers.py +++ b/src/apps/parsers/serializers.py @@ -1,7 +1,328 @@ -""" -Сериализаторы для приложения парсеров. +"""Сериализаторы для приложения парсеров.""" -TODO: Добавить сериализаторы по мере необходимости. -""" +import ipaddress +import socket +from urllib.parse import urlsplit -# Сериализаторы будут добавлены по мере разработки конкретных парсеров +from apps.parsers.clients.common.structured import ( + MAX_FILE_SIZE_BYTES, + SUPPORTED_EXCEL_EXTENSIONS, + SUPPORTED_ZIP_EXTENSIONS, +) +from apps.parsers.models import GenericParserRecord, ParserLoadLog +from rest_framework import serializers + +BLOCKED_FILE_HOSTS = {"localhost", "localhost.localdomain"} +BLOCKED_FILE_HOST_SUFFIXES = (".localhost", ".local", ".internal") +SUPPORTED_UPLOAD_EXTENSIONS = tuple( + sorted({*SUPPORTED_EXCEL_EXTENSIONS, *SUPPORTED_ZIP_EXTENSIONS, ".zip"}) +) + + +def _is_blocked_ip(address: str) -> bool: + """Проверить, что адрес не указывает во внутреннюю сеть worker'а.""" + ip = ipaddress.ip_address(address) + return ( + ip.is_loopback + or ip.is_private + or ip.is_link_local + or ip.is_multicast + or ip.is_reserved + or ip.is_unspecified + ) + + +def _validate_public_file_host(host: str) -> None: + """Запретить очевидные SSRF-цели и DNS-имена, резолвящиеся внутрь сети.""" + normalized_host = host.strip("[]").rstrip(".").lower() + if normalized_host in BLOCKED_FILE_HOSTS or normalized_host.endswith( + BLOCKED_FILE_HOST_SUFFIXES + ): + raise serializers.ValidationError("file_url host is not allowed") + + try: + if _is_blocked_ip(normalized_host): + raise serializers.ValidationError("file_url IP address is not allowed") + return + except ValueError: + pass + + try: + address_info = socket.getaddrinfo( + normalized_host, + 443, + type=socket.SOCK_STREAM, + ) + except socket.gaierror as exc: + raise serializers.ValidationError("file_url host cannot be resolved") from exc + + for item in address_info: + resolved_ip = item[4][0] + if _is_blocked_ip(resolved_ip): + raise serializers.ValidationError("file_url host resolves to private IP") + + +class ParserSourceSerializer(serializers.Serializer): + """Описание доступного парсера.""" + + key = serializers.CharField() + source = serializers.CharField() + title = serializers.CharField() + agency = serializers.CharField() + data_scope = serializers.CharField() + task_name = serializers.CharField() + is_existing = serializers.BooleanField() + requires_file_url = serializers.BooleanField() + mode = serializers.CharField() + status = serializers.CharField() + owner = serializers.CharField(allow_blank=True) + upstream_url = serializers.URLField(allow_blank=True) + access_method = serializers.CharField() + parser_strategy = serializers.CharField() + source_notes = serializers.CharField(allow_blank=True) + supports_file_upload = serializers.BooleanField() + result_list_url = serializers.CharField() + result_detail_url = serializers.CharField() + upload_url = serializers.CharField(allow_blank=True) + + +class ParserRunRequestSerializer(serializers.Serializer): + """Параметры запуска Celery-задачи парсера.""" + + file_url = serializers.URLField(required=False, allow_blank=True) + proxies = serializers.ListField( + child=serializers.CharField(), + required=False, + allow_empty=True, + ) + year = serializers.IntegerField(required=False, min_value=2000, max_value=2100) + month = serializers.IntegerField(required=False, min_value=1, max_value=12) + limit = serializers.IntegerField(required=False, min_value=1, max_value=1000) + offset = serializers.IntegerField(required=False, min_value=0) + max_months_per_law = serializers.IntegerField( + required=False, + min_value=1, + max_value=36, + ) + start_year = serializers.IntegerField( + required=False, min_value=2000, max_value=2100 + ) + start_month = serializers.IntegerField(required=False, min_value=1, max_value=12) + include_fz294 = serializers.BooleanField(required=False) + include_fz248 = serializers.BooleanField(required=False) + region_code = serializers.CharField(required=False, allow_blank=True) + company_inn = serializers.CharField(required=False, allow_blank=True) + text = serializers.CharField(required=False, allow_blank=True) + + def validate_file_url(self, value: str) -> str: + """Разрешить только публичные HTTPS URL для загрузки worker'ом.""" + if not value: + return value + + parsed = urlsplit(value) + if parsed.scheme != "https": + raise serializers.ValidationError("file_url must use https") + if parsed.username or parsed.password: + raise serializers.ValidationError("file_url credentials are not allowed") + if not parsed.hostname: + raise serializers.ValidationError("file_url host is required") + + _validate_public_file_host(parsed.hostname) + return value + + def validate(self, attrs): + """Проверить парные параметры ручного sync-старта.""" + attrs = super().validate(attrs) + if ("start_year" in attrs) != ("start_month" in attrs): + raise serializers.ValidationError( + { + "start_month": ( + "start_year and start_month must be provided together" + ) + } + ) + return attrs + + +class ParserUploadRequestSerializer(serializers.Serializer): + """Файл ручной загрузки реестра или финансовой выгрузки.""" + + file = serializers.FileField() + + def validate_file(self, value): + """Проверить размер и расширение структурированной выгрузки.""" + file_name = (value.name or "").lower() + if not file_name.endswith(SUPPORTED_UPLOAD_EXTENSIONS): + extensions = ", ".join(SUPPORTED_UPLOAD_EXTENSIONS) + raise serializers.ValidationError( + f"Unsupported file extension. Allowed: {extensions}" + ) + if value.size > MAX_FILE_SIZE_BYTES: + raise serializers.ValidationError( + f"File exceeds size limit: {MAX_FILE_SIZE_BYTES} bytes" + ) + return value + + +class ParserScheduleRequestSerializer(ParserRunRequestSerializer): + """Параметры создания/обновления периодической Celery-задачи парсера.""" + + SCHEDULE_TYPES = ("interval", "crontab") + PERIODS = ("seconds", "minutes", "hours", "days") + + source_key = serializers.CharField(required=False) + name = serializers.CharField(required=False, allow_blank=True, max_length=200) + enabled = serializers.BooleanField(required=False, default=True) + schedule_type = serializers.ChoiceField( + choices=SCHEDULE_TYPES, + required=False, + default="interval", + ) + every = serializers.IntegerField(required=False, min_value=1) + period = serializers.ChoiceField(choices=PERIODS, required=False, default="hours") + minute = serializers.CharField(required=False, allow_blank=True, default="0") + hour = serializers.CharField(required=False, allow_blank=True, default="*") + day_of_week = serializers.CharField(required=False, allow_blank=True, default="*") + day_of_month = serializers.CharField(required=False, allow_blank=True, default="*") + month_of_year = serializers.CharField(required=False, allow_blank=True, default="*") + + def validate(self, attrs): + attrs = super().validate(attrs) + schedule_type = attrs.get("schedule_type", "interval") + if schedule_type == "interval" and not attrs.get("every"): + raise serializers.ValidationError( + {"every": "Required for interval schedule"} + ) + return attrs + + +class ParserScheduleSerializer(serializers.Serializer): + """Описание периодической задачи парсера.""" + + id = serializers.IntegerField() + name = serializers.CharField() + source_key = serializers.CharField() + source = serializers.CharField() + title = serializers.CharField() + task_name = serializers.CharField() + enabled = serializers.BooleanField() + schedule_type = serializers.CharField() + schedule = serializers.DictField() + params = serializers.DictField() + last_run_at = serializers.DateTimeField(allow_null=True) + total_run_count = serializers.IntegerField() + date_changed = serializers.DateTimeField() + + +class ParserListQuerySerializer(serializers.Serializer): + """Общие query-параметры списков парсеров.""" + + limit = serializers.IntegerField( + required=False, default=50, min_value=1, max_value=200 + ) + + +class ParserResultQuerySerializer(serializers.Serializer): + """Query-параметры per-source result endpoints.""" + + page = serializers.IntegerField(required=False, default=1, min_value=1) + page_size = serializers.IntegerField( + required=False, default=20, min_value=1, max_value=100 + ) + limit = serializers.IntegerField(required=False, min_value=1, max_value=100) + id = serializers.IntegerField(required=False, min_value=1) + source = serializers.CharField(required=False, allow_blank=True) + external_id = serializers.CharField(required=False, allow_blank=True) + inn = serializers.CharField(required=False, allow_blank=True) + ogrn = serializers.CharField(required=False, allow_blank=True) + load_batch = serializers.IntegerField(required=False, min_value=1) + batch_id = serializers.IntegerField(required=False, min_value=1) + status = serializers.CharField(required=False, allow_blank=True) + record_date = serializers.CharField(required=False, allow_blank=True) + search = serializers.CharField(required=False, allow_blank=True) + ordering = serializers.CharField(required=False, allow_blank=True) + include_payload = serializers.BooleanField(required=False, default=True) + + def validate(self, attrs): + attrs = super().validate(attrs) + if attrs.get("limit"): + attrs["page"] = 1 + attrs["page_size"] = attrs["limit"] + if attrs.get("batch_id") and not attrs.get("load_batch"): + attrs["load_batch"] = attrs["batch_id"] + return attrs + + +class ParserRunResponseSerializer(serializers.Serializer): + """Ответ API на запуск задачи.""" + + task_id = serializers.CharField() + source = serializers.CharField() + task_name = serializers.CharField() + + +class ParserLoadLogSerializer(serializers.ModelSerializer): + """Сериализатор логов загрузки.""" + + class Meta: + model = ParserLoadLog + fields = [ + "id", + "batch_id", + "source", + "records_count", + "status", + "error_message", + "created_at", + "updated_at", + ] + read_only_fields = fields + + +class GenericParserRecordSerializer(serializers.ModelSerializer): + """Сериализатор универсальных записей новых источников.""" + + class Meta: + model = GenericParserRecord + fields = [ + "id", + "load_batch", + "source", + "external_id", + "inn", + "ogrn", + "organisation_name", + "title", + "record_date", + "amount", + "status", + "url", + "payload", + "created_at", + "updated_at", + ] + read_only_fields = fields + + +class ParserResultRecordSerializer(serializers.Serializer): + """Унифицированная запись результата конкретного источника.""" + + id = serializers.IntegerField() + load_batch = serializers.IntegerField() + source = serializers.CharField() + external_id = serializers.CharField(allow_blank=True) + inn = serializers.CharField(allow_blank=True) + ogrn = serializers.CharField(allow_blank=True) + organisation_name = serializers.CharField(allow_blank=True) + title = serializers.CharField(allow_blank=True) + record_date = serializers.CharField(allow_blank=True) + amount = serializers.DecimalField( + max_digits=20, + decimal_places=2, + allow_null=True, + ) + status = serializers.CharField(allow_blank=True) + url = serializers.CharField(allow_blank=True) + payload = serializers.DictField() + created_at = serializers.DateTimeField() + updated_at = serializers.DateTimeField() diff --git a/src/apps/parsers/services.py b/src/apps/parsers/services.py index e47893f..66bc72e 100644 --- a/src/apps/parsers/services.py +++ b/src/apps/parsers/services.py @@ -7,21 +7,45 @@ import logging from apps.core.services import BaseService, BulkOperationsMixin +from apps.parsers.clients.common.schemas import GenericParserItem from apps.parsers.clients.minpromtorg.schemas import IndustrialCertificate, Manufacturer from apps.parsers.clients.proverki.schemas import Inspection from apps.parsers.models import ( + GenericParserRecord, IndustrialCertificateRecord, InspectionRecord, ManufacturerRecord, + ParserBatchSequence, ParserLoadLog, Proxy, ) -from django.db import transaction +from django.db import IntegrityError, transaction from django.utils import timezone logger = logging.getLogger(__name__) +def _dedupe_by_key(items: list, key_getter) -> dict: + """Вернуть первые элементы по уникальному ключу, сохранив порядок.""" + unique = {} + for item in items: + key = key_getter(item) + if key not in unique: + unique[key] = item + return unique + + +def _model_defaults(instance, lookup_fields: list[str]) -> dict: + """Собрать defaults для get_or_create из Django model instance.""" + lookup = set(lookup_fields) + defaults = {} + for field in instance._meta.concrete_fields: + if field.primary_key or field.name in lookup: + continue + defaults[field.name] = getattr(instance, field.name) + return defaults + + class ParserLoadLogService(BaseService[ParserLoadLog]): """ Сервис для управления логами загрузок парсеров. @@ -35,19 +59,83 @@ class ParserLoadLogService(BaseService[ParserLoadLog]): model = ParserLoadLog @classmethod - def get_next_batch_id(cls, source: str) -> int: + def _get_next_batch_id_from_logs(cls, source: str) -> int: """ - Получить следующий batch_id для источника. + Рассчитать следующий batch_id из фактических логов. - Args: - source: Код источника (industrial, manufactures) - - Returns: - Следующий batch_id + Используется для первичной инициализации ParserBatchSequence и + read-only совместимости get_next_batch_id. """ last_log = cls.model.objects.filter(source=source).order_by("-batch_id").first() return (last_log.batch_id + 1) if last_log else 1 + @classmethod + def get_next_batch_id(cls, source: str) -> int: + """ + Получить следующий batch_id для источника без резервирования. + + Для запуска парсеров использовать create_next_load_log, который + атомарно резервирует batch_id и сразу создаёт лог загрузки. + """ + sequence = ParserBatchSequence.objects.filter(source=source).first() + if sequence is None: + return cls._get_next_batch_id_from_logs(source) + return max(sequence.next_batch_id, cls._get_next_batch_id_from_logs(source)) + + @classmethod + def create_next_load_log( + cls, + *, + source: str, + records_count: int = 0, + status: str = "success", + error_message: str = "", + ) -> ParserLoadLog: + """ + Атомарно зарезервировать следующий batch_id и создать лог загрузки. + + ParserBatchSequence даёт строку для select_for_update. При первом + конкурентном создании источника возможен IntegrityError по unique(source), + поэтому операция коротко повторяется. + """ + for _ in range(3): + try: + with transaction.atomic(): + sequence = ( + ParserBatchSequence.objects.select_for_update() + .filter(source=source) + .first() + ) + if sequence is None: + sequence = ParserBatchSequence.objects.create( + source=source, + next_batch_id=cls._get_next_batch_id_from_logs(source), + ) + + next_from_logs = cls._get_next_batch_id_from_logs(source) + if sequence.next_batch_id < next_from_logs: + sequence.next_batch_id = next_from_logs + + batch_id = sequence.next_batch_id + sequence.next_batch_id = batch_id + 1 + sequence.save(update_fields=["next_batch_id", "updated_at"]) + + return cls.model.objects.create( + source=source, + batch_id=batch_id, + records_count=records_count, + status=status, + error_message=error_message, + ) + except IntegrityError: + logger.warning( + "Retrying parser batch allocation after sequence conflict " + "(source=%s)", + source, + ) + + raise IntegrityError(f"Cannot allocate parser batch_id for source={source}") + @classmethod @transaction.atomic def create_load_log( @@ -104,6 +192,38 @@ class IndustrialCertificateService( model = IndustrialCertificateRecord + @classmethod + def _create_with_exact_count( + cls, + instances: list[IndustrialCertificateRecord], + *, + unique_fields: list[str], + chunk_size: int, + ) -> int: + """ + Быстро создать записи bulk_create, при конфликте точно досчитать created. + + ignore_conflicts не даёт надёжного количества вставленных строк, а + параллельные импорты могут создать конфликт после предварительной проверки. + """ + try: + with transaction.atomic(): + cls.bulk_create_chunked(instances, chunk_size=chunk_size) + return len(instances) + except IntegrityError: + logger.info("Falling back to get_or_create after bulk insert conflict") + + created_count = 0 + for instance in instances: + lookup = {field: getattr(instance, field) for field in unique_fields} + _, created = cls.model.objects.get_or_create( + defaults=_model_defaults(instance, unique_fields), + **lookup, + ) + if created: + created_count += 1 + return created_count + @classmethod @transaction.atomic def save_certificates( @@ -133,6 +253,21 @@ class IndustrialCertificateService( logger.info("Saving %d certificates (batch_id=%d)", len(certificates), batch_id) + unique_certificates = _dedupe_by_key( + certificates, + lambda cert: cert.certificate_number, + ) + certificate_numbers = list(unique_certificates.keys()) + existing_numbers = set( + cls.model.objects.filter( + certificate_number__in=certificate_numbers, + ).values_list("certificate_number", flat=True) + ) + certificates_to_create = [ + cert + for certificate_number, cert in unique_certificates.items() + if certificate_number not in existing_numbers + ] instances = [ cls.model( load_batch=batch_id, @@ -144,13 +279,16 @@ class IndustrialCertificateService( inn=cert.inn, ogrn=cert.ogrn, ) - for cert in certificates + for cert in certificates_to_create ] + if not instances: + logger.info("No new certificates to save") + return 0 - saved_count = cls.bulk_create_chunked( + saved_count = cls._create_with_exact_count( instances, + unique_fields=["certificate_number"], chunk_size=chunk_size, - ignore_conflicts=True, # Skip duplicates by certificate_number ) logger.info("Saved %d certificates", saved_count) @@ -187,6 +325,33 @@ class ManufacturerService(BulkOperationsMixin, BaseService[ManufacturerRecord]): model = ManufacturerRecord + @classmethod + def _create_with_exact_count( + cls, + instances: list[ManufacturerRecord], + *, + unique_fields: list[str], + chunk_size: int, + ) -> int: + """Создать записи и точно вернуть количество новых строк.""" + try: + with transaction.atomic(): + cls.bulk_create_chunked(instances, chunk_size=chunk_size) + return len(instances) + except IntegrityError: + logger.info("Falling back to get_or_create after bulk insert conflict") + + created_count = 0 + for instance in instances: + lookup = {field: getattr(instance, field) for field in unique_fields} + _, created = cls.model.objects.get_or_create( + defaults=_model_defaults(instance, unique_fields), + **lookup, + ) + if created: + created_count += 1 + return created_count + @classmethod @transaction.atomic def save_manufacturers( @@ -218,6 +383,19 @@ class ManufacturerService(BulkOperationsMixin, BaseService[ManufacturerRecord]): "Saving %d manufacturers (batch_id=%d)", len(manufacturers), batch_id ) + unique_manufacturers = _dedupe_by_key( + manufacturers, + lambda manufacturer: manufacturer.inn, + ) + inns = list(unique_manufacturers.keys()) + existing_inns = set( + cls.model.objects.filter(inn__in=inns).values_list("inn", flat=True) + ) + manufacturers_to_create = [ + manufacturer + for inn, manufacturer in unique_manufacturers.items() + if inn not in existing_inns + ] instances = [ cls.model( load_batch=batch_id, @@ -226,13 +404,16 @@ class ManufacturerService(BulkOperationsMixin, BaseService[ManufacturerRecord]): ogrn=m.ogrn, address=m.address, ) - for m in manufacturers + for m in manufacturers_to_create ] + if not instances: + logger.info("No new manufacturers to save") + return 0 - saved_count = cls.bulk_create_chunked( + saved_count = cls._create_with_exact_count( instances, + unique_fields=["inn"], chunk_size=chunk_size, - ignore_conflicts=True, # Skip duplicates by INN ) logger.info("Saved %d manufacturers", saved_count) @@ -258,6 +439,138 @@ class ManufacturerService(BulkOperationsMixin, BaseService[ManufacturerRecord]): return cls.filter(ogrn=ogrn) +class GenericParserRecordService(BulkOperationsMixin, BaseService[GenericParserRecord]): + """ + Сервис для универсальных записей новых источников. + + Отвечает за: + - Массовое сохранение нормализованных записей + - Поиск по ИНН/ОГРН и источнику + - Идемпотентность по паре source + external_id + """ + + model = GenericParserRecord + + @classmethod + def _create_with_exact_count( + cls, + instances: list[GenericParserRecord], + *, + unique_fields: list[str], + chunk_size: int, + ) -> int: + """Создать generic records и точно вернуть количество новых строк.""" + try: + with transaction.atomic(): + cls.bulk_create_chunked(instances, chunk_size=chunk_size) + return len(instances) + except IntegrityError: + logger.info("Falling back to get_or_create after bulk insert conflict") + + created_count = 0 + for instance in instances: + lookup = {field: getattr(instance, field) for field in unique_fields} + _, created = cls.model.objects.get_or_create( + defaults=_model_defaults(instance, unique_fields), + **lookup, + ) + if created: + created_count += 1 + return created_count + + @classmethod + @transaction.atomic + def save_records( + cls, + records: list[GenericParserItem], + batch_id: int, + *, + source: str, + chunk_size: int = 500, + ) -> int: + """ + Сохранить нормализованные записи нового источника. + + Args: + records: DTO из клиента источника + batch_id: ID пакета загрузки + source: Код источника ParserLoadLog.Source + chunk_size: Размер чанка для bulk_create + + Returns: + Количество новых сохранённых записей + """ + if not records: + logger.warning("No generic parser records to save (source=%s)", source) + return 0 + + logger.info( + "Saving %d generic records (source=%s, batch_id=%d)", + len(records), + source, + batch_id, + ) + + unique_records = _dedupe_by_key(records, lambda record: record.external_id) + external_ids = list(unique_records.keys()) + existing_external_ids = set( + cls.model.objects.filter( + source=source, + external_id__in=external_ids, + ).values_list("external_id", flat=True) + ) + + records_to_create = [ + record + for external_id, record in unique_records.items() + if external_id not in existing_external_ids + ] + instances = [ + cls.model( + load_batch=batch_id, + source=source, + external_id=record.external_id, + inn=record.inn, + ogrn=record.ogrn, + organisation_name=record.organisation_name, + title=record.title, + record_date=record.record_date, + amount=record.amount, + status=record.status, + url=record.url, + payload=record.payload, + ) + for record in records_to_create + ] + if not instances: + logger.info("No new generic records to save (source=%s)", source) + return 0 + + saved_count = cls._create_with_exact_count( + instances, + unique_fields=["source", "external_id"], + chunk_size=chunk_size, + ) + logger.info("Saved %d generic records (source=%s)", saved_count, source) + return saved_count + + @classmethod + def find_by_inn(cls, inn: str, source: str | None = None): + """Найти записи по ИНН с опциональным фильтром источника.""" + qs = cls.filter(inn=inn) + if source: + qs = qs.filter(source=source) + return qs + + @classmethod + def find_by_ogrn(cls, ogrn: str, source: str | None = None): + """Найти записи по ОГРН с опциональным фильтром источника.""" + qs = cls.filter(ogrn=ogrn) + if source: + qs = qs.filter(source=source) + return qs + + class ProxyService(BaseService[Proxy]): """ Сервис для управления прокси-серверами. @@ -379,6 +692,33 @@ class InspectionService(BulkOperationsMixin, BaseService[InspectionRecord]): model = InspectionRecord + @classmethod + def _create_with_exact_count( + cls, + instances: list[InspectionRecord], + *, + unique_fields: list[str], + chunk_size: int, + ) -> int: + """Создать проверки и точно вернуть количество новых строк.""" + try: + with transaction.atomic(): + cls.bulk_create_chunked(instances, chunk_size=chunk_size) + return len(instances) + except IntegrityError: + logger.info("Falling back to get_or_create after bulk insert conflict") + + created_count = 0 + for instance in instances: + lookup = {field: getattr(instance, field) for field in unique_fields} + _, created = cls.model.objects.get_or_create( + defaults=_model_defaults(instance, unique_fields), + **lookup, + ) + if created: + created_count += 1 + return created_count + @classmethod @transaction.atomic def save_inspections( @@ -422,6 +762,21 @@ class InspectionService(BulkOperationsMixin, BaseService[InspectionRecord]): data_month, ) + unique_inspections = _dedupe_by_key( + inspections, + lambda inspection: inspection.registration_number, + ) + registration_numbers = list(unique_inspections.keys()) + existing_numbers = set( + cls.model.objects.filter( + registration_number__in=registration_numbers, + ).values_list("registration_number", flat=True) + ) + inspections_to_create = [ + inspection + for registration_number, inspection in unique_inspections.items() + if registration_number not in existing_numbers + ] instances = [ cls.model( load_batch=batch_id, @@ -441,13 +796,16 @@ class InspectionService(BulkOperationsMixin, BaseService[InspectionRecord]): data_year=data_year, data_month=data_month, ) - for insp in inspections + for insp in inspections_to_create ] + if not instances: + logger.info("No new inspections to save") + return 0 - saved_count = cls.bulk_create_chunked( + saved_count = cls._create_with_exact_count( instances, + unique_fields=["registration_number"], chunk_size=chunk_size, - ignore_conflicts=True, # Skip duplicates by registration_number ) logger.info("Saved %d inspections", saved_count) diff --git a/src/apps/parsers/source_registry.py b/src/apps/parsers/source_registry.py new file mode 100644 index 0000000..93740b6 --- /dev/null +++ b/src/apps/parsers/source_registry.py @@ -0,0 +1,294 @@ +"""Каталог парсеров и источников данных.""" + +from dataclasses import dataclass + +from apps.parsers.models import ParserLoadLog + + +@dataclass(frozen=True) +class ParserSourceDescriptor: + """Описание источника для API и запуска задач.""" + + key: str + source: str + title: str + agency: str + data_scope: str + task_name: str + is_existing: bool = False + requires_file_url: bool = False + mode: str = "native_api" + status: str = "implemented" + owner: str = "" + upstream_url: str = "" + access_method: str = "api" + parser_strategy: str = "native" + source_notes: str = "" + supports_file_upload: bool = False + api_route: str = "" + upload_route: str = "" + + @property + def result_list_url(self) -> str: + """Frontend/API URL для списка результата источника.""" + return f"/api/v1/{self.api_route}/" if self.api_route else "" + + @property + def result_detail_url(self) -> str: + """Frontend/API URL для карточки результата источника.""" + return f"/api/v1/{self.api_route}/{{id}}/" if self.api_route else "" + + @property + def upload_url(self) -> str: + """Frontend/API URL ручной загрузки файла источника.""" + if not self.supports_file_upload or not self.api_route: + return "" + return f"/api/v1/{self.upload_api_route}/" + + @property + def upload_api_route(self) -> str: + """API route ручной загрузки без prefix /api/v1.""" + if self.upload_route: + return self.upload_route + return f"{self.api_route}/upload" + + +PARSER_SOURCES: dict[str, ParserSourceDescriptor] = { + "industrial": ParserSourceDescriptor( + key="industrial", + source=ParserLoadLog.Source.INDUSTRIAL, + title="Сертификаты промышленного производства", + agency="Минпромторг России", + data_scope="Заключения о подтверждении производства промышленной продукции", + task_name="apps.parsers.tasks.parse_industrial_production", + is_existing=True, + upstream_url="https://minpromtorg.gov.ru/api/kss-document-preview", + parser_strategy="minpromtorg_excel_discovery", + source_notes=( + "Параметры discovery: types[]=668d4f2a-966a-4b65-9fb9-2f1ad19a3d1f, " + "fragment=Заключения о подтверждении производства промышленной продукции." + ), + api_route="minpromtorg/certificates", + ), + "manufactures": ParserSourceDescriptor( + key="manufactures", + source=ParserLoadLog.Source.MANUFACTURES, + title="Реестр производителей", + agency="Минпромторг России", + data_scope="Производители промышленной продукции", + task_name="apps.parsers.tasks.parse_manufactures", + is_existing=True, + upstream_url="https://minpromtorg.gov.ru/api/kss-document-preview", + parser_strategy="minpromtorg_excel_discovery", + source_notes=( + "Параметры discovery: types[]=668d4f2a-966a-4b65-9fb9-2f1ad19a3d1f, " + "fragment=Производители промышленной продукции." + ), + api_route="minpromtorg/manufacturers", + ), + "inspections": ParserSourceDescriptor( + key="inspections", + source=ParserLoadLog.Source.INSPECTIONS, + title="Проверки Генпрокуратуры", + agency="Генпрокуратура РФ", + data_scope="Плановые и внеплановые проверки", + task_name="apps.parsers.tasks.parse_inspections", + is_existing=True, + upstream_url="https://proverki.gov.ru/portal/public-open-data", + access_method="open_data_portal", + parser_strategy="proverki_open_data", + api_route="proverki", + ), + "sync_inspections": ParserSourceDescriptor( + key="sync_inspections", + source=ParserLoadLog.Source.INSPECTIONS, + title="Синхронизация проверок", + agency="Генпрокуратура РФ", + data_scope="Помесячная синхронизация проверок", + task_name="apps.parsers.tasks.sync_inspections", + is_existing=True, + upstream_url="https://proverki.gov.ru/portal/public-open-data/check", + access_method="open_data_portal", + parser_strategy="proverki_open_data_sync", + api_route="proverki", + ), + "mpt_products": ParserSourceDescriptor( + key="mpt_products", + source=ParserLoadLog.Source.MPT_PRODUCTS, + title="Продукция Минпромторга", + agency="Минпромторг России", + data_scope="Продукция российских производителей", + task_name="apps.parsers.tasks.parse_mpt_products", + mode="official_api", + status="implemented", + upstream_url="https://gisp.gov.ru/pp719v2/pub/prod/", + access_method="official_registry_api", + parser_strategy="gisp_product_registry", + source_notes="Реестр промышленной продукции РФ опубликован в ГИСП.", + api_route="minpromtorg/products", + ), + "procurements_44fz": ParserSourceDescriptor( + key="procurements_44fz", + source=ParserLoadLog.Source.PROCUREMENTS_44FZ, + title="Закупки 44-ФЗ", + agency="Федеральное казначейство", + data_scope="Извещения и закупочные процедуры 44-ФЗ", + task_name="apps.parsers.tasks.parse_procurements_44fz", + mode="official_api", + status="implemented", + upstream_url="https://zakupki.gov.ru/epz/order/extendedsearch/results.html", + access_method="eis_official_api", + parser_strategy="eis_44fz_search", + source_notes="Официальный поиск ЕИС; XML-выгрузки ЕИС требуют отдельного discovery по реестрам.", + api_route="zakupki", + ), + "procurements_223fz": ParserSourceDescriptor( + key="procurements_223fz", + source=ParserLoadLog.Source.PROCUREMENTS_223FZ, + title="Закупки 223-ФЗ", + agency="Федеральное казначейство", + data_scope="Извещения и закупочные процедуры 223-ФЗ", + task_name="apps.parsers.tasks.parse_procurements_223fz", + mode="official_api", + status="implemented", + upstream_url="https://zakupki.gov.ru/epz/orderclause/search/results.html", + access_method="eis_official_api", + parser_strategy="eis_223fz_search", + source_notes="Официальный реестр положений о закупках 223-ФЗ в ЕИС.", + api_route="zakupki", + ), + "contracts": ParserSourceDescriptor( + key="contracts", + source=ParserLoadLog.Source.CONTRACTS, + title="Контракты ЕИС", + agency="Федеральное казначейство", + data_scope="Государственные и корпоративные контракты", + task_name="apps.parsers.tasks.parse_contracts", + mode="official_api", + status="implemented", + upstream_url="https://zakupki.gov.ru/epz/contract/search/results.html", + access_method="eis_official_api", + parser_strategy="eis_contract_registry", + api_route="zakupki", + ), + "unfair_suppliers": ParserSourceDescriptor( + key="unfair_suppliers", + source=ParserLoadLog.Source.UNFAIR_SUPPLIERS, + title="Недобросовестные поставщики", + agency="ФАС России / ЕИС Закупки", + data_scope="Реестр недобросовестных поставщиков", + task_name="apps.parsers.tasks.parse_unfair_suppliers", + mode="official_api", + status="implemented", + upstream_url="https://zakupki.gov.ru/epz/dishonestsupplier/search/results.html", + access_method="eis_official_api", + parser_strategy="eis_unfair_supplier_registry", + api_route="fas/unfair-suppliers", + ), + "fas_goz": ParserSourceDescriptor( + key="fas_goz", + source=ParserLoadLog.Source.FAS_GOZ, + title="Уклонение от ГОЗ", + agency="ФАС России", + data_scope="Юрлица, привлеченные за отказ или уклонение от ГОЗ", + task_name="apps.parsers.tasks.parse_fas_goz_evasion", + mode="official_api", + status="implemented", + upstream_url="https://fas.gov.ru/pages/activity/reestr-uridicheskih-lic", + access_method="official_registry_api", + parser_strategy="fas_goz_registry", + api_route="fas/goz", + ), + "fns_financial": ParserSourceDescriptor( + key="fns_financial", + source=ParserLoadLog.Source.FNS_FINANCIAL, + title="Финансово-экономические показатели", + agency="ФНС России", + data_scope="Финансово-экономическая выгрузка", + task_name="apps.parsers.tasks.parse_fns_financial_indicators", + mode="official_api", + status="implemented", + owner="Сергей", + upstream_url=( + "https://bo.nalog.gov.ru/advanced-search/organizations/search" + "?query=%D0%9E%D0%9E%D0%9E&page=0&size=100" + ), + access_method="public_web_api", + parser_strategy="fns_bfo_search_and_download", + source_notes=( + "ГИР БО: поиск организаций и скачивание отчетности с ЭП ФНС. " + "Ручная загрузка разрешена для финансовых выгрузок от Сергея." + ), + supports_file_upload=True, + api_route="fns/reports", + upload_route="fns/upload", + ), + "arbitration": ParserSourceDescriptor( + key="arbitration", + source=ParserLoadLog.Source.ARBITRATION, + title="Арбитражные дела", + agency="Верховный суд РФ / КАД Арбитр", + data_scope="Арбитражные дела по организациям", + task_name="apps.parsers.tasks.parse_arbitration_cases", + mode="official_api", + status="implemented", + upstream_url="https://kad.arbitr.ru/", + access_method="official_search_api", + parser_strategy="kad_arbitr_search", + api_route="arbitration/cases", + ), + "fedresurs_bankruptcy": ParserSourceDescriptor( + key="fedresurs_bankruptcy", + source=ParserLoadLog.Source.FEDRESURS_BANKRUPTCY, + title="Банкротства Федресурс", + agency="Федресурс", + data_scope="Сведения о процедурах банкротства", + task_name="apps.parsers.tasks.parse_fedresurs_bankruptcy", + mode="official_api", + status="implemented", + owner="Сергей", + upstream_url="https://bankrot.fedresurs.ru/", + access_method="official_registry_api", + parser_strategy="fedresurs_bankruptcy_search", + source_notes=( + "Официальный ЕФРСБ; может отдавать anti-bot challenge worker'ам. " + "Ручная загрузка разрешена только для выгрузок, переданных Сергеем." + ), + supports_file_upload=True, + api_route="fedresurs/bankruptcy", + ), + "fstec": ParserSourceDescriptor( + key="fstec", + source=ParserLoadLog.Source.FSTEC, + title="Реестры ФСТЭК", + agency="ФСТЭК России", + data_scope="Реестры по информационной безопасности", + task_name="apps.parsers.tasks.parse_fstec_registers", + mode="official_api", + status="implemented", + upstream_url="https://reestr.fstec.ru/reg3", + access_method="official_registry_api", + parser_strategy="fstec_registry_table", + api_route="fstec/registers", + ), + "trudvsem": ParserSourceDescriptor( + key="trudvsem", + source=ParserLoadLog.Source.TRUDVSEM, + title="Вакансии Работа России", + agency="ЕЦП Работа в России", + data_scope="Вакансии работодателей", + task_name="apps.parsers.tasks.parse_trudvsem_vacancies", + upstream_url="https://opendata.trudvsem.ru/api/v1/vacancies", + access_method="public_api", + parser_strategy="trudvsem_vacancies_api", + api_route="trudvsem/vacancies", + ), +} + + +def get_source_by_model_source(source: str) -> ParserSourceDescriptor | None: + """Найти описание парсера по значению ParserLoadLog.Source.""" + for descriptor in PARSER_SOURCES.values(): + if descriptor.source == source: + return descriptor + return None diff --git a/src/apps/parsers/tasks.py b/src/apps/parsers/tasks.py index 46f08ce..7b8f5cf 100644 --- a/src/apps/parsers/tasks.py +++ b/src/apps/parsers/tasks.py @@ -6,23 +6,29 @@ Celery задачи для приложения парсеров. """ import logging +from collections.abc import Callable from datetime import datetime from apps.core.services import BackgroundJobService +from apps.parsers.clients.common import GenericParserItem, StructuredDataClient from apps.parsers.clients.minpromtorg import ( IndustrialProductionClient, ManufacturesClient, ) from apps.parsers.clients.proverki import ProverkiClient +from apps.parsers.clients.trudvsem import TrudvsemClient from apps.parsers.models import ParserLoadLog from apps.parsers.services import ( + GenericParserRecordService, IndustrialCertificateService, InspectionService, ManufacturerService, ParserLoadLogService, ProxyService, ) +from apps.parsers.source_registry import PARSER_SOURCES, get_source_by_model_source from celery import shared_task +from django.core.files.storage import default_storage logger = logging.getLogger(__name__) @@ -31,20 +37,208 @@ DEFAULT_START_YEAR = 2025 DEFAULT_START_MONTH = 1 +def _run_generic_parser( + celery_task, + *, + source: str, + task_name: str, + fetch_records: Callable[[], list[GenericParserItem]], + user_id: int | None = None, + meta: dict | None = None, +) -> dict: + """ + Общий запуск нового парсера. + + Создаёт BackgroundJob и ParserLoadLog, вызывает клиент, сохраняет записи + через GenericParserRecordService. Сами Celery-задачи остаются тонкими + обёртками с настройкой источника. + """ + load_log = ParserLoadLogService.create_next_load_log( + source=source, + status="in_progress", + ) + batch_id = load_log.batch_id + task_id = celery_task.request.id + job_meta = {"source": source, "batch_id": batch_id, **(meta or {})} + + logger.info( + "Starting generic parser (task_id=%s, source=%s, batch_id=%d)", + task_id, + source, + batch_id, + ) + + job = BackgroundJobService.create_job( + task_id=task_id, + task_name=task_name, + user_id=user_id, + meta=job_meta, + ) + job.mark_started() + job.update_progress(0, "Инициализация парсера...") + + try: + job.update_progress(20, "Загрузка данных источника...") + records = fetch_records() + + job.update_progress(80, f"Сохранение {len(records)} записей...") + saved_count = GenericParserRecordService.save_records( + records, + batch_id=batch_id, + source=source, + ) + + ParserLoadLogService.update( + load_log, + status="success", + records_count=saved_count, + ) + result = {"batch_id": batch_id, "saved": saved_count, "status": "success"} + job.complete(result=result) + + logger.info( + "Generic parser completed (source=%s, batch_id=%d, saved=%d)", + source, + batch_id, + saved_count, + ) + return result + + except Exception as e: + logger.error( + "Generic parser failed (source=%s, batch_id=%d): %s", + source, + batch_id, + e, + exc_info=True, + ) + ParserLoadLogService.mark_failed(load_log, str(e)) + job.fail(error=str(e)) + return { + "batch_id": batch_id, + "saved": 0, + "status": "failed", + "error": str(e), + } + + +def _parse_structured_file_source( + celery_task, + *, + source: str, + task_name: str, + file_url: str | None = None, + proxies: list[str] | None = None, + user_id: int | None = None, +) -> dict: + """Запустить generic-парсер по официальному upstream URL источника.""" + if proxies is None: + proxies = ProxyService.get_active_proxies_or_none() + source_descriptor = get_source_by_model_source(source) + source_url = file_url or ( + source_descriptor.upstream_url if source_descriptor else "" + ) + if not source_url: + raise ValueError(f"Upstream URL is not configured for source={source}") + + def fetch_records() -> list[GenericParserItem]: + verify_ssl = source != ParserLoadLog.Source.FSTEC + with StructuredDataClient( + source=source, + proxies=proxies, + verify_ssl=verify_ssl, + ) as client: + return client.fetch_records(file_url=source_url) + + return _run_generic_parser( + celery_task, + source=source, + task_name=task_name, + fetch_records=fetch_records, + user_id=user_id, + meta={ + "upstream_url": source_url, + "override_url": file_url or "", + "parser_strategy": ( + source_descriptor.parser_strategy if source_descriptor else "" + ), + }, + ) + + @shared_task(bind=True) -def parse_industrial_production(self, proxies: list[str] | None = None) -> dict: +def import_parser_upload( + self, + *, + source_key: str, + storage_path: str, + file_name: str = "", + user_id: int | None = None, +) -> dict: + """Импортировать загруженный пользователем файл реестра через Celery.""" + source_descriptor = PARSER_SOURCES.get(source_key) + if source_descriptor is None: + raise ValueError(f"Unknown parser source: {source_key}") + if not source_descriptor.supports_file_upload: + raise ValueError(f"Manual file upload is not supported for source={source_key}") + + def fetch_records() -> list[GenericParserItem]: + with default_storage.open(storage_path, "rb") as uploaded_file: + content = uploaded_file.read() + with StructuredDataClient(source=source_descriptor.source) as client: + return client.fetch_records( + content=content, + file_name=file_name or storage_path, + ) + + try: + return _run_generic_parser( + self, + source=source_descriptor.source, + task_name="apps.parsers.tasks.import_parser_upload", + fetch_records=fetch_records, + user_id=user_id, + meta={ + "source_key": source_key, + "upload_path": storage_path, + "file_name": file_name, + "parser_strategy": source_descriptor.parser_strategy, + }, + ) + finally: + try: + default_storage.delete(storage_path) + except Exception: + logger.warning( + "Failed to delete uploaded parser file: %s", + storage_path, + exc_info=True, + ) + + +@shared_task(bind=True) +def parse_industrial_production( + self, + proxies: list[str] | None = None, + user_id: int | None = None, +) -> dict: """ Задача парсинга сертификатов промышленного производства. Args: proxies: Список прокси-серверов (опционально). Если не передан, берётся из БД. + user_id: ID пользователя, запустившего задачу через API. Returns: Результат: batch_id, saved, status """ source = ParserLoadLog.Source.INDUSTRIAL - batch_id = ParserLoadLogService.get_next_batch_id(source) + load_log = ParserLoadLogService.create_next_load_log( + source=source, + status="in_progress", + ) + batch_id = load_log.batch_id task_id = self.request.id # Если прокси не переданы, берём из БД @@ -62,18 +256,12 @@ def parse_industrial_production(self, proxies: list[str] | None = None) -> dict: job = BackgroundJobService.create_job( task_id=task_id, task_name="apps.parsers.tasks.parse_industrial_production", + user_id=user_id, meta={"source": source, "batch_id": batch_id}, ) job.mark_started() job.update_progress(0, "Инициализация парсера...") - # Создаём запись лога - load_log = ParserLoadLogService.create_load_log( - source=source, - batch_id=batch_id, - status="in_progress", - ) - try: # Парсинг данных job.update_progress(10, "Загрузка данных с API Минпромторга...") @@ -123,19 +311,28 @@ def parse_industrial_production(self, proxies: list[str] | None = None) -> dict: @shared_task(bind=True) -def parse_manufactures(self, proxies: list[str] | None = None) -> dict: +def parse_manufactures( + self, + proxies: list[str] | None = None, + user_id: int | None = None, +) -> dict: """ Задача парсинга реестра производителей. Args: proxies: Список прокси-серверов (опционально). Если не передан, берётся из БД. + user_id: ID пользователя, запустившего задачу через API. Returns: Результат: batch_id, saved, status """ source = ParserLoadLog.Source.MANUFACTURES - batch_id = ParserLoadLogService.get_next_batch_id(source) + load_log = ParserLoadLogService.create_next_load_log( + source=source, + status="in_progress", + ) + batch_id = load_log.batch_id task_id = self.request.id # Если прокси не переданы, берём из БД @@ -153,18 +350,12 @@ def parse_manufactures(self, proxies: list[str] | None = None) -> dict: job = BackgroundJobService.create_job( task_id=task_id, task_name="apps.parsers.tasks.parse_manufactures", + user_id=user_id, meta={"source": source, "batch_id": batch_id}, ) job.mark_started() job.update_progress(0, "Инициализация парсера...") - # Создаём запись лога - load_log = ParserLoadLogService.create_load_log( - source=source, - batch_id=batch_id, - status="in_progress", - ) - try: # Парсинг данных job.update_progress(10, "Загрузка данных с API Минпромторга...") @@ -214,13 +405,17 @@ def parse_manufactures(self, proxies: list[str] | None = None) -> dict: @shared_task -def parse_all_minpromtorg(proxies: list[str] | None = None) -> dict: +def parse_all_minpromtorg( + proxies: list[str] | None = None, + user_id: int | None = None, +) -> dict: """ Запустить все парсеры Минпромторга. Args: proxies: Список прокси-серверов (опционально). Если не передан, каждая задача возьмёт прокси из БД. + user_id: ID пользователя, запустившего группу задач через API. Returns: Результаты всех парсеров @@ -228,8 +423,14 @@ def parse_all_minpromtorg(proxies: list[str] | None = None) -> dict: logger.info("Starting all Minpromtorg parsers") results = { - "industrial": parse_industrial_production.delay(proxies=proxies).id, - "manufactures": parse_manufactures.delay(proxies=proxies).id, + "industrial": parse_industrial_production.delay( + proxies=proxies, + user_id=user_id, + ).id, + "manufactures": parse_manufactures.delay( + proxies=proxies, + user_id=user_id, + ).id, } return results @@ -243,6 +444,7 @@ def parse_inspections( month: int | None = None, file_url: str | None = None, proxies: list[str] | None = None, + user_id: int | None = None, ) -> dict: """ Задача парсинга данных о проверках с proverki.gov.ru. @@ -253,12 +455,17 @@ def parse_inspections( file_url: Прямая ссылка на файл данных (опционально) proxies: Список прокси-серверов (опционально). Если не передан, берётся из БД. + user_id: ID пользователя, запустившего задачу через API. Returns: Результат: batch_id, saved, status """ source = ParserLoadLog.Source.INSPECTIONS - batch_id = ParserLoadLogService.get_next_batch_id(source) + load_log = ParserLoadLogService.create_next_load_log( + source=source, + status="in_progress", + ) + batch_id = load_log.batch_id task_id = self.request.id # Если прокси не переданы, берём из БД @@ -278,18 +485,12 @@ def parse_inspections( job = BackgroundJobService.create_job( task_id=task_id, task_name="apps.parsers.tasks.parse_inspections", + user_id=user_id, meta={"source": source, "batch_id": batch_id, "year": year, "month": month}, ) job.mark_started() job.update_progress(0, "Инициализация парсера...") - # Создаём запись лога - load_log = ParserLoadLogService.create_load_log( - source=source, - batch_id=batch_id, - status="in_progress", - ) - def progress_callback(percent: int, message: str) -> None: """Callback для обновления прогресса.""" job.update_progress(percent, message) @@ -348,28 +549,299 @@ def parse_inspections( @shared_task -def parse_all_sources(proxies: list[str] | None = None) -> dict: +def parse_all_sources( + proxies: list[str] | None = None, + user_id: int | None = None, + file_urls: dict[str, str] | None = None, +) -> dict: """ Запустить все парсеры из всех источников. Args: proxies: Список прокси-серверов (опционально). Если не передан, каждая задача возьмёт прокси из БД. + user_id: ID пользователя, запустившего группу задач через API. + file_urls: optional debug override URL по ключам источников. Returns: Task IDs всех запущенных парсеров """ logger.info("Starting all parsers from all sources") + file_urls = file_urls or {} results = { - "industrial": parse_industrial_production.delay(proxies=proxies).id, - "manufactures": parse_manufactures.delay(proxies=proxies).id, - "inspections": parse_inspections.delay(proxies=proxies).id, + "industrial": parse_industrial_production.delay( + proxies=proxies, + user_id=user_id, + ).id, + "manufactures": parse_manufactures.delay( + proxies=proxies, + user_id=user_id, + ).id, + "inspections": parse_inspections.delay( + proxies=proxies, + user_id=user_id, + ).id, + "trudvsem": parse_trudvsem_vacancies.delay( + proxies=proxies, + user_id=user_id, + ).id, } + upstream_tasks = { + "mpt_products": parse_mpt_products, + "procurements_44fz": parse_procurements_44fz, + "procurements_223fz": parse_procurements_223fz, + "contracts": parse_contracts, + "unfair_suppliers": parse_unfair_suppliers, + "fas_goz": parse_fas_goz_evasion, + "fns_financial": parse_fns_financial_indicators, + "arbitration": parse_arbitration_cases, + "fedresurs_bankruptcy": parse_fedresurs_bankruptcy, + "fstec": parse_fstec_registers, + } + for source_key, task in upstream_tasks.items(): + results[source_key] = task.delay( + file_url=file_urls.get(source_key), + proxies=proxies, + user_id=user_id, + ).id + return results +@shared_task(bind=True) +def parse_mpt_products( + self, + *, + file_url: str | None = None, + proxies: list[str] | None = None, + user_id: int | None = None, +) -> dict: + """Парсинг продукции Минпромторга из структурированной выгрузки.""" + return _parse_structured_file_source( + self, + source=ParserLoadLog.Source.MPT_PRODUCTS, + task_name="apps.parsers.tasks.parse_mpt_products", + file_url=file_url, + proxies=proxies, + user_id=user_id, + ) + + +@shared_task(bind=True) +def parse_procurements_44fz( + self, + *, + file_url: str | None = None, + proxies: list[str] | None = None, + user_id: int | None = None, +) -> dict: + """Парсинг закупок 44-ФЗ из выгрузки ЕИС.""" + return _parse_structured_file_source( + self, + source=ParserLoadLog.Source.PROCUREMENTS_44FZ, + task_name="apps.parsers.tasks.parse_procurements_44fz", + file_url=file_url, + proxies=proxies, + user_id=user_id, + ) + + +@shared_task(bind=True) +def parse_procurements_223fz( + self, + *, + file_url: str | None = None, + proxies: list[str] | None = None, + user_id: int | None = None, +) -> dict: + """Парсинг закупок 223-ФЗ из выгрузки ЕИС.""" + return _parse_structured_file_source( + self, + source=ParserLoadLog.Source.PROCUREMENTS_223FZ, + task_name="apps.parsers.tasks.parse_procurements_223fz", + file_url=file_url, + proxies=proxies, + user_id=user_id, + ) + + +@shared_task(bind=True) +def parse_contracts( + self, + *, + file_url: str | None = None, + proxies: list[str] | None = None, + user_id: int | None = None, +) -> dict: + """Парсинг контрактов ЕИС из структурированной выгрузки.""" + return _parse_structured_file_source( + self, + source=ParserLoadLog.Source.CONTRACTS, + task_name="apps.parsers.tasks.parse_contracts", + file_url=file_url, + proxies=proxies, + user_id=user_id, + ) + + +@shared_task(bind=True) +def parse_unfair_suppliers( + self, + *, + file_url: str | None = None, + proxies: list[str] | None = None, + user_id: int | None = None, +) -> dict: + """Парсинг реестра недобросовестных поставщиков.""" + return _parse_structured_file_source( + self, + source=ParserLoadLog.Source.UNFAIR_SUPPLIERS, + task_name="apps.parsers.tasks.parse_unfair_suppliers", + file_url=file_url, + proxies=proxies, + user_id=user_id, + ) + + +@shared_task(bind=True) +def parse_fas_goz_evasion( + self, + *, + file_url: str | None = None, + proxies: list[str] | None = None, + user_id: int | None = None, +) -> dict: + """Парсинг реестра уклонения от контрактов ГОЗ ФАС.""" + return _parse_structured_file_source( + self, + source=ParserLoadLog.Source.FAS_GOZ, + task_name="apps.parsers.tasks.parse_fas_goz_evasion", + file_url=file_url, + proxies=proxies, + user_id=user_id, + ) + + +@shared_task(bind=True) +def parse_fns_financial_indicators( + self, + *, + file_url: str | None = None, + proxies: list[str] | None = None, + user_id: int | None = None, +) -> dict: + """Парсинг финансово-экономической выгрузки ФНС.""" + return _parse_structured_file_source( + self, + source=ParserLoadLog.Source.FNS_FINANCIAL, + task_name="apps.parsers.tasks.parse_fns_financial_indicators", + file_url=file_url, + proxies=proxies, + user_id=user_id, + ) + + +@shared_task(bind=True) +def parse_arbitration_cases( + self, + *, + file_url: str | None = None, + proxies: list[str] | None = None, + user_id: int | None = None, +) -> dict: + """Парсинг арбитражных дел из подготовленной выгрузки.""" + return _parse_structured_file_source( + self, + source=ParserLoadLog.Source.ARBITRATION, + task_name="apps.parsers.tasks.parse_arbitration_cases", + file_url=file_url, + proxies=proxies, + user_id=user_id, + ) + + +@shared_task(bind=True) +def parse_fedresurs_bankruptcy( + self, + *, + file_url: str | None = None, + proxies: list[str] | None = None, + user_id: int | None = None, +) -> dict: + """Парсинг сведений о банкротствах Федресурса.""" + return _parse_structured_file_source( + self, + source=ParserLoadLog.Source.FEDRESURS_BANKRUPTCY, + task_name="apps.parsers.tasks.parse_fedresurs_bankruptcy", + file_url=file_url, + proxies=proxies, + user_id=user_id, + ) + + +@shared_task(bind=True) +def parse_fstec_registers( + self, + *, + file_url: str | None = None, + proxies: list[str] | None = None, + user_id: int | None = None, +) -> dict: + """Парсинг реестров ФСТЭК из структурированной выгрузки или HTML-таблицы.""" + return _parse_structured_file_source( + self, + source=ParserLoadLog.Source.FSTEC, + task_name="apps.parsers.tasks.parse_fstec_registers", + file_url=file_url, + proxies=proxies, + user_id=user_id, + ) + + +@shared_task(bind=True) +def parse_trudvsem_vacancies( + self, + *, + limit: int = 100, + offset: int = 0, + region_code: str | None = None, + company_inn: str | None = None, + text: str | None = None, + proxies: list[str] | None = None, + user_id: int | None = None, +) -> dict: + """Парсинг вакансий портала Работа России через открытый API.""" + if proxies is None: + proxies = ProxyService.get_active_proxies_or_none() + + def fetch_records() -> list[GenericParserItem]: + with TrudvsemClient(proxies=proxies) as client: + return client.fetch_vacancies( + limit=limit, + offset=offset, + region_code=region_code, + company_inn=company_inn, + text=text, + ) + + return _run_generic_parser( + self, + source=ParserLoadLog.Source.TRUDVSEM, + task_name="apps.parsers.tasks.parse_trudvsem_vacancies", + fetch_records=fetch_records, + user_id=user_id, + meta={ + "limit": limit, + "offset": offset, + "region_code": region_code, + "company_inn": company_inn, + "text": text, + }, + ) + + def _get_next_month(year: int, month: int) -> tuple[int, int]: """Получить следующий месяц.""" if month == 12: @@ -382,6 +854,12 @@ def sync_inspections( # noqa: C901 self, *, proxies: list[str] | None = None, + user_id: int | None = None, + max_months_per_law: int | None = None, + start_year: int | None = None, + start_month: int | None = None, + include_fz294: bool = True, + include_fz248: bool = True, ) -> dict: """ Синхронизация данных о проверках с proverki.gov.ru. @@ -395,12 +873,22 @@ def sync_inspections( # noqa: C901 Args: proxies: Список прокси-серверов (опционально) + user_id: ID пользователя, запустившего задачу через API. + max_months_per_law: Ограничить количество месяцев на каждый тип проверок. + start_year: Принудительный год старта для ручного ограниченного sync. + start_month: Принудительный месяц старта для ручного ограниченного sync. + include_fz294: Загружать проверки ФЗ-294. + include_fz248: Загружать проверки ФЗ-248. Returns: Результат синхронизации """ source = ParserLoadLog.Source.INSPECTIONS - batch_id = ParserLoadLogService.get_next_batch_id(source) + load_log = ParserLoadLogService.create_next_load_log( + source=source, + status="in_progress", + ) + batch_id = load_log.batch_id task_id = self.request.id # Если прокси не переданы, берём из БД @@ -415,27 +903,29 @@ def sync_inspections( # noqa: C901 job = BackgroundJobService.create_job( task_id=task_id, task_name="apps.parsers.tasks.sync_inspections", + user_id=user_id, meta={"source": source, "batch_id": batch_id}, ) job.mark_started() job.update_progress(0, "Инициализация синхронизации...") - # Создаём запись лога - load_log = ParserLoadLogService.create_load_log( - source=source, - batch_id=batch_id, - status="in_progress", - ) - current_year = datetime.now().year current_month = datetime.now().month total_saved = 0 results = {"fz294": [], "fz248": []} + period_errors = [] + explicit_start_year = start_year + explicit_start_month = start_month + law_flags = [] + if include_fz294: + law_flags.append(False) + if include_fz248: + law_flags.append(True) try: with ProverkiClient(proxies=proxies) as client: # Обрабатываем оба типа проверок - for is_fz248 in [False, True]: + for is_fz248 in law_flags: fz_key = "fz248" if is_fz248 else "fz294" fz_name = "ФЗ-248" if is_fz248 else "ФЗ-294" @@ -444,34 +934,59 @@ def sync_inspections( # noqa: C901 is_federal_law_248=is_fz248 ) - if last_year and last_month: + if explicit_start_year and explicit_start_month: + law_start_year = explicit_start_year + law_start_month = explicit_start_month + logger.info( + "%s: using explicit start period %d/%d", + fz_name, + law_start_year, + law_start_month, + ) + elif last_year and last_month: # Начинаем со следующего месяца после последнего загруженного - start_year, start_month = _get_next_month(last_year, last_month) + law_start_year, law_start_month = _get_next_month( + last_year, + last_month, + ) logger.info( "%s: continuing from %d/%d (last loaded: %d/%d)", fz_name, - start_year, - start_month, + law_start_year, + law_start_month, last_year, last_month, ) else: # Начинаем с дефолтной даты - start_year, start_month = DEFAULT_START_YEAR, DEFAULT_START_MONTH + law_start_year = DEFAULT_START_YEAR + law_start_month = DEFAULT_START_MONTH logger.info( "%s: no data in DB, starting from %d/%d", fz_name, - start_year, - start_month, + law_start_year, + law_start_month, ) # Загружаем месяц за месяцем - year, month = start_year, start_month + year, month = law_start_year, law_start_month empty_months_count = 0 + attempted_months = 0 while year < current_year or ( year == current_year and month <= current_month ): + if ( + max_months_per_law is not None + and attempted_months >= max_months_per_law + ): + logger.info( + "%s: stopping after max_months_per_law=%d", + fz_name, + max_months_per_law, + ) + break + # Прекращаем если 2 месяца подряд нет данных if empty_months_count >= 2: logger.info( @@ -530,6 +1045,13 @@ def sync_inspections( # noqa: C901 ) except Exception as e: + error_info = { + "law": fz_key, + "year": year, + "month": month, + "error": str(e), + } + period_errors.append(error_info) logger.warning( "%s %d/%d: error - %s", fz_name, @@ -538,10 +1060,37 @@ def sync_inspections( # noqa: C901 str(e), ) empty_months_count += 1 + attempted_months += 1 + break # Переходим к следующему месяцу + attempted_months += 1 year, month = _get_next_month(year, month) + if period_errors: + error_message = ( + "Inspection sync finished with " f"{len(period_errors)} period errors" + ) + ParserLoadLogService.update( + load_log, + status="failed", + records_count=total_saved, + error_message=error_message, + ) + job.fail(error=error_message) + logger.warning( + "Inspections sync failed with period errors (total_saved=%d, errors=%d)", + total_saved, + len(period_errors), + ) + return { + "batch_id": batch_id, + "total_saved": total_saved, + "status": "failed", + "results": results, + "errors": period_errors, + } + # Обновляем лог ParserLoadLogService.update( load_log, diff --git a/src/apps/parsers/urls.py b/src/apps/parsers/urls.py index 051ad2a..8cc0c45 100644 --- a/src/apps/parsers/urls.py +++ b/src/apps/parsers/urls.py @@ -1,5 +1,31 @@ +from apps.parsers import views +from django.urls import path + app_name = "parsers" urlpatterns = [ - # URL-маршруты будут добавлены по мере разработки + path("sources/", views.ParserSourceListView.as_view(), name="source-list"), + path("dashboard/", views.ParserDashboardDataView.as_view(), name="dashboard-data"), + path("run//", views.ParserRunView.as_view(), name="run-parser"), + path( + "upload//", + views.ParserUploadView.as_view(), + name="upload-parser-data", + ), + path( + "schedules/", + views.ParserScheduleListCreateView.as_view(), + name="schedule-list", + ), + path( + "schedules//", + views.ParserScheduleDetailView.as_view(), + name="schedule-detail", + ), + path("load-logs/", views.ParserLoadLogListView.as_view(), name="load-log-list"), + path( + "records/", + views.GenericParserRecordListView.as_view(), + name="generic-record-list", + ), ] diff --git a/src/apps/parsers/views.py b/src/apps/parsers/views.py index a890579..dddddf9 100644 --- a/src/apps/parsers/views.py +++ b/src/apps/parsers/views.py @@ -1,7 +1,1338 @@ -""" -Views для приложения парсеров. +"""Views для приложения парсеров.""" -TODO: Добавить views по мере необходимости. -""" +import json +import uuid -# Views будут добавлены по мере разработки конкретных парсеров +from apps.core.response import api_error_response, api_response +from apps.core.serializers import BackgroundJobListSerializer +from apps.core.services import BackgroundJobService +from apps.parsers import tasks +from apps.parsers.models import ( + GenericParserRecord, + IndustrialCertificateRecord, + InspectionRecord, + ManufacturerRecord, + ParserLoadLog, +) +from apps.parsers.serializers import ( + GenericParserRecordSerializer, + ParserListQuerySerializer, + ParserLoadLogSerializer, + ParserResultQuerySerializer, + ParserResultRecordSerializer, + ParserRunRequestSerializer, + ParserRunResponseSerializer, + ParserScheduleRequestSerializer, + ParserScheduleSerializer, + ParserSourceSerializer, + ParserUploadRequestSerializer, +) +from apps.parsers.source_registry import PARSER_SOURCES +from django.conf import settings +from django.core.files.storage import default_storage +from django.core.paginator import Paginator +from django.db.models import Count, Q +from django.utils.text import get_valid_filename +from django.views.generic import TemplateView +from django_celery_beat.models import CrontabSchedule, IntervalSchedule, PeriodicTask +from drf_yasg import openapi +from drf_yasg.utils import swagger_auto_schema +from rest_framework import status +from rest_framework.parsers import FormParser, MultiPartParser +from rest_framework.permissions import IsAuthenticated +from rest_framework.request import Request +from rest_framework.views import APIView + +TASKS_BY_NAME = { + "apps.parsers.tasks.parse_industrial_production": tasks.parse_industrial_production, + "apps.parsers.tasks.parse_manufactures": tasks.parse_manufactures, + "apps.parsers.tasks.parse_inspections": tasks.parse_inspections, + "apps.parsers.tasks.sync_inspections": tasks.sync_inspections, + "apps.parsers.tasks.parse_mpt_products": tasks.parse_mpt_products, + "apps.parsers.tasks.parse_procurements_44fz": tasks.parse_procurements_44fz, + "apps.parsers.tasks.parse_procurements_223fz": tasks.parse_procurements_223fz, + "apps.parsers.tasks.parse_contracts": tasks.parse_contracts, + "apps.parsers.tasks.parse_unfair_suppliers": tasks.parse_unfair_suppliers, + "apps.parsers.tasks.parse_fas_goz_evasion": tasks.parse_fas_goz_evasion, + "apps.parsers.tasks.parse_fns_financial_indicators": ( + tasks.parse_fns_financial_indicators + ), + "apps.parsers.tasks.parse_arbitration_cases": tasks.parse_arbitration_cases, + "apps.parsers.tasks.parse_fedresurs_bankruptcy": tasks.parse_fedresurs_bankruptcy, + "apps.parsers.tasks.parse_fstec_registers": tasks.parse_fstec_registers, + "apps.parsers.tasks.parse_trudvsem_vacancies": tasks.parse_trudvsem_vacancies, +} + +EXISTING_TASK_PARAMS = { + "industrial": {"proxies", "user_id"}, + "manufactures": {"proxies", "user_id"}, + "inspections": {"year", "month", "file_url", "proxies", "user_id"}, + "sync_inspections": { + "proxies", + "user_id", + "max_months_per_law", + "start_year", + "start_month", + "include_fz294", + "include_fz248", + }, +} + +TRUDVSEM_PARAMS = { + "limit", + "offset", + "region_code", + "company_inn", + "text", + "proxies", + "user_id", +} + +GENERIC_FILE_PARAMS = {"file_url", "proxies", "user_id"} +PARSER_TASK_NAMES = set(TASKS_BY_NAME) +NATIVE_RECORD_MODELS = { + ParserLoadLog.Source.INDUSTRIAL: IndustrialCertificateRecord, + ParserLoadLog.Source.MANUFACTURES: ManufacturerRecord, + ParserLoadLog.Source.INSPECTIONS: InspectionRecord, +} +PARSERS_TAG = "Parser Management" +MINPROMTORG_TAG = "Minpromtorg" +PROVERKI_TAG = "Prosecutor General Inspections" +ZAKUPKI_TAG = "EIS Zakupki" +FAS_TAG = "FAS" +FNS_TAG = "FNS" +ARBITRATION_TAG = "Arbitration" +FEDRESURS_TAG = "Fedresurs" +FSTEC_TAG = "FSTEC" +TRUDVSEM_TAG = "Trudvsem" +SOURCE_RESULT_TAGS = { + "industrial": MINPROMTORG_TAG, + "manufactures": MINPROMTORG_TAG, + "mpt_products": MINPROMTORG_TAG, + "inspections": PROVERKI_TAG, + "sync_inspections": PROVERKI_TAG, + "procurements_44fz": ZAKUPKI_TAG, + "procurements_223fz": ZAKUPKI_TAG, + "contracts": ZAKUPKI_TAG, + "unfair_suppliers": FAS_TAG, + "fas_goz": FAS_TAG, + "fns_financial": FNS_TAG, + "arbitration": ARBITRATION_TAG, + "fedresurs_bankruptcy": FEDRESURS_TAG, + "fstec": FSTEC_TAG, + "trudvsem": TRUDVSEM_TAG, +} +LIMIT_PARAM = openapi.Parameter( + "limit", + openapi.IN_QUERY, + description="Максимальное количество записей", + type=openapi.TYPE_INTEGER, +) +SOURCE_PARAM = openapi.Parameter( + "source", + openapi.IN_QUERY, + description="Источник данных: industrial, manufactures, inspections, trudvsem и т.д.", + type=openapi.TYPE_STRING, +) +STATUS_PARAM = openapi.Parameter( + "status", + openapi.IN_QUERY, + description="Фильтр по статусу", + type=openapi.TYPE_STRING, +) +RECORD_ID_PARAM = openapi.Parameter( + "id", + openapi.IN_QUERY, + description="ID записи для detail-view dashboard", + type=openapi.TYPE_INTEGER, +) +INN_PARAM = openapi.Parameter( + "inn", + openapi.IN_QUERY, + description="Фильтр по ИНН", + type=openapi.TYPE_STRING, +) +OGRN_PARAM = openapi.Parameter( + "ogrn", + openapi.IN_QUERY, + description="Фильтр по ОГРН", + type=openapi.TYPE_STRING, +) +UPLOAD_FILE_PARAM = openapi.Parameter( + "file", + openapi.IN_FORM, + description="JSON, CSV, XML, HTML, XLSX/XLSM или ZIP с файлами реестра", + type=openapi.TYPE_FILE, + required=True, +) +PAGE_PARAM = openapi.Parameter( + "page", + openapi.IN_QUERY, + description="Номер страницы", + type=openapi.TYPE_INTEGER, +) +PAGE_SIZE_PARAM = openapi.Parameter( + "page_size", + openapi.IN_QUERY, + description="Размер страницы, максимум 100", + type=openapi.TYPE_INTEGER, +) +EXTERNAL_ID_PARAM = openapi.Parameter( + "external_id", + openapi.IN_QUERY, + description="Стабильный ID записи во внешнем источнике", + type=openapi.TYPE_STRING, +) +LOAD_BATCH_PARAM = openapi.Parameter( + "load_batch", + openapi.IN_QUERY, + description="Batch загрузки", + type=openapi.TYPE_INTEGER, +) +SEARCH_PARAM = openapi.Parameter( + "search", + openapi.IN_QUERY, + description="Поиск по основным текстовым полям источника", + type=openapi.TYPE_STRING, +) +ORDERING_PARAM = openapi.Parameter( + "ordering", + openapi.IN_QUERY, + description="Сортировка: id, load_batch, created_at, updated_at, external_id, inn, ogrn", + type=openapi.TYPE_STRING, +) +INCLUDE_PAYLOAD_PARAM = openapi.Parameter( + "include_payload", + openapi.IN_QUERY, + description="Вернуть исходный payload записи", + type=openapi.TYPE_BOOLEAN, +) +RESULT_LIST_PARAMS = [ + PAGE_PARAM, + PAGE_SIZE_PARAM, + LIMIT_PARAM, + RECORD_ID_PARAM, + EXTERNAL_ID_PARAM, + INN_PARAM, + OGRN_PARAM, + LOAD_BATCH_PARAM, + STATUS_PARAM, + SEARCH_PARAM, + ORDERING_PARAM, + INCLUDE_PAYLOAD_PARAM, +] +RESULT_DETAIL_PARAMS = [ + EXTERNAL_ID_PARAM, + INN_PARAM, + OGRN_PARAM, + LOAD_BATCH_PARAM, + STATUS_PARAM, + INCLUDE_PAYLOAD_PARAM, +] + + +def _allowed_task_params(source_key: str) -> set[str]: + """Вернуть параметры, которые принимает выбранная Celery-задача.""" + if source_key in EXISTING_TASK_PARAMS: + return EXISTING_TASK_PARAMS[source_key] + if source_key == "trudvsem": + return TRUDVSEM_PARAMS + return GENERIC_FILE_PARAMS + + +def build_task_kwargs(source_key: str, params: dict, user_id: int) -> dict: + """Оставить только kwargs задачи и закрепить владельца запуска.""" + allowed = _allowed_task_params(source_key) + task_kwargs = {key: value for key, value in params.items() if key in allowed} + if "user_id" in allowed: + task_kwargs["user_id"] = user_id + return task_kwargs + + +def _save_uploaded_parser_file(uploaded_file) -> str: + """Сохранить файл ручного импорта в storage для Celery worker.""" + safe_name = get_valid_filename(uploaded_file.name or "parser-upload") + return default_storage.save( + f"parser_uploads/{uuid.uuid4()}-{safe_name}", uploaded_file + ) + + +def _model_payload(record) -> dict: + """Сериализовать поля native-модели в payload для detail-view.""" + payload = {} + for field in record._meta.fields: + value = getattr(record, field.name) + if hasattr(value, "isoformat"): + value = value.isoformat() + payload[field.name] = value + return payload + + +def _native_record_to_result( + source: str, + record, + *, + include_payload: bool = True, +) -> dict: + """Свести старые native-модели к общему DTO результата.""" + if source == ParserLoadLog.Source.INDUSTRIAL: + external_id = record.certificate_number + organisation_name = record.organisation_name + title = record.certificate_number + record_date = record.issue_date + status_value = "" + url = record.certificate_file_url + elif source == ParserLoadLog.Source.MANUFACTURES: + external_id = record.inn + organisation_name = record.full_legal_name + title = record.full_legal_name + record_date = "" + status_value = "" + url = "" + else: + external_id = record.registration_number + organisation_name = record.organisation_name + title = record.control_authority + record_date = record.start_date + status_value = record.status + url = "" + + return { + "id": record.id, + "load_batch": record.load_batch, + "source": source, + "external_id": external_id, + "inn": record.inn, + "ogrn": record.ogrn, + "organisation_name": organisation_name, + "title": title, + "record_date": record_date, + "amount": None, + "status": status_value, + "url": url, + "payload": _model_payload(record) if include_payload else {}, + "created_at": record.created_at, + "updated_at": record.updated_at, + } + + +def _generic_record_to_result( + record: GenericParserRecord, + *, + include_payload: bool = True, +) -> dict: + """Свести GenericParserRecord к DTO результата источника.""" + return { + "id": record.id, + "load_batch": record.load_batch, + "source": record.source, + "external_id": record.external_id, + "inn": record.inn, + "ogrn": record.ogrn, + "organisation_name": record.organisation_name, + "title": record.title, + "record_date": record.record_date, + "amount": record.amount, + "status": record.status, + "url": record.url, + "payload": record.payload if include_payload else {}, + "created_at": record.created_at, + "updated_at": record.updated_at, + } + + +def _source_key_by_task_name(task_name: str) -> str: + for source_key, descriptor in PARSER_SOURCES.items(): + if descriptor.task_name == task_name: + return source_key + return "" + + +def _parse_periodic_task_kwargs(periodic_task: PeriodicTask) -> dict: + try: + return json.loads(periodic_task.kwargs or "{}") + except (TypeError, ValueError): + return {} + + +def _periodic_task_to_dict(periodic_task: PeriodicTask) -> dict: + """Преобразовать PeriodicTask django-celery-beat в API DTO.""" + source_key = _source_key_by_task_name(periodic_task.task) + descriptor = PARSER_SOURCES.get(source_key) + params = _parse_periodic_task_kwargs(periodic_task) + if periodic_task.interval_id: + schedule_type = "interval" + schedule = { + "every": periodic_task.interval.every, + "period": periodic_task.interval.period, + } + elif periodic_task.crontab_id: + schedule_type = "crontab" + schedule = { + "minute": periodic_task.crontab.minute, + "hour": periodic_task.crontab.hour, + "day_of_week": periodic_task.crontab.day_of_week, + "day_of_month": periodic_task.crontab.day_of_month, + "month_of_year": periodic_task.crontab.month_of_year, + } + else: + schedule_type = "unsupported" + schedule = {} + + return { + "id": periodic_task.id, + "name": periodic_task.name, + "source_key": source_key, + "source": descriptor.source if descriptor else "", + "title": descriptor.title if descriptor else periodic_task.task, + "task_name": periodic_task.task, + "enabled": periodic_task.enabled, + "schedule_type": schedule_type, + "schedule": schedule, + "params": params, + "last_run_at": periodic_task.last_run_at, + "total_run_count": periodic_task.total_run_count, + "date_changed": periodic_task.date_changed, + } + + +def _parser_periodic_tasks_for_user(user) -> list[PeriodicTask]: + """Вернуть периодические задачи парсеров с фильтром владения для non-staff.""" + queryset = ( + PeriodicTask.objects.filter(task__in=PARSER_TASK_NAMES) + .select_related("interval", "crontab") + .order_by("name") + ) + if user.is_staff: + return list(queryset) + + result = [] + for periodic_task in queryset: + params = _parse_periodic_task_kwargs(periodic_task) + if params.get("user_id") == user.id: + result.append(periodic_task) + return result + + +def _get_parser_periodic_task_for_user(pk: int, user) -> PeriodicTask | None: + for periodic_task in _parser_periodic_tasks_for_user(user): + if periodic_task.pk == pk: + return periodic_task + return None + + +def source_result_swagger_tag(source_key: str) -> str: + """Вернуть Swagger tag для результата источника.""" + return SOURCE_RESULT_TAGS.get(source_key, PARSERS_TAG) + + +def _safe_ordering(ordering: str, field_map: dict[str, str]) -> list[str]: + """Вернуть только разрешенные поля сортировки для модели источника.""" + result = [] + for raw_field in (item.strip() for item in ordering.split(",") if item.strip()): + desc = raw_field.startswith("-") + api_field = raw_field[1:] if desc else raw_field + model_field = field_map.get(api_field) + if not model_field: + continue + result.append(f"-{model_field}" if desc else model_field) + return result + + +def _native_field_map(source: str) -> dict[str, str]: + """Маппинг DTO-фильтров на поля старых native-моделей.""" + common = { + "id": "id", + "load_batch": "load_batch", + "inn": "inn", + "ogrn": "ogrn", + "created_at": "created_at", + "updated_at": "updated_at", + } + if source == ParserLoadLog.Source.INDUSTRIAL: + return { + **common, + "external_id": "certificate_number", + "organisation_name": "organisation_name", + "title": "certificate_number", + "record_date": "issue_date", + } + if source == ParserLoadLog.Source.MANUFACTURES: + return { + **common, + "external_id": "inn", + "organisation_name": "full_legal_name", + "title": "full_legal_name", + } + return { + **common, + "external_id": "registration_number", + "organisation_name": "organisation_name", + "title": "control_authority", + "record_date": "start_date", + "status": "status", + } + + +def _native_search_q(source: str, search: str) -> Q: + """Поиск по полям старых native-моделей.""" + if source == ParserLoadLog.Source.INDUSTRIAL: + return ( + Q(organisation_name__icontains=search) + | Q(certificate_number__icontains=search) + | Q(inn__icontains=search) + | Q(ogrn__icontains=search) + ) + if source == ParserLoadLog.Source.MANUFACTURES: + return ( + Q(full_legal_name__icontains=search) + | Q(inn__icontains=search) + | Q(ogrn__icontains=search) + | Q(address__icontains=search) + ) + return ( + Q(organisation_name__icontains=search) + | Q(registration_number__icontains=search) + | Q(inn__icontains=search) + | Q(ogrn__icontains=search) + | Q(control_authority__icontains=search) + | Q(status__icontains=search) + ) + + +def _generic_search_q(search: str) -> Q: + """Поиск по GenericParserRecord.""" + return ( + Q(external_id__icontains=search) + | Q(organisation_name__icontains=search) + | Q(title__icontains=search) + | Q(inn__icontains=search) + | Q(ogrn__icontains=search) + | Q(status__icontains=search) + | Q(url__icontains=search) + ) + + +def _route_model_sources(descriptor) -> set[str]: + """Вернуть model source values, которые опубликованы на том же API route.""" + return { + item.source + for item in PARSER_SOURCES.values() + if item.api_route == descriptor.api_route + } + + +def _result_sources_for_request(descriptor, params: dict) -> set[str]: + """Сузить route-группу по query source, если он передан.""" + route_sources = _route_model_sources(descriptor) + requested_source = params.get("source") + if not requested_source: + return route_sources + + if requested_source in route_sources: + return {requested_source} + + requested_descriptor = PARSER_SOURCES.get(requested_source) + if ( + requested_descriptor is not None + and requested_descriptor.api_route == descriptor.api_route + ): + return {requested_descriptor.source} + + return set() + + +def _filter_native_result_queryset(source: str, params: dict, sources: set[str]): + """Вернуть queryset native-модели с фильтрами DTO.""" + queryset = NATIVE_RECORD_MODELS[source].objects.all() + if not sources: + queryset = queryset.none() + + field_map = _native_field_map(source) + for api_field in ("id", "external_id", "inn", "ogrn", "load_batch", "status"): + value = params.get(api_field) + model_field = field_map.get(api_field) + if value not in ("", None) and model_field: + queryset = queryset.filter(**{model_field: value}) + if params.get("record_date") and field_map.get("record_date"): + queryset = queryset.filter(**{field_map["record_date"]: params["record_date"]}) + if params.get("search"): + queryset = queryset.filter(_native_search_q(source, params["search"])) + + ordering = _safe_ordering(params.get("ordering", ""), field_map) + return queryset.order_by(*(ordering or ["-created_at"])) + + +def _filter_generic_result_queryset(sources: set[str], params: dict): + """Вернуть queryset GenericParserRecord с фильтрами DTO.""" + queryset = GenericParserRecord.objects.filter(source__in=sources) + for field in ( + "id", + "external_id", + "inn", + "ogrn", + "load_batch", + "status", + "record_date", + ): + value = params.get(field) + if value not in ("", None): + queryset = queryset.filter(**{field: value}) + if params.get("search"): + queryset = queryset.filter(_generic_search_q(params["search"])) + + generic_field_map = { + "id": "id", + "load_batch": "load_batch", + "external_id": "external_id", + "inn": "inn", + "ogrn": "ogrn", + "organisation_name": "organisation_name", + "title": "title", + "record_date": "record_date", + "status": "status", + "created_at": "created_at", + "updated_at": "updated_at", + } + ordering = _safe_ordering(params.get("ordering", ""), generic_field_map) + return queryset.order_by(*(ordering or ["-created_at"])) + + +def _filter_result_queryset(source_key: str, params: dict): + """Вернуть descriptor и queryset результата конкретного источника.""" + descriptor = PARSER_SOURCES.get(source_key) + if descriptor is None: + return None, None + + sources = _result_sources_for_request(descriptor, params) + source = descriptor.source + if source in NATIVE_RECORD_MODELS: + return descriptor, _filter_native_result_queryset(source, params, sources) + return descriptor, _filter_generic_result_queryset(sources, params) + + +def _result_record_to_dict( + source: str, + record, + *, + include_payload: bool, +) -> dict: + if source in NATIVE_RECORD_MODELS: + return _native_record_to_result( + source, + record, + include_payload=include_payload, + ) + return _generic_record_to_result(record, include_payload=include_payload) + + +def _source_not_found_response(source_key: str): + return api_error_response( + [ + { + "code": "unknown_parser_source", + "message": f"Неизвестный источник: {source_key}", + } + ], + status_code=status.HTTP_404_NOT_FOUND, + ) + + +class SourceResultListView(APIView): + """get_list результата конкретного источника для frontend API.""" + + permission_classes = [IsAuthenticated] + source_key = "" + + @swagger_auto_schema( + operation_summary="get_list результата источника", + operation_description=( + "Возвращает список записей конкретного источника. " + "Источник закреплен в URL, фильтры передаются query-параметрами." + ), + manual_parameters=RESULT_LIST_PARAMS, + tags=[PARSERS_TAG], + responses={200: ParserResultRecordSerializer(many=True)}, + ) + def get(self, request: Request, source_key: str | None = None): + """Вернуть список результатов источника.""" + resolved_source_key = source_key or self.source_key + query_serializer = ParserResultQuerySerializer(data=request.query_params) + query_serializer.is_valid(raise_exception=True) + params = query_serializer.validated_data + + descriptor, queryset = _filter_result_queryset(resolved_source_key, params) + if descriptor is None: + return _source_not_found_response(resolved_source_key) + + paginator = Paginator(queryset, params["page_size"]) + page_obj = paginator.get_page(params["page"]) + data = [ + _result_record_to_dict( + descriptor.source, + record, + include_payload=params["include_payload"], + ) + for record in page_obj.object_list + ] + return api_response( + data, + pagination={ + "page": page_obj.number, + "page_size": params["page_size"], + "total_count": paginator.count, + "total_pages": paginator.num_pages, + "has_next": page_obj.has_next(), + "has_previous": page_obj.has_previous(), + }, + ) + + +class SourceResultDetailView(APIView): + """get результата конкретного источника для frontend API.""" + + permission_classes = [IsAuthenticated] + source_key = "" + + @swagger_auto_schema( + operation_summary="get результата источника", + operation_description=( + "Возвращает одну запись источника по id. Query-параметры дополнительно " + "сужают выборку, например inn/load_batch/status." + ), + manual_parameters=RESULT_DETAIL_PARAMS, + tags=[PARSERS_TAG], + responses={200: ParserResultRecordSerializer, 404: "Запись не найдена"}, + ) + def get(self, request: Request, pk: int, source_key: str | None = None): + """Вернуть детальную запись источника.""" + resolved_source_key = source_key or self.source_key + query_serializer = ParserResultQuerySerializer(data=request.query_params) + query_serializer.is_valid(raise_exception=True) + params = query_serializer.validated_data + params.pop("id", None) + + descriptor, queryset = _filter_result_queryset(resolved_source_key, params) + if descriptor is None: + return _source_not_found_response(resolved_source_key) + + record = queryset.filter(pk=pk).first() + if record is None: + return api_error_response( + [{"code": "record_not_found", "message": "Запись не найдена"}], + status_code=status.HTTP_404_NOT_FOUND, + ) + + return api_response( + _result_record_to_dict( + descriptor.source, + record, + include_payload=params["include_payload"], + ) + ) + + +class ParserSourceListView(APIView): + """Список доступных парсеров без запуска задач.""" + + permission_classes = [IsAuthenticated] + + @swagger_auto_schema( + operation_summary="Список источников парсеров", + operation_description="Каталог native и upstream источников для dashboard и API-запуска.", + tags=[PARSERS_TAG], + responses={200: ParserSourceSerializer(many=True)}, + ) + def get(self, request: Request): + """Вернуть каталог уже существующих и добавленных парсеров.""" + serializer = ParserSourceSerializer(PARSER_SOURCES.values(), many=True) + return api_response(serializer.data) + + +class ParserRunView(APIView): + """Запуск Celery-задачи парсера.""" + + permission_classes = [IsAuthenticated] + + @swagger_auto_schema( + operation_summary="Запустить parser Celery-задачу", + operation_description=( + "Запускает выбранный источник через Celery. " + "file_url используется только как debug override для upstream URL." + ), + request_body=ParserRunRequestSerializer, + tags=[PARSERS_TAG], + responses={202: ParserRunResponseSerializer, 400: "Ошибка валидации"}, + ) + def post(self, request: Request, source_key: str): + """Запустить выбранный парсер через Celery.""" + descriptor = PARSER_SOURCES.get(source_key) + if descriptor is None: + return api_error_response( + [ + { + "code": "unknown_parser_source", + "message": f"Неизвестный парсер: {source_key}", + } + ], + status_code=status.HTTP_404_NOT_FOUND, + ) + + serializer = ParserRunRequestSerializer(data=request.data) + serializer.is_valid(raise_exception=True) + params = { + key: value + for key, value in serializer.validated_data.items() + if value not in ("", None) + } + if params.get("proxies") and not request.user.is_staff: + return api_error_response( + [ + { + "code": "proxy_override_forbidden", + "message": "Прокси для запуска парсера может задавать только staff", + } + ], + status_code=status.HTTP_403_FORBIDDEN, + ) + + if descriptor.requires_file_url and not params.get("file_url"): + return api_error_response( + [ + { + "code": "file_url_required", + "message": "Для этого парсера нужен параметр file_url", + } + ] + ) + + task = TASKS_BY_NAME[descriptor.task_name] + task_kwargs = build_task_kwargs(source_key, params, request.user.id) + async_result = task.delay(**task_kwargs) + + return api_response( + { + "task_id": async_result.id, + "source": descriptor.source, + "task_name": descriptor.task_name, + }, + status_code=status.HTTP_202_ACCEPTED, + ) + + +class ParserUploadView(APIView): + """Ручная загрузка файла реестра или финансовой выгрузки.""" + + permission_classes = [IsAuthenticated] + parser_classes = [MultiPartParser, FormParser] + + @swagger_auto_schema( + operation_summary="Загрузить файл реестра или финансовой выгрузки", + operation_description=( + "Принимает структурированный файл, сохраняет его во временное storage " + "и запускает Celery import_parser_upload. Данные после импорта доступны " + "через /api/v1/parsers/records/." + ), + manual_parameters=[UPLOAD_FILE_PARAM], + consumes=["multipart/form-data"], + tags=[PARSERS_TAG], + responses={202: ParserRunResponseSerializer, 400: "Ошибка валидации"}, + ) + def post(self, request: Request, source_key: str): + """Загрузить файл и запустить Celery-импорт.""" + descriptor = PARSER_SOURCES.get(source_key) + if descriptor is None: + return api_error_response( + [ + { + "code": "unknown_parser_source", + "message": f"Неизвестный парсер: {source_key}", + } + ], + status_code=status.HTTP_404_NOT_FOUND, + ) + if not descriptor.supports_file_upload: + return api_error_response( + [ + { + "code": "upload_not_supported", + "message": "Для этого источника ручная загрузка файла не поддерживается", + } + ], + status_code=status.HTTP_400_BAD_REQUEST, + ) + + serializer = ParserUploadRequestSerializer(data=request.data) + serializer.is_valid(raise_exception=True) + uploaded_file = serializer.validated_data["file"] + storage_path = _save_uploaded_parser_file(uploaded_file) + + try: + async_result = tasks.import_parser_upload.delay( + source_key=source_key, + storage_path=storage_path, + file_name=uploaded_file.name, + user_id=request.user.id, + ) + except Exception: + default_storage.delete(storage_path) + raise + + return api_response( + { + "task_id": async_result.id, + "source": descriptor.source, + "task_name": "apps.parsers.tasks.import_parser_upload", + }, + status_code=status.HTTP_202_ACCEPTED, + ) + + +class ParserLoadLogListView(APIView): + """Список логов загрузок парсеров.""" + + permission_classes = [IsAuthenticated] + + @swagger_auto_schema( + operation_summary="Логи загрузок парсеров", + operation_description="Последние ParserLoadLog с фильтрами по source/status.", + manual_parameters=[LIMIT_PARAM, SOURCE_PARAM, STATUS_PARAM], + tags=[PARSERS_TAG], + responses={200: ParserLoadLogSerializer(many=True)}, + ) + def get(self, request: Request): + """Вернуть последние логи загрузок.""" + query_serializer = ParserListQuerySerializer(data=request.query_params) + query_serializer.is_valid(raise_exception=True) + limit = query_serializer.validated_data["limit"] + + queryset = ParserLoadLog.objects.all() + source = request.query_params.get("source") + status_filter = request.query_params.get("status") + if source: + queryset = queryset.filter(source=source) + if status_filter: + queryset = queryset.filter(status=status_filter) + + serializer = ParserLoadLogSerializer(queryset[:limit], many=True) + return api_response(serializer.data) + + +class GenericParserRecordListView(APIView): + """Список универсальных записей новых источников.""" + + permission_classes = [IsAuthenticated] + + @swagger_auto_schema( + operation_summary="Данные спарсенных записей", + operation_description=( + "Единый read endpoint для dashboard: GenericParserRecord для новых источников " + "и совместимый DTO для старых native-моделей industrial, manufactures, inspections." + ), + manual_parameters=[ + LIMIT_PARAM, + SOURCE_PARAM, + RECORD_ID_PARAM, + INN_PARAM, + OGRN_PARAM, + ], + tags=[PARSERS_TAG], + responses={200: GenericParserRecordSerializer(many=True)}, + ) + def get(self, request: Request): + """Вернуть записи с фильтрами по источнику и реквизитам.""" + query_serializer = ParserListQuerySerializer(data=request.query_params) + query_serializer.is_valid(raise_exception=True) + limit = query_serializer.validated_data["limit"] + + record_id = request.query_params.get("id") + source = request.query_params.get("source") + inn = request.query_params.get("inn") + ogrn = request.query_params.get("ogrn") + if record_id: + if not record_id.isdigit(): + return api_error_response( + [{"code": "invalid_record_id", "message": "id должен быть числом"}] + ) + record_id = int(record_id) + if source in NATIVE_RECORD_MODELS: + data = self._get_native_records( + source=source, + record_id=record_id, + inn=inn, + ogrn=ogrn, + limit=limit, + ) + return api_response(data) + + queryset = GenericParserRecord.objects.all() + if record_id: + queryset = queryset.filter(id=record_id) + if source: + queryset = queryset.filter(source=source) + if inn: + queryset = queryset.filter(inn=inn) + if ogrn: + queryset = queryset.filter(ogrn=ogrn) + + serializer = GenericParserRecordSerializer(queryset[:limit], many=True) + return api_response(serializer.data) + + def _get_native_records( + self, + *, + source: str, + record_id: int | None, + inn: str | None, + ogrn: str | None, + limit: int, + ) -> list[dict]: + """Вернуть старые native-модели в том же DTO, что generic records.""" + model = NATIVE_RECORD_MODELS[source] + queryset = model.objects.all() + if record_id: + queryset = queryset.filter(id=record_id) + if inn: + queryset = queryset.filter(inn=inn) + if ogrn: + queryset = queryset.filter(ogrn=ogrn) + + return [ + self._native_record_to_dict(source, record) for record in queryset[:limit] + ] + + def _native_record_to_dict(self, source: str, record) -> dict: + """Свести старые модели к общему формату records endpoint.""" + payload = self._model_payload(record) + if source == ParserLoadLog.Source.INDUSTRIAL: + external_id = record.certificate_number + organisation_name = record.organisation_name + title = record.certificate_number + record_date = record.issue_date + status_value = "" + url = record.certificate_file_url + elif source == ParserLoadLog.Source.MANUFACTURES: + external_id = record.inn + organisation_name = record.full_legal_name + title = record.full_legal_name + record_date = "" + status_value = "" + url = "" + else: + external_id = record.registration_number + organisation_name = record.organisation_name + title = record.control_authority + record_date = record.start_date + status_value = record.status + url = "" + + return { + "id": record.id, + "load_batch": record.load_batch, + "source": source, + "external_id": external_id, + "inn": record.inn, + "ogrn": record.ogrn, + "organisation_name": organisation_name, + "title": title, + "record_date": record_date, + "amount": None, + "status": status_value, + "url": url, + "payload": payload, + "created_at": record.created_at, + "updated_at": record.updated_at, + } + + def _model_payload(self, record) -> dict: + """Сериализовать поля native-модели в payload для detail-view.""" + payload = {} + for field in record._meta.fields: + value = getattr(record, field.name) + if hasattr(value, "isoformat"): + value = value.isoformat() + payload[field.name] = value + return payload + + +class ParserScheduleListCreateView(APIView): + """Список и настройка периодических Celery-задач парсеров.""" + + permission_classes = [IsAuthenticated] + + @swagger_auto_schema( + operation_summary="Список расписаний парсеров", + operation_description="django-celery-beat PeriodicTask для parser задач.", + tags=[PARSERS_TAG], + responses={200: ParserScheduleSerializer(many=True)}, + ) + def get(self, request: Request): + """Вернуть расписания парсеров, доступные пользователю.""" + schedules = [ + _periodic_task_to_dict(periodic_task) + for periodic_task in _parser_periodic_tasks_for_user(request.user) + ] + serializer = ParserScheduleSerializer(schedules, many=True) + return api_response(serializer.data) + + @swagger_auto_schema( + operation_summary="Создать или обновить расписание парсера", + request_body=ParserScheduleRequestSerializer, + tags=[PARSERS_TAG], + responses={201: ParserScheduleSerializer, 400: "Ошибка валидации"}, + ) + def post(self, request: Request): + """Создать или обновить расписание парсера.""" + serializer = ParserScheduleRequestSerializer(data=request.data) + serializer.is_valid(raise_exception=True) + params = { + key: value + for key, value in serializer.validated_data.items() + if value not in ("", None) + } + source_key = params.get("source_key") + if not source_key: + return api_error_response( + [{"code": "source_required", "message": "source_key обязателен"}] + ) + + descriptor = PARSER_SOURCES.get(source_key) + if descriptor is None: + return api_error_response( + [ + { + "code": "unknown_parser_source", + "message": f"Неизвестный парсер: {source_key}", + } + ], + status_code=status.HTTP_404_NOT_FOUND, + ) + + if params.get("proxies") and not request.user.is_staff: + return api_error_response( + [ + { + "code": "proxy_override_forbidden", + "message": "Прокси для запуска парсера может задавать только staff", + } + ], + status_code=status.HTTP_403_FORBIDDEN, + ) + if descriptor.requires_file_url and not params.get("file_url"): + return api_error_response( + [ + { + "code": "file_url_required", + "message": "Для этого расписания нужен параметр file_url", + } + ] + ) + + periodic_task = self._save_periodic_task( + source_key=source_key, + params=params, + user_id=request.user.id, + ) + response_serializer = ParserScheduleSerializer( + _periodic_task_to_dict(periodic_task) + ) + return api_response( + response_serializer.data, + status_code=status.HTTP_201_CREATED, + ) + + def _save_periodic_task( + self, + *, + source_key: str, + params: dict, + user_id: int, + ) -> PeriodicTask: + descriptor = PARSER_SOURCES[source_key] + task_kwargs = build_task_kwargs(source_key, params, user_id) + schedule_type = params.get("schedule_type", "interval") + schedule_defaults = {"interval": None, "crontab": None} + + if schedule_type == "interval": + schedule_defaults["interval"], _ = IntervalSchedule.objects.get_or_create( + every=params["every"], + period=params.get("period", "hours"), + ) + else: + schedule_defaults["crontab"], _ = CrontabSchedule.objects.get_or_create( + minute=params.get("minute", "0"), + hour=params.get("hour", "*"), + day_of_week=params.get("day_of_week", "*"), + day_of_month=params.get("day_of_month", "*"), + month_of_year=params.get("month_of_year", "*"), + timezone=settings.TIME_ZONE, + ) + + name = params.get("name") or f"parser:{source_key}:user:{user_id}" + periodic_task, _ = PeriodicTask.objects.update_or_create( + name=name, + defaults={ + "task": descriptor.task_name, + "kwargs": json.dumps(task_kwargs, ensure_ascii=False), + "enabled": params.get("enabled", True), + "description": ( + f"Parser dashboard schedule: {descriptor.title}; user_id={user_id}" + ), + **schedule_defaults, + }, + ) + return periodic_task + + +class ParserScheduleDetailView(APIView): + """Управление существующим расписанием парсера.""" + + permission_classes = [IsAuthenticated] + + @swagger_auto_schema( + operation_summary="Изменить расписание парсера", + request_body=ParserScheduleRequestSerializer, + tags=[PARSERS_TAG], + responses={200: ParserScheduleSerializer, 404: "Расписание не найдено"}, + ) + def patch(self, request: Request, pk: int): + """Обновить расписание или параметры периодической задачи.""" + periodic_task = _get_parser_periodic_task_for_user(pk, request.user) + if periodic_task is None: + return api_error_response( + [{"code": "schedule_not_found", "message": "Расписание не найдено"}], + status_code=status.HTTP_404_NOT_FOUND, + ) + + current = _periodic_task_to_dict(periodic_task) + data = { + "source_key": current["source_key"], + "name": periodic_task.name, + "enabled": periodic_task.enabled, + "schedule_type": current["schedule_type"], + **current["schedule"], + **current["params"], + **request.data, + } + view = ParserScheduleListCreateView() + view.request = request + serializer = ParserScheduleRequestSerializer(data=data) + serializer.is_valid(raise_exception=True) + params = { + key: value + for key, value in serializer.validated_data.items() + if value not in ("", None) + } + params["name"] = periodic_task.name + updated = view._save_periodic_task( + source_key=params["source_key"], + params=params, + user_id=request.user.id, + ) + response_serializer = ParserScheduleSerializer(_periodic_task_to_dict(updated)) + return api_response(response_serializer.data) + + @swagger_auto_schema( + operation_summary="Удалить расписание парсера", + tags=[PARSERS_TAG], + responses={200: "Расписание удалено", 404: "Расписание не найдено"}, + ) + def delete(self, request: Request, pk: int): + """Удалить периодическую задачу парсера.""" + periodic_task = _get_parser_periodic_task_for_user(pk, request.user) + if periodic_task is None: + return api_error_response( + [{"code": "schedule_not_found", "message": "Расписание не найдено"}], + status_code=status.HTTP_404_NOT_FOUND, + ) + + periodic_task.delete() + return api_response({"deleted": pk}) + + +class ParserDashboardDataView(APIView): + """Данные для страницы управления парсерами.""" + + permission_classes = [IsAuthenticated] + + @swagger_auto_schema( + operation_summary="Данные dashboard парсеров", + operation_description="Источники, расписания, фоновые задачи, load logs и счетчики.", + tags=[PARSERS_TAG], + responses={200: "Dashboard payload"}, + ) + def get(self, request: Request): + """Вернуть источники, расписания, jobs и последние загрузки.""" + sources = ParserSourceSerializer(PARSER_SOURCES.values(), many=True).data + schedules = [ + _periodic_task_to_dict(periodic_task) + for periodic_task in _parser_periodic_tasks_for_user(request.user) + ] + jobs = BackgroundJobService.get_user_jobs(user_id=request.user.id, limit=20) + load_logs = ParserLoadLog.objects.all()[:20] + source_counts = dict( + GenericParserRecord.objects.values("source") + .annotate(count=Count("id")) + .values_list("source", "count") + ) + source_counts.update( + { + source: model.objects.count() + for source, model in NATIVE_RECORD_MODELS.items() + } + ) + + data = { + "sources": sources, + "groups": { + "registries": [ + source + for source in sources + if source["source"] != ParserLoadLog.Source.FNS_FINANCIAL + and source["mode"] + in { + "native_api", + "official_api", + "official_search", + "official_source", + } + ], + "financial_reports": [ + source + for source in sources + if source["source"] == ParserLoadLog.Source.FNS_FINANCIAL + ], + "uploads": [ + source for source in sources if source["supports_file_upload"] + ], + }, + "schedules": ParserScheduleSerializer(schedules, many=True).data, + "jobs": BackgroundJobListSerializer(jobs, many=True).data, + "load_logs": ParserLoadLogSerializer(load_logs, many=True).data, + "source_counts": source_counts, + "api": { + "login": "/api/v1/users/login/", + "sources": "/api/v1/parsers/sources/", + "run": "/api/v1/parsers/run/{source_key}/", + "upload": "/api/v1/parsers/upload/{source_key}/", + "schedules": "/api/v1/parsers/schedules/", + "jobs": "/api/v1/jobs/", + "job_stream": "/api/v1/jobs/{task_id}/stream/", + "load_logs": "/api/v1/parsers/load-logs/", + "records": "/api/v1/parsers/records/", + "frontend_sources": "/api/v1/sources/", + "frontend_source_statuses": "/api/v1/sources/statuses/", + "frontend_source_detail": "/api/v1/sources/{slug}/", + "frontend_source_refresh": "/api/v1/sources/{slug}/refresh/", + "parsing_settings": "/api/v1/parsing/settings/", + "system_logs": "/api/v1/system/logs/", + "system_logs_export": "/api/v1/system/logs/export/", + }, + } + return api_response(data) + + +class ParserDashboardPageView(TemplateView): + """HTML-страница управления парсерами вне Django admin.""" + + template_name = "dashboard.html" diff --git a/src/apps/registers/__init__.py b/src/apps/registers/__init__.py new file mode 100644 index 0000000..27d060c --- /dev/null +++ b/src/apps/registers/__init__.py @@ -0,0 +1 @@ +"""Приложение реестров организаций.""" diff --git a/src/apps/registers/admin.py b/src/apps/registers/admin.py new file mode 100644 index 0000000..03bc6cb --- /dev/null +++ b/src/apps/registers/admin.py @@ -0,0 +1,64 @@ +"""Admin configuration for registers app.""" + +from apps.registers.models import ( + Organization, + Register, + RegisterUpload, + RegistryMembershipPeriod, +) +from django.contrib import admin + + +@admin.register(Register) +class RegisterAdmin(admin.ModelAdmin): + """Admin for register catalog.""" + + list_display = ("name", "created_at", "updated_at") + search_fields = ("name",) + + +@admin.register(Organization) +class OrganizationAdmin(admin.ModelAdmin): + """Admin for registry organizations.""" + + list_display = ("pn_name_short", "mn_ogrn", "mn_inn", "in_kpp", "mn_okpo") + search_fields = ("pn_name", "mn_ogrn", "mn_inn", "in_kpp", "mn_okpo") + list_filter = ("membership_periods__registry",) + + def pn_name_short(self, obj): + return obj.pn_name[:80] + + pn_name_short.short_description = "Организация" + pn_name_short.admin_order_field = "pn_name" + + +@admin.register(RegisterUpload) +class RegisterUploadAdmin(admin.ModelAdmin): + """Admin for registry uploads.""" + + list_display = ( + "registry", + "actual_date", + "import_status", + "rows_count", + "file_name", + "uploaded_by", + "created_at", + ) + list_filter = ("registry", "import_status", "actual_date") + search_fields = ("registry__name", "file_name", "file_hash") + + +@admin.register(RegistryMembershipPeriod) +class RegistryMembershipPeriodAdmin(admin.ModelAdmin): + """Admin for current registry memberships.""" + + list_display = ("registry", "organization") + list_filter = ("registry",) + search_fields = ( + "registry__name", + "organization__pn_name", + "organization__mn_ogrn", + "organization__mn_inn", + ) + list_select_related = ("registry", "organization") diff --git a/src/apps/registers/apps.py b/src/apps/registers/apps.py new file mode 100644 index 0000000..21f01be --- /dev/null +++ b/src/apps/registers/apps.py @@ -0,0 +1,10 @@ +from django.apps import AppConfig + + +class RegistersConfig(AppConfig): + """Конфигурация приложения реестров организаций.""" + + default_auto_field = "django.db.models.BigAutoField" + name = "apps.registers" + label = "registers" + verbose_name = "Реестры организаций" diff --git a/src/apps/registers/migrations/0001_initial.py b/src/apps/registers/migrations/0001_initial.py new file mode 100644 index 0000000..c522924 --- /dev/null +++ b/src/apps/registers/migrations/0001_initial.py @@ -0,0 +1,378 @@ +# Generated by Codex on 2026-04-27 + +import uuid + +import django.core.validators +import django.db.models.deletion +from django.conf import settings +from django.db import migrations, models + + +DEFAULT_REGISTER_NAMES = ( + "Реестр предприятий ОПК", + "Реестр госкорпорации Роскосмос", + "Реестр госкорпорации Росатом", + "Реестр госкорпорации Роскосмос ГОЗ", + "Реестр госкорпорации Роскосмос ОПК", + "Реестр госкорпорации Росатом ГОЗ", + "Реестр госкорпорации Росатом ОПК", +) + + +def seed_default_registers(apps, schema_editor): + Register = apps.get_model("registers", "Register") + db_alias = schema_editor.connection.alias + for name in DEFAULT_REGISTER_NAMES: + Register.objects.using(db_alias).get_or_create(name=name) + + +class Migration(migrations.Migration): + initial = True + + dependencies = [ + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ] + + operations = [ + migrations.CreateModel( + name="Register", + fields=[ + ( + "created_at", + models.DateTimeField( + auto_now_add=True, + db_index=True, + help_text="Дата и время создания записи", + verbose_name="создано", + ), + ), + ( + "updated_at", + models.DateTimeField( + auto_now=True, + help_text="Дата и время последнего обновления", + verbose_name="обновлено", + ), + ), + ( + "id", + models.UUIDField( + default=uuid.uuid4, + editable=False, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ( + "name", + models.CharField( + help_text="Человекочитаемое название реестра", + max_length=255, + unique=True, + verbose_name="название", + ), + ), + ], + options={ + "verbose_name": "реестр", + "verbose_name_plural": "реестры", + "db_table": "registers_register", + "ordering": ["name"], + }, + ), + migrations.CreateModel( + name="Organization", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ( + "created_at", + models.DateTimeField( + auto_now_add=True, + db_index=True, + help_text="Дата и время создания записи", + verbose_name="создано", + ), + ), + ( + "updated_at", + models.DateTimeField( + auto_now=True, + help_text="Дата и время последнего обновления", + verbose_name="обновлено", + ), + ), + ( + "pn_name", + models.TextField( + help_text="Полное наименование организации", + verbose_name="наименование", + ), + ), + ( + "mn_ogrn", + models.BigIntegerField( + db_index=True, + help_text="ОГРН организации", + verbose_name="ОГРН", + ), + ), + ( + "mn_inn", + models.BigIntegerField( + db_index=True, + help_text="ИНН организации", + verbose_name="ИНН", + ), + ), + ( + "in_kpp", + models.BigIntegerField( + blank=True, + db_index=True, + help_text="КПП организации", + null=True, + verbose_name="КПП", + ), + ), + ( + "mn_okpo", + models.TextField( + db_index=True, + help_text="ОКПО организации", + validators=[ + django.core.validators.RegexValidator( + message="ОКПО должен содержать только цифры", + regex="^\\d+$", + ) + ], + verbose_name="ОКПО", + ), + ), + ], + options={ + "verbose_name": "организация", + "verbose_name_plural": "организации", + "db_table": "registers_organization", + "ordering": ["pn_name"], + }, + ), + migrations.CreateModel( + name="RegisterUpload", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ( + "created_at", + models.DateTimeField( + auto_now_add=True, + db_index=True, + help_text="Дата и время создания записи", + verbose_name="создано", + ), + ), + ( + "updated_at", + models.DateTimeField( + auto_now=True, + help_text="Дата и время последнего обновления", + verbose_name="обновлено", + ), + ), + ( + "actual_date", + models.DateField( + db_index=True, + help_text="Дата среза реестра, к которой относится загрузка", + verbose_name="дата актуальности", + ), + ), + ( + "file_name", + models.CharField( + help_text="Оригинальное имя загруженного файла", + max_length=255, + verbose_name="имя файла", + ), + ), + ( + "file_hash", + models.CharField( + db_index=True, + help_text="SHA-256 хеш загруженного файла", + max_length=64, + verbose_name="хеш файла", + ), + ), + ( + "import_status", + models.CharField( + choices=[("success", "Успешная"), ("failed", "Ошибка")], + db_index=True, + default="success", + max_length=16, + verbose_name="статус импорта", + ), + ), + ( + "import_error", + models.TextField( + blank=True, + default="", + verbose_name="текст ошибки импорта", + ), + ), + ( + "rows_count", + models.PositiveIntegerField( + default=0, + verbose_name="количество строк", + ), + ), + ( + "registry", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name="uploads", + to="registers.register", + verbose_name="реестр", + ), + ), + ( + "uploaded_by", + models.ForeignKey( + blank=True, + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="register_uploads", + to=settings.AUTH_USER_MODEL, + verbose_name="загружено пользователем", + ), + ), + ], + options={ + "verbose_name": "загрузка реестра", + "verbose_name_plural": "загрузки реестров", + "db_table": "registers_upload", + "ordering": ["-actual_date", "-created_at"], + }, + ), + migrations.CreateModel( + name="RegistryMembershipPeriod", + fields=[ + ( + "id", + models.BigAutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ( + "created_at", + models.DateTimeField( + auto_now_add=True, + db_index=True, + help_text="Дата и время создания записи", + verbose_name="создано", + ), + ), + ( + "updated_at", + models.DateTimeField( + auto_now=True, + help_text="Дата и время последнего обновления", + verbose_name="обновлено", + ), + ), + ( + "organization", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name="membership_periods", + to="registers.organization", + verbose_name="организация", + ), + ), + ( + "registry", + models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name="membership_periods", + to="registers.register", + verbose_name="реестр", + ), + ), + ], + options={ + "verbose_name": "участие в реестре", + "verbose_name_plural": "участия в реестрах", + "db_table": "registers_membership_period", + "ordering": ["registry_id", "organization_id"], + }, + ), + migrations.AddIndex( + model_name="organization", + index=models.Index(fields=["mn_ogrn", "mn_inn"], name="registers__mn_ogrn_idx"), + ), + migrations.AddIndex( + model_name="organization", + index=models.Index(fields=["mn_okpo"], name="registers__mn_okpo_idx"), + ), + migrations.AddConstraint( + model_name="organization", + constraint=models.UniqueConstraint( + fields=("mn_ogrn", "mn_inn"), + name="unique_registers_organization_identity", + ), + ), + migrations.AddIndex( + model_name="registerupload", + index=models.Index( + fields=["registry", "actual_date"], + name="registers_upload_registry_date_idx", + ), + ), + migrations.AddIndex( + model_name="registerupload", + index=models.Index( + fields=["registry", "file_hash"], + name="registers_upload_registry_hash_idx", + ), + ), + migrations.AddIndex( + model_name="registrymembershipperiod", + index=models.Index(fields=["registry"], name="registers_members_registry_idx"), + ), + migrations.AddIndex( + model_name="registrymembershipperiod", + index=models.Index( + fields=["organization"], + name="registers_members_organization_idx", + ), + ), + migrations.AddConstraint( + model_name="registrymembershipperiod", + constraint=models.UniqueConstraint( + fields=("registry", "organization"), + name="unique_membership", + ), + ), + migrations.RunPython(seed_default_registers, migrations.RunPython.noop), + ] diff --git a/src/apps/registers/migrations/__init__.py b/src/apps/registers/migrations/__init__.py new file mode 100644 index 0000000..4cd7d51 --- /dev/null +++ b/src/apps/registers/migrations/__init__.py @@ -0,0 +1 @@ +"""Миграции приложения реестров организаций.""" diff --git a/src/apps/registers/models.py b/src/apps/registers/models.py new file mode 100644 index 0000000..9af910e --- /dev/null +++ b/src/apps/registers/models.py @@ -0,0 +1,195 @@ +"""Модели приложения реестров организаций.""" + +import uuid + +from apps.core.mixins import TimestampMixin +from django.conf import settings +from django.core.validators import RegexValidator +from django.db import models +from django.utils.translation import gettext_lazy as _ + + +class Register(TimestampMixin, models.Model): + """Справочник реестров организаций.""" + + id = models.UUIDField( + _("ID"), + primary_key=True, + default=uuid.uuid4, + editable=False, + ) + name = models.CharField( + _("название"), + max_length=255, + unique=True, + help_text=_("Человекочитаемое название реестра"), + ) + + class Meta: + db_table = "registers_register" + verbose_name = _("реестр") + verbose_name_plural = _("реестры") + ordering = ["name"] + + def __str__(self) -> str: + return self.name + + +class Organization(TimestampMixin, models.Model): + """Каноническая организация из реестров.""" + + pn_name = models.TextField( + _("наименование"), + help_text=_("Полное наименование организации"), + ) + mn_ogrn = models.BigIntegerField( + _("ОГРН"), + db_index=True, + help_text=_("ОГРН организации"), + ) + mn_inn = models.BigIntegerField( + _("ИНН"), + db_index=True, + help_text=_("ИНН организации"), + ) + in_kpp = models.BigIntegerField( + _("КПП"), + db_index=True, + null=True, + blank=True, + help_text=_("КПП организации"), + ) + mn_okpo = models.TextField( + _("ОКПО"), + db_index=True, + validators=[ + RegexValidator( + regex=r"^\d+$", + message=_("ОКПО должен содержать только цифры"), + ) + ], + help_text=_("ОКПО организации"), + ) + + class Meta: + db_table = "registers_organization" + verbose_name = _("организация") + verbose_name_plural = _("организации") + ordering = ["pn_name"] + indexes = [ + models.Index(fields=["mn_ogrn", "mn_inn"]), + models.Index(fields=["mn_okpo"]), + ] + constraints = [ + models.UniqueConstraint( + fields=["mn_ogrn", "mn_inn"], + name="unique_registers_organization_identity", + ), + ] + + def __str__(self) -> str: + return f"{self.pn_name[:60]} ({self.mn_ogrn}/{self.mn_inn})" + + +class RegisterUpload(TimestampMixin, models.Model): + """Факт загрузки снимка реестра на конкретную дату актуальности.""" + + class ImportStatus(models.TextChoices): + SUCCESS = "success", _("Успешная") + FAILED = "failed", _("Ошибка") + + registry = models.ForeignKey( + Register, + on_delete=models.CASCADE, + related_name="uploads", + verbose_name=_("реестр"), + ) + actual_date = models.DateField( + _("дата актуальности"), + db_index=True, + help_text=_("Дата среза реестра, к которой относится загрузка"), + ) + file_name = models.CharField( + _("имя файла"), + max_length=255, + help_text=_("Оригинальное имя загруженного файла"), + ) + file_hash = models.CharField( + _("хеш файла"), + max_length=64, + db_index=True, + help_text=_("SHA-256 хеш загруженного файла"), + ) + import_status = models.CharField( + _("статус импорта"), + max_length=16, + choices=ImportStatus.choices, + default=ImportStatus.SUCCESS, + db_index=True, + ) + import_error = models.TextField( + _("текст ошибки импорта"), + blank=True, + default="", + ) + rows_count = models.PositiveIntegerField( + _("количество строк"), + default=0, + ) + uploaded_by = models.ForeignKey( + settings.AUTH_USER_MODEL, + on_delete=models.SET_NULL, + null=True, + blank=True, + related_name="register_uploads", + verbose_name=_("загружено пользователем"), + ) + + class Meta: + db_table = "registers_upload" + verbose_name = _("загрузка реестра") + verbose_name_plural = _("загрузки реестров") + ordering = ["-actual_date", "-created_at"] + indexes = [ + models.Index(fields=["registry", "actual_date"]), + models.Index(fields=["registry", "file_hash"]), + ] + + def __str__(self) -> str: + return f"{self.registry.name} @ {self.actual_date}" + + +class RegistryMembershipPeriod(TimestampMixin, models.Model): + """Текущая принадлежность организации к реестру.""" + + registry = models.ForeignKey( + Register, + on_delete=models.CASCADE, + related_name="membership_periods", + verbose_name=_("реестр"), + ) + organization = models.ForeignKey( + Organization, + on_delete=models.CASCADE, + related_name="membership_periods", + verbose_name=_("организация"), + ) + + class Meta: + db_table = "registers_membership_period" + verbose_name = _("участие в реестре") + verbose_name_plural = _("участия в реестрах") + ordering = ["registry_id", "organization_id"] + indexes = [ + models.Index(fields=["registry"]), + models.Index(fields=["organization"]), + ] + constraints = [ + models.UniqueConstraint( + fields=["registry", "organization"], + name="unique_membership", + ), + ] + + def __str__(self) -> str: + return f"{self.registry.name}: {self.organization.pn_name[:40]}" diff --git a/src/apps/registers/serializers.py b/src/apps/registers/serializers.py new file mode 100644 index 0000000..52e2e81 --- /dev/null +++ b/src/apps/registers/serializers.py @@ -0,0 +1,150 @@ +"""Сериализаторы API реестров организаций.""" + +from apps.registers.models import ( + Organization, + Register, + RegisterUpload, + RegistryMembershipPeriod, +) +from rest_framework import serializers + + +class RegisterSerializer(serializers.ModelSerializer): + """Сериализатор реестра.""" + + uploads_count = serializers.IntegerField(read_only=True, required=False) + active_organizations = serializers.IntegerField(read_only=True, required=False) + + class Meta: + model = Register + fields = [ + "id", + "name", + "uploads_count", + "active_organizations", + "created_at", + "updated_at", + ] + read_only_fields = fields + + +class RegistryMembershipPeriodSerializer(serializers.ModelSerializer): + """Сериализатор участия организации в реестре.""" + + registry_id = serializers.UUIDField(source="registry.id", read_only=True) + registry_name = serializers.CharField(source="registry.name", read_only=True) + + class Meta: + model = RegistryMembershipPeriod + fields = ["id", "registry_id", "registry_name"] + read_only_fields = fields + + +class OrganizationSerializer(serializers.ModelSerializer): + """Сериализатор организации.""" + + class Meta: + model = Organization + fields = [ + "id", + "pn_name", + "mn_ogrn", + "mn_inn", + "in_kpp", + "mn_okpo", + "created_at", + "updated_at", + ] + read_only_fields = fields + + +class OrganizationDetailSerializer(OrganizationSerializer): + """Организация с текущими реестрами.""" + + registries = RegistryMembershipPeriodSerializer( + source="membership_periods", + many=True, + read_only=True, + ) + + class Meta(OrganizationSerializer.Meta): + fields = OrganizationSerializer.Meta.fields + ["registries"] + + +class RegisterUploadSerializer(serializers.ModelSerializer): + """Сериализатор факта загрузки реестра.""" + + registry_name = serializers.CharField(source="registry.name", read_only=True) + + class Meta: + model = RegisterUpload + fields = [ + "id", + "registry", + "registry_name", + "actual_date", + "file_name", + "file_hash", + "import_status", + "import_error", + "rows_count", + "created_at", + "updated_at", + ] + read_only_fields = fields + + +class RegisterFileUploadSerializer(serializers.Serializer): + """Сериализатор загрузки файла реестра.""" + + registry = serializers.PrimaryKeyRelatedField(queryset=Register.objects.all()) + actual_date = serializers.DateField(required=False) + file = serializers.FileField() + + def validate_file(self, value): + if not value.name.lower().endswith(".xlsx"): + raise serializers.ValidationError("Поддерживаются только файлы .xlsx") + return value + + +class OrganizationListQuerySerializer(serializers.Serializer): + """Query-параметры списка организаций.""" + + registry = serializers.PrimaryKeyRelatedField( + queryset=Register.objects.all(), + required=False, + ) + search = serializers.CharField(required=False, allow_blank=True) + mn_ogrn = serializers.IntegerField(required=False, min_value=0) + mn_inn = serializers.IntegerField(required=False, min_value=0) + in_kpp = serializers.IntegerField(required=False, min_value=0) + mn_okpo = serializers.CharField(required=False, allow_blank=False) + + def validate_mn_okpo(self, value): + if not value.isdigit(): + raise serializers.ValidationError("mn_okpo должен содержать только цифры") + return value + + +class RegistryOrganizationListQuerySerializer(serializers.Serializer): + """Query-параметры списка организаций конкретного реестра.""" + + search = serializers.CharField(required=False, allow_blank=True) + mn_ogrn = serializers.IntegerField(required=False, min_value=0) + mn_inn = serializers.IntegerField(required=False, min_value=0) + in_kpp = serializers.IntegerField(required=False, min_value=0) + mn_okpo = serializers.CharField(required=False, allow_blank=False) + + def validate_mn_okpo(self, value): + if not value.isdigit(): + raise serializers.ValidationError("mn_okpo должен содержать только цифры") + return value + + +class RegisterUploadTaskSerializer(serializers.Serializer): + """Ответ запуска Celery-импорта реестра.""" + + task_id = serializers.CharField() + registry_id = serializers.UUIDField() + registry_name = serializers.CharField() + task_name = serializers.CharField() diff --git a/src/apps/registers/services.py b/src/apps/registers/services.py new file mode 100644 index 0000000..5fb8157 --- /dev/null +++ b/src/apps/registers/services.py @@ -0,0 +1,452 @@ +"""Сервисы загрузки и обработки реестров организаций.""" + +from __future__ import annotations + +import hashlib +from dataclasses import dataclass +from datetime import date + +from apps.registers.models import ( + Organization, + Register, + RegisterUpload, + RegistryMembershipPeriod, +) +from django.db import transaction +from django.db.models import CharField, Q +from django.db.models.functions import Cast +from django.utils import timezone +from openpyxl import load_workbook + + +class RegisterImportError(ValueError): + """Ошибка импорта Excel файла реестра.""" + + +@dataclass(frozen=True) +class ParsedOrganization: + """Промежуточная структура организации после парсинга Excel.""" + + pn_name: str + mn_ogrn: int + mn_inn: int + in_kpp: int | None + mn_okpo: str + + +class RegisterImportService: + """Сервис импорта организаций из Excel в выбранный реестр.""" + + REQUIRED_HEADERS = {"pn_name", "mn_ogrn", "mn_inn", "mn_okpo"} + + @classmethod + def sync_registry_memberships( + cls, + *, + registry: Register, + uploaded_file, + file_name: str = "", + actual_date: date | None = None, + uploaded_by=None, + uploaded_by_id: int | None = None, + ) -> dict[str, int | str]: + """ + Обновить текущее состояние реестра целиком из загруженного файла. + + Файл считается полным снимком реестра: отсутствующие в новом файле + организации удаляются из текущей принадлежности к этому реестру. + """ + snapshot_date = actual_date or timezone.localdate() + cls._validate_snapshot_date(registry=registry, snapshot_date=snapshot_date) + file_hash = cls._calculate_file_hash(uploaded_file) + upload_kwargs = { + "registry": registry, + "actual_date": snapshot_date, + "file_name": file_name + or getattr(uploaded_file, "name", "") + or "registry.xlsx", + "file_hash": file_hash, + "rows_count": 0, + } + if uploaded_by is not None: + upload_kwargs["uploaded_by"] = uploaded_by + elif uploaded_by_id is not None: + upload_kwargs["uploaded_by_id"] = uploaded_by_id + upload = RegisterUpload.objects.create(**upload_kwargs) + + try: + rows = cls._ensure_unique_identities(cls.parse_xlsx(uploaded_file)) + with transaction.atomic(): + ( + snapshot_org_ids, + created_count, + updated_count, + ) = cls._upsert_organizations(rows) + existing_org_ids = set( + RegistryMembershipPeriod.objects.filter( + registry=registry + ).values_list( + "organization_id", + flat=True, + ) + ) + snapshot_org_ids_set = set(snapshot_org_ids) + removed_count = RegistryMembershipPeriod.objects.filter( + registry=registry, + organization_id__in=existing_org_ids - snapshot_org_ids_set, + ).delete()[0] + new_org_ids = snapshot_org_ids_set - existing_org_ids + RegistryMembershipPeriod.objects.bulk_create( + [ + RegistryMembershipPeriod( + registry=registry, + organization_id=organization_id, + ) + for organization_id in new_org_ids + ], + batch_size=1000, + ignore_conflicts=True, + ) + active_count = RegistryMembershipPeriod.objects.filter( + registry=registry + ).count() + + upload.rows_count = len(rows) + upload.import_status = RegisterUpload.ImportStatus.SUCCESS + upload.import_error = "" + upload.save( + update_fields=[ + "rows_count", + "import_status", + "import_error", + "updated_at", + ] + ) + + return { + "upload_id": upload.id, + "registry_id": str(registry.id), + "registry_name": registry.name, + "actual_date": snapshot_date.isoformat(), + "rows_in_file": len(rows), + "organizations_created": created_count, + "organizations_updated": updated_count, + "memberships_added": len(new_org_ids), + "memberships_removed": removed_count, + "active_memberships": active_count, + } + except Exception as exc: + upload.import_status = RegisterUpload.ImportStatus.FAILED + upload.import_error = str(exc) + upload.save(update_fields=["import_status", "import_error", "updated_at"]) + if isinstance(exc, RegisterImportError): + raise + raise RegisterImportError(str(exc)) from exc + + @classmethod + def get_organizations_queryset( + cls, + *, + registry: Register | None = None, + search: str = "", + mn_ogrn: int | None = None, + mn_inn: int | None = None, + in_kpp: int | None = None, + mn_okpo: str | None = None, + ): + """Получить queryset организаций с учетом фильтров.""" + queryset = Organization.objects.all().order_by("pn_name") + if registry: + queryset = queryset.filter(membership_periods__registry=registry) + queryset = queryset.distinct() + queryset = cls._apply_exact_filters( + queryset, + mn_ogrn=mn_ogrn, + mn_inn=mn_inn, + in_kpp=in_kpp, + mn_okpo=mn_okpo, + ) + return cls._apply_search(queryset, search.strip()) + + @classmethod + def get_registry_organizations_queryset( + cls, + *, + registry: Register, + search: str = "", + mn_ogrn: int | None = None, + mn_inn: int | None = None, + in_kpp: int | None = None, + mn_okpo: str | None = None, + ): + """Получить queryset организаций конкретного реестра.""" + return cls.get_organizations_queryset( + registry=registry, + search=search, + mn_ogrn=mn_ogrn, + mn_inn=mn_inn, + in_kpp=in_kpp, + mn_okpo=mn_okpo, + ) + + @classmethod + def _upsert_organizations( + cls, + rows: list[ParsedOrganization], + ) -> tuple[set[int], int, int]: + snapshot_org_ids: set[int] = set() + created_count = 0 + updated_count = 0 + for row in rows: + organization, created = Organization.objects.get_or_create( + mn_ogrn=row.mn_ogrn, + mn_inn=row.mn_inn, + defaults={ + "pn_name": row.pn_name, + "in_kpp": row.in_kpp, + "mn_okpo": row.mn_okpo, + }, + ) + if created: + created_count += 1 + elif cls._update_organization_fields(organization=organization, row=row): + updated_count += 1 + snapshot_org_ids.add(organization.id) + return snapshot_org_ids, created_count, updated_count + + @classmethod + def _update_organization_fields( + cls, + *, + organization: Organization, + row: ParsedOrganization, + ) -> bool: + update_fields: list[str] = [] + for field in ("pn_name", "in_kpp", "mn_okpo"): + value = getattr(row, field) + if getattr(organization, field) != value: + setattr(organization, field, value) + update_fields.append(field) + if not update_fields: + return False + organization.save(update_fields=update_fields + ["updated_at"]) + return True + + @classmethod + def parse_xlsx(cls, uploaded_file) -> list[ParsedOrganization]: + """Прочитать Excel и вернуть строки организаций.""" + try: + uploaded_file.seek(0) + workbook = load_workbook(uploaded_file, read_only=True, data_only=True) + except Exception as exc: + raise RegisterImportError("Не удалось прочитать Excel файл") from exc + + try: + worksheet = workbook.active + row_iter = worksheet.iter_rows(min_row=1, values_only=True) + header_row = next(row_iter, None) + if not header_row: + raise RegisterImportError("Файл не содержит заголовков") + + header_index_map = cls._build_header_index_map(header_row) + cls._validate_headers(header_index_map) + + organizations: list[ParsedOrganization] = [] + for row_number, row_values in enumerate(row_iter, start=2): + if cls._is_empty_row(row_values): + continue + organizations.append( + ParsedOrganization( + pn_name=cls._as_required_text( + row_values[header_index_map["pn_name"]], + field_name="pn_name", + row_number=row_number, + ), + mn_ogrn=cls._as_required_int( + row_values[header_index_map["mn_ogrn"]], + field_name="mn_ogrn", + row_number=row_number, + ), + mn_inn=cls._as_required_int( + row_values[header_index_map["mn_inn"]], + field_name="mn_inn", + row_number=row_number, + ), + in_kpp=cls._as_optional_int( + row_values[header_index_map["in_kpp"]], + field_name="in_kpp", + row_number=row_number, + ) + if "in_kpp" in header_index_map + else None, + mn_okpo=cls._as_numeric_text( + row_values[header_index_map["mn_okpo"]], + field_name="mn_okpo", + row_number=row_number, + ), + ) + ) + + if not organizations: + raise RegisterImportError("Файл не содержит строк с организациями") + return organizations + finally: + workbook.close() + + @classmethod + def _calculate_file_hash(cls, uploaded_file) -> str: + uploaded_file.seek(0) + hasher = hashlib.sha256() + while True: + chunk = uploaded_file.read(1024 * 1024) + if not chunk: + break + hasher.update(chunk) + uploaded_file.seek(0) + return hasher.hexdigest() + + @classmethod + def _validate_snapshot_date( + cls, *, registry: Register, snapshot_date: date + ) -> None: + latest = ( + RegisterUpload.objects.filter( + registry=registry, + import_status=RegisterUpload.ImportStatus.SUCCESS, + ) + .order_by("-actual_date") + .first() + ) + if latest and snapshot_date < latest.actual_date: + raise RegisterImportError( + "Дата актуальности не может быть раньше последней загрузки" + ) + + @classmethod + def _ensure_unique_identities( + cls, + rows: list[ParsedOrganization], + ) -> list[ParsedOrganization]: + seen_keys: set[tuple[int, int]] = set() + for row in rows: + key = (row.mn_ogrn, row.mn_inn) + if key in seen_keys: + raise RegisterImportError( + "Файл содержит дубли по ключу (mn_ogrn, mn_inn): " + f"{row.mn_ogrn}, {row.mn_inn}" + ) + seen_keys.add(key) + return rows + + @classmethod + def _apply_exact_filters( + cls, + queryset, + *, + mn_ogrn: int | None, + mn_inn: int | None, + in_kpp: int | None, + mn_okpo: str | None, + ): + if mn_ogrn is not None: + queryset = queryset.filter(mn_ogrn=mn_ogrn) + if mn_inn is not None: + queryset = queryset.filter(mn_inn=mn_inn) + if in_kpp is not None: + queryset = queryset.filter(in_kpp=in_kpp) + if mn_okpo: + queryset = queryset.filter(mn_okpo=mn_okpo) + return queryset + + @classmethod + def _apply_search(cls, queryset, search_query: str): + if not search_query: + return queryset + return queryset.annotate( + mn_ogrn_text=Cast("mn_ogrn", output_field=CharField()), + mn_inn_text=Cast("mn_inn", output_field=CharField()), + in_kpp_text=Cast("in_kpp", output_field=CharField()), + ).filter( + Q(pn_name__icontains=search_query) + | Q(mn_ogrn_text__icontains=search_query) + | Q(mn_inn_text__icontains=search_query) + | Q(in_kpp_text__icontains=search_query) + | Q(mn_okpo__icontains=search_query) + ) + + @classmethod + def _build_header_index_map(cls, header_row: tuple) -> dict[str, int]: + header_index_map: dict[str, int] = {} + for index, value in enumerate(header_row): + normalized = cls._normalize_header(value) + if normalized: + header_index_map[normalized] = index + return header_index_map + + @classmethod + def _validate_headers(cls, header_index_map: dict[str, int]) -> None: + missing = sorted(cls.REQUIRED_HEADERS - set(header_index_map.keys())) + if missing: + raise RegisterImportError( + f"Отсутствуют обязательные колонки: {', '.join(missing)}" + ) + + @staticmethod + def _normalize_header(value) -> str: + return str(value or "").strip().lower() + + @staticmethod + def _is_empty_row(row_values: tuple) -> bool: + return all(value is None or str(value).strip() == "" for value in row_values) + + @classmethod + def _as_required_text(cls, value, *, field_name: str, row_number: int) -> str: + text_value = str(value or "").strip() + if not text_value: + raise RegisterImportError( + f"Строка {row_number}: поле {field_name} обязательно" + ) + return text_value + + @classmethod + def _as_required_int(cls, value, *, field_name: str, row_number: int) -> int: + parsed = cls._as_optional_int( + value, + field_name=field_name, + row_number=row_number, + ) + if parsed is None: + raise RegisterImportError( + f"Строка {row_number}: поле {field_name} обязательно" + ) + return parsed + + @classmethod + def _as_optional_int(cls, value, *, field_name: str, row_number: int) -> int | None: + if value is None: + return None + text_value = str(value).strip().replace(" ", "") + if not text_value: + return None + if text_value.endswith(".0"): + text_value = text_value[:-2] + if not text_value.isdigit(): + raise RegisterImportError( + f"Строка {row_number}: поле {field_name} должно быть числом" + ) + return int(text_value) + + @classmethod + def _as_numeric_text(cls, value, *, field_name: str, row_number: int) -> str: + text_value = str(value or "").strip().replace(" ", "") + if text_value.endswith(".0") and text_value[:-2].isdigit(): + text_value = text_value[:-2] + if not text_value: + raise RegisterImportError( + f"Строка {row_number}: поле {field_name} обязательно" + ) + if not text_value.isdigit(): + raise RegisterImportError( + f"Строка {row_number}: поле {field_name} должно содержать только цифры" + ) + return text_value diff --git a/src/apps/registers/tasks.py b/src/apps/registers/tasks.py new file mode 100644 index 0000000..f5cab40 --- /dev/null +++ b/src/apps/registers/tasks.py @@ -0,0 +1,69 @@ +"""Celery задачи приложения реестров организаций.""" + +import logging +from datetime import date + +from apps.core.services import BackgroundJobService +from apps.registers.models import Register +from apps.registers.services import RegisterImportService +from celery import shared_task +from django.core.files.storage import default_storage + +logger = logging.getLogger(__name__) + + +@shared_task(bind=True) +def import_register_upload( + self, + *, + registry_id: str, + storage_path: str, + file_name: str = "", + actual_date: str | None = None, + user_id: int | None = None, +) -> dict: + """Импортировать Excel-файл организаций реестра через Celery.""" + task_id = self.request.id + registry = Register.objects.get(id=registry_id) + job = BackgroundJobService.create_job( + task_id=task_id, + task_name="apps.registers.tasks.import_register_upload", + user_id=user_id, + meta={ + "registry_id": str(registry.id), + "registry_name": registry.name, + "storage_path": storage_path, + "file_name": file_name, + "actual_date": actual_date or "", + }, + ) + job.mark_started() + job.update_progress(10, "Файл реестра получен...") + + try: + parsed_actual_date = date.fromisoformat(actual_date) if actual_date else None + job.update_progress(35, "Импорт организаций из Excel...") + with default_storage.open(storage_path, "rb") as uploaded_file: + result = RegisterImportService.sync_registry_memberships( + registry=registry, + uploaded_file=uploaded_file, + file_name=file_name, + actual_date=parsed_actual_date, + uploaded_by_id=user_id, + ) + job.update_progress(90, "Фиксация результата загрузки...") + job.complete(result=result) + return {"status": "success", **result} + except Exception as exc: + logger.error("Register upload import failed: %s", exc, exc_info=True) + job.fail(error=str(exc)) + return {"status": "failed", "error": str(exc), "registry_id": registry_id} + finally: + try: + default_storage.delete(storage_path) + except Exception: + logger.warning( + "Failed to delete uploaded registry file: %s", + storage_path, + exc_info=True, + ) diff --git a/src/apps/registers/urls.py b/src/apps/registers/urls.py new file mode 100644 index 0000000..be053bc --- /dev/null +++ b/src/apps/registers/urls.py @@ -0,0 +1,28 @@ +"""URL конфигурация приложения реестров организаций.""" + +from apps.registers.views import ( + OrganizationViewSet, + RegisterUploadListView, + RegisterUploadView, + RegisterViewSet, + RegistryOrganizationListView, +) +from django.urls import include, path +from rest_framework.routers import DefaultRouter + +app_name = "registers" + +router = DefaultRouter() +router.register("registries", RegisterViewSet, basename="registries") +router.register("organizations", OrganizationViewSet, basename="organizations") + +urlpatterns = [ + path("upload/", RegisterUploadView.as_view(), name="register-upload"), + path("uploads/", RegisterUploadListView.as_view(), name="register-upload-list"), + path( + "registries//organizations/", + RegistryOrganizationListView.as_view(), + name="registry-organizations-list", + ), + path("", include(router.urls)), +] diff --git a/src/apps/registers/views.py b/src/apps/registers/views.py new file mode 100644 index 0000000..c354303 --- /dev/null +++ b/src/apps/registers/views.py @@ -0,0 +1,286 @@ +"""Views для работы с реестрами организаций.""" + +import uuid + +from apps.core.response import api_error_response, api_response +from apps.registers import tasks +from apps.registers.models import Organization, Register, RegisterUpload +from apps.registers.serializers import ( + OrganizationDetailSerializer, + OrganizationListQuerySerializer, + OrganizationSerializer, + RegisterFileUploadSerializer, + RegisterSerializer, + RegisterUploadSerializer, + RegisterUploadTaskSerializer, + RegistryOrganizationListQuerySerializer, +) +from apps.registers.services import RegisterImportService +from django.core.files.storage import default_storage +from django.db.models import Count, Max, Q +from django.shortcuts import get_object_or_404 +from django.utils.text import get_valid_filename +from drf_yasg import openapi +from drf_yasg.utils import swagger_auto_schema +from rest_framework import status +from rest_framework.generics import ListAPIView +from rest_framework.parsers import FormParser, MultiPartParser +from rest_framework.permissions import IsAdminUser, IsAuthenticated +from rest_framework.views import APIView +from rest_framework.viewsets import ReadOnlyModelViewSet + +REGISTERS_TAG = "Registers" + + +def _save_uploaded_register_file(uploaded_file) -> str: + """Сохранить Excel во временное storage для Celery worker.""" + safe_name = get_valid_filename(uploaded_file.name or "registry.xlsx") + return default_storage.save( + f"register_uploads/{uuid.uuid4()}-{safe_name}", + uploaded_file, + ) + + +class RegisterViewSet(ReadOnlyModelViewSet): + """API для просмотра списка реестров.""" + + serializer_class = RegisterSerializer + permission_classes = [IsAuthenticated] + search_fields = ["name"] + ordering_fields = ["name", "created_at", "updated_at"] + ordering = ["name"] + + def get_queryset(self): + return Register.objects.annotate( + uploads_count=Count("uploads", distinct=True), + active_organizations=Count( + "membership_periods__organization", distinct=True + ), + last_upload_at=Max("uploads__created_at"), + ).order_by("name") + + @swagger_auto_schema( + tags=[REGISTERS_TAG], + operation_summary="List organization registries", + operation_description="Returns available organization registries.", + responses={200: RegisterSerializer(many=True)}, + ) + def list(self, request, *args, **kwargs): + return super().list(request, *args, **kwargs) + + @swagger_auto_schema( + tags=[REGISTERS_TAG], + operation_summary="Get organization registry", + responses={200: RegisterSerializer, 404: "Registry not found"}, + ) + def retrieve(self, request, *args, **kwargs): + return super().retrieve(request, *args, **kwargs) + + +class OrganizationViewSet(ReadOnlyModelViewSet): + """API для просмотра организаций из реестров.""" + + serializer_class = OrganizationSerializer + permission_classes = [IsAuthenticated] + search_fields = ["pn_name", "mn_okpo"] + ordering_fields = ["id", "pn_name", "mn_ogrn", "mn_inn", "in_kpp", "mn_okpo"] + ordering = ["pn_name"] + + def get_serializer_class(self): + if self.action == "retrieve": + return OrganizationDetailSerializer + return OrganizationSerializer + + def get_queryset(self): + if self.action == "retrieve": + return Organization.objects.prefetch_related( + "membership_periods__registry" + ).order_by("pn_name") + + serializer = OrganizationListQuerySerializer(data=self.request.query_params) + serializer.is_valid(raise_exception=True) + return RegisterImportService.get_organizations_queryset( + **serializer.validated_data + ) + + @swagger_auto_schema( + tags=[REGISTERS_TAG], + operation_summary="List registry organizations", + operation_description=( + "Returns organizations with filters by registry, search, mn_ogrn, " + "mn_inn, in_kpp and mn_okpo." + ), + manual_parameters=[ + openapi.Parameter( + "registry", + openapi.IN_QUERY, + type=openapi.TYPE_STRING, + format=openapi.FORMAT_UUID, + description="Registry UUID", + ), + openapi.Parameter( + "search", + openapi.IN_QUERY, + type=openapi.TYPE_STRING, + description="Search by name, OGRN, INN, KPP or OKPO", + ), + openapi.Parameter("mn_ogrn", openapi.IN_QUERY, type=openapi.TYPE_INTEGER), + openapi.Parameter("mn_inn", openapi.IN_QUERY, type=openapi.TYPE_INTEGER), + openapi.Parameter("in_kpp", openapi.IN_QUERY, type=openapi.TYPE_INTEGER), + openapi.Parameter("mn_okpo", openapi.IN_QUERY, type=openapi.TYPE_STRING), + ], + responses={200: OrganizationSerializer(many=True)}, + ) + def list(self, request, *args, **kwargs): + return super().list(request, *args, **kwargs) + + @swagger_auto_schema( + tags=[REGISTERS_TAG], + operation_summary="Get registry organization", + responses={200: OrganizationDetailSerializer, 404: "Organization not found"}, + ) + def retrieve(self, request, *args, **kwargs): + return super().retrieve(request, *args, **kwargs) + + +class RegistryOrganizationListView(ListAPIView): + """API списка организаций конкретного реестра.""" + + serializer_class = OrganizationSerializer + permission_classes = [IsAuthenticated] + + def _get_registry(self) -> Register: + return get_object_or_404(Register, id=self.kwargs["registry_id"]) + + def get_queryset(self): + if getattr(self, "swagger_fake_view", False): + return Organization.objects.none() + serializer = RegistryOrganizationListQuerySerializer( + data=self.request.query_params + ) + serializer.is_valid(raise_exception=True) + return RegisterImportService.get_registry_organizations_queryset( + registry=self._get_registry(), + **serializer.validated_data, + ) + + @swagger_auto_schema( + tags=[REGISTERS_TAG], + operation_summary="List organizations in a registry", + responses={200: OrganizationSerializer(many=True), 404: "Registry not found"}, + ) + def get(self, request, *args, **kwargs): + return super().get(request, *args, **kwargs) + + +class RegisterUploadListView(ListAPIView): + """Список загрузок реестров.""" + + serializer_class = RegisterUploadSerializer + permission_classes = [IsAuthenticated] + + def get_queryset(self): + queryset = RegisterUpload.objects.select_related("registry").order_by( + "-created_at" + ) + registry = self.request.query_params.get("registry") + status_filter = self.request.query_params.get("status") + if registry: + queryset = queryset.filter( + Q(registry_id=registry) | Q(registry__name=registry) + ) + if status_filter: + queryset = queryset.filter(import_status=status_filter) + return queryset + + @swagger_auto_schema( + tags=[REGISTERS_TAG], + operation_summary="List registry uploads", + manual_parameters=[ + openapi.Parameter("registry", openapi.IN_QUERY, type=openapi.TYPE_STRING), + openapi.Parameter("status", openapi.IN_QUERY, type=openapi.TYPE_STRING), + ], + responses={200: RegisterUploadSerializer(many=True)}, + ) + def get(self, request, *args, **kwargs): + return super().get(request, *args, **kwargs) + + +class RegisterUploadView(APIView): + """API загрузки Excel файла организаций в выбранный реестр.""" + + parser_classes = [MultiPartParser, FormParser] + permission_classes = [IsAdminUser] + + @swagger_auto_schema( + tags=[REGISTERS_TAG], + operation_summary="Upload registry organization file", + operation_description=( + "Accepts .xlsx with columns pn_name, mn_ogrn, mn_inn, mn_okpo and " + "optional in_kpp, then starts Celery import_register_upload." + ), + manual_parameters=[ + openapi.Parameter( + "registry", + openapi.IN_FORM, + type=openapi.TYPE_STRING, + format=openapi.FORMAT_UUID, + required=True, + description="Registry UUID", + ), + openapi.Parameter( + "actual_date", + openapi.IN_FORM, + type=openapi.TYPE_STRING, + format=openapi.FORMAT_DATE, + required=False, + description="Snapshot date, defaults to today", + ), + openapi.Parameter( + "file", + openapi.IN_FORM, + type=openapi.TYPE_FILE, + required=True, + description="Excel .xlsx file", + ), + ], + consumes=["multipart/form-data"], + responses={ + 202: RegisterUploadTaskSerializer, + 400: "Validation error", + 403: "Admin permissions required", + }, + ) + def post(self, request): + serializer = RegisterFileUploadSerializer(data=request.data) + serializer.is_valid(raise_exception=True) + + registry = serializer.validated_data["registry"] + uploaded_file = serializer.validated_data["file"] + actual_date = serializer.validated_data.get("actual_date") + storage_path = _save_uploaded_register_file(uploaded_file) + + try: + async_result = tasks.import_register_upload.delay( + registry_id=str(registry.id), + storage_path=storage_path, + file_name=uploaded_file.name, + actual_date=actual_date.isoformat() if actual_date else None, + user_id=request.user.id, + ) + except Exception as exc: + default_storage.delete(storage_path) + return api_error_response( + [{"code": "task_start_failed", "message": str(exc)}], + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + ) + + return api_response( + { + "task_id": async_result.id, + "registry_id": registry.id, + "registry_name": registry.name, + "task_name": "apps.registers.tasks.import_register_upload", + }, + status_code=status.HTTP_202_ACCEPTED, + ) diff --git a/src/apps/user/models.py b/src/apps/user/models.py index ef84f96..892d892 100644 --- a/src/apps/user/models.py +++ b/src/apps/user/models.py @@ -13,17 +13,17 @@ class User(AbstractUser): # Переопределяем группы и разрешения для избежания конфликта groups = models.ManyToManyField( "auth.Group", - verbose_name=_("groups"), + verbose_name="groups", blank=True, - help_text=_(""), + help_text="", related_name="custom_user_set", related_query_name="custom_user", ) user_permissions = models.ManyToManyField( "auth.Permission", - verbose_name=_("user permissions"), + verbose_name="user permissions", blank=True, - help_text=_("Specific permissions for this user."), + help_text="Specific permissions for this user.", related_name="custom_user_set", related_query_name="custom_user", ) diff --git a/src/apps/user/serializers.py b/src/apps/user/serializers.py index a0f13f4..56e5ef9 100644 --- a/src/apps/user/serializers.py +++ b/src/apps/user/serializers.py @@ -104,9 +104,39 @@ class ProfileUpdateSerializer(serializers.ModelSerializer): class LoginSerializer(serializers.Serializer): """Сериализатор для входа""" - email = serializers.EmailField(help_text="Email пользователя") + username = serializers.CharField( + required=False, + allow_blank=False, + trim_whitespace=True, + help_text="Логин пользователя", + ) + login = serializers.CharField( + required=False, + allow_blank=False, + trim_whitespace=True, + write_only=True, + help_text="Логин пользователя", + ) + email = serializers.EmailField(required=False, help_text="Email пользователя") password = serializers.CharField(help_text="Пароль") + def validate(self, attrs): + identity = attrs.get("username") or attrs.get("login") or attrs.get("email") + if not identity: + raise serializers.ValidationError( + {"username": "Укажите логин пользователя"} + ) + + attrs["identity"] = identity + attrs["identity_field"] = ( + "email" + if attrs.get("email") + and not attrs.get("username") + and not attrs.get("login") + else "username" + ) + return attrs + class TokenSerializer(serializers.Serializer): """Сериализатор для токенов""" diff --git a/src/apps/user/urls.py b/src/apps/user/urls.py index 8de1a0a..df9b5ca 100644 --- a/src/apps/user/urls.py +++ b/src/apps/user/urls.py @@ -1,5 +1,4 @@ from django.urls import path -from rest_framework_simplejwt.views import TokenVerifyView from . import views @@ -11,7 +10,7 @@ urlpatterns = [ path("login/", views.LoginView.as_view(), name="login"), path("logout/", views.LogoutView.as_view(), name="logout"), path("token/refresh/", views.TokenRefreshView.as_view(), name="token_refresh"), - path("token/verify/", TokenVerifyView.as_view(), name="token_verify"), + path("token/verify/", views.TokenVerifyView.as_view(), name="token_verify"), # Пользовательские данные path("me/", views.CurrentUserView.as_view(), name="current_user"), path("me/update/", views.UserUpdateView.as_view(), name="user_update"), diff --git a/src/apps/user/views.py b/src/apps/user/views.py index 0259014..7e4223b 100644 --- a/src/apps/user/views.py +++ b/src/apps/user/views.py @@ -1,4 +1,4 @@ -from django.contrib.auth import authenticate +from django.contrib.auth import authenticate, get_user_model from django.contrib.auth.hashers import check_password from drf_yasg import openapi from drf_yasg.utils import swagger_auto_schema @@ -7,7 +7,13 @@ from rest_framework.decorators import api_view, permission_classes from rest_framework.permissions import AllowAny, IsAuthenticated from rest_framework.response import Response from rest_framework.views import APIView +from rest_framework_simplejwt.exceptions import InvalidToken, TokenError +from rest_framework_simplejwt.serializers import ( + TokenRefreshSerializer, + TokenVerifySerializer, +) from rest_framework_simplejwt.tokens import RefreshToken +from rest_framework_simplejwt.views import TokenVerifyView as SimpleJWTTokenVerifyView from .serializers import ( LoginSerializer, @@ -20,6 +26,16 @@ from .serializers import ( ) from .services import ProfileService, UserService +User = get_user_model() +USERS_TAG = "Users" +AUTHORIZATION_PARAM = openapi.Parameter( + "Authorization", + openapi.IN_HEADER, + description="Bearer ", + type=openapi.TYPE_STRING, + required=True, +) + class RegisterView(APIView): """Регистрация нового пользователя""" @@ -27,7 +43,10 @@ class RegisterView(APIView): permission_classes = [AllowAny] @swagger_auto_schema( - request_body=UserRegistrationSerializer, responses={201: UserSerializer} + request_body=UserRegistrationSerializer, + tags=[USERS_TAG], + security=[], + responses={201: UserSerializer}, ) def post(self, request): serializer = UserRegistrationSerializer(data=request.data) @@ -53,14 +72,33 @@ class LoginView(APIView): permission_classes = [AllowAny] - @swagger_auto_schema(request_body=LoginSerializer, responses={200: TokenSerializer}) + def _authenticate_by_identity(self, identity, identity_field, password): + if identity_field == "email": + return authenticate(email=identity, password=password) + + try: + user = User.objects.get(username=identity) + except User.DoesNotExist: + if "@" in identity: + return authenticate(email=identity, password=password) + return None + + return authenticate(email=user.email, password=password) + + @swagger_auto_schema( + request_body=LoginSerializer, + tags=[USERS_TAG], + security=[], + responses={200: TokenSerializer}, + ) def post(self, request): serializer = LoginSerializer(data=request.data) if serializer.is_valid(): - email = serializer.validated_data["email"] + identity = serializer.validated_data["identity"] + identity_field = serializer.validated_data["identity_field"] password = serializer.validated_data["password"] - user = authenticate(email=email, password=password) + user = self._authenticate_by_identity(identity, identity_field, password) if user: tokens = UserService.get_tokens_for_user(user) return Response(tokens, status=status.HTTP_200_OK) @@ -79,25 +117,23 @@ class LogoutView(APIView): permission_classes = [IsAuthenticated] @swagger_auto_schema( - manual_parameters=[ - openapi.Parameter( - "Authorization", - openapi.IN_HEADER, - description="Bearer ", - type=openapi.TYPE_STRING, - required=True, - ) - ], + manual_parameters=[AUTHORIZATION_PARAM], + tags=[USERS_TAG], responses={200: "Успешный выход"}, ) def post(self, request): + refresh_token = request.data.get("refresh") + if not refresh_token: + return Response( + {"error": "Refresh token обязателен"}, + status=status.HTTP_400_BAD_REQUEST, + ) + try: - refresh_token = request.data.get("refresh") - if refresh_token: - token = RefreshToken(refresh_token) - token.blacklist() + token = RefreshToken(refresh_token) + token.blacklist() return Response({"message": "Успешный выход"}, status=status.HTTP_200_OK) - except Exception: + except (AttributeError, TokenError): return Response( {"error": "Неверный токен"}, status=status.HTTP_400_BAD_REQUEST ) @@ -109,15 +145,8 @@ class CurrentUserView(APIView): permission_classes = [IsAuthenticated] @swagger_auto_schema( - manual_parameters=[ - openapi.Parameter( - "Authorization", - openapi.IN_HEADER, - description="Bearer ", - type=openapi.TYPE_STRING, - required=True, - ) - ], + manual_parameters=[AUTHORIZATION_PARAM], + tags=[USERS_TAG], responses={200: UserSerializer}, ) def get(self, request): @@ -132,15 +161,8 @@ class UserUpdateView(APIView): @swagger_auto_schema( request_body=UserUpdateSerializer, - manual_parameters=[ - openapi.Parameter( - "Authorization", - openapi.IN_HEADER, - description="Bearer ", - type=openapi.TYPE_STRING, - required=True, - ) - ], + manual_parameters=[AUTHORIZATION_PARAM], + tags=[USERS_TAG], responses={200: UserSerializer}, ) def patch(self, request): @@ -168,15 +190,8 @@ class ProfileDetailView(generics.RetrieveUpdateAPIView): return profile @swagger_auto_schema( - manual_parameters=[ - openapi.Parameter( - "Authorization", - openapi.IN_HEADER, - description="Bearer ", - type=openapi.TYPE_STRING, - required=True, - ) - ] + manual_parameters=[AUTHORIZATION_PARAM], + tags=[USERS_TAG], ) def get(self, request, *args, **kwargs): profile = self.get_object() @@ -185,15 +200,16 @@ class ProfileDetailView(generics.RetrieveUpdateAPIView): @swagger_auto_schema( request_body=ProfileUpdateSerializer, - manual_parameters=[ - openapi.Parameter( - "Authorization", - openapi.IN_HEADER, - description="Bearer ", - type=openapi.TYPE_STRING, - required=True, - ) - ], + manual_parameters=[AUTHORIZATION_PARAM], + tags=[USERS_TAG], + ) + def put(self, request, *args, **kwargs): + return self.update(request, *args, **kwargs) + + @swagger_auto_schema( + request_body=ProfileUpdateSerializer, + manual_parameters=[AUTHORIZATION_PARAM], + tags=[USERS_TAG], ) def patch(self, request, *args, **kwargs): profile = self.get_object() @@ -213,15 +229,8 @@ class PasswordChangeView(APIView): @swagger_auto_schema( request_body=PasswordChangeSerializer, - manual_parameters=[ - openapi.Parameter( - "Authorization", - openapi.IN_HEADER, - description="Bearer ", - type=openapi.TYPE_STRING, - required=True, - ) - ], + manual_parameters=[AUTHORIZATION_PARAM], + tags=[USERS_TAG], responses={200: "Пароль успешно изменен"}, ) def post(self, request): @@ -246,6 +255,7 @@ class PasswordChangeView(APIView): return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST) +@swagger_auto_schema(method="get", tags=[USERS_TAG], responses={200: "Profile data"}) @api_view(["GET"]) @permission_classes([IsAuthenticated]) def user_profile_detail(request): @@ -269,6 +279,8 @@ class TokenRefreshView(APIView): }, required=["refresh"], ), + tags=[USERS_TAG], + security=[], responses={200: TokenSerializer}, ) def post(self, request): @@ -279,12 +291,24 @@ class TokenRefreshView(APIView): status=status.HTTP_400_BAD_REQUEST, ) + serializer = TokenRefreshSerializer(data={"refresh": refresh_token}) try: - refresh = RefreshToken(refresh_token) - return Response( - {"access": str(refresh.access_token), "refresh": str(refresh)} - ) - except Exception: + serializer.is_valid(raise_exception=True) + return Response(serializer.validated_data) + except (InvalidToken, TokenError): return Response( {"error": "Неверный refresh token"}, status=status.HTTP_401_UNAUTHORIZED ) + + +class TokenVerifyView(SimpleJWTTokenVerifyView): + """Проверка refresh/access токена.""" + + @swagger_auto_schema( + request_body=TokenVerifySerializer, + tags=[USERS_TAG], + security=[], + responses={200: "Token is valid"}, + ) + def post(self, request, *args, **kwargs): + return super().post(request, *args, **kwargs) diff --git a/src/config/api_v1_urls.py b/src/config/api_v1_urls.py index cdf8730..e6f5823 100644 --- a/src/config/api_v1_urls.py +++ b/src/config/api_v1_urls.py @@ -4,18 +4,77 @@ API v1 URL configuration. All API endpoints are versioned under /api/v1/ """ -from apps.core.views import BackgroundJobListView, BackgroundJobStatusView +from apps.core.views import ( + BackgroundJobControlView, + BackgroundJobListView, + BackgroundJobStatusView, + BackgroundJobStreamView, +) +from apps.parsers.frontend_compat import ( + ParserLoadLogDetailCompatView, + ParserLoadLogExportCompatView, + ParserLoadLogListCompatView, + ParsingSettingsCompatView, + SourceCardDetailCompatView, + SourceCardListCompatView, + SourceCardRefreshCompatView, + SourceTaskStatusListCompatView, +) from django.urls import include, path app_name = "api_v1" jobs_urlpatterns = [ path("", BackgroundJobListView.as_view(), name="job-list"), + path("/stream/", BackgroundJobStreamView.as_view(), name="job-stream"), path("/", BackgroundJobStatusView.as_view(), name="job-status"), + path( + "/control/", + BackgroundJobControlView.as_view(), + name="job-control", + ), +] + +sources_urlpatterns = [ + path("", SourceCardListCompatView.as_view(), name="source-cards-list"), + path("statuses/", SourceTaskStatusListCompatView.as_view(), name="source-statuses"), + path( + "/", SourceCardDetailCompatView.as_view(), name="source-card-detail" + ), + path( + "/refresh/", + SourceCardRefreshCompatView.as_view(), + name="source-card-refresh", + ), +] + +parsing_urlpatterns = [ + path("settings/", ParsingSettingsCompatView.as_view(), name="parsing-settings"), +] + +system_urlpatterns = [ + path("logs/", ParserLoadLogListCompatView.as_view(), name="parser-logs-list"), + path( + "logs/export/", + ParserLoadLogExportCompatView.as_view(), + name="parser-logs-export", + ), + path( + "logs//", + ParserLoadLogDetailCompatView.as_view(), + name="parser-logs-detail", + ), ] urlpatterns = [ path("users/", include("apps.user.urls")), + path("sources/", include((sources_urlpatterns, "sources"))), + path("parsing/", include((parsing_urlpatterns, "parsing"))), + path("system/", include((system_urlpatterns, "system"))), + path("registers/", include("apps.registers.urls")), + path("exchange/", include("apps.exchange.urls")), + path("backups/", include("apps.backups.urls")), + path("", include("apps.parsers.api_result_urls", namespace="parser_results")), path("parsers/", include("apps.parsers.urls")), path("jobs/", include((jobs_urlpatterns, "jobs"))), ] diff --git a/src/config/celery.py b/src/config/celery.py index 0eecb09..056f03e 100644 --- a/src/config/celery.py +++ b/src/config/celery.py @@ -9,7 +9,11 @@ import os from celery import Celery # Set the default Django settings module for the 'celery' program. -os.environ.setdefault("DJANGO_SETTINGS_MODULE", "config.settings.development") +# +# config.__init__ imports this module before config.wsgi/config.asgi module bodies +# run. Keep the implicit default production-safe; local entrypoints such as +# manage.py and docker-compose set development explicitly. +os.environ.setdefault("DJANGO_SETTINGS_MODULE", "config.settings.production") app = Celery("project") diff --git a/src/config/settings/base.py b/src/config/settings/base.py index 17cfd8b..3e1026e 100644 --- a/src/config/settings/base.py +++ b/src/config/settings/base.py @@ -59,9 +59,13 @@ INSTALLED_APPS = [ "django_celery_beat", "django_celery_results", "drf_yasg", + "rest_framework_simplejwt.token_blacklist", # Local apps "apps.core", "apps.user", + "apps.registers", + "apps.exchange", + "apps.backups", "apps.parsers", ] @@ -93,6 +97,9 @@ JAZZMIN_SETTINGS = { "hide_models": [], "order_with_respect_to": [ "user", + "registers", + "exchange", + "backups", "parsers", "core", "django_celery_beat", @@ -103,6 +110,12 @@ JAZZMIN_SETTINGS = { "auth.Group": "fas fa-users", "user.User": "fas fa-user", "user.Profile": "fas fa-id-card", + "registers.Register": "fas fa-book", + "registers.Organization": "fas fa-building", + "registers.RegisterUpload": "fas fa-file-upload", + "registers.RegistryMembershipPeriod": "fas fa-link", + "exchange.ExchangeConnection": "fas fa-database", + "backups.BackupExportJob": "fas fa-file-archive", "parsers.Proxy": "fas fa-shield-alt", "parsers.ParserLoadLog": "fas fa-history", "parsers.IndustrialCertificateRecord": "fas fa-certificate", @@ -253,6 +266,18 @@ STATICFILES_DIRS = [BASE_DIR / "static"] MEDIA_URL = "/media/" MEDIA_ROOT = BASE_DIR / "media" +# External DB exchange and encrypted backup export +EXCHANGE_CREDENTIALS_ENCRYPTION_KEY = get_env( + "EXCHANGE_CREDENTIALS_ENCRYPTION_KEY", + SECRET_KEY, +) +BACKUP_ENCRYPTION_KEY = get_env("BACKUP_ENCRYPTION_KEY", "") +BACKUP_KEY_ID = get_env("BACKUP_KEY_ID", "default") +BACKUP_EXPORT_DIRECTORY = get_env( + "BACKUP_EXPORT_DIRECTORY", + str(MEDIA_ROOT / "backups"), +) + # Default primary key field type DEFAULT_AUTO_FIELD = "django.db.models.BigAutoField" @@ -291,6 +316,18 @@ REST_FRAMEWORK = { }, } +SWAGGER_SETTINGS = { + "SECURITY_DEFINITIONS": { + "Bearer": { + "type": "apiKey", + "name": "Authorization", + "in": "header", + "description": "JWT header: Bearer ", + }, + "Basic": {"type": "basic"}, + }, +} + # JWT settings from datetime import timedelta diff --git a/src/config/settings/production.py b/src/config/settings/production.py index a85e324..a5b9ba8 100644 --- a/src/config/settings/production.py +++ b/src/config/settings/production.py @@ -1,3 +1,5 @@ +import os + from .base import * # Production settings @@ -62,6 +64,30 @@ CACHES = { } } +LOG_FILE = os.getenv("DJANGO_LOG_FILE") +LOG_HANDLERS = { + "console": { + "level": "INFO", + "class": "logging.StreamHandler", + "formatter": "verbose", + }, + "mail_admins": { + "level": "ERROR", + "class": "django.utils.log.AdminEmailHandler", + }, +} +ROOT_LOG_HANDLERS = ["console"] +if LOG_FILE: + LOG_HANDLERS["file"] = { + "level": "INFO", + "class": "logging.handlers.RotatingFileHandler", + "filename": LOG_FILE, + "maxBytes": 1024 * 1024 * 15, # 15MB + "backupCount": 10, + "formatter": "verbose", + } + ROOT_LOG_HANDLERS.append("file") + # Logging for production LOGGING = { "version": 1, @@ -72,37 +98,24 @@ LOGGING = { "style": "{", }, }, - "handlers": { - "file": { - "level": "INFO", - "class": "logging.handlers.RotatingFileHandler", - "filename": "/var/log/django/app.log", - "maxBytes": 1024 * 1024 * 15, # 15MB - "backupCount": 10, - "formatter": "verbose", - }, - "mail_admins": { - "level": "ERROR", - "class": "django.utils.log.AdminEmailHandler", - }, - }, + "handlers": LOG_HANDLERS, "root": { - "handlers": ["file"], + "handlers": ROOT_LOG_HANDLERS, "level": "INFO", }, "loggers": { "django": { - "handlers": ["file"], + "handlers": ROOT_LOG_HANDLERS, "level": "INFO", "propagate": False, }, "apps.data_processor": { - "handlers": ["file"], + "handlers": ROOT_LOG_HANDLERS, "level": "INFO", "propagate": False, }, "apps.scraping": { - "handlers": ["file"], + "handlers": ROOT_LOG_HANDLERS, "level": "INFO", "propagate": False, }, diff --git a/src/config/settings/test.py b/src/config/settings/test.py index 2e68d19..45334e1 100644 --- a/src/config/settings/test.py +++ b/src/config/settings/test.py @@ -77,6 +77,10 @@ LOGGING = { # Media files for tests MEDIA_ROOT = "/tmp/test_media" # noqa: S108 +BACKUP_EXPORT_DIRECTORY = "/tmp/test_media/backups" # noqa: S108 +BACKUP_ENCRYPTION_KEY = "MDEyMzQ1Njc4OWFiY2RlZjAxMjM0NTY3ODlhYmNkZWY=" +BACKUP_KEY_ID = "test" +EXCHANGE_CREDENTIALS_ENCRYPTION_KEY = "test-exchange-credential-key" # Static files for tests STATICFILES_STORAGE = "django.contrib.staticfiles.storage.StaticFilesStorage" diff --git a/src/config/urls.py b/src/config/urls.py index aca36e0..8650225 100644 --- a/src/config/urls.py +++ b/src/config/urls.py @@ -4,6 +4,7 @@ URL Configuration for the project. The `urlpatterns` list routes URLs to views. """ +from apps.parsers.views import ParserDashboardPageView from django.conf import settings from django.conf.urls.static import static from django.contrib import admin @@ -32,6 +33,28 @@ urlpatterns = [ schema_view.with_ui("swagger", cache_timeout=0), name="schema-swagger-ui", ), + path("dashboard", ParserDashboardPageView.as_view(), name="dashboard"), + path("dashboard/", ParserDashboardPageView.as_view(), name="dashboard-slash"), + path( + "dashboard/", + ParserDashboardPageView.as_view(), + name="dashboard-source", + ), + path( + "dashboard//", + ParserDashboardPageView.as_view(), + name="dashboard-source-slash", + ), + path( + "dashboard//", + ParserDashboardPageView.as_view(), + name="dashboard-source-item", + ), + path( + "dashboard///", + ParserDashboardPageView.as_view(), + name="dashboard-source-item-slash", + ), path("admin/", admin.site.urls), path("health/", include("apps.core.urls")), path("api/v1/", include("config.api_v1_urls", namespace="api_v1")), diff --git a/src/templates/dashboard.html b/src/templates/dashboard.html new file mode 100644 index 0000000..93f0de8 --- /dev/null +++ b/src/templates/dashboard.html @@ -0,0 +1,1985 @@ + + + + + + Mostovik Parser Dashboard + + + +
+
+

Parser Dashboard

+
Управление источниками, загрузками, расписаниями Celery и внешней выгрузкой
+
+
+ + + + нет токена + + +
+
+ +
+
+

Вход

+
+
+ + +
+
+ +
+
+
+
+ + + + +
+ + + + + + + + + + + + + + + + diff --git a/tests/apps/backups/__init__.py b/tests/apps/backups/__init__.py new file mode 100644 index 0000000..1de819f --- /dev/null +++ b/tests/apps/backups/__init__.py @@ -0,0 +1 @@ +"""Tests for backups app.""" diff --git a/tests/apps/backups/test_services_views.py b/tests/apps/backups/test_services_views.py new file mode 100644 index 0000000..171977f --- /dev/null +++ b/tests/apps/backups/test_services_views.py @@ -0,0 +1,131 @@ +"""Tests for encrypted backup export.""" + +import json +import struct +from datetime import date +from io import BytesIO +from zipfile import ZipFile + +from apps.backups.models import BackupExportJob +from apps.backups.services import BackupExportService +from apps.core.models import BackgroundJob, JobStatus +from apps.parsers.models import ParserLoadLog +from apps.registers.models import Register +from apps.user.services import UserService +from django.test import override_settings +from django.urls import reverse +from rest_framework import status +from rest_framework.test import APITestCase + +from tests.apps.parsers.factories import GenericParserRecordFactory +from tests.apps.registers.factories import ( + OrganizationFactory, + RegistryMembershipPeriodFactory, +) +from tests.apps.user.factories import UserFactory + + +@override_settings( + BACKUP_ENCRYPTION_KEY="MDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDA=", + CELERY_TASK_ALWAYS_EAGER=True, + CELERY_TASK_EAGER_PROPAGATES=True, +) +class BackupExportTest(APITestCase): + """Tests for registry backup service and API.""" + + def setUp(self): + self.admin = UserFactory.create_user(is_staff=True) + self.user = UserFactory.create_user() + self.registry, _ = Register.objects.get_or_create( + name="Реестр предприятий ОПК" + ) + self.organization = OrganizationFactory( + pn_name='АО "ОПК"', + mn_ogrn=1027600980990, + mn_inn=7601000086, + mn_okpo="07506197", + ) + RegistryMembershipPeriodFactory( + registry=self.registry, + organization=self.organization, + ) + + def authenticate(self, user): + tokens = UserService.get_tokens_for_user(user) + self.client.credentials(HTTP_AUTHORIZATION=f"Bearer {tokens['access']}") + + def test_backup_archive_contains_registry_payload_header(self): + """Test service creates ZIP with encrypted bin and metadata header.""" + GenericParserRecordFactory( + source=ParserLoadLog.Source.FNS_FINANCIAL, + inn=str(self.organization.mn_inn), + ogrn=str(self.organization.mn_ogrn), + ) + + artifact = BackupExportService.build_backup_archive( + actual_date=date(2026, 4, 27), + registry=self.registry, + ) + + with ZipFile(BytesIO(artifact.archive_bytes)) as archive: + names = archive.namelist() + bin_name = next(name for name in names if name.endswith(".bin")) + self.assertTrue(any(name.endswith(".sha256") for name in names)) + bin_payload = archive.read(bin_name) + + self.assertEqual(bin_payload[:4], BackupExportService.MAGIC) + header_length = struct.unpack(">I", bin_payload[5:9])[0] + header = json.loads(bin_payload[9 : 9 + header_length]) + self.assertEqual(header["actual_date"], "2026-04-27") + self.assertEqual(header["registry_name"], "Реестр предприятий ОПК") + self.assertEqual(header["organizations_count"], 1) + self.assertEqual(artifact.organizations_count, 1) + + def test_backup_export_requires_admin(self): + """Test regular users cannot start backup export.""" + self.authenticate(self.user) + + response = self.client.post( + reverse("api_v1:backups:export"), + { + "actual_date": "2026-04-27", + "registry": str(self.registry.id), + }, + format="json", + ) + + self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN) + + def test_backup_export_starts_task_then_downloads_ready_archive(self): + """Test endpoint starts Celery task and returns archive on repeated call.""" + self.authenticate(self.admin) + payload = { + "actual_date": "2026-04-27", + "registry": str(self.registry.id), + } + + with self.captureOnCommitCallbacks(execute=True): + first_response = self.client.post( + reverse("api_v1:backups:export"), + payload, + format="json", + ) + second_response = self.client.post( + reverse("api_v1:backups:export"), + payload, + format="json", + ) + + self.assertEqual(first_response.status_code, status.HTTP_202_ACCEPTED) + self.assertEqual(first_response.data["data"]["status"], "started") + self.assertEqual(second_response.status_code, status.HTTP_200_OK) + self.assertEqual(second_response["Content-Type"], "application/zip") + self.assertEqual( + second_response["X-Backup-Actual-Date"], + "2026-04-27", + ) + self.assertFalse(BackupExportJob.objects.exists()) + background_job = BackgroundJob.objects.get( + task_id=first_response.data["data"]["task_id"] + ) + self.assertEqual(background_job.status, JobStatus.SUCCESS) diff --git a/tests/apps/core/test_openapi.py b/tests/apps/core/test_openapi.py index 9bc164f..5f01850 100644 --- a/tests/apps/core/test_openapi.py +++ b/tests/apps/core/test_openapi.py @@ -1,5 +1,7 @@ """Tests for core OpenAPI utilities""" +import json + from apps.core.openapi import ( CommonParameters, CommonResponses, @@ -41,6 +43,98 @@ class ApiDocsDecoratorTest(TestCase): self.assertEqual(my_view.__name__, "my_view") +class OpenApiSchemaViewTest(TestCase): + """Tests for generated Swagger schema.""" + + def _operation_tags(self, paths: dict) -> list[str]: + tags = [] + for operations in paths.values(): + for method, operation in operations.items(): + if method == "parameters": + continue + tags.extend(operation.get("tags", [])) + return tags + + def test_schema_exposes_parser_and_job_tags(self): + """Test dashboard/parser APIs are visible as separate Swagger groups.""" + response = self.client.get("/?format=openapi") + + self.assertEqual(response.status_code, 200) + schema = json.loads(response.content) + paths = schema["paths"] + self.assertIn("/api/v1/parsers/sources/", paths) + self.assertIn("/api/v1/parsers/records/", paths) + self.assertIn("/api/v1/parsers/run/{source_key}/", paths) + self.assertIn("/api/v1/parsers/upload/{source_key}/", paths) + self.assertIn("/api/v1/users/register/", paths) + self.assertIn("/api/v1/users/login/", paths) + self.assertIn("/api/v1/users/token/refresh/", paths) + self.assertIn("/api/v1/users/token/verify/", paths) + self.assertIn("/api/v1/users/me/", paths) + self.assertIn("/api/v1/fns/reports/", paths) + self.assertIn("/api/v1/fns/reports/{id}/", paths) + self.assertIn("/api/v1/fns/upload/", paths) + self.assertIn("/api/v1/minpromtorg/manufacturers/", paths) + self.assertIn("/api/v1/zakupki/", paths) + self.assertIn("/api/v1/zakupki/{id}/", paths) + self.assertIn("/api/v1/jobs/", paths) + self.assertNotIn("/api/v1/zakupki/procurements-44fz/", paths) + self.assertNotIn("/api/v1/zakupki/procurements-223fz/", paths) + self.assertNotIn("/api/v1/zakupki/contracts/", paths) + self.assertNotIn("/api/v1/zakupki/upload/", paths) + self.assertNotIn("/api/v2/fns/reports/", paths) + self.assertEqual( + paths["/api/v1/parsers/sources/"]["get"]["tags"], + ["Parser Management"], + ) + self.assertEqual( + paths["/api/v1/parsers/upload/{source_key}/"]["post"]["tags"], + ["Parser Management"], + ) + self.assertEqual( + paths["/api/v1/fns/reports/"]["get"]["tags"], + ["FNS"], + ) + self.assertEqual( + paths["/api/v1/fns/reports/{id}/"]["get"]["tags"], + ["FNS"], + ) + self.assertEqual( + paths["/api/v1/fns/upload/"]["post"]["tags"], + ["FNS"], + ) + self.assertEqual( + paths["/api/v1/minpromtorg/manufacturers/"]["get"]["tags"], + ["Minpromtorg"], + ) + self.assertEqual( + paths["/api/v1/zakupki/"]["get"]["tags"], + ["EIS Zakupki"], + ) + for path in ( + "/api/v1/users/register/", + "/api/v1/users/login/", + "/api/v1/users/token/refresh/", + "/api/v1/users/token/verify/", + "/api/v1/users/me/", + ): + operation = ( + paths[path]["post"] if "post" in paths[path] else paths[path]["get"] + ) + self.assertEqual(operation["tags"], ["Users"]) + for path in ( + "/api/v1/users/register/", + "/api/v1/users/login/", + "/api/v1/users/token/refresh/", + "/api/v1/users/token/verify/", + ): + self.assertEqual(paths[path]["post"]["security"], []) + self.assertEqual(paths["/api/v1/jobs/"]["get"]["tags"], ["Jobs"]) + for tag in self._operation_tags(paths): + self.assertFalse(any("\u0400" <= char <= "\u04ff" for char in tag), tag) + self.assertIn("Bearer", schema["securityDefinitions"]) + + class GetStatusDescriptionTest(TestCase): """Tests for _get_status_description function""" diff --git a/tests/apps/core/test_services.py b/tests/apps/core/test_services.py index f4fe1a6..6758305 100644 --- a/tests/apps/core/test_services.py +++ b/tests/apps/core/test_services.py @@ -1,7 +1,7 @@ """Tests for core services""" from apps.core.exceptions import NotFoundError -from apps.core.services import BaseService +from apps.core.services import BaseService, BulkOperationsMixin from django.contrib.auth import get_user_model from django.test import TestCase @@ -14,6 +14,12 @@ class UserTestService(BaseService[User]): model = User +class UserBulkTestService(BulkOperationsMixin, BaseService[User]): + """Test bulk service using User model""" + + model = User + + class BaseServiceTest(TestCase): """Tests for BaseService""" @@ -21,7 +27,7 @@ class BaseServiceTest(TestCase): self.user = User.objects.create_user( username="testuser", email="test@example.com", - password="testpass123", + password="testpass123", # noqa: S106 ) def test_get_by_id_success(self): @@ -53,7 +59,7 @@ class BaseServiceTest(TestCase): User.objects.create_user( username="testuser2", email="test2@example.com", - password="testpass123", + password="testpass123", # noqa: S106 ) result = UserTestService.get_all() @@ -101,3 +107,24 @@ class BaseServiceTest(TestCase): UserTestService.delete(self.user) self.assertFalse(User.objects.filter(pk=user_pk).exists()) + + +class BulkOperationsMixinTest(TestCase): + """Tests for bulk operation helpers.""" + + def test_bulk_create_rejects_update_conflicts_on_django_3(self): + """Test Django 4-only bulk upsert API is not exposed as working.""" + instances = [ + User( + username="bulk-user", + email="bulk@example.com", + ) + ] + + with self.assertRaises(NotImplementedError): + UserBulkTestService.bulk_create_chunked( + instances, + update_conflicts=True, + update_fields=["email"], + unique_fields=["username"], + ) diff --git a/tests/apps/core/test_views.py b/tests/apps/core/test_views.py index cbf2400..ba84cc4 100644 --- a/tests/apps/core/test_views.py +++ b/tests/apps/core/test_views.py @@ -1,9 +1,16 @@ -"""Tests for core views (health checks)""" +"""Tests for core views and API endpoints.""" +from unittest.mock import patch + +from apps.core.models import JobStatus +from apps.core.services import BackgroundJobService +from apps.user.services import UserService from django.urls import reverse from rest_framework import status from rest_framework.test import APITestCase +from tests.apps.user.factories import UserFactory + class HealthCheckViewTest(APITestCase): """Tests for HealthCheckView""" @@ -99,3 +106,121 @@ class APIVersioningURLTest(APITestCase): """Test reverse URL for password change""" url = reverse("api_v1:user:password_change") self.assertEqual(url, "/api/v1/users/password/change/") + + +class BackgroundJobApiTest(APITestCase): + """Tests for background job API access rules.""" + + def setUp(self): + self.user = UserFactory.create_user() + tokens = UserService.get_tokens_for_user(self.user) + self.client.credentials(HTTP_AUTHORIZATION=f"Bearer {tokens['access']}") + + def test_job_status_denies_other_user_job(self): + """Test users cannot read jobs owned by another user.""" + other_user = UserFactory.create_user() + job = BackgroundJobService.create_job( + task_id="task-other-user", + task_name="test.task", + user_id=other_user.id, + ) + + response = self.client.get( + reverse("api_v1:jobs:job-status", kwargs={"task_id": job.task_id}) + ) + + self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN) + + def test_job_status_denies_system_job_for_regular_user(self): + """Test user_id=NULL jobs are not visible to regular users.""" + job = BackgroundJobService.create_job( + task_id="task-system", + task_name="test.task", + ) + + response = self.client.get( + reverse("api_v1:jobs:job-status", kwargs={"task_id": job.task_id}) + ) + + self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN) + + def test_job_status_allows_system_job_for_staff(self): + """Test staff can inspect system jobs.""" + staff = UserFactory.create_user(is_staff=True) + tokens = UserService.get_tokens_for_user(staff) + self.client.credentials(HTTP_AUTHORIZATION=f"Bearer {tokens['access']}") + job = BackgroundJobService.create_job( + task_id="task-system-staff", + task_name="test.task", + ) + + response = self.client.get( + reverse("api_v1:jobs:job-status", kwargs={"task_id": job.task_id}) + ) + + self.assertEqual(response.status_code, status.HTTP_200_OK) + + def test_job_list_rejects_invalid_limit(self): + """Test job list validates limit query parameter.""" + response = self.client.get( + reverse("api_v1:jobs:job-list"), + {"limit": "bad"}, + ) + + self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) + + @patch("apps.core.views.current_app.control.revoke") + def test_job_control_revoke_marks_user_job_revoked(self, mock_revoke): + """Test owner can revoke a running background job.""" + job = BackgroundJobService.create_job( + task_id="task-revoke", + task_name="test.task", + user_id=self.user.id, + ) + job.mark_started() + + response = self.client.post( + reverse("api_v1:jobs:job-control", kwargs={"task_id": job.task_id}), + {"action": "revoke"}, + format="json", + ) + + self.assertEqual(response.status_code, status.HTTP_200_OK) + mock_revoke.assert_called_once_with(job.task_id, terminate=True) + job.refresh_from_db() + self.assertEqual(job.status, JobStatus.REVOKED) + + def test_job_control_denies_finished_job(self): + """Test finished jobs cannot be revoked again.""" + job = BackgroundJobService.create_job( + task_id="task-finished", + task_name="test.task", + user_id=self.user.id, + ) + job.complete() + + response = self.client.post( + reverse("api_v1:jobs:job-control", kwargs={"task_id": job.task_id}), + {"action": "revoke"}, + format="json", + ) + + self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) + + def test_job_stream_returns_completed_sse_event(self): + """Test dev-compatible SSE job stream endpoint.""" + job = BackgroundJobService.create_job( + task_id="task-stream", + task_name="test.task", + user_id=self.user.id, + ) + job.complete(result={"ok": True}) + + response = self.client.get( + reverse("api_v1:jobs:job-stream", kwargs={"task_id": job.task_id}) + ) + + self.assertEqual(response.status_code, status.HTTP_200_OK) + content = b"".join(response.streaming_content).decode("utf-8") + self.assertIn("event: completed", content) + self.assertIn('"task_id": "task-stream"', content) diff --git a/tests/apps/exchange/__init__.py b/tests/apps/exchange/__init__.py new file mode 100644 index 0000000..671fe7a --- /dev/null +++ b/tests/apps/exchange/__init__.py @@ -0,0 +1 @@ +"""Tests for exchange app.""" diff --git a/tests/apps/exchange/factories.py b/tests/apps/exchange/factories.py new file mode 100644 index 0000000..14f304a --- /dev/null +++ b/tests/apps/exchange/factories.py @@ -0,0 +1,19 @@ +"""Фабрики для приложения exchange.""" + +import factory +from apps.exchange.models import ExchangeConnection + + +class ExchangeConnectionFactory(factory.django.DjangoModelFactory): + """Фабрика подключения exchange.""" + + class Meta: + model = ExchangeConnection + + server = "127.0.0.1" + port = 5432 + username = factory.Sequence(lambda n: f"exchange_user_{n}") + password = "secret" # noqa: S105 + database_name = "exchange_db" + schema_name = "public" + is_active = False diff --git a/tests/apps/exchange/test_views.py b/tests/apps/exchange/test_views.py new file mode 100644 index 0000000..ae0cf8d --- /dev/null +++ b/tests/apps/exchange/test_views.py @@ -0,0 +1,167 @@ +"""Tests for exchange API views.""" + +from unittest.mock import Mock, patch + +from apps.core.models import BackgroundJob +from apps.user.services import UserService +from django.urls import reverse +from django_celery_beat.models import PeriodicTask +from rest_framework import status +from rest_framework.test import APITestCase + +from tests.apps.exchange.factories import ExchangeConnectionFactory +from tests.apps.user.factories import UserFactory + + +class ExchangeApiTest(APITestCase): + """Tests for external DB exchange API.""" + + def setUp(self): + self.user = UserFactory.create_user() + self.admin = UserFactory.create_user(is_staff=True) + self.user_tokens = UserService.get_tokens_for_user(self.user) + self.admin_tokens = UserService.get_tokens_for_user(self.admin) + + def authenticate(self, user): + tokens = UserService.get_tokens_for_user(user) + self.client.credentials(HTTP_AUTHORIZATION=f"Bearer {tokens['access']}") + + def test_exchange_connections_require_admin(self): + """Test regular users cannot manage exchange connections.""" + self.authenticate(self.user) + + response = self.client.get(reverse("api_v1:exchange:connections")) + + self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN) + + @patch( + "apps.exchange.views.ExchangeConnectionService.create_active_connection_and_prepare" + ) + def test_create_exchange_connection_uses_service(self, mock_create): + """Test admin can create active exchange connection through service.""" + self.authenticate(self.admin) + connection = ExchangeConnectionFactory.build( + id=1, + server="db.example.local", + is_active=True, + ) + mock_create.return_value = connection + + response = self.client.post( + reverse("api_v1:exchange:connections"), + { + "server": "db.example.local", + "port": 5432, + "username": "user", + "password": "secret", + "database_name": "target", + "schema_name": "public", + }, + format="json", + ) + + self.assertEqual(response.status_code, status.HTTP_201_CREATED) + self.assertEqual(response.data["data"]["server"], "db.example.local") + mock_create.assert_called_once() + + def test_exchange_tables_include_registers_and_parser_data(self): + """Test table catalog contains registry and parser models.""" + self.authenticate(self.admin) + + response = self.client.get(reverse("api_v1:exchange:tables")) + + self.assertEqual(response.status_code, status.HTTP_200_OK) + tables = {item["table"] for item in response.data["data"]} + self.assertIn("registers_organization", tables) + self.assertIn("registers_membership_period", tables) + self.assertIn("parsers_generic_record", tables) + + @patch("apps.exchange.views.copy_exchange_data_async.delay") + def test_exchange_copy_starts_celery_task_and_job(self, mock_delay): + """Test copy endpoint queues Celery task with active connection.""" + self.authenticate(self.admin) + connection = ExchangeConnectionFactory.create(is_active=True) + mock_delay.return_value = Mock(id="exchange-task-1") + + response = self.client.post( + reverse("api_v1:exchange:copy"), + {"mode": "all", "truncate_before_copy": True}, + format="json", + ) + + self.assertEqual(response.status_code, status.HTTP_202_ACCEPTED) + self.assertEqual(response.data["data"]["task_id"], "exchange-task-1") + mock_delay.assert_called_once_with( + connection_id=connection.id, + payload={"mode": "all", "truncate_before_copy": True}, + requested_by_id=self.admin.id, + ) + self.assertTrue( + BackgroundJob.objects.filter(task_id="exchange-task-1").exists() + ) + + def test_exchange_copy_requires_active_connection(self): + """Test copy endpoint reports missing active connection.""" + self.authenticate(self.admin) + + response = self.client.post( + reverse("api_v1:exchange:copy"), + {"mode": "all", "truncate_before_copy": True}, + format="json", + ) + + self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) + + def test_exchange_schedule_create_and_list(self): + """Test exchange schedules are stored in django-celery-beat.""" + self.authenticate(self.admin) + + response = self.client.post( + reverse("api_v1:exchange:periodic-tasks"), + { + "schedule_type": "interval", + "interval_every": 12, + "interval_period": "hours", + "mode": "single", + "table": "registers_organization", + "truncate_before_copy": False, + }, + format="json", + ) + list_response = self.client.get(reverse("api_v1:exchange:periodic-tasks")) + + self.assertEqual(response.status_code, status.HTTP_201_CREATED) + self.assertEqual(list_response.status_code, status.HTTP_200_OK) + self.assertTrue( + PeriodicTask.objects.filter(id=response.data["data"]["id"]).exists() + ) + created_task = next( + item + for item in list_response.data["data"] + if item["id"] == response.data["data"]["id"] + ) + payload = created_task["payload"] + self.assertEqual(payload["mode"], "single") + self.assertEqual(payload["table"], "registers_organization") + + def test_exchange_schedule_delete(self): + """Test exchange schedule can be deleted.""" + self.authenticate(self.admin) + create_response = self.client.post( + reverse("api_v1:exchange:periodic-tasks"), + { + "schedule_type": "daily", + "crontab_hour": 2, + "crontab_minute": 0, + "mode": "all", + }, + format="json", + ) + task_id = create_response.data["data"]["id"] + + response = self.client.delete( + reverse("api_v1:exchange:periodic-task-detail", args=[task_id]) + ) + + self.assertEqual(response.status_code, status.HTTP_204_NO_CONTENT) + self.assertFalse(PeriodicTask.objects.filter(id=task_id).exists()) diff --git a/tests/apps/parsers/factories.py b/tests/apps/parsers/factories.py index 4ff41ca..6da24f9 100644 --- a/tests/apps/parsers/factories.py +++ b/tests/apps/parsers/factories.py @@ -3,17 +3,16 @@ import random from datetime import timedelta -from django.utils import timezone - import factory - from apps.parsers.models import ( + GenericParserRecord, IndustrialCertificateRecord, InspectionRecord, ManufacturerRecord, ParserLoadLog, Proxy, ) +from django.utils import timezone # === Хелперы для генерации реалистичных данных === @@ -320,7 +319,9 @@ class InspectionRecordFactory(factory.django.DjangoModelFactory): lambda _: random.choice(["плановая", "внеплановая"]) ) inspection_form = factory.LazyAttribute( - lambda _: random.choice(["документарная", "выездная", "документарная и выездная"]) + lambda _: random.choice( + ["документарная", "выездная", "документарная и выездная"] + ) ) start_date = factory.LazyAttribute( lambda _: (timezone.now() - timedelta(days=random.randint(1, 180))).strftime( @@ -339,9 +340,35 @@ class InspectionRecordFactory(factory.django.DjangoModelFactory): lambda _: random.choice(["294-ФЗ", "248-ФЗ", "184-ФЗ"]) ) result = factory.LazyAttribute( - lambda _: random.choice( - ["нарушения не выявлены", "выявлены нарушения", ""] - ) + lambda _: random.choice(["нарушения не выявлены", "выявлены нарушения", ""]) if random.random() > 0.3 else "" ) + + +class GenericParserRecordFactory(factory.django.DjangoModelFactory): + """Factory for GenericParserRecord model.""" + + class Meta: + model = GenericParserRecord + + load_batch = factory.Sequence(lambda n: n + 1) + source = ParserLoadLog.Source.FNS_FINANCIAL + external_id = factory.Sequence(lambda n: f"GEN-{n:06d}") + inn = factory.LazyFunction(generate_inn_legal) + ogrn = factory.LazyFunction(generate_ogrn) + organisation_name = factory.LazyFunction(generate_company_name) + title = factory.LazyAttribute(lambda obj: f"Запись {obj.external_id}") + record_date = factory.LazyAttribute(lambda _: timezone.now().strftime("%Y-%m-%d")) + amount = "1000000.00" + status = "active" + url = factory.LazyAttribute( + lambda obj: f"https://example.com/records/{obj.external_id}" + ) + payload = factory.LazyAttribute( + lambda obj: { + "external_id": obj.external_id, + "inn": obj.inn, + "organisation_name": obj.organisation_name, + } + ) diff --git a/tests/apps/parsers/test_clients.py b/tests/apps/parsers/test_clients.py index e16594c..aa67ad2 100644 --- a/tests/apps/parsers/test_clients.py +++ b/tests/apps/parsers/test_clients.py @@ -1,19 +1,26 @@ """Tests for parsers clients.""" +import json +import zipfile from io import BytesIO -from unittest.mock import patch - -from django.test import TestCase, tag - -from faker import Faker -from openpyxl import Workbook +from unittest.mock import Mock, patch from apps.parsers.clients.base import BaseHTTPClient, HTTPClientError +from apps.parsers.clients.common import ( + GenericParserItem, + StructuredDataClient, + StructuredDataClientError, +) from apps.parsers.clients.minpromtorg.industrial import IndustrialProductionClient from apps.parsers.clients.minpromtorg.manufactures import ManufacturesClient from apps.parsers.clients.minpromtorg.schemas import IndustrialCertificate, Manufacturer from apps.parsers.clients.proverki import ProverkiClient from apps.parsers.clients.proverki.schemas import Inspection +from apps.parsers.clients.trudvsem import TrudvsemClient +from apps.parsers.models import ParserLoadLog +from django.test import TestCase, tag +from faker import Faker +from openpyxl import Workbook fake = Faker("ru_RU") @@ -63,6 +70,428 @@ class BaseHTTPClientTest(TestCase): proxy = client.current_proxy self.assertEqual(proxy, "http://proxy:8080") + def test_download_file_rejects_large_content_length_before_body_read(self): + """Test download_file checks Content-Length before reading response body.""" + client = BaseHTTPClient(base_url="https://example.com") + response = Mock() + response.ok = True + response.headers = {"Content-Length": "10"} + response.iter_content.return_value = [b"too-large"] + response.close = Mock() + client.session.get = Mock(return_value=response) + + with self.assertRaises(HTTPClientError): + client.download_file("/data.csv", max_size_bytes=5) + + response.iter_content.assert_not_called() + response.close.assert_called_once() + + def test_download_file_passes_ssl_verification_flag(self): + """Test download_file can disable SSL verification for broken upstream TLS.""" + client = BaseHTTPClient(base_url="https://example.com", verify_ssl=False) + response = Mock() + response.ok = True + response.headers = {"Content-Length": "4"} + response.iter_content.return_value = [b"data"] + response.close = Mock() + client.session.get = Mock(return_value=response) + + content = client.download_file("/data.csv") + + self.assertEqual(content, b"data") + client.session.get.assert_called_once_with( + "https://example.com/data.csv", + stream=True, + timeout=30, + verify=False, + ) + + +class StructuredDataClientTest(TestCase): + """Tests for StructuredDataClient.""" + + def test_parse_json_records(self): + """Test JSON parsing and normalization.""" + payload = { + "data": [ + { + "id": "FIN-1", + "inn": "1234567890", + "ogrn": "1234567890123", + "name": "Test Company", + "amount": "10 500,50", + "date": "2024", + } + ] + } + client = StructuredDataClient(source=ParserLoadLog.Source.FNS_FINANCIAL) + + records = client.fetch_records( + content=json.dumps(payload).encode("utf-8"), + file_name="data.json", + ) + + self.assertEqual(len(records), 1) + self.assertIsInstance(records[0], GenericParserItem) + self.assertEqual(records[0].external_id, "FIN-1") + self.assertEqual(records[0].inn, "1234567890") + self.assertEqual(str(records[0].amount), "10500.50") + + def test_parse_csv_records(self): + """Test CSV parsing with Russian headers.""" + content = ( + "реестровый номер;ИНН;наименование;сумма\n" + "RN-1;1234567890;ООО Тест;1000.00\n" + ).encode("cp1251") + client = StructuredDataClient(source=ParserLoadLog.Source.UNFAIR_SUPPLIERS) + + records = client.fetch_records(content=content, file_name="data.csv") + + self.assertEqual(len(records), 1) + self.assertEqual(records[0].external_id, "RN-1") + self.assertEqual(records[0].organisation_name, "ООО Тест") + + def test_parse_xml_records_under_wrapper(self): + """Test XML parser selects repeated nested record elements.""" + content = ( + "" + "XML-1123ООО А" + "XML-2456ООО Б" + "" + ).encode() + client = StructuredDataClient(source=ParserLoadLog.Source.FNS_FINANCIAL) + + records = client.fetch_records(content=content, file_name="data.xml") + + self.assertEqual(len(records), 2) + self.assertEqual(records[0].external_id, "XML-1") + self.assertEqual(records[1].external_id, "XML-2") + + def test_json_payload_preserves_nested_objects(self): + """Test payload keeps nested JSON structures machine-readable.""" + payload = { + "data": [ + { + "id": "NESTED-1", + "company": {"inn": "123", "name": "ООО А"}, + "amounts": [1, 2], + } + ] + } + client = StructuredDataClient(source=ParserLoadLog.Source.FNS_FINANCIAL) + + record = client.fetch_records( + content=json.dumps(payload).encode("utf-8"), + file_name="data.json", + )[0] + + self.assertEqual(record.payload["company"], {"inn": "123", "name": "ООО А"}) + self.assertEqual(record.payload["amounts"], [1, 2]) + + def test_fallback_external_id_is_stable_after_reordering(self): + """Test generated external_id does not depend on row position.""" + client = StructuredDataClient(source=ParserLoadLog.Source.FNS_FINANCIAL) + first = client.fetch_records( + content=("name;amount\nООО А;10\nООО Б;20\n").encode(), + file_name="data.csv", + ) + second = client.fetch_records( + content=("name;amount\nООО Б;20\nООО А;10\n").encode(), + file_name="data.csv", + ) + + self.assertEqual(first[0].external_id, second[1].external_id) + self.assertEqual(first[1].external_id, second[0].external_id) + + def test_zip_rejects_too_many_supported_files(self): + """Test ZIP parser refuses archives with too many supported files.""" + archive_content = BytesIO() + with zipfile.ZipFile(archive_content, "w") as archive: + archive.writestr("one.csv", "id\n1\n") + archive.writestr("two.csv", "id\n2\n") + + client = StructuredDataClient( + source=ParserLoadLog.Source.FNS_FINANCIAL, + max_zip_entries=1, + ) + + with self.assertRaises(StructuredDataClientError): + client.fetch_records( + content=archive_content.getvalue(), + file_name="data.zip", + ) + + def test_html_without_table_returns_empty_records(self): + """Test generic parser does not treat HTML pages as malformed XML.""" + content = b"
No table
" + client = StructuredDataClient(source=ParserLoadLog.Source.ARBITRATION) + + records = client.fetch_records(content=content, file_name="") + + self.assertEqual(records, []) + + def test_html_table_after_long_head_is_detected(self): + """Test HTML detection scans beyond the first kilobyte.""" + content = ( + "" + (" " * 1500) + "" + "" + "" + "
idinn
HTML-11234567890
" + ).encode() + client = StructuredDataClient(source=ParserLoadLog.Source.ARBITRATION) + + records = client.fetch_records(content=content, file_name="") + + self.assertEqual(len(records), 1) + self.assertEqual(records[0].external_id, "HTML-1") + self.assertEqual(records[0].inn, "1234567890") + + def test_html_layout_table_without_headers_is_ignored(self): + """Test layout/navigation tables are not imported as records.""" + content = ( + "" + "" + "" + "
КартотекаСтраж
КалендарьМой Арбитр
" + ).encode() + client = StructuredDataClient(source=ParserLoadLog.Source.ARBITRATION) + + records = client.fetch_records(content=content, file_name="") + + self.assertEqual(records, []) + + @patch.object(BaseHTTPClient, "post_json") + def test_mpt_products_page_uses_official_search_api(self, mock_post_json): + """Test GISP product page uses the official paginated UI API.""" + mock_post_json.return_value = { + "ok": True, + "total_count": 1, + "items": [ + { + "org_name": "ООО Производитель", + "org_inn": "7701000000", + "org_ogrn": "1027700000000", + "product_reg_number_2023": "10165413", + "product_name": "Средство дезинфицирующее", + "res_date": "2026-04-25", + "product_gisp_url": "https://gisp.gov.ru/goods/#/product/1", + } + ], + } + client = StructuredDataClient(source=ParserLoadLog.Source.MPT_PRODUCTS) + + records = client.fetch_records(file_url="https://gisp.gov.ru/pp719v2/pub/prod/") + + self.assertEqual(len(records), 1) + self.assertEqual(records[0].external_id, "10165413") + self.assertEqual(records[0].inn, "7701000000") + self.assertEqual(records[0].organisation_name, "ООО Производитель") + self.assertEqual(records[0].title, "Средство дезинфицирующее") + self.assertEqual( + mock_post_json.call_args.args[0], + "https://gisp.gov.ru/pp719v2/pub/prod/b/", + ) + + def test_zakupki_cards_are_parsed_as_records(self): + """Test ЕИС search cards are parsed when there is no HTML table.""" + content = """ + +
+ № 0331 +
Работа комиссии
+
Объект закупки
Поставка оборудования
+
Заказчик
ГКУ Тест
+
Начальная цена
649 989,52 ₽
+
Размещено
20.04.2026
+
+ + """.encode() + client = StructuredDataClient(source=ParserLoadLog.Source.PROCUREMENTS_44FZ) + + records = client.fetch_records(content=content, file_name="search.html") + + self.assertEqual(len(records), 1) + self.assertEqual(records[0].external_id, "0331") + self.assertEqual(records[0].title, "Поставка оборудования") + self.assertEqual(records[0].organisation_name, "ГКУ Тест") + self.assertEqual(str(records[0].amount), "649989.52") + self.assertEqual(records[0].record_date, "20.04.2026") + + def test_html_table_with_td_header_row_is_parsed(self): + """Test registry tables without th still parse when first row is a header.""" + content = """ + + + +
Номер реестровой записиИнформация о лицеИНН
ГОЗ-1ООО Оборона7701000000
+ """.encode() + client = StructuredDataClient(source=ParserLoadLog.Source.ARBITRATION) + + records = client.fetch_records(content=content, file_name="fas.html") + + self.assertEqual(len(records), 1) + self.assertEqual(records[0].external_id, "ГОЗ-1") + self.assertEqual(records[0].inn, "7701000000") + self.assertEqual(records[0].organisation_name, "ООО Оборона") + + def test_fas_goz_multirow_header_table_is_parsed(self): + """Test FAS GOZ table skips multirow headers and column-number rows.""" + content = """ + + + + + + + + + + +
Номер реестровой записиОрганПостановлениеЛицо
номердатаисполнениеполное наименованиеадресИНН
12345678
1Нижегородское УФАС России№ 052/04/7.29.2-2965/2023 от 22.01.202428.10.2025В стадии исполненияАО УАПОАО УАПОг. Уфа0275074279
+ """.encode() + client = StructuredDataClient(source=ParserLoadLog.Source.FAS_GOZ) + + records = client.fetch_records(content=content, file_name="fas.html") + + self.assertEqual(len(records), 1) + self.assertEqual(records[0].external_id, "1") + self.assertEqual(records[0].inn, "0275074279") + self.assertEqual(records[0].organisation_name, "АО УАПО") + self.assertEqual(records[0].record_date, "28.10.2025") + self.assertEqual(records[0].status, "В стадии исполнения") + + def test_fns_nested_bfo_fields_are_normalized(self): + """Test FNS JSON keeps nested payload and maps useful BFO fields.""" + payload = { + "content": [ + { + "id": 6622458, + "inn": "7736050003", + "shortName": 'ПАО "ГАЗПРОМ"', + "ogrn": "1027700070518", + "statusCode": "ACTIVE", + "bfo": { + "period": "2025", + "actualBfoDate": "2026-03-16", + "gainSum": 5846351786, + }, + } + ] + } + client = StructuredDataClient(source=ParserLoadLog.Source.FNS_FINANCIAL) + + records = client.fetch_records( + content=json.dumps(payload).encode("utf-8"), + file_name="fns.json", + ) + + self.assertEqual(records[0].external_id, "6622458") + self.assertEqual(records[0].inn, "7736050003") + self.assertEqual(records[0].organisation_name, 'ПАО "ГАЗПРОМ"') + self.assertEqual(records[0].record_date, "2026-03-16") + self.assertEqual(str(records[0].amount), "5846351786") + self.assertEqual(records[0].status, "ACTIVE") + + @patch.object(BaseHTTPClient, "download_file") + def test_fstec_page_discovers_csv_download(self, mock_download): + """Test FSTEC registry page follows the official CSV download link.""" + mock_download.side_effect = [ + ( + 'Государственный ' + "реестр ССЗИ" + ).encode(), + ( + '"№ сертификата","Дата внесения в реестр","Срок действия сертификата",' + '"Наименование средства (шифр)","Заявитель"\n' + '"17/1","2002-07-26","2020-08-01","ФСПК-100","ООО НПП ЭЛКОМ"\n' + ).encode(), + ] + client = StructuredDataClient(source=ParserLoadLog.Source.FSTEC) + + records = client.fetch_records(file_url="https://reestr.fstec.ru/reg3") + + self.assertEqual(len(records), 1) + self.assertEqual(records[0].external_id, "17/1") + self.assertEqual(records[0].organisation_name, "ООО НПП ЭЛКОМ") + self.assertEqual(records[0].title, "ФСПК-100") + self.assertEqual(records[0].record_date, "2002-07-26") + self.assertEqual(records[0].status, "2020-08-01") + + +class TrudvsemClientTest(TestCase): + """Tests for TrudvsemClient.""" + + @patch.object(BaseHTTPClient, "get_json") + def test_fetch_vacancies_success(self, mock_get_json): + """Test successful vacancies fetching.""" + mock_get_json.return_value = { + "results": { + "vacancies": [ + { + "vacancy": { + "id": "VAC-1", + "job-name": "Инженер", + "creation-date": "2026-01-01", + "salary": {"from": 120000}, + "company": { + "name": "ООО Тест", + "inn": "1234567890", + "ogrn": "1234567890123", + }, + "vac_url": "https://trudvsem.ru/vacancy/VAC-1", + } + } + ] + } + } + + with TrudvsemClient() as client: + records = client.fetch_vacancies(limit=1) + + self.assertEqual(len(records), 1) + self.assertEqual(records[0].external_id, "VAC-1") + self.assertEqual(records[0].source, ParserLoadLog.Source.TRUDVSEM) + self.assertEqual(records[0].inn, "1234567890") + + @patch.object(BaseHTTPClient, "get_json") + def test_fetch_vacancies_by_company_inn_scans_pages(self, mock_get_json): + """Test company_inn search scans next pages instead of false empty result.""" + mock_get_json.side_effect = [ + { + "results": { + "vacancies": [ + { + "vacancy": { + "id": "VAC-OTHER", + "company": {"inn": "0000000000"}, + } + } + ] + } + }, + { + "results": { + "vacancies": [ + { + "vacancy": { + "id": "VAC-MATCH", + "company": {"inn": "1234567890"}, + } + } + ] + } + }, + ] + + with TrudvsemClient(company_search_max_pages=2) as client: + records = client.fetch_vacancies(limit=1, company_inn="1234567890") + + self.assertEqual(len(records), 1) + self.assertEqual(records[0].external_id, "VAC-MATCH") + self.assertEqual(mock_get_json.call_args_list[0].kwargs["params"]["offset"], 0) + self.assertEqual(mock_get_json.call_args_list[1].kwargs["params"]["offset"], 1) + def _create_test_excel_certificates() -> bytes: """Create test Excel file with certificate data.""" @@ -159,7 +588,10 @@ class IndustrialProductionClientTest(TestCase): { "name": "Заключения о подтверждении производства промышленной продукции на территории Российской Федерации", "files": [ - {"name": "data_resolutions_20240101.xlsx", "url": "/files/test.xlsx"}, + { + "name": "data_resolutions_20240101.xlsx", + "url": "/files/test.xlsx", + }, ], } ] @@ -193,9 +625,18 @@ class IndustrialProductionClientTest(TestCase): { "name": "Заключения о подтверждении производства промышленной продукции на территории Российской Федерации", "files": [ - {"name": "data_resolutions_20240101.xlsx", "url": "/files/old.xlsx"}, - {"name": "data_resolutions_20240315.xlsx", "url": "/files/new.xlsx"}, - {"name": "data_resolutions_20240201.xlsx", "url": "/files/mid.xlsx"}, + { + "name": "data_resolutions_20240101.xlsx", + "url": "/files/old.xlsx", + }, + { + "name": "data_resolutions_20240315.xlsx", + "url": "/files/new.xlsx", + }, + { + "name": "data_resolutions_20240201.xlsx", + "url": "/files/mid.xlsx", + }, ], } ] @@ -539,7 +980,7 @@ class ProverkiClientTest(TestCase): client = ProverkiClient() xml_str = '' - element = ET.fromstring(xml_str) + element = ET.fromstring(xml_str) # noqa: S314 result = client._parse_xml_record(element) @@ -553,7 +994,7 @@ class ProverkiClientTest(TestCase): client = ProverkiClient() xml_str = "" - element = ET.fromstring(xml_str) + element = ET.fromstring(xml_str) # noqa: S314 result = client._parse_xml_record(element) @@ -569,9 +1010,7 @@ class ProverkiClientTest(TestCase): TEST001 Компания -""".encode( - "windows-1251" - ) +""".encode("windows-1251") inspections = client._parse_xml_content(xml_content, None) diff --git a/tests/apps/parsers/test_models.py b/tests/apps/parsers/test_models.py index 56c0e68..63215e0 100644 --- a/tests/apps/parsers/test_models.py +++ b/tests/apps/parsers/test_models.py @@ -1,15 +1,16 @@ """Tests for parsers models.""" -from django.test import TestCase - from apps.parsers.models import ( + GenericParserRecord, IndustrialCertificateRecord, ManufacturerRecord, ParserLoadLog, Proxy, ) +from django.test import TestCase from .factories import ( + GenericParserRecordFactory, IndustrialCertificateRecordFactory, ManufacturerRecordFactory, ParserLoadLogFactory, @@ -139,3 +140,26 @@ class ManufacturerRecordModelTest(TestCase): self.assertIsNotNone(manufacturer.created_at) self.assertIsNotNone(manufacturer.updated_at) + + +class GenericParserRecordModelTest(TestCase): + """Tests for GenericParserRecord model.""" + + def test_create_generic_record(self): + """Test creating generic parser record.""" + record = GenericParserRecordFactory() + + self.assertIsInstance(record, GenericParserRecord) + self.assertEqual(record.source, ParserLoadLog.Source.FNS_FINANCIAL) + self.assertIsNotNone(record.external_id) + self.assertIsInstance(record.payload, dict) + + def test_generic_record_str(self): + """Test generic record string representation.""" + record = GenericParserRecordFactory( + source=ParserLoadLog.Source.TRUDVSEM, + organisation_name='ООО "Тест"', + ) + + self.assertIn(ParserLoadLog.Source.TRUDVSEM, str(record)) + self.assertIn("Тест", str(record)) diff --git a/tests/apps/parsers/test_services.py b/tests/apps/parsers/test_services.py index 0d41de9..715a5b2 100644 --- a/tests/apps/parsers/test_services.py +++ b/tests/apps/parsers/test_services.py @@ -1,12 +1,12 @@ """Tests for parsers services.""" -from django.test import TestCase - -from faker import Faker +from decimal import Decimal +from apps.parsers.clients.common.schemas import GenericParserItem from apps.parsers.clients.minpromtorg.schemas import IndustrialCertificate, Manufacturer from apps.parsers.clients.proverki.schemas import Inspection from apps.parsers.models import ( + GenericParserRecord, IndustrialCertificateRecord, InspectionRecord, ManufacturerRecord, @@ -14,14 +14,18 @@ from apps.parsers.models import ( Proxy, ) from apps.parsers.services import ( + GenericParserRecordService, IndustrialCertificateService, InspectionService, ManufacturerService, ParserLoadLogService, ProxyService, ) +from django.test import TestCase +from faker import Faker from .factories import ( + GenericParserRecordFactory, IndustrialCertificateRecordFactory, InspectionRecordFactory, ManufacturerRecordFactory, @@ -147,7 +151,9 @@ class ParserLoadLogServiceTest(TestCase): def test_get_next_batch_id_first(self): """Test getting first batch_id for new source.""" - batch_id = ParserLoadLogService.get_next_batch_id(ParserLoadLog.Source.INDUSTRIAL) + batch_id = ParserLoadLogService.get_next_batch_id( + ParserLoadLog.Source.INDUSTRIAL + ) self.assertEqual(batch_id, 1) def test_get_next_batch_id_increment(self): @@ -155,7 +161,9 @@ class ParserLoadLogServiceTest(TestCase): ParserLoadLogFactory(batch_id=5, source=ParserLoadLog.Source.INDUSTRIAL) ParserLoadLogFactory(batch_id=3, source=ParserLoadLog.Source.INDUSTRIAL) - batch_id = ParserLoadLogService.get_next_batch_id(ParserLoadLog.Source.INDUSTRIAL) + batch_id = ParserLoadLogService.get_next_batch_id( + ParserLoadLog.Source.INDUSTRIAL + ) self.assertEqual(batch_id, 6) def test_get_next_batch_id_per_source(self): @@ -173,6 +181,32 @@ class ParserLoadLogServiceTest(TestCase): self.assertEqual(industrial_batch, 11) self.assertEqual(manufactures_batch, 6) + def test_create_next_load_log_reserves_batch_ids(self): + """Test atomic batch allocator creates sequential load logs.""" + first = ParserLoadLogService.create_next_load_log( + source=ParserLoadLog.Source.INDUSTRIAL, + status="in_progress", + ) + second = ParserLoadLogService.create_next_load_log( + source=ParserLoadLog.Source.INDUSTRIAL, + status="in_progress", + ) + + self.assertEqual(first.batch_id, 1) + self.assertEqual(second.batch_id, 2) + self.assertEqual(ParserLoadLog.objects.count(), 2) + + def test_create_next_load_log_honors_existing_logs(self): + """Test batch allocator starts after existing manual logs.""" + ParserLoadLogFactory(batch_id=7, source=ParserLoadLog.Source.INDUSTRIAL) + + log = ParserLoadLogService.create_next_load_log( + source=ParserLoadLog.Source.INDUSTRIAL, + status="in_progress", + ) + + self.assertEqual(log.batch_id, 8) + def test_create_load_log(self): """Test creating load log.""" log = ParserLoadLogService.create_load_log( @@ -272,7 +306,9 @@ class IndustrialCertificateServiceTest(TestCase): results = IndustrialCertificateService.find_by_inn("1111111111") self.assertEqual(results.count(), 2) - results_batch1 = IndustrialCertificateService.find_by_inn("1111111111", batch_id=1) + results_batch1 = IndustrialCertificateService.find_by_inn( + "1111111111", batch_id=1 + ) self.assertEqual(results_batch1.count(), 1) def test_find_by_certificate_number(self): @@ -314,6 +350,7 @@ class IndustrialCertificateServiceTest(TestCase): ) ] count2 = IndustrialCertificateService.save_certificates(duplicate, batch_id=2) + self.assertEqual(count2, 0) # Should still be 1 record (duplicate skipped) self.assertEqual(IndustrialCertificateRecord.objects.count(), 1) @@ -421,6 +458,7 @@ class ManufacturerServiceTest(TestCase): ) ] count2 = ManufacturerService.save_manufacturers(duplicate, batch_id=2) + self.assertEqual(count2, 0) # Should still be 1 record (duplicate skipped) self.assertEqual(ManufacturerRecord.objects.count(), 1) @@ -433,6 +471,147 @@ class ManufacturerServiceTest(TestCase): self.assertEqual(record.load_batch, 1) # Original batch +class GenericParserRecordServiceTest(TestCase): + """Tests for GenericParserRecordService.""" + + def test_save_records_empty(self): + """Test saving empty generic records returns 0.""" + count = GenericParserRecordService.save_records( + [], + batch_id=1, + source=ParserLoadLog.Source.FNS_FINANCIAL, + ) + self.assertEqual(count, 0) + + def test_save_records(self): + """Test saving generic records from dataclass.""" + records = [ + GenericParserItem( + source=ParserLoadLog.Source.FNS_FINANCIAL, + external_id=f"FIN-{i}", + inn=f"123456789{i}", + ogrn=f"123456789012{i}", + organisation_name=f"Company {i}", + title="Выручка", + record_date="2024", + amount=Decimal("1000.00"), + status="active", + url=f"https://example.com/{i}", + payload={"external_id": f"FIN-{i}"}, + ) + for i in range(5) + ] + + count = GenericParserRecordService.save_records( + records, + batch_id=1, + source=ParserLoadLog.Source.FNS_FINANCIAL, + ) + + self.assertEqual(count, 5) + self.assertEqual(GenericParserRecord.objects.count(), 5) + + def test_save_records_deduplication(self): + """Test saving generic records skips duplicates by source and external_id.""" + record = GenericParserItem( + source=ParserLoadLog.Source.TRUDVSEM, + external_id="VAC-1", + inn="1234567890", + organisation_name="Old Company", + payload={"name": "old"}, + ) + + count1 = GenericParserRecordService.save_records( + [record], + batch_id=1, + source=ParserLoadLog.Source.TRUDVSEM, + ) + count2 = GenericParserRecordService.save_records( + [record], + batch_id=2, + source=ParserLoadLog.Source.TRUDVSEM, + ) + + self.assertEqual(count1, 1) + self.assertEqual(count2, 0) + self.assertEqual(GenericParserRecord.objects.count(), 1) + + def test_save_records_deduplicates_incoming_batch(self): + """Test duplicate external IDs inside one payload count as one save.""" + records = [ + GenericParserItem( + source=ParserLoadLog.Source.TRUDVSEM, + external_id="VAC-SAME", + inn="1234567890", + organisation_name="First Company", + payload={"name": "first"}, + ), + GenericParserItem( + source=ParserLoadLog.Source.TRUDVSEM, + external_id="VAC-SAME", + inn="9999999999", + organisation_name="Second Company", + payload={"name": "second"}, + ), + ] + + count = GenericParserRecordService.save_records( + records, + batch_id=1, + source=ParserLoadLog.Source.TRUDVSEM, + ) + + self.assertEqual(count, 1) + self.assertEqual(GenericParserRecord.objects.count(), 1) + self.assertEqual( + GenericParserRecord.objects.get().organisation_name, + "First Company", + ) + + def test_create_with_exact_count_returns_zero_for_conflict(self): + """Test fallback count is exact when DB unique constraint rejects insert.""" + GenericParserRecordFactory( + source=ParserLoadLog.Source.TRUDVSEM, + external_id="VAC-RACE", + ) + instance = GenericParserRecord( + load_batch=2, + source=ParserLoadLog.Source.TRUDVSEM, + external_id="VAC-RACE", + inn="1234567890", + organisation_name="Concurrent Company", + payload={"external_id": "VAC-RACE"}, + ) + + count = GenericParserRecordService._create_with_exact_count( + [instance], + unique_fields=["source", "external_id"], + chunk_size=500, + ) + + self.assertEqual(count, 0) + self.assertEqual(GenericParserRecord.objects.count(), 1) + + def test_find_by_inn_with_source(self): + """Test finding generic records by INN and source.""" + GenericParserRecordFactory( + source=ParserLoadLog.Source.TRUDVSEM, + inn="1234567890", + ) + GenericParserRecordFactory( + source=ParserLoadLog.Source.FNS_FINANCIAL, + inn="1234567890", + ) + + result = GenericParserRecordService.find_by_inn( + "1234567890", + source=ParserLoadLog.Source.TRUDVSEM, + ) + + self.assertEqual(result.count(), 1) + self.assertEqual(result.first().source, ParserLoadLog.Source.TRUDVSEM) + + class InspectionServiceTest(TestCase): """Tests for InspectionService.""" @@ -568,6 +747,7 @@ class InspectionServiceTest(TestCase): ) ] count2 = InspectionService.save_inspections(duplicate, batch_id=2) + self.assertEqual(count2, 0) # Should still be 1 record (duplicate skipped) self.assertEqual(InspectionRecord.objects.count(), 1) @@ -581,10 +761,9 @@ class InspectionServiceTest(TestCase): self.assertEqual(record.load_batch, 1) # Original batch -from django.test import tag - from apps.parsers.clients.base import HTTPClientError from apps.parsers.clients.minpromtorg.industrial import IndustrialProductionClient +from django.test import tag @tag("integration", "slow", "network", "e2e") @@ -674,4 +853,3 @@ class EndToEndIntegrationTest(TestCase): except HTTPClientError as e: self.skipTest(f"External API unavailable: {e}") - diff --git a/tests/apps/parsers/test_tasks.py b/tests/apps/parsers/test_tasks.py new file mode 100644 index 0000000..1ae1d20 --- /dev/null +++ b/tests/apps/parsers/test_tasks.py @@ -0,0 +1,204 @@ +"""Tests for parser Celery tasks.""" + +from contextlib import ExitStack +from decimal import Decimal +from unittest.mock import patch + +from apps.core.models import BackgroundJob, JobStatus +from apps.parsers.clients.common.schemas import GenericParserItem +from apps.parsers.models import GenericParserRecord, ParserLoadLog +from apps.parsers.source_registry import PARSER_SOURCES +from apps.parsers.tasks import ( + import_parser_upload, + parse_all_sources, + parse_fns_financial_indicators, + parse_industrial_production, + parse_trudvsem_vacancies, + sync_inspections, +) +from django.core.files.base import ContentFile +from django.core.files.storage import default_storage +from django.test import TestCase + + +class GenericParserTasksTest(TestCase): + """Tests for new generic parser tasks.""" + + @patch("apps.parsers.tasks.IndustrialProductionClient.fetch_certificates") + def test_parse_industrial_production_sets_user_id(self, mock_fetch_certificates): + """Test existing parser task keeps BackgroundJob ownership.""" + mock_fetch_certificates.return_value = [] + + result = parse_industrial_production.delay(user_id=99).get() + + self.assertEqual(result["status"], "success") + self.assertEqual(result["saved"], 0) + self.assertEqual(BackgroundJob.objects.get().user_id, 99) + + @patch("apps.parsers.tasks.StructuredDataClient.fetch_records") + def test_parse_fns_financial_indicators_saves_records(self, mock_fetch_records): + """Test FNS financial task stores generic records.""" + mock_fetch_records.return_value = [ + GenericParserItem( + source=ParserLoadLog.Source.FNS_FINANCIAL, + external_id="FIN-1", + inn="1234567890", + organisation_name="ООО Тест", + title="Выручка", + amount=Decimal("1000.00"), + payload={"id": "FIN-1"}, + ) + ] + + result = parse_fns_financial_indicators.delay( + file_url="https://example.com/fns.json", + user_id=42, + ).get() + + self.assertEqual(result["status"], "success") + self.assertEqual(result["saved"], 1) + self.assertEqual(GenericParserRecord.objects.count(), 1) + self.assertEqual(ParserLoadLog.objects.count(), 1) + + job = BackgroundJob.objects.get() + self.assertEqual(job.status, JobStatus.SUCCESS) + self.assertEqual(job.user_id, 42) + + @patch("apps.parsers.tasks.StructuredDataClient.fetch_records") + def test_generic_parser_uses_registry_upstream_url(self, mock_fetch_records): + """Test generic parser resolves official source URL without manual file.""" + mock_fetch_records.return_value = [] + + result = parse_fns_financial_indicators.delay(user_id=42).get() + + self.assertEqual(result["status"], "success") + self.assertEqual( + mock_fetch_records.call_args.kwargs["file_url"], + PARSER_SOURCES["fns_financial"].upstream_url, + ) + + def test_import_parser_upload_saves_records_and_removes_file(self): + """Test uploaded registry files are imported through Celery.""" + storage_path = default_storage.save( + "parser_uploads/test-fns.csv", + ContentFile(b"inn,ogrn,name\n1234567890,1234567890123,Test Org\n"), + ) + + result = import_parser_upload.delay( + source_key="fns_financial", + storage_path=storage_path, + file_name="test-fns.csv", + user_id=77, + ).get() + + self.assertEqual(result["status"], "success") + self.assertEqual(result["saved"], 1) + self.assertFalse(default_storage.exists(storage_path)) + record = GenericParserRecord.objects.get() + self.assertEqual(record.source, ParserLoadLog.Source.FNS_FINANCIAL) + self.assertEqual(record.inn, "1234567890") + self.assertEqual(BackgroundJob.objects.get().user_id, 77) + + @patch("apps.parsers.tasks.TrudvsemClient.fetch_vacancies") + def test_parse_trudvsem_vacancies_saves_records(self, mock_fetch_vacancies): + """Test Trudvsem task stores vacancy records.""" + mock_fetch_vacancies.return_value = [ + GenericParserItem( + source=ParserLoadLog.Source.TRUDVSEM, + external_id="VAC-1", + inn="1234567890", + organisation_name="ООО Тест", + title="Инженер", + payload={"id": "VAC-1"}, + ) + ] + + result = parse_trudvsem_vacancies.delay(limit=1, user_id=7).get() + + self.assertEqual(result["status"], "success") + self.assertEqual(result["saved"], 1) + self.assertTrue( + GenericParserRecord.objects.filter( + source=ParserLoadLog.Source.TRUDVSEM, + external_id="VAC-1", + ).exists() + ) + self.assertEqual(BackgroundJob.objects.get().user_id, 7) + + def test_parse_all_sources_reports_all_sources(self): + """Test parse_all_sources covers native and upstream parser entries.""" + task_names = [ + "parse_industrial_production", + "parse_manufactures", + "parse_inspections", + "parse_trudvsem_vacancies", + "parse_mpt_products", + "parse_procurements_44fz", + "parse_procurements_223fz", + "parse_contracts", + "parse_unfair_suppliers", + "parse_fas_goz_evasion", + "parse_fns_financial_indicators", + "parse_arbitration_cases", + "parse_fedresurs_bankruptcy", + "parse_fstec_registers", + ] + + with ExitStack() as stack: + mocks = { + task_name: stack.enter_context( + patch(f"apps.parsers.tasks.{task_name}.delay") + ) + for task_name in task_names + } + for task_name, mock_delay in mocks.items(): + mock_delay.return_value.id = f"{task_name}-task" + + result = parse_all_sources.run(user_id=5, file_urls={}) + + self.assertEqual(result["industrial"], "parse_industrial_production-task") + self.assertEqual(result["trudvsem"], "parse_trudvsem_vacancies-task") + self.assertEqual( + result["fns_financial"], + "parse_fns_financial_indicators-task", + ) + self.assertEqual( + result["fedresurs_bankruptcy"], + "parse_fedresurs_bankruptcy-task", + ) + + @patch("apps.parsers.tasks.ProverkiClient.fetch_inspections") + def test_sync_inspections_marks_period_errors_as_failed(self, mock_fetch): + """Test inspection sync does not persist upstream errors as success.""" + mock_fetch.side_effect = RuntimeError("upstream unavailable") + + result = sync_inspections.delay(user_id=11).get() + + self.assertEqual(result["status"], "failed") + self.assertGreaterEqual(len(result["errors"]), 1) + + load_log = ParserLoadLog.objects.get() + self.assertEqual(load_log.status, "failed") + + job = BackgroundJob.objects.get() + self.assertEqual(job.status, JobStatus.FAILURE) + self.assertEqual(job.user_id, 11) + + @patch("apps.parsers.tasks.ProverkiClient.fetch_inspections") + def test_sync_inspections_can_limit_real_sync_slice(self, mock_fetch): + """Test inspection sync can be bounded for manual/API runs.""" + mock_fetch.return_value = [] + + result = sync_inspections.delay( + user_id=11, + max_months_per_law=1, + start_year=2026, + start_month=1, + include_fz248=False, + ).get() + + self.assertEqual(result["status"], "success") + self.assertEqual(mock_fetch.call_count, 1) + self.assertEqual(mock_fetch.call_args.kwargs["year"], 2026) + self.assertEqual(mock_fetch.call_args.kwargs["month"], 1) + self.assertFalse(mock_fetch.call_args.kwargs["is_federal_law_248"]) diff --git a/tests/apps/parsers/test_views.py b/tests/apps/parsers/test_views.py new file mode 100644 index 0000000..c614825 --- /dev/null +++ b/tests/apps/parsers/test_views.py @@ -0,0 +1,753 @@ +"""Tests for parsers API views.""" + +import json +from unittest.mock import Mock, patch + +from apps.parsers.models import ParserLoadLog +from apps.user.services import UserService +from django.core.files.uploadedfile import SimpleUploadedFile +from django.urls import reverse +from django_celery_beat.models import PeriodicTask +from rest_framework import status +from rest_framework.test import APITestCase + +from tests.apps.parsers.factories import ( + GenericParserRecordFactory, + IndustrialCertificateRecordFactory, + InspectionRecordFactory, + ManufacturerRecordFactory, + ParserLoadLogFactory, +) +from tests.apps.user.factories import UserFactory + + +class ParserApiTest(APITestCase): + """Tests for parsers API.""" + + def setUp(self): + self.user = UserFactory.create_user() + self.tokens = UserService.get_tokens_for_user(self.user) + self.client.credentials(HTTP_AUTHORIZATION=f"Bearer {self.tokens['access']}") + + def test_source_list_contains_existing_and_new_parsers(self): + """Test source catalog exposes existing and new parser slices.""" + response = self.client.get(reverse("api_v1:parsers:source-list")) + + self.assertEqual(response.status_code, status.HTTP_200_OK) + keys = {item["key"] for item in response.data["data"]} + self.assertIn("industrial", keys) + self.assertIn("trudvsem", keys) + self.assertIn("fedresurs_bankruptcy", keys) + sources = {item["key"]: item for item in response.data["data"]} + self.assertEqual(sources["industrial"]["mode"], "native_api") + self.assertEqual(sources["fedresurs_bankruptcy"]["mode"], "official_api") + self.assertEqual( + sources["fedresurs_bankruptcy"]["status"], + "implemented", + ) + self.assertEqual( + sources["fedresurs_bankruptcy"]["upstream_url"], + "https://bankrot.fedresurs.ru/", + ) + self.assertFalse( + any(item["mode"] == "file_import" for item in sources.values()) + ) + self.assertEqual(sources["fns_financial"]["owner"], "Сергей") + self.assertTrue(sources["fns_financial"]["supports_file_upload"]) + self.assertTrue(sources["fedresurs_bankruptcy"]["supports_file_upload"]) + upload_sources = { + key for key, source in sources.items() if source["supports_file_upload"] + } + self.assertEqual(upload_sources, {"fns_financial", "fedresurs_bankruptcy"}) + self.assertFalse(sources["trudvsem"]["supports_file_upload"]) + self.assertFalse(sources["mpt_products"]["supports_file_upload"]) + self.assertEqual(sources["mpt_products"]["upload_url"], "") + self.assertEqual( + sources["fns_financial"]["result_list_url"], + "/api/v1/fns/reports/", + ) + self.assertEqual( + sources["fns_financial"]["result_detail_url"], + "/api/v1/fns/reports/{id}/", + ) + self.assertEqual( + sources["fns_financial"]["upload_url"], + "/api/v1/fns/upload/", + ) + self.assertEqual( + sources["procurements_44fz"]["result_list_url"], + "/api/v1/zakupki/", + ) + self.assertEqual( + sources["procurements_223fz"]["result_list_url"], + "/api/v1/zakupki/", + ) + self.assertEqual(sources["contracts"]["result_list_url"], "/api/v1/zakupki/") + + def test_dashboard_page_is_available_outside_admin(self): + """Test dashboard HTML shell is served outside Django admin.""" + response = self.client.get(reverse("dashboard")) + + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertContains(response, "Parser Dashboard") + self.assertContains(response, 'name="username"') + self.assertContains(response, "color-scheme: dark") + self.assertContains(response, "refreshTokenButton") + self.assertContains(response, "sourceRoutePanel") + self.assertContains(response, "uploadPanel") + self.assertContains(response, "uploadSources") + self.assertContains(response, "registryUploadPanel") + self.assertContains(response, "Загрузка организаций ОПК") + self.assertContains(response, "/api/v1/registers/upload/") + self.assertContains(response, "/api/v1/registers/registries/") + self.assertContains(response, "Внешняя БД и выгрузка данных") + self.assertContains(response, "/api/v1/exchange/copy/") + self.assertContains(response, "Выгрузка организаций реестра в .bin") + self.assertContains(response, "/api/v1/backups/export/") + self.assertContains(response, "Источники API") + self.assertContains(response, "Загрузить реестр") + self.assertContains(response, "/api/v1/users/token/refresh/") + + def test_dashboard_nested_routes_render_same_html_shell(self): + """Test direct source and item dashboard URLs render the HTML app.""" + source_response = self.client.get("/dashboard/trudvsem") + item_response = self.client.get("/dashboard/trudvsem/1") + + self.assertEqual(source_response.status_code, status.HTTP_200_OK) + self.assertEqual(item_response.status_code, status.HTTP_200_OK) + self.assertContains(source_response, "Parser Dashboard") + self.assertContains(item_response, "Parser Dashboard") + + @patch("apps.parsers.views.tasks.parse_trudvsem_vacancies.delay") + def test_run_parser_starts_celery_task(self, mock_delay): + """Test parser run endpoint starts task through Celery.""" + mock_delay.return_value = Mock(id="task-123") + + response = self.client.post( + reverse("api_v1:parsers:run-parser", kwargs={"source_key": "trudvsem"}), + {"limit": 10, "text": "инженер"}, + format="json", + ) + + self.assertEqual(response.status_code, status.HTTP_202_ACCEPTED) + self.assertEqual(response.data["data"]["task_id"], "task-123") + mock_delay.assert_called_once() + self.assertEqual(mock_delay.call_args.kwargs["limit"], 10) + self.assertEqual(mock_delay.call_args.kwargs["user_id"], self.user.id) + + @patch("apps.parsers.views.tasks.parse_industrial_production.delay") + def test_run_existing_parser_passes_user_id(self, mock_delay): + """Test existing parser tasks keep API job ownership.""" + mock_delay.return_value = Mock(id="task-existing") + + response = self.client.post( + reverse("api_v1:parsers:run-parser", kwargs={"source_key": "industrial"}), + {}, + format="json", + ) + + self.assertEqual(response.status_code, status.HTTP_202_ACCEPTED) + self.assertEqual(response.data["data"]["task_id"], "task-existing") + self.assertEqual(mock_delay.call_args.kwargs["user_id"], self.user.id) + + @patch("apps.parsers.views.tasks.sync_inspections.delay") + def test_run_sync_inspections_passes_control_params(self, mock_delay): + """Test sync parser accepts bounded run controls from API.""" + mock_delay.return_value = Mock(id="task-sync") + + response = self.client.post( + reverse( + "api_v1:parsers:run-parser", + kwargs={"source_key": "sync_inspections"}, + ), + { + "max_months_per_law": 1, + "start_year": 2026, + "start_month": 1, + "include_fz294": True, + "include_fz248": False, + }, + format="json", + ) + + self.assertEqual(response.status_code, status.HTTP_202_ACCEPTED) + self.assertEqual(response.data["data"]["task_id"], "task-sync") + self.assertEqual(mock_delay.call_args.kwargs["max_months_per_law"], 1) + self.assertEqual(mock_delay.call_args.kwargs["start_year"], 2026) + self.assertEqual(mock_delay.call_args.kwargs["start_month"], 1) + self.assertTrue(mock_delay.call_args.kwargs["include_fz294"]) + self.assertFalse(mock_delay.call_args.kwargs["include_fz248"]) + + @patch("apps.parsers.views.tasks.parse_fedresurs_bankruptcy.delay") + def test_run_upstream_parser_uses_catalog_source_without_file_url(self, mock_delay): + """Test upstream parsers do not require manual file URLs.""" + mock_delay.return_value = Mock(id="fedresurs-task") + + response = self.client.post( + reverse( + "api_v1:parsers:run-parser", + kwargs={"source_key": "fedresurs_bankruptcy"}, + ), + {}, + format="json", + ) + + self.assertEqual(response.status_code, status.HTTP_202_ACCEPTED) + self.assertEqual(response.data["data"]["task_id"], "fedresurs-task") + self.assertNotIn("file_url", mock_delay.call_args.kwargs) + + def test_run_file_parser_rejects_private_file_url(self): + """Test parser run endpoint blocks private worker-side URLs.""" + response = self.client.post( + reverse( + "api_v1:parsers:run-parser", + kwargs={"source_key": "fedresurs_bankruptcy"}, + ), + {"file_url": "https://127.0.0.1/private.xml"}, + format="json", + ) + + self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) + + def test_run_parser_rejects_proxy_override_for_regular_user(self): + """Test non-staff users cannot route parser traffic through own proxy.""" + response = self.client.post( + reverse("api_v1:parsers:run-parser", kwargs={"source_key": "industrial"}), + {"proxies": ["http://proxy.example:8080"]}, + format="json", + ) + + self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN) + self.assertEqual( + response.data["errors"][0]["code"], + "proxy_override_forbidden", + ) + + @patch("apps.parsers.views._save_uploaded_parser_file") + @patch("apps.parsers.views.tasks.import_parser_upload.delay") + def test_upload_registry_file_starts_celery_task( + self, + mock_delay, + mock_save_upload, + ): + """Test manual registry upload starts Celery import task.""" + mock_delay.return_value = Mock(id="upload-task") + mock_save_upload.return_value = "parser_uploads/source.csv" + uploaded_file = SimpleUploadedFile( + "source.csv", + b"inn,ogrn,name\n1234567890,1234567890123,Test Org\n", + content_type="text/csv", + ) + + response = self.client.post( + reverse( + "api_v1:parsers:upload-parser-data", + kwargs={"source_key": "fns_financial"}, + ), + {"file": uploaded_file}, + format="multipart", + ) + + self.assertEqual(response.status_code, status.HTTP_202_ACCEPTED) + self.assertEqual(response.data["data"]["task_id"], "upload-task") + mock_delay.assert_called_once_with( + source_key="fns_financial", + storage_path="parser_uploads/source.csv", + file_name="source.csv", + user_id=self.user.id, + ) + + @patch("apps.parsers.views._save_uploaded_parser_file") + @patch("apps.parsers.views.tasks.import_parser_upload.delay") + def test_source_upload_registry_file_starts_celery_task( + self, + mock_delay, + mock_save_upload, + ): + """Test source upload endpoint follows dev-compatible FNS route.""" + mock_delay.return_value = Mock(id="upload-task") + mock_save_upload.return_value = "parser_uploads/source.csv" + uploaded_file = SimpleUploadedFile( + "source.csv", + b"inn,ogrn,name\n1234567890,1234567890123,Test Org\n", + content_type="text/csv", + ) + + response = self.client.post( + "/api/v1/fns/upload/", + {"file": uploaded_file}, + format="multipart", + ) + + self.assertEqual(response.status_code, status.HTTP_202_ACCEPTED) + self.assertEqual(response.data["data"]["task_id"], "upload-task") + mock_delay.assert_called_once_with( + source_key="fns_financial", + storage_path="parser_uploads/source.csv", + file_name="source.csv", + user_id=self.user.id, + ) + + def test_source_upload_absent_for_non_upload_source(self): + """Test source API does not expose upload where source has no manual import.""" + response = self.client.post("/api/v1/trudvsem/vacancies/upload/", {}) + + self.assertEqual(response.status_code, status.HTTP_404_NOT_FOUND) + + def test_upload_registry_file_rejects_non_upload_source(self): + """Test manual upload is limited to sources that support file import.""" + uploaded_file = SimpleUploadedFile( + "source.csv", + b"inn,name\n1234567890,Test Org\n", + content_type="text/csv", + ) + + response = self.client.post( + reverse( + "api_v1:parsers:upload-parser-data", + kwargs={"source_key": "trudvsem"}, + ), + {"file": uploaded_file}, + format="multipart", + ) + + self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) + self.assertEqual(response.data["errors"][0]["code"], "upload_not_supported") + + def test_upload_registry_file_rejects_api_only_registry_source(self): + """Test API/upstream sources do not expose manual file imports.""" + uploaded_file = SimpleUploadedFile( + "source.csv", + b"inn,name\n1234567890,Test Org\n", + content_type="text/csv", + ) + + response = self.client.post( + reverse( + "api_v1:parsers:upload-parser-data", + kwargs={"source_key": "mpt_products"}, + ), + {"file": uploaded_file}, + format="multipart", + ) + + self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) + self.assertEqual(response.data["errors"][0]["code"], "upload_not_supported") + + def test_load_logs_list(self): + """Test load logs endpoint.""" + ParserLoadLogFactory(source=ParserLoadLog.Source.FNS_FINANCIAL, batch_id=1) + + response = self.client.get(reverse("api_v1:parsers:load-log-list")) + + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(len(response.data["data"]), 1) + + def test_load_logs_list_rejects_invalid_limit(self): + """Test load logs endpoint validates limit query parameter.""" + response = self.client.get( + reverse("api_v1:parsers:load-log-list"), + {"limit": "bad"}, + ) + + self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) + + def test_frontend_source_card_aliases(self): + """Test dev frontend source-card API aliases are present.""" + GenericParserRecordFactory(source=ParserLoadLog.Source.FNS_FINANCIAL) + ParserLoadLogFactory(source=ParserLoadLog.Source.FNS_FINANCIAL, batch_id=1) + + list_response = self.client.get(reverse("api_v1:sources:source-cards-list")) + statuses_response = self.client.get(reverse("api_v1:sources:source-statuses")) + detail_response = self.client.get( + reverse( + "api_v1:sources:source-card-detail", + kwargs={"slug": "financial-indicators"}, + ) + ) + + self.assertEqual(list_response.status_code, status.HTTP_200_OK) + self.assertEqual(statuses_response.status_code, status.HTTP_200_OK) + self.assertEqual(detail_response.status_code, status.HTTP_200_OK) + slugs = {item["slug"] for item in list_response.data["data"]} + self.assertIn("financial-indicators", slugs) + self.assertIn("manufacturers-and-products", slugs) + self.assertIn("public-procurements", slugs) + self.assertEqual(detail_response.data["data"]["slug"], "financial-indicators") + self.assertIn("source_items", detail_response.data["data"]) + + def test_frontend_parsing_settings_alias_get_and_patch(self): + """Test dev-compatible parsing settings endpoint.""" + url = reverse("api_v1:parsing:parsing-settings") + + get_response = self.client.get(url) + patch_response = self.client.patch( + url, + {"planned_inspections": "weekly"}, + format="json", + ) + + self.assertEqual(get_response.status_code, status.HTTP_200_OK) + self.assertEqual(patch_response.status_code, status.HTTP_200_OK) + self.assertEqual(patch_response.data["planned_inspections"], "weekly") + + def test_frontend_source_refresh_alias_starts_task(self): + """Test dev-compatible source refresh endpoint starts mapped Celery task.""" + task = Mock() + task.delay.return_value = Mock(id="compat-refresh-task") + + with patch.dict( + "apps.parsers.frontend_compat.TASKS_BY_NAME", + {"apps.parsers.tasks.parse_trudvsem_vacancies": task}, + ): + response = self.client.post( + reverse( + "api_v1:sources:source-card-refresh", + kwargs={"slug": "labor-vacancies"}, + ), + {"params": {"limit": 5, "text": "engineer"}}, + format="json", + ) + + self.assertEqual(response.status_code, status.HTTP_202_ACCEPTED) + self.assertEqual(response.data["task_id"], "compat-refresh-task") + task.delay.assert_called_once_with( + limit=5, + text="engineer", + user_id=self.user.id, + ) + + def test_frontend_source_refresh_rejects_nested_proxy_override(self): + """Test source refresh alias keeps proxy override restricted.""" + response = self.client.post( + reverse( + "api_v1:sources:source-card-refresh", + kwargs={"slug": "labor-vacancies"}, + ), + {"params": {"proxies": ["http://proxy.example"]}}, + format="json", + ) + + self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN) + self.assertEqual(response.data["errors"][0]["code"], "proxy_override_forbidden") + + def test_frontend_system_logs_aliases(self): + """Test dev-compatible system logs list/detail/export endpoints.""" + log = ParserLoadLogFactory( + source=ParserLoadLog.Source.FNS_FINANCIAL, + batch_id=777, + records_count=3, + ) + + list_response = self.client.get( + reverse("api_v1:system:parser-logs-list"), + {"search": "777"}, + ) + detail_response = self.client.get( + reverse("api_v1:system:parser-logs-detail", kwargs={"pk": log.id}) + ) + export_response = self.client.get( + reverse("api_v1:system:parser-logs-export"), + {"search": "777"}, + ) + + self.assertEqual(list_response.status_code, status.HTTP_200_OK) + self.assertEqual(list_response.data["count"], 1) + self.assertEqual( + list_response.data["results"][0]["source"], "financial-indicators" + ) + self.assertEqual(detail_response.status_code, status.HTTP_200_OK) + self.assertEqual(detail_response.data["id"], log.id) + self.assertEqual(export_response.status_code, status.HTTP_200_OK) + self.assertIn("text/csv", export_response["Content-Type"]) + + def test_generic_records_list_filters_by_source(self): + """Test generic records endpoint filters by source.""" + GenericParserRecordFactory(source=ParserLoadLog.Source.TRUDVSEM) + GenericParserRecordFactory(source=ParserLoadLog.Source.FNS_FINANCIAL) + + response = self.client.get( + reverse("api_v1:parsers:generic-record-list"), + {"source": ParserLoadLog.Source.TRUDVSEM}, + ) + + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(len(response.data["data"]), 1) + self.assertEqual( + response.data["data"][0]["source"], ParserLoadLog.Source.TRUDVSEM + ) + + def test_generic_records_list_filters_by_id(self): + """Test generic records endpoint can support dashboard detail view.""" + target = GenericParserRecordFactory(source=ParserLoadLog.Source.TRUDVSEM) + GenericParserRecordFactory(source=ParserLoadLog.Source.TRUDVSEM) + + response = self.client.get( + reverse("api_v1:parsers:generic-record-list"), + {"source": ParserLoadLog.Source.TRUDVSEM, "id": target.id}, + ) + + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(len(response.data["data"]), 1) + self.assertEqual(response.data["data"][0]["id"], target.id) + + def test_records_list_returns_old_industrial_records(self): + """Test records endpoint still exposes old industrial parser data.""" + target = IndustrialCertificateRecordFactory() + + response = self.client.get( + reverse("api_v1:parsers:generic-record-list"), + {"source": ParserLoadLog.Source.INDUSTRIAL, "id": target.id}, + ) + + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(len(response.data["data"]), 1) + record = response.data["data"][0] + self.assertEqual(record["source"], ParserLoadLog.Source.INDUSTRIAL) + self.assertEqual(record["external_id"], target.certificate_number) + self.assertEqual(record["organisation_name"], target.organisation_name) + self.assertEqual( + record["payload"]["certificate_number"], target.certificate_number + ) + + def test_records_list_returns_old_manufacturer_records(self): + """Test records endpoint still exposes old manufacturer parser data.""" + target = ManufacturerRecordFactory() + + response = self.client.get( + reverse("api_v1:parsers:generic-record-list"), + {"source": ParserLoadLog.Source.MANUFACTURES, "id": target.id}, + ) + + self.assertEqual(response.status_code, status.HTTP_200_OK) + record = response.data["data"][0] + self.assertEqual(record["source"], ParserLoadLog.Source.MANUFACTURES) + self.assertEqual(record["external_id"], target.inn) + self.assertEqual(record["organisation_name"], target.full_legal_name) + + def test_records_list_returns_old_inspection_records(self): + """Test records endpoint still exposes old inspections parser data.""" + target = InspectionRecordFactory() + + response = self.client.get( + reverse("api_v1:parsers:generic-record-list"), + {"source": ParserLoadLog.Source.INSPECTIONS, "id": target.id}, + ) + + self.assertEqual(response.status_code, status.HTTP_200_OK) + record = response.data["data"][0] + self.assertEqual(record["source"], ParserLoadLog.Source.INSPECTIONS) + self.assertEqual(record["external_id"], target.registration_number) + self.assertEqual(record["organisation_name"], target.organisation_name) + self.assertEqual(record["status"], target.status) + + def test_generic_records_list_rejects_invalid_id(self): + """Test generic records endpoint validates id filter.""" + response = self.client.get( + reverse("api_v1:parsers:generic-record-list"), + {"id": "bad"}, + ) + + self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) + self.assertEqual(response.data["errors"][0]["code"], "invalid_record_id") + + def test_generic_records_list_rejects_invalid_limit(self): + """Test generic records endpoint validates limit query parameter.""" + response = self.client.get( + reverse("api_v1:parsers:generic-record-list"), + {"limit": "bad"}, + ) + + self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) + + def test_source_result_list_filters_generic_records(self): + """Test source result API exposes get_list with query params.""" + target = GenericParserRecordFactory( + source=ParserLoadLog.Source.FNS_FINANCIAL, + inn="7701000001", + organisation_name="Target Org", + ) + GenericParserRecordFactory( + source=ParserLoadLog.Source.FNS_FINANCIAL, + inn="7701000002", + ) + GenericParserRecordFactory(source=ParserLoadLog.Source.TRUDVSEM) + + response = self.client.get( + "/api/v1/fns/reports/", + {"inn": "7701000001", "page_size": 10, "include_payload": "false"}, + ) + + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(len(response.data["data"]), 1) + self.assertEqual(response.data["data"][0]["id"], target.id) + self.assertEqual(response.data["data"][0]["payload"], {}) + self.assertEqual(response.data["meta"]["pagination"]["total_count"], 1) + + def test_source_result_get_filters_generic_record(self): + """Test source result API exposes get with query params.""" + target = GenericParserRecordFactory( + source=ParserLoadLog.Source.FNS_FINANCIAL, + inn="7701000001", + ) + + response = self.client.get( + f"/api/v1/fns/reports/{target.id}/", + {"inn": "7701000001"}, + ) + missing_response = self.client.get( + f"/api/v1/fns/reports/{target.id}/", + {"inn": "7701000002"}, + ) + + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(response.data["data"]["id"], target.id) + self.assertEqual(missing_response.status_code, status.HTTP_404_NOT_FOUND) + + def test_source_result_list_filters_native_records(self): + """Test source result API exposes old native records with the same result DTO.""" + target = ManufacturerRecordFactory(inn="7701000001") + ManufacturerRecordFactory(inn="7701000002") + + response = self.client.get( + "/api/v1/minpromtorg/manufacturers/", + {"inn": "7701000001", "page_size": 10}, + ) + + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertEqual(len(response.data["data"]), 1) + self.assertEqual(response.data["data"][0]["id"], target.id) + self.assertEqual( + response.data["data"][0]["organisation_name"], + target.full_legal_name, + ) + + def test_zakupki_result_route_groups_eis_sources(self): + """Test EIS data is exposed through one dev-compatible zakupki route.""" + target = GenericParserRecordFactory( + source=ParserLoadLog.Source.PROCUREMENTS_44FZ, + inn="7701000001", + organisation_name="44FZ Org", + ) + GenericParserRecordFactory( + source=ParserLoadLog.Source.PROCUREMENTS_223FZ, + inn="7701000002", + organisation_name="223FZ Org", + ) + GenericParserRecordFactory( + source=ParserLoadLog.Source.CONTRACTS, + inn="7701000003", + organisation_name="Contract Org", + ) + GenericParserRecordFactory(source=ParserLoadLog.Source.FNS_FINANCIAL) + + grouped_response = self.client.get("/api/v1/zakupki/", {"page_size": 10}) + filtered_response = self.client.get( + "/api/v1/zakupki/", + {"source": ParserLoadLog.Source.PROCUREMENTS_44FZ, "page_size": 10}, + ) + detail_response = self.client.get(f"/api/v1/zakupki/{target.id}/") + + self.assertEqual(grouped_response.status_code, status.HTTP_200_OK) + self.assertEqual( + {record["source"] for record in grouped_response.data["data"]}, + { + ParserLoadLog.Source.PROCUREMENTS_44FZ, + ParserLoadLog.Source.PROCUREMENTS_223FZ, + ParserLoadLog.Source.CONTRACTS, + }, + ) + self.assertEqual(filtered_response.status_code, status.HTTP_200_OK) + self.assertEqual(len(filtered_response.data["data"]), 1) + self.assertEqual(filtered_response.data["data"][0]["id"], target.id) + self.assertEqual(detail_response.status_code, status.HTTP_200_OK) + self.assertEqual(detail_response.data["data"]["id"], target.id) + + def test_invented_eis_subroutes_are_not_exposed(self): + """Test public API does not expose invented EIS subresource routes.""" + self.assertEqual( + self.client.get("/api/v1/zakupki/procurements-44fz/").status_code, + status.HTTP_404_NOT_FOUND, + ) + self.assertEqual( + self.client.get("/api/v1/zakupki/procurements-223fz/").status_code, + status.HTTP_404_NOT_FOUND, + ) + self.assertEqual( + self.client.get("/api/v1/zakupki/contracts/").status_code, + status.HTTP_404_NOT_FOUND, + ) + self.assertEqual( + self.client.post("/api/v1/zakupki/upload/", {}).status_code, + status.HTTP_404_NOT_FOUND, + ) + + def test_dashboard_data_contains_sources_jobs_and_groups(self): + """Test dashboard data endpoint returns management payload.""" + ParserLoadLogFactory(source=ParserLoadLog.Source.FNS_FINANCIAL, batch_id=1) + + response = self.client.get(reverse("api_v1:parsers:dashboard-data")) + + self.assertEqual(response.status_code, status.HTTP_200_OK) + payload = response.data["data"] + self.assertIn("sources", payload) + self.assertIn("jobs", payload) + self.assertIn("schedules", payload) + self.assertEqual(payload["api"]["frontend_sources"], "/api/v1/sources/") + self.assertEqual(payload["api"]["system_logs"], "/api/v1/system/logs/") + financial_keys = { + item["key"] for item in payload["groups"]["financial_reports"] + } + self.assertIn("fns_financial", financial_keys) + upload_keys = {item["key"] for item in payload["groups"]["uploads"]} + self.assertEqual(upload_keys, {"fns_financial", "fedresurs_bankruptcy"}) + + def test_schedule_create_configures_celery_beat_task(self): + """Test parser schedule endpoint creates django-celery-beat task.""" + response = self.client.post( + reverse("api_v1:parsers:schedule-list"), + { + "source_key": "trudvsem", + "every": 6, + "period": "hours", + "limit": 25, + "text": "инженер", + }, + format="json", + ) + + self.assertEqual(response.status_code, status.HTTP_201_CREATED) + self.assertEqual(response.data["data"]["source_key"], "trudvsem") + + periodic_task = PeriodicTask.objects.get( + name=f"parser:trudvsem:user:{self.user.id}" + ) + kwargs = json.loads(periodic_task.kwargs) + self.assertEqual(kwargs["user_id"], self.user.id) + self.assertEqual(kwargs["limit"], 25) + self.assertEqual(periodic_task.interval.every, 6) + + def test_schedule_update_and_delete(self): + """Test parser schedule can be paused and removed.""" + create_response = self.client.post( + reverse("api_v1:parsers:schedule-list"), + {"source_key": "industrial", "every": 1, "period": "days"}, + format="json", + ) + schedule_id = create_response.data["data"]["id"] + + update_response = self.client.patch( + reverse("api_v1:parsers:schedule-detail", kwargs={"pk": schedule_id}), + {"enabled": False}, + format="json", + ) + + self.assertEqual(update_response.status_code, status.HTTP_200_OK) + self.assertFalse(update_response.data["data"]["enabled"]) + + delete_response = self.client.delete( + reverse("api_v1:parsers:schedule-detail", kwargs={"pk": schedule_id}) + ) + + self.assertEqual(delete_response.status_code, status.HTTP_200_OK) + self.assertFalse(PeriodicTask.objects.filter(pk=schedule_id).exists()) diff --git a/tests/apps/registers/__init__.py b/tests/apps/registers/__init__.py new file mode 100644 index 0000000..3d9b6e1 --- /dev/null +++ b/tests/apps/registers/__init__.py @@ -0,0 +1 @@ +"""Tests for registers app.""" diff --git a/tests/apps/registers/factories.py b/tests/apps/registers/factories.py new file mode 100644 index 0000000..3a4890c --- /dev/null +++ b/tests/apps/registers/factories.py @@ -0,0 +1,61 @@ +"""Фабрики для приложения реестров.""" + +import factory +from apps.registers.models import ( + Organization, + Register, + RegisterUpload, + RegistryMembershipPeriod, +) +from django.utils import timezone +from faker import Faker + +from tests.apps.user.factories import UserFactory + +fake = Faker("ru_RU") + + +class RegisterFactory(factory.django.DjangoModelFactory): + """Фабрика реестра.""" + + class Meta: + model = Register + + name = factory.Sequence(lambda n: f"Реестр {n}") + + +class OrganizationFactory(factory.django.DjangoModelFactory): + """Фабрика организации.""" + + class Meta: + model = Organization + + pn_name = factory.LazyFunction(lambda: fake.company()) + mn_ogrn = factory.Sequence(lambda n: 1027700000000 + n) + mn_inn = factory.Sequence(lambda n: 7700000000 + n) + in_kpp = factory.Sequence(lambda n: 770000000 + n) + mn_okpo = factory.Sequence(lambda n: f"{10000000 + n}") + + +class RegisterUploadFactory(factory.django.DjangoModelFactory): + """Фабрика загрузки реестра.""" + + class Meta: + model = RegisterUpload + + registry = factory.SubFactory(RegisterFactory) + actual_date = factory.LazyFunction(timezone.localdate) + file_name = "registry.xlsx" + file_hash = factory.Sequence(lambda n: f"{n:064x}") + rows_count = 1 + uploaded_by = factory.SubFactory(UserFactory) + + +class RegistryMembershipPeriodFactory(factory.django.DjangoModelFactory): + """Фабрика участия организации в реестре.""" + + class Meta: + model = RegistryMembershipPeriod + + registry = factory.SubFactory(RegisterFactory) + organization = factory.SubFactory(OrganizationFactory) diff --git a/tests/apps/registers/test_views.py b/tests/apps/registers/test_views.py new file mode 100644 index 0000000..093a027 --- /dev/null +++ b/tests/apps/registers/test_views.py @@ -0,0 +1,155 @@ +"""Tests for registers API views.""" + +import io +from datetime import date + +from apps.core.models import BackgroundJob, JobStatus +from apps.registers.models import Organization, RegisterUpload, RegistryMembershipPeriod +from apps.user.services import UserService +from django.core.files.uploadedfile import SimpleUploadedFile +from django.urls import reverse +from openpyxl import Workbook +from rest_framework import status +from rest_framework.test import APITestCase + +from tests.apps.registers.factories import ( + OrganizationFactory, + RegisterFactory, + RegistryMembershipPeriodFactory, +) +from tests.apps.user.factories import UserFactory + + +def build_register_excel(rows: list[dict], *, with_kpp: bool = True) -> bytes: + """Собрать тестовый Excel реестра.""" + workbook = Workbook() + worksheet = workbook.active + headers = ["pn_name", "mn_ogrn", "mn_inn"] + if with_kpp: + headers.append("in_kpp") + headers.append("mn_okpo") + worksheet.append(headers) + for row in rows: + values = [row["pn_name"], row["mn_ogrn"], row["mn_inn"]] + if with_kpp: + values.append(row.get("in_kpp")) + values.append(row["mn_okpo"]) + worksheet.append(values) + buffer = io.BytesIO() + workbook.save(buffer) + workbook.close() + return buffer.getvalue() + + +class RegistersApiTest(APITestCase): + """Tests for registers API.""" + + def setUp(self): + self.user = UserFactory.create_user() + self.admin = UserFactory.create_user(is_staff=True) + self.user_tokens = UserService.get_tokens_for_user(self.user) + self.admin_tokens = UserService.get_tokens_for_user(self.admin) + self.client.credentials( + HTTP_AUTHORIZATION=f"Bearer {self.user_tokens['access']}" + ) + + def authenticate_admin(self): + self.client.credentials( + HTTP_AUTHORIZATION=f"Bearer {self.admin_tokens['access']}" + ) + + def test_registries_and_organizations_are_readable(self): + """Test list/detail endpoints for registry organizations.""" + registry = RegisterFactory(name="Реестр предприятий ОПК") + organization = OrganizationFactory( + pn_name='АО "ОПК"', + mn_ogrn=1027600980990, + mn_inn=7601000086, + mn_okpo="07506197", + ) + RegistryMembershipPeriodFactory(registry=registry, organization=organization) + + registries_response = self.client.get( + reverse("api_v1:registers:registries-list"), + {"page_size": 100}, + ) + organizations_response = self.client.get( + reverse("api_v1:registers:organizations-list"), + {"registry": str(registry.id), "search": "ОПК", "page_size": 100}, + ) + detail_response = self.client.get( + reverse("api_v1:registers:organizations-detail", args=[organization.id]) + ) + registry_specific_response = self.client.get( + reverse( + "api_v1:registers:registry-organizations-list", + args=[registry.id], + ), + {"page_size": 100}, + ) + + self.assertEqual(registries_response.status_code, status.HTTP_200_OK) + self.assertEqual(organizations_response.status_code, status.HTTP_200_OK) + self.assertEqual(detail_response.status_code, status.HTTP_200_OK) + self.assertEqual(registry_specific_response.status_code, status.HTTP_200_OK) + self.assertEqual(organizations_response.data["data"][0]["id"], organization.id) + self.assertEqual( + detail_response.data["registries"][0]["registry_name"], registry.name + ) + self.assertEqual( + registry_specific_response.data["data"][0]["id"], + organization.id, + ) + + def test_register_upload_requires_staff_user(self): + """Test non-staff users cannot upload registry files.""" + registry = RegisterFactory(name="Реестр предприятий ОПК") + upload = SimpleUploadedFile("opk.xlsx", build_register_excel([])) + + response = self.client.post( + reverse("api_v1:registers:register-upload"), + {"registry": str(registry.id), "file": upload}, + format="multipart", + ) + + self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN) + + def test_register_upload_starts_celery_import_and_saves_data(self): + """Test registry upload goes through Celery and updates DB.""" + registry = RegisterFactory(name="Реестр предприятий ОПК") + self.authenticate_admin() + upload = SimpleUploadedFile( + "opk.xlsx", + build_register_excel( + [ + { + "pn_name": 'АО "ОПК"', + "mn_ogrn": "1027600980990", + "mn_inn": "7601000086", + "in_kpp": "760401001", + "mn_okpo": "07506197", + } + ] + ), + content_type="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + ) + + response = self.client.post( + reverse("api_v1:registers:register-upload"), + { + "registry": str(registry.id), + "actual_date": date(2026, 4, 27).isoformat(), + "file": upload, + }, + format="multipart", + ) + + self.assertEqual(response.status_code, status.HTTP_202_ACCEPTED) + self.assertEqual( + response.data["data"]["registry_name"], "Реестр предприятий ОПК" + ) + self.assertEqual(Organization.objects.count(), 1) + self.assertEqual(RegistryMembershipPeriod.objects.count(), 1) + self.assertEqual(RegisterUpload.objects.get().rows_count, 1) + job = BackgroundJob.objects.get(task_id=response.data["data"]["task_id"]) + self.assertEqual(job.status, JobStatus.SUCCESS) diff --git a/tests/apps/user/test_serializers.py b/tests/apps/user/test_serializers.py index e3ada24..5486c06 100644 --- a/tests/apps/user/test_serializers.py +++ b/tests/apps/user/test_serializers.py @@ -191,7 +191,7 @@ class LoginSerializerTest(TestCase): def setUp(self): self.login_data = { - "email": fake.email(), + "username": fake.user_name(), "password": fake.password(length=12, special_chars=False), } @@ -200,16 +200,36 @@ class LoginSerializerTest(TestCase): serializer = LoginSerializer(data=self.login_data) self.assertTrue(serializer.is_valid()) - def test_missing_email(self): - """Test validation fails without email""" + def test_email_login_data_is_still_supported(self): + """Test legacy email login payload is still valid.""" + serializer = LoginSerializer( + data={ + "email": fake.email(), + "password": fake.password(length=12, special_chars=False), + } + ) + self.assertTrue(serializer.is_valid()) + + def test_login_alias_is_supported(self): + """Test login alias can be used instead of username.""" + serializer = LoginSerializer( + data={ + "login": fake.user_name(), + "password": fake.password(length=12, special_chars=False), + } + ) + self.assertTrue(serializer.is_valid()) + + def test_missing_identity(self): + """Test validation fails without username, login or email.""" data = {"password": fake.password(length=12, special_chars=False)} serializer = LoginSerializer(data=data) self.assertFalse(serializer.is_valid()) - self.assertIn("email", serializer.errors) + self.assertIn("username", serializer.errors) def test_missing_password(self): """Test validation fails without password""" - data = {"email": fake.email()} + data = {"username": fake.user_name()} serializer = LoginSerializer(data=data) self.assertFalse(serializer.is_valid()) self.assertIn("password", serializer.errors) diff --git a/tests/apps/user/test_views.py b/tests/apps/user/test_views.py index 4e3aa44..5ced57a 100644 --- a/tests/apps/user/test_views.py +++ b/tests/apps/user/test_views.py @@ -7,6 +7,8 @@ from django.urls import reverse from faker import Faker from rest_framework import status from rest_framework.test import APITestCase +from rest_framework_simplejwt.token_blacklist.models import BlacklistedToken +from rest_framework_simplejwt.tokens import RefreshToken from .factories import ProfileFactory, UserFactory @@ -86,7 +88,7 @@ class LoginViewTest(APITestCase): self.password = fake.password(length=12, special_chars=False) self.user = UserFactory.create_user(password=self.password) - self.login_data = {"email": self.user.email, "password": self.password} + self.login_data = {"username": self.user.username, "password": self.password} def test_login_success(self): """Test successful login""" @@ -96,6 +98,30 @@ class LoginViewTest(APITestCase): self.assertIn("refresh", response.data) self.assertIn("access", response.data) + def test_login_success_with_email_for_legacy_clients(self): + """Test legacy email login is still supported.""" + response = self.client.post( + self.login_url, + {"email": self.user.email, "password": self.password}, + format="json", + ) + + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertIn("refresh", response.data) + self.assertIn("access", response.data) + + def test_login_success_with_login_alias(self): + """Test login alias can authenticate by username.""" + response = self.client.post( + self.login_url, + {"login": self.user.username, "password": self.password}, + format="json", + ) + + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertIn("refresh", response.data) + self.assertIn("access", response.data) + def test_login_invalid_credentials(self): """Test login fails with invalid credentials""" data = self.login_data.copy() @@ -109,7 +135,7 @@ class LoginViewTest(APITestCase): def test_login_nonexistent_user(self): """Test login fails for nonexistent user""" data = { - "email": fake.unique.email(), + "username": fake.unique.user_name(), "password": fake.password(length=12, special_chars=False), } @@ -118,6 +144,36 @@ class LoginViewTest(APITestCase): self.assertEqual(response.status_code, status.HTTP_401_UNAUTHORIZED) +class LogoutViewTest(APITestCase): + """Tests for LogoutView""" + + def setUp(self): + self.user = UserFactory.create_user() + self.logout_url = reverse("api_v1:user:logout") + self.tokens = UserService.get_tokens_for_user(self.user) + self.client.credentials(HTTP_AUTHORIZATION=f"Bearer {self.tokens['access']}") + + def test_logout_blacklists_refresh_token(self): + """Test logout revokes refresh token when blacklist app is enabled.""" + refresh = RefreshToken(self.tokens["refresh"]) + response = self.client.post( + self.logout_url, + {"refresh": self.tokens["refresh"]}, + format="json", + ) + + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertTrue( + BlacklistedToken.objects.filter(token__jti=refresh["jti"]).exists() + ) + + def test_logout_requires_refresh_token(self): + """Test logout without refresh token fails explicitly.""" + response = self.client.post(self.logout_url, {}, format="json") + + self.assertEqual(response.status_code, status.HTTP_400_BAD_REQUEST) + + class CurrentUserViewTest(APITestCase): """Tests for CurrentUserView""" @@ -293,8 +349,7 @@ class TokenRefreshViewTest(APITestCase): self.assertEqual(response.status_code, status.HTTP_200_OK) self.assertIn("access", response.data) self.assertIn("refresh", response.data) - # New refresh token should be different - # Refresh token may be the same or different depending on implementation + self.assertNotEqual(response.data["refresh"], self.tokens["refresh"]) def test_refresh_token_invalid(self): """Test token refresh fails with invalid refresh token"""