feat(parsers): add proverki.gov.ru parser with sync_inspections task
Some checks failed
CI/CD Pipeline / Build Docker Images (push) Blocked by required conditions
CI/CD Pipeline / Push to Gitea Registry (push) Blocked by required conditions
CI/CD Pipeline / Code Quality Checks (push) Failing after 3m55s
CI/CD Pipeline / Run Tests (push) Failing after 3h11m38s

- Add InspectionRecord model with is_federal_law_248, data_year, data_month fields
- Add ProverkiClient with Playwright support for JS-rendered portal
- Add streaming XML parser for large files (>50MB)
- Add sync_inspections task with incremental loading logic
  - Starts from 01.01.2025 if DB is empty
  - Loads both FZ-294 and FZ-248 inspections
  - Stops after 2 consecutive empty months
- Add InspectionService methods: get_last_loaded_period, has_data_for_period
- Add Minpromtorg parsers (certificates, manufacturers)
- Add Django Admin for parser models
- Update README with parsers documentation and changelog
This commit is contained in:
2026-01-21 20:16:25 +01:00
parent f121445313
commit 199d871923
45 changed files with 6810 additions and 97 deletions

View File

@@ -0,0 +1,677 @@
"""Tests for parsers services."""
from django.test import TestCase
from faker import Faker
from apps.parsers.clients.minpromtorg.schemas import IndustrialCertificate, Manufacturer
from apps.parsers.clients.proverki.schemas import Inspection
from apps.parsers.models import (
IndustrialCertificateRecord,
InspectionRecord,
ManufacturerRecord,
ParserLoadLog,
Proxy,
)
from apps.parsers.services import (
IndustrialCertificateService,
InspectionService,
ManufacturerService,
ParserLoadLogService,
ProxyService,
)
from .factories import (
IndustrialCertificateRecordFactory,
InspectionRecordFactory,
ManufacturerRecordFactory,
ParserLoadLogFactory,
ProxyFactory,
)
fake = Faker("ru_RU")
class ProxyServiceTest(TestCase):
"""Tests for ProxyService."""
def test_get_active_proxies_empty(self):
"""Test getting active proxies when none exist."""
proxies = ProxyService.get_active_proxies()
self.assertEqual(proxies, [])
def test_get_active_proxies_with_data(self):
"""Test getting active proxies returns only active ones."""
proxy1 = ProxyFactory(is_active=True)
proxy2 = ProxyFactory(is_active=True)
ProxyFactory(is_active=False) # Inactive - should not be returned
proxies = ProxyService.get_active_proxies()
self.assertEqual(len(proxies), 2)
self.assertIn(proxy1.address, proxies)
self.assertIn(proxy2.address, proxies)
def test_get_active_proxies_or_none_empty(self):
"""Test get_active_proxies_or_none returns None when no proxies."""
result = ProxyService.get_active_proxies_or_none()
self.assertIsNone(result)
def test_get_active_proxies_or_none_with_data(self):
"""Test get_active_proxies_or_none returns list when proxies exist."""
ProxyFactory(is_active=True)
result = ProxyService.get_active_proxies_or_none()
self.assertIsNotNone(result)
self.assertEqual(len(result), 1)
def test_mark_used(self):
"""Test marking proxy as used updates timestamp."""
proxy = ProxyFactory()
self.assertIsNone(proxy.last_used_at)
ProxyService.mark_used(proxy.address)
proxy.refresh_from_db()
self.assertIsNotNone(proxy.last_used_at)
def test_mark_failed(self):
"""Test marking proxy as failed increases fail count."""
proxy = ProxyFactory(fail_count=0)
ProxyService.mark_failed(proxy.address)
proxy.refresh_from_db()
self.assertEqual(proxy.fail_count, 1)
def test_deactivate(self):
"""Test deactivating proxy."""
proxy = ProxyFactory(is_active=True)
ProxyService.deactivate(proxy.address)
proxy.refresh_from_db()
self.assertFalse(proxy.is_active)
def test_add_proxy(self):
"""Test adding new proxy."""
address = "http://new-proxy:8080"
description = "Test proxy"
proxy = ProxyService.add_proxy(address, description)
self.assertEqual(proxy.address, address)
self.assertEqual(proxy.description, description)
self.assertTrue(proxy.is_active)
def test_add_proxy_idempotent(self):
"""Test adding existing proxy returns existing record."""
address = "http://existing:8080"
existing = ProxyFactory(address=address, description="Original")
proxy = ProxyService.add_proxy(address, "New description")
self.assertEqual(proxy.id, existing.id)
self.assertEqual(proxy.description, "Original") # Not updated
def test_add_proxies(self):
"""Test bulk adding proxies."""
addresses = [
"http://proxy1:8080",
"http://proxy2:8080",
"http://proxy3:8080",
]
created = ProxyService.add_proxies(addresses)
self.assertEqual(created, 3)
self.assertEqual(Proxy.objects.count(), 3)
def test_add_proxies_skips_existing(self):
"""Test bulk add skips existing proxies."""
ProxyFactory(address="http://existing:8080")
addresses = [
"http://existing:8080", # Already exists
"http://new:8080",
]
created = ProxyService.add_proxies(addresses)
self.assertEqual(created, 1)
self.assertEqual(Proxy.objects.count(), 2)
class ParserLoadLogServiceTest(TestCase):
"""Tests for ParserLoadLogService."""
def test_get_next_batch_id_first(self):
"""Test getting first batch_id for new source."""
batch_id = ParserLoadLogService.get_next_batch_id(ParserLoadLog.Source.INDUSTRIAL)
self.assertEqual(batch_id, 1)
def test_get_next_batch_id_increment(self):
"""Test batch_id increments correctly."""
ParserLoadLogFactory(batch_id=5, source=ParserLoadLog.Source.INDUSTRIAL)
ParserLoadLogFactory(batch_id=3, source=ParserLoadLog.Source.INDUSTRIAL)
batch_id = ParserLoadLogService.get_next_batch_id(ParserLoadLog.Source.INDUSTRIAL)
self.assertEqual(batch_id, 6)
def test_get_next_batch_id_per_source(self):
"""Test batch_id is tracked per source."""
ParserLoadLogFactory(batch_id=10, source=ParserLoadLog.Source.INDUSTRIAL)
ParserLoadLogFactory(batch_id=5, source=ParserLoadLog.Source.MANUFACTURES)
industrial_batch = ParserLoadLogService.get_next_batch_id(
ParserLoadLog.Source.INDUSTRIAL
)
manufactures_batch = ParserLoadLogService.get_next_batch_id(
ParserLoadLog.Source.MANUFACTURES
)
self.assertEqual(industrial_batch, 11)
self.assertEqual(manufactures_batch, 6)
def test_create_load_log(self):
"""Test creating load log."""
log = ParserLoadLogService.create_load_log(
source=ParserLoadLog.Source.INDUSTRIAL,
batch_id=1,
records_count=100,
status="success",
)
self.assertIsInstance(log, ParserLoadLog)
self.assertEqual(log.source, ParserLoadLog.Source.INDUSTRIAL)
self.assertEqual(log.batch_id, 1)
self.assertEqual(log.records_count, 100)
self.assertEqual(log.status, "success")
def test_mark_failed(self):
"""Test marking log as failed."""
log = ParserLoadLogFactory(status="success")
ParserLoadLogService.mark_failed(log, "Connection error")
log.refresh_from_db()
self.assertEqual(log.status, "failed")
self.assertEqual(log.error_message, "Connection error")
def test_update_records_count(self):
"""Test updating records count."""
log = ParserLoadLogFactory(records_count=0)
ParserLoadLogService.update_records_count(log, 250)
log.refresh_from_db()
self.assertEqual(log.records_count, 250)
class IndustrialCertificateServiceTest(TestCase):
"""Tests for IndustrialCertificateService."""
def test_save_certificates_empty(self):
"""Test saving empty list returns 0."""
count = IndustrialCertificateService.save_certificates([], batch_id=1)
self.assertEqual(count, 0)
def test_save_certificates(self):
"""Test saving certificates from dataclass."""
certificates = [
IndustrialCertificate(
issue_date="2024-01-01",
certificate_number=f"CERT-{i}",
expiry_date="2025-01-01",
certificate_file_url=f"https://example.com/cert{i}.pdf",
organisation_name=f"Company {i}",
inn=f"123456789{i}",
ogrn=f"123456789012{i}",
)
for i in range(5)
]
count = IndustrialCertificateService.save_certificates(certificates, batch_id=1)
self.assertEqual(count, 5)
self.assertEqual(IndustrialCertificateRecord.objects.count(), 5)
def test_save_certificates_with_chunk_size(self):
"""Test saving certificates in chunks."""
certificates = [
IndustrialCertificate(
issue_date="2024-01-01",
certificate_number=f"CERT-{i}",
expiry_date="2025-01-01",
certificate_file_url=f"https://example.com/cert{i}.pdf",
organisation_name=f"Company {i}",
inn=f"12345678{i:02d}",
ogrn=f"1234567890{i:03d}",
)
for i in range(10)
]
count = IndustrialCertificateService.save_certificates(
certificates, batch_id=1, chunk_size=3
)
self.assertEqual(count, 10)
def test_find_by_inn(self):
"""Test finding certificates by INN."""
IndustrialCertificateRecordFactory(
inn="1111111111", certificate_number="CERT-A1", load_batch=1
)
IndustrialCertificateRecordFactory(
inn="1111111111", certificate_number="CERT-A2", load_batch=2
)
IndustrialCertificateRecordFactory(
inn="2222222222", certificate_number="CERT-B1", load_batch=1
)
results = IndustrialCertificateService.find_by_inn("1111111111")
self.assertEqual(results.count(), 2)
results_batch1 = IndustrialCertificateService.find_by_inn("1111111111", batch_id=1)
self.assertEqual(results_batch1.count(), 1)
def test_find_by_certificate_number(self):
"""Test finding certificate by number."""
IndustrialCertificateRecordFactory(certificate_number="CERT-UNIQUE")
IndustrialCertificateRecordFactory(certificate_number="CERT-OTHER")
results = IndustrialCertificateService.find_by_certificate_number("CERT-UNIQUE")
self.assertEqual(results.count(), 1)
def test_save_certificates_deduplication(self):
"""Test saving certificates skips duplicates by certificate_number."""
# Create initial certificate
initial = [
IndustrialCertificate(
issue_date="2024-01-01",
certificate_number="CERT-DEDUP-001",
expiry_date="2025-01-01",
certificate_file_url="https://example.com/old.pdf",
organisation_name="Old Company Name",
inn="1234567890",
ogrn="1234567890123",
)
]
count1 = IndustrialCertificateService.save_certificates(initial, batch_id=1)
self.assertEqual(count1, 1)
self.assertEqual(IndustrialCertificateRecord.objects.count(), 1)
# Try to save with same certificate_number - should be skipped
duplicate = [
IndustrialCertificate(
issue_date="2024-06-01",
certificate_number="CERT-DEDUP-001", # Same number - will be skipped
expiry_date="2026-01-01",
certificate_file_url="https://example.com/new.pdf",
organisation_name="New Company Name",
inn="9999999999",
ogrn="9999999999999",
)
]
count2 = IndustrialCertificateService.save_certificates(duplicate, batch_id=2)
# Should still be 1 record (duplicate skipped)
self.assertEqual(IndustrialCertificateRecord.objects.count(), 1)
# Verify original data preserved
record = IndustrialCertificateRecord.objects.first()
self.assertEqual(record.organisation_name, "Old Company Name")
self.assertEqual(record.inn, "1234567890")
self.assertEqual(record.load_batch, 1) # Original batch
class ManufacturerServiceTest(TestCase):
"""Tests for ManufacturerService."""
def test_save_manufacturers_empty(self):
"""Test saving empty list returns 0."""
count = ManufacturerService.save_manufacturers([], batch_id=1)
self.assertEqual(count, 0)
def test_save_manufacturers(self):
"""Test saving manufacturers from dataclass."""
manufacturers = [
Manufacturer(
full_legal_name=f"Company {i} LLC",
inn=f"123456789{i}",
ogrn=f"123456789012{i}",
address=f"Address {i}",
)
for i in range(5)
]
count = ManufacturerService.save_manufacturers(manufacturers, batch_id=1)
self.assertEqual(count, 5)
self.assertEqual(ManufacturerRecord.objects.count(), 5)
def test_save_manufacturers_with_chunk_size(self):
"""Test saving manufacturers in chunks."""
manufacturers = [
Manufacturer(
full_legal_name=f"Company {i}",
inn=f"12345678{i:02d}",
ogrn=f"1234567890{i:03d}",
address=f"Address {i}",
)
for i in range(10)
]
count = ManufacturerService.save_manufacturers(
manufacturers, batch_id=1, chunk_size=3
)
self.assertEqual(count, 10)
def test_find_by_inn(self):
"""Test finding manufacturers by INN."""
ManufacturerRecordFactory(inn="1111111111", load_batch=1)
ManufacturerRecordFactory(inn="2222222222", load_batch=1)
ManufacturerRecordFactory(inn="3333333333", load_batch=2)
results = ManufacturerService.find_by_inn("1111111111")
self.assertEqual(results.count(), 1)
def test_find_by_inn_with_batch_filter(self):
"""Test finding manufacturers by INN with batch filter."""
ManufacturerRecordFactory(inn="4444444444", load_batch=1)
ManufacturerRecordFactory(inn="5555555555", load_batch=2)
results_batch1 = ManufacturerService.find_by_inn("4444444444", batch_id=1)
self.assertEqual(results_batch1.count(), 1)
results_batch2 = ManufacturerService.find_by_inn("4444444444", batch_id=2)
self.assertEqual(results_batch2.count(), 0)
def test_find_by_ogrn(self):
"""Test finding manufacturers by OGRN."""
ManufacturerRecordFactory(ogrn="1234567890123")
ManufacturerRecordFactory(ogrn="9999999999999")
results = ManufacturerService.find_by_ogrn("1234567890123")
self.assertEqual(results.count(), 1)
def test_save_manufacturers_deduplication(self):
"""Test saving manufacturers skips duplicates by INN."""
# Create initial manufacturer
initial = [
Manufacturer(
full_legal_name="Old Company Name LLC",
inn="7777777777",
ogrn="1234567890123",
address="Old Address",
)
]
count1 = ManufacturerService.save_manufacturers(initial, batch_id=1)
self.assertEqual(count1, 1)
self.assertEqual(ManufacturerRecord.objects.count(), 1)
# Try to save with same INN - should be skipped
duplicate = [
Manufacturer(
full_legal_name="New Company Name LLC",
inn="7777777777", # Same INN - will be skipped
ogrn="9999999999999",
address="New Address",
)
]
count2 = ManufacturerService.save_manufacturers(duplicate, batch_id=2)
# Should still be 1 record (duplicate skipped)
self.assertEqual(ManufacturerRecord.objects.count(), 1)
# Verify original data preserved
record = ManufacturerRecord.objects.first()
self.assertEqual(record.full_legal_name, "Old Company Name LLC")
self.assertEqual(record.ogrn, "1234567890123")
self.assertEqual(record.address, "Old Address")
self.assertEqual(record.load_batch, 1) # Original batch
class InspectionServiceTest(TestCase):
"""Tests for InspectionService."""
def test_save_inspections_empty(self):
"""Test saving empty list returns 0."""
count = InspectionService.save_inspections([], batch_id=1)
self.assertEqual(count, 0)
def test_save_inspections(self):
"""Test saving inspections from dataclass."""
inspections = [
Inspection(
registration_number=f"77202400000{i}",
inn=f"770{i}234567",
ogrn=f"102770000000{i}",
organisation_name=f"Компания {i}",
control_authority="Роспотребнадзор",
inspection_type="плановая",
inspection_form="документарная",
start_date="2024-01-15",
end_date="2024-01-30",
status="завершена",
legal_basis="294-ФЗ",
result="нарушения не выявлены",
)
for i in range(5)
]
count = InspectionService.save_inspections(inspections, batch_id=1)
self.assertEqual(count, 5)
self.assertEqual(InspectionRecord.objects.count(), 5)
def test_save_inspections_with_chunk_size(self):
"""Test saving inspections in chunks."""
inspections = [
Inspection(
registration_number=f"7720240000{i:02d}",
inn=f"770{i:02d}34567",
ogrn=f"10277000000{i:02d}",
organisation_name=f"Компания {i}",
control_authority="Ростехнадзор",
inspection_type="внеплановая",
inspection_form="выездная",
start_date="2024-02-01",
end_date="2024-02-15",
status="завершена",
legal_basis="248-ФЗ",
)
for i in range(10)
]
count = InspectionService.save_inspections(
inspections, batch_id=1, chunk_size=3
)
self.assertEqual(count, 10)
def test_find_by_inn(self):
"""Test finding inspections by INN."""
InspectionRecordFactory(inn="1111111111", load_batch=1)
InspectionRecordFactory(inn="1111111111", load_batch=2)
InspectionRecordFactory(inn="2222222222", load_batch=1)
results = InspectionService.find_by_inn("1111111111")
self.assertEqual(results.count(), 2)
results_batch1 = InspectionService.find_by_inn("1111111111", batch_id=1)
self.assertEqual(results_batch1.count(), 1)
def test_find_by_registration_number(self):
"""Test finding inspection by registration number."""
InspectionRecordFactory(registration_number="772024000001")
InspectionRecordFactory(registration_number="772024000002")
results = InspectionService.find_by_registration_number("772024000001")
self.assertEqual(results.count(), 1)
def test_find_by_control_authority(self):
"""Test finding inspections by control authority."""
InspectionRecordFactory(control_authority="Роспотребнадзор", load_batch=1)
InspectionRecordFactory(
control_authority="Управление Роспотребнадзора по г. Москве", load_batch=1
)
InspectionRecordFactory(control_authority="Ростехнадзор", load_batch=1)
results = InspectionService.find_by_control_authority("Роспотребнадзор")
self.assertEqual(results.count(), 2)
results_batch1 = InspectionService.find_by_control_authority(
"Роспотребнадзор", batch_id=1
)
self.assertEqual(results_batch1.count(), 2)
def test_save_inspections_deduplication(self):
"""Test saving inspections skips duplicates by registration_number."""
# Create initial inspection
initial = [
Inspection(
registration_number="DEDUP-REG-001",
inn="1234567890",
ogrn="1234567890123",
organisation_name="Old Organisation",
control_authority="Роспотребнадзор",
inspection_type="плановая",
inspection_form="документарная",
start_date="2024-01-01",
end_date="2024-01-15",
status="завершена",
legal_basis="294-ФЗ",
result="нарушения не выявлены",
)
]
count1 = InspectionService.save_inspections(initial, batch_id=1)
self.assertEqual(count1, 1)
self.assertEqual(InspectionRecord.objects.count(), 1)
# Try to save with same registration_number - should be skipped
duplicate = [
Inspection(
registration_number="DEDUP-REG-001", # Same number - will be skipped
inn="9999999999",
ogrn="9999999999999",
organisation_name="New Organisation",
control_authority="Ростехнадзор",
inspection_type="внеплановая",
inspection_form="выездная",
start_date="2024-06-01",
end_date="2024-06-30",
status="в процессе",
legal_basis="248-ФЗ",
result="выявлены нарушения",
)
]
count2 = InspectionService.save_inspections(duplicate, batch_id=2)
# Should still be 1 record (duplicate skipped)
self.assertEqual(InspectionRecord.objects.count(), 1)
# Verify original data preserved
record = InspectionRecord.objects.first()
self.assertEqual(record.organisation_name, "Old Organisation")
self.assertEqual(record.inn, "1234567890")
self.assertEqual(record.control_authority, "Роспотребнадзор")
self.assertEqual(record.status, "завершена")
self.assertEqual(record.load_batch, 1) # Original batch
from django.test import tag
from apps.parsers.clients.base import HTTPClientError
from apps.parsers.clients.minpromtorg.industrial import IndustrialProductionClient
@tag("integration", "slow", "network", "e2e")
class EndToEndIntegrationTest(TestCase):
"""
End-to-end интеграционные тесты полного flow.
Тестирует: Загрузка с API -> Парсинг -> Сохранение в БД -> Проверка.
Запуск: uv run python run_tests.py tests.apps.parsers.test_services.EndToEndIntegrationTest
"""
def test_full_flow_fetch_and_save_certificates(self):
"""
Полный E2E тест: загрузка сертификатов и сохранение в БД.
1. Загружаем данные с реального API
2. Создаём лог загрузки
3. Сохраняем первые N записей в БД
4. Проверяем что данные корректно сохранились
"""
try:
# 1. Загружаем данные с API
print("\n[E2E] Step 1: Fetching certificates from API...")
with IndustrialProductionClient(timeout=120) as client:
all_certificates = client.fetch_certificates()
if not all_certificates:
self.skipTest("No certificates returned from API")
print(f"[E2E] Loaded {len(all_certificates)} certificates from API")
# Берём только первые 100 для теста
certificates = all_certificates[:100]
# 2. Создаём batch_id и лог
print("[E2E] Step 2: Creating load log...")
batch_id = ParserLoadLogService.get_next_batch_id(
ParserLoadLog.Source.INDUSTRIAL
)
log = ParserLoadLogService.create_load_log(
source=ParserLoadLog.Source.INDUSTRIAL,
batch_id=batch_id,
records_count=0,
)
print(f"[E2E] Created batch_id={batch_id}")
# 3. Сохраняем в БД
print("[E2E] Step 3: Saving certificates to database...")
saved_count = IndustrialCertificateService.save_certificates(
certificates, batch_id=batch_id
)
ParserLoadLogService.update_records_count(log, saved_count)
print(f"[E2E] Saved {saved_count} certificates")
# 4. Проверяем результат
print("[E2E] Step 4: Verifying saved data...")
# Проверяем количество
db_count = IndustrialCertificateRecord.objects.filter(
load_batch=batch_id
).count()
self.assertEqual(db_count, saved_count)
self.assertEqual(db_count, len(certificates))
# Проверяем первую запись
first_cert = certificates[0]
db_record = IndustrialCertificateRecord.objects.filter(
load_batch=batch_id,
certificate_number=first_cert.certificate_number,
).first()
self.assertIsNotNone(db_record)
self.assertEqual(db_record.inn, first_cert.inn)
self.assertEqual(db_record.ogrn, first_cert.ogrn)
self.assertEqual(db_record.organisation_name, first_cert.organisation_name)
# Проверяем лог
log.refresh_from_db()
self.assertEqual(log.records_count, saved_count)
self.assertEqual(log.status, "success")
print("[E2E] ✅ All checks passed!")
print(f"[E2E] Sample record: {db_record.certificate_number}")
print(f"[E2E] Organisation: {db_record.organisation_name}")
print(f"[E2E] INN: {db_record.inn}, OGRN: {db_record.ogrn}")
except HTTPClientError as e:
self.skipTest(f"External API unavailable: {e}")