feat(parsers): add proverki.gov.ru parser with sync_inspections task
Some checks failed
CI/CD Pipeline / Build Docker Images (push) Blocked by required conditions
CI/CD Pipeline / Push to Gitea Registry (push) Blocked by required conditions
CI/CD Pipeline / Code Quality Checks (push) Failing after 3m55s
CI/CD Pipeline / Run Tests (push) Failing after 3h11m38s

- Add InspectionRecord model with is_federal_law_248, data_year, data_month fields
- Add ProverkiClient with Playwright support for JS-rendered portal
- Add streaming XML parser for large files (>50MB)
- Add sync_inspections task with incremental loading logic
  - Starts from 01.01.2025 if DB is empty
  - Loads both FZ-294 and FZ-248 inspections
  - Stops after 2 consecutive empty months
- Add InspectionService methods: get_last_loaded_period, has_data_for_period
- Add Minpromtorg parsers (certificates, manufacturers)
- Add Django Admin for parser models
- Update README with parsers documentation and changelog
This commit is contained in:
2026-01-21 20:16:25 +01:00
parent f121445313
commit 199d871923
45 changed files with 6810 additions and 97 deletions

View File

@@ -0,0 +1,347 @@
"""Factories for parsers tests."""
import random
from datetime import timedelta
from django.utils import timezone
import factory
from apps.parsers.models import (
IndustrialCertificateRecord,
InspectionRecord,
ManufacturerRecord,
ParserLoadLog,
Proxy,
)
# === Хелперы для генерации реалистичных данных ===
def generate_inn_legal() -> str:
"""Генерация ИНН юридического лица (10 цифр)."""
# ИНН юрлица: NNNNXXXXXC (10 цифр)
# NNNN - код налогового органа
# XXXXX - порядковый номер
# C - контрольная цифра
region = random.choice(["77", "78", "50", "52", "63", "16", "66", "74", "54", "61"])
inspection = str(random.randint(1, 99)).zfill(2)
number = str(random.randint(1, 99999)).zfill(5)
base = region + inspection + number
# Контрольная цифра (упрощённо)
control = str(sum(int(d) for d in base) % 10)
return base + control
def generate_ogrn() -> str:
"""Генерация ОГРН юридического лица (13 цифр)."""
# ОГРН: СГГККННХХХХХЧ (13 цифр)
# С - признак (1 - юрлицо)
# ГГ - год регистрации
# КК - код региона
# НН - код инспекции
# ХХХХХ - номер записи
# Ч - контрольная цифра
sign = "1"
year = str(random.randint(2, 24)).zfill(2)
region = random.choice(["77", "78", "50", "52", "63", "16", "66", "74", "54", "61"])
inspection = str(random.randint(1, 99)).zfill(2)
number = str(random.randint(1, 99999)).zfill(5)
base = sign + year + region + inspection + number
# Контрольная цифра: остаток от деления на 11, если 10 - то 0
control = str(int(base) % 11 % 10)
return base + control
def generate_certificate_number() -> str:
"""Генерация номера сертификата промпроизводства."""
# Формат: ПП-XXXXXXXXXX или аналогичный
prefix = random.choice(["ПП", "СПП", "ЗППП"])
year = random.randint(2020, 2025)
number = random.randint(1, 99999)
return f"{prefix}-{year}-{number:05d}"
def generate_company_name() -> str:
"""Генерация реалистичного названия компании."""
forms = ["ООО", "АО", "ПАО", "ЗАО", "ОАО"]
industries = [
"Металлург",
"Промтех",
"Машстрой",
"Агропром",
"Нефтегаз",
"Химпром",
"Электроника",
"Автоком",
"Стройинвест",
"Техносервис",
"Приборостроение",
"Энергомаш",
"Станкопром",
"Спецсталь",
"Трубопрокат",
]
suffixes = ["", " Групп", " Холдинг", " Инвест", " Трейд", " Индустрия", " Про"]
cities = [
"Москва",
"Санкт-Петербург",
"Новосибирск",
"Екатеринбург",
"Казань",
"Челябинск",
]
form = random.choice(forms)
industry = random.choice(industries)
suffix = random.choice(suffixes)
city = random.choice(cities) if random.random() > 0.7 else ""
name = f"{industry}{suffix}"
if city:
name = f"{name}-{city}"
return f'{form} "{name}"'
def generate_legal_address() -> str:
"""Генерация юридического адреса."""
regions = [
("г. Москва", ""),
("г. Санкт-Петербург", ""),
("Московская обл.", "г. Подольск"),
("Свердловская обл.", "г. Екатеринбург"),
("Республика Татарстан", "г. Казань"),
("Челябинская обл.", "г. Челябинск"),
("Новосибирская обл.", "г. Новосибирск"),
("Нижегородская обл.", "г. Нижний Новгород"),
]
region, city = random.choice(regions)
street_types = ["ул.", "пр-т", "пер.", "наб.", "ш."]
street_names = [
"Ленина",
"Мира",
"Советская",
"Промышленная",
"Заводская",
"Первомайская",
"Октябрьская",
"Гагарина",
"Кирова",
"Строителей",
]
street = f"{random.choice(street_types)} {random.choice(street_names)}"
building = random.randint(1, 150)
office = random.randint(1, 500) if random.random() > 0.5 else None
postal = f"{random.randint(100, 199)}0{random.randint(10, 99)}"
parts = [postal, region]
if city:
parts.append(city)
parts.append(f"{street}, д. {building}")
if office:
parts.append(f"оф. {office}")
return ", ".join(parts)
def generate_proxy_address() -> str:
"""Генерация адреса прокси-сервера."""
protocols = ["http", "https", "socks5"]
hosts = [
f"{random.randint(1, 255)}.{random.randint(1, 255)}."
f"{random.randint(1, 255)}.{random.randint(1, 255)}",
f"proxy{random.randint(1, 50)}.example.com",
f"ru{random.randint(1, 20)}.proxy-service.net",
]
ports = [8080, 3128, 8888, 1080, 8000, 9050]
protocol = random.choice(protocols)
host = random.choice(hosts)
port = random.choice(ports)
return f"{protocol}://{host}:{port}"
# === Фабрики ===
class ProxyFactory(factory.django.DjangoModelFactory):
"""Factory for Proxy model."""
class Meta:
model = Proxy
address = factory.LazyFunction(generate_proxy_address)
is_active = True
fail_count = 0
description = factory.LazyAttribute(
lambda _: random.choice(
[
"Datacenter RU",
"Residential RU",
"Mobile RU",
"Datacenter EU",
"Premium proxy",
"Backup proxy",
]
)
)
class ParserLoadLogFactory(factory.django.DjangoModelFactory):
"""Factory for ParserLoadLog model."""
class Meta:
model = ParserLoadLog
batch_id = factory.Sequence(lambda n: n + 1)
source = factory.LazyAttribute(
lambda _: random.choice(
[
ParserLoadLog.Source.INDUSTRIAL,
ParserLoadLog.Source.MANUFACTURES,
]
)
)
records_count = factory.LazyAttribute(lambda _: random.randint(100, 5000))
status = "success"
error_message = ""
class IndustrialCertificateRecordFactory(factory.django.DjangoModelFactory):
"""Factory for IndustrialCertificateRecord model."""
class Meta:
model = IndustrialCertificateRecord
load_batch = factory.Sequence(lambda n: n + 1)
issue_date = factory.LazyAttribute(
lambda _: (timezone.now() - timedelta(days=random.randint(30, 365))).strftime(
"%d.%m.%Y"
)
)
certificate_number = factory.LazyFunction(generate_certificate_number)
expiry_date = factory.LazyAttribute(
lambda _: (timezone.now() + timedelta(days=random.randint(180, 730))).strftime(
"%d.%m.%Y"
)
)
certificate_file_url = factory.LazyAttribute(
lambda obj: f"https://minpromtorg.gov.ru/docs/certificates/"
f"{obj.certificate_number.replace('-', '_')}.pdf"
)
organisation_name = factory.LazyFunction(generate_company_name)
inn = factory.LazyFunction(generate_inn_legal)
ogrn = factory.LazyFunction(generate_ogrn)
class ManufacturerRecordFactory(factory.django.DjangoModelFactory):
"""Factory for ManufacturerRecord model."""
class Meta:
model = ManufacturerRecord
load_batch = factory.Sequence(lambda n: n + 1)
full_legal_name = factory.LazyFunction(generate_company_name)
inn = factory.LazyFunction(generate_inn_legal)
ogrn = factory.LazyFunction(generate_ogrn)
address = factory.LazyFunction(generate_legal_address)
def generate_registration_number() -> str:
"""Генерация учётного номера проверки."""
# Формат: 772020123456 или подобный
region = random.choice(["77", "78", "50", "52", "63", "16", "66", "74", "54", "61"])
year = random.randint(2020, 2025)
number = random.randint(1, 999999)
return f"{region}{year}{number:06d}"
def generate_control_authority() -> str:
"""Генерация наименования контрольного органа."""
authorities = [
"Роспотребнадзор",
"Ростехнадзор",
"Росприроднадзор",
"МЧС России",
"Роструд",
"ФНС России",
"ФАС России",
"Россельхознадзор",
"Роскомнадзор",
"Росздравнадзор",
]
prefixes = [
"Управление",
"Территориальное управление",
"Межрегиональное управление",
"Отдел",
]
regions = [
"по г. Москве",
"по Санкт-Петербургу",
"по Московской области",
"по Свердловской области",
"по Республике Татарстан",
"по Челябинской области",
"по Новосибирской области",
]
authority = random.choice(authorities)
prefix = random.choice(prefixes) if random.random() > 0.3 else ""
region = random.choice(regions) if random.random() > 0.4 else ""
if prefix and region:
return f"{prefix} {authority} {region}"
elif prefix:
return f"{prefix} {authority}"
elif region:
return f"{authority} {region}"
return authority
class InspectionRecordFactory(factory.django.DjangoModelFactory):
"""Factory for InspectionRecord model."""
class Meta:
model = InspectionRecord
load_batch = factory.Sequence(lambda n: n + 1)
registration_number = factory.LazyFunction(generate_registration_number)
inn = factory.LazyFunction(generate_inn_legal)
ogrn = factory.LazyFunction(generate_ogrn)
organisation_name = factory.LazyFunction(generate_company_name)
control_authority = factory.LazyFunction(generate_control_authority)
inspection_type = factory.LazyAttribute(
lambda _: random.choice(["плановая", "внеплановая"])
)
inspection_form = factory.LazyAttribute(
lambda _: random.choice(["документарная", "выездная", "документарная и выездная"])
)
start_date = factory.LazyAttribute(
lambda _: (timezone.now() - timedelta(days=random.randint(1, 180))).strftime(
"%Y-%m-%d"
)
)
end_date = factory.LazyAttribute(
lambda _: (timezone.now() + timedelta(days=random.randint(1, 30))).strftime(
"%Y-%m-%d"
)
)
status = factory.LazyAttribute(
lambda _: random.choice(["завершена", "в процессе", "запланирована"])
)
legal_basis = factory.LazyAttribute(
lambda _: random.choice(["294-ФЗ", "248-ФЗ", "184-ФЗ"])
)
result = factory.LazyAttribute(
lambda _: random.choice(
["нарушения не выявлены", "выявлены нарушения", ""]
)
if random.random() > 0.3
else ""
)