feat: обновления парсеров, тестов и миграций
Some checks failed
CI/CD Pipeline / Run Tests (push) Failing after 37s
CI/CD Pipeline / Code Quality Checks (push) Failing after 43s
CI/CD Pipeline / Build & Push Images (push) Has been skipped
CI/CD Pipeline / Deploy (dev) (push) Has been skipped
CI/CD Pipeline / Deploy (prod) (push) Has been skipped
CI/CD Pipeline / Code Quality Checks (pull_request) Failing after 0s
CI/CD Pipeline / Run Tests (pull_request) Failing after 0s
CI/CD Pipeline / Build & Push Images (pull_request) Has been skipped
CI/CD Pipeline / Deploy (dev) (pull_request) Has been skipped
CI/CD Pipeline / Deploy (prod) (pull_request) Has been skipped

- Обновлены клиенты парсеров (checko, fns, minpromtorg, proverki, zakupki)
- Добавлены новые миграции для моделей
- Расширено покрытие тестами
- Обновлены конфигурации и настройки проекта
- Добавлены утилиты для тестирования

Co-Authored-By: Warp <agent@warp.dev>
This commit is contained in:
2026-02-10 10:17:47 +01:00
parent 975d019ba5
commit ee95628a0a
59 changed files with 7292 additions and 2876 deletions

View File

@@ -21,6 +21,7 @@ from xml.etree import ( # noqa: S314 - XML parsing with proper error handling
from apps.parsers.clients.base import BaseHTTPClient, HTTPClientError
from apps.parsers.clients.proverki.schemas import Inspection, InspectionPlan
from requests.adapters import BaseAdapter
logger = logging.getLogger(__name__)
@@ -65,9 +66,12 @@ class ProverkiClient:
proxies: list[str] | None = None
host: str = DEFAULT_HOST
scheme: str = "https"
timeout: int = 120
temp_dir: str | None = None
use_playwright: bool = True # Использовать Playwright как fallback
http_adapter: BaseAdapter | None = None
STREAMING_THRESHOLD_BYTES = 50 * 1024 * 1024
_http_client: BaseHTTPClient | None = field(default=None, repr=False)
_playwright: object | None = field(default=None, repr=False)
_browser: object | None = field(default=None, repr=False)
@@ -84,9 +88,10 @@ class ProverkiClient:
"""Ленивая инициализация HTTP клиента."""
if self._http_client is None:
self._http_client = BaseHTTPClient(
base_url=f"https://{self.host}",
base_url=f"{self.scheme}://{self.host}",
proxies=self.proxies,
timeout=self.timeout,
adapter=self.http_adapter,
headers={
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
@@ -243,18 +248,25 @@ class ProverkiClient:
"""Скачать файл и распарсить его содержимое."""
logger.info("Downloading: %s (format=%s)", file_url, file_format)
# Если это портал - сразу используем Playwright
# Если это портал - используем Playwright или делаем прямую попытку скачивания
if file_format == "portal" or "/portal/" in file_url:
if not self.use_playwright:
raise ProverkiClientError(
"Портал proverki.gov.ru требует JavaScript. "
"Включите use_playwright=True.",
url=file_url,
)
if progress_callback:
progress_callback(20, "Навигация по порталу...")
content = self._download_from_portal(file_url, progress_callback)
self._close_playwright()
if self.use_playwright:
if progress_callback:
progress_callback(20, "Навигация по порталу...")
content = self._download_from_portal(file_url, progress_callback)
self._close_playwright()
else:
if progress_callback:
progress_callback(20, f"Скачивание {file_url}...")
content = self.http_client.download_file(file_url)
logger.info("Downloaded %d bytes", len(content))
if content[:15].lower().startswith((b"<!doctype html", b"<html")):
raise ProverkiClientError(
"Сервер вернул HTML вместо данных. "
"API proverki.gov.ru требует JavaScript. "
"Включите use_playwright=True или получите данные вручную.",
url=file_url,
)
else:
if progress_callback:
progress_callback(20, f"Скачивание {file_url}...")
@@ -508,7 +520,7 @@ class ProverkiClient:
inspections = []
# Для больших файлов используем iterparse (потоковый парсинг)
if len(content) > 50 * 1024 * 1024: # > 50 MB
if len(content) > self.STREAMING_THRESHOLD_BYTES:
logger.info(
"Large file detected (%d MB), using streaming parser",
len(content) // (1024 * 1024),