feat: обновления парсеров, тестов и миграций
Some checks failed
CI/CD Pipeline / Run Tests (push) Failing after 37s
CI/CD Pipeline / Code Quality Checks (push) Failing after 43s
CI/CD Pipeline / Build & Push Images (push) Has been skipped
CI/CD Pipeline / Deploy (dev) (push) Has been skipped
CI/CD Pipeline / Deploy (prod) (push) Has been skipped
CI/CD Pipeline / Code Quality Checks (pull_request) Failing after 0s
CI/CD Pipeline / Run Tests (pull_request) Failing after 0s
CI/CD Pipeline / Build & Push Images (pull_request) Has been skipped
CI/CD Pipeline / Deploy (dev) (pull_request) Has been skipped
CI/CD Pipeline / Deploy (prod) (pull_request) Has been skipped
Some checks failed
CI/CD Pipeline / Run Tests (push) Failing after 37s
CI/CD Pipeline / Code Quality Checks (push) Failing after 43s
CI/CD Pipeline / Build & Push Images (push) Has been skipped
CI/CD Pipeline / Deploy (dev) (push) Has been skipped
CI/CD Pipeline / Deploy (prod) (push) Has been skipped
CI/CD Pipeline / Code Quality Checks (pull_request) Failing after 0s
CI/CD Pipeline / Run Tests (pull_request) Failing after 0s
CI/CD Pipeline / Build & Push Images (pull_request) Has been skipped
CI/CD Pipeline / Deploy (dev) (pull_request) Has been skipped
CI/CD Pipeline / Deploy (prod) (pull_request) Has been skipped
- Обновлены клиенты парсеров (checko, fns, minpromtorg, proverki, zakupki) - Добавлены новые миграции для моделей - Расширено покрытие тестами - Обновлены конфигурации и настройки проекта - Добавлены утилиты для тестирования Co-Authored-By: Warp <agent@warp.dev>
This commit is contained in:
@@ -21,6 +21,7 @@ from xml.etree import ( # noqa: S314 - XML parsing with proper error handling
|
||||
|
||||
from apps.parsers.clients.base import BaseHTTPClient, HTTPClientError
|
||||
from apps.parsers.clients.proverki.schemas import Inspection, InspectionPlan
|
||||
from requests.adapters import BaseAdapter
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -65,9 +66,12 @@ class ProverkiClient:
|
||||
|
||||
proxies: list[str] | None = None
|
||||
host: str = DEFAULT_HOST
|
||||
scheme: str = "https"
|
||||
timeout: int = 120
|
||||
temp_dir: str | None = None
|
||||
use_playwright: bool = True # Использовать Playwright как fallback
|
||||
http_adapter: BaseAdapter | None = None
|
||||
STREAMING_THRESHOLD_BYTES = 50 * 1024 * 1024
|
||||
_http_client: BaseHTTPClient | None = field(default=None, repr=False)
|
||||
_playwright: object | None = field(default=None, repr=False)
|
||||
_browser: object | None = field(default=None, repr=False)
|
||||
@@ -84,9 +88,10 @@ class ProverkiClient:
|
||||
"""Ленивая инициализация HTTP клиента."""
|
||||
if self._http_client is None:
|
||||
self._http_client = BaseHTTPClient(
|
||||
base_url=f"https://{self.host}",
|
||||
base_url=f"{self.scheme}://{self.host}",
|
||||
proxies=self.proxies,
|
||||
timeout=self.timeout,
|
||||
adapter=self.http_adapter,
|
||||
headers={
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
@@ -243,18 +248,25 @@ class ProverkiClient:
|
||||
"""Скачать файл и распарсить его содержимое."""
|
||||
logger.info("Downloading: %s (format=%s)", file_url, file_format)
|
||||
|
||||
# Если это портал - сразу используем Playwright
|
||||
# Если это портал - используем Playwright или делаем прямую попытку скачивания
|
||||
if file_format == "portal" or "/portal/" in file_url:
|
||||
if not self.use_playwright:
|
||||
raise ProverkiClientError(
|
||||
"Портал proverki.gov.ru требует JavaScript. "
|
||||
"Включите use_playwright=True.",
|
||||
url=file_url,
|
||||
)
|
||||
if progress_callback:
|
||||
progress_callback(20, "Навигация по порталу...")
|
||||
content = self._download_from_portal(file_url, progress_callback)
|
||||
self._close_playwright()
|
||||
if self.use_playwright:
|
||||
if progress_callback:
|
||||
progress_callback(20, "Навигация по порталу...")
|
||||
content = self._download_from_portal(file_url, progress_callback)
|
||||
self._close_playwright()
|
||||
else:
|
||||
if progress_callback:
|
||||
progress_callback(20, f"Скачивание {file_url}...")
|
||||
content = self.http_client.download_file(file_url)
|
||||
logger.info("Downloaded %d bytes", len(content))
|
||||
if content[:15].lower().startswith((b"<!doctype html", b"<html")):
|
||||
raise ProverkiClientError(
|
||||
"Сервер вернул HTML вместо данных. "
|
||||
"API proverki.gov.ru требует JavaScript. "
|
||||
"Включите use_playwright=True или получите данные вручную.",
|
||||
url=file_url,
|
||||
)
|
||||
else:
|
||||
if progress_callback:
|
||||
progress_callback(20, f"Скачивание {file_url}...")
|
||||
@@ -508,7 +520,7 @@ class ProverkiClient:
|
||||
inspections = []
|
||||
|
||||
# Для больших файлов используем iterparse (потоковый парсинг)
|
||||
if len(content) > 50 * 1024 * 1024: # > 50 MB
|
||||
if len(content) > self.STREAMING_THRESHOLD_BYTES:
|
||||
logger.info(
|
||||
"Large file detected (%d MB), using streaming parser",
|
||||
len(content) // (1024 * 1024),
|
||||
|
||||
Reference in New Issue
Block a user