feat: обновления парсеров, тестов и миграций
Some checks failed
CI/CD Pipeline / Run Tests (push) Failing after 37s
CI/CD Pipeline / Code Quality Checks (push) Failing after 43s
CI/CD Pipeline / Build & Push Images (push) Has been skipped
CI/CD Pipeline / Deploy (dev) (push) Has been skipped
CI/CD Pipeline / Deploy (prod) (push) Has been skipped
CI/CD Pipeline / Code Quality Checks (pull_request) Failing after 0s
CI/CD Pipeline / Run Tests (pull_request) Failing after 0s
CI/CD Pipeline / Build & Push Images (pull_request) Has been skipped
CI/CD Pipeline / Deploy (dev) (pull_request) Has been skipped
CI/CD Pipeline / Deploy (prod) (pull_request) Has been skipped

- Обновлены клиенты парсеров (checko, fns, minpromtorg, proverki, zakupki)
- Добавлены новые миграции для моделей
- Расширено покрытие тестами
- Обновлены конфигурации и настройки проекта
- Добавлены утилиты для тестирования

Co-Authored-By: Warp <agent@warp.dev>
This commit is contained in:
2026-02-10 10:17:47 +01:00
parent 975d019ba5
commit ee95628a0a
59 changed files with 7292 additions and 2876 deletions

View File

@@ -10,6 +10,7 @@ from dataclasses import dataclass, field
from typing import Any
import requests
from requests.adapters import BaseAdapter
logger = logging.getLogger(__name__)
@@ -68,6 +69,7 @@ class BaseHTTPClient:
proxies: list[str] | None = None
timeout: int = 30
headers: dict[str, str] = field(default_factory=dict)
adapter: BaseAdapter | None = None
def __post_init__(self) -> None:
"""Инициализация после создания dataclass."""
@@ -115,6 +117,17 @@ class BaseHTTPClient:
default_headers.update(self.headers)
session.headers.update(default_headers)
if self.adapter is not None:
session.mount(self.base_url, self.adapter)
if self.base_url.startswith("http://"):
session.mount(
self.base_url.replace("http://", "https://", 1), self.adapter
)
elif self.base_url.startswith("https://"):
session.mount(
self.base_url.replace("https://", "http://", 1), self.adapter
)
return session
def rotate_proxy(self) -> str | None:

View File

@@ -96,6 +96,7 @@ from apps.parsers.clients.checko.schemas.responses import (
TaxDebt,
TaxPenalty,
)
from requests.adapters import BaseAdapter
logger = logging.getLogger(__name__)
@@ -274,6 +275,9 @@ class CheckoClient:
proxies: list[str] | None = None
"""Список прокси (опционально)."""
http_adapter: BaseAdapter | None = None
"""Опциональный HTTP адаптер (для тестов)."""
_http_client: BaseHTTPClient = field(init=False, repr=False)
def __post_init__(self) -> None:
@@ -282,6 +286,7 @@ class CheckoClient:
base_url=self.base_url,
proxies=self.proxies,
timeout=self.timeout,
adapter=self.http_adapter,
# Don't request Brotli compression (br) as it requires extra dependency
headers={"Accept-Encoding": "gzip, deflate"},
)

View File

@@ -195,7 +195,7 @@ class FNSExcelParser:
"""Преобразует значение ячейки в int или None."""
if value is None:
return None
if isinstance(value, int | float):
if isinstance(value, (int, float)):
return int(value)
if isinstance(value, str):
value = value.strip()

View File

@@ -13,6 +13,7 @@ from io import BytesIO
from apps.parsers.clients.base import BaseHTTPClient, HTTPClientError
from apps.parsers.clients.minpromtorg.schemas import IndustrialCertificate
from openpyxl import load_workbook
from requests.adapters import BaseAdapter
logger = logging.getLogger(__name__)
@@ -51,10 +52,12 @@ class IndustrialProductionClient:
proxies: list[str] | None = None
host: str = DEFAULT_HOST
scheme: str = "https"
api_path: str = DEFAULT_API_PATH
doc_type: str = DEFAULT_DOC_TYPE
query: str = DEFAULT_QUERY
timeout: int = 120
http_adapter: BaseAdapter | None = None
_http_client: BaseHTTPClient | None = field(default=None, repr=False)
def __post_init__(self) -> None:
@@ -66,9 +69,10 @@ class IndustrialProductionClient:
"""Ленивая инициализация HTTP клиента."""
if self._http_client is None:
self._http_client = BaseHTTPClient(
base_url=f"https://{self.host}",
base_url=f"{self.scheme}://{self.host}",
proxies=self.proxies,
timeout=self.timeout,
adapter=self.http_adapter,
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/120.0.0.0",
"Accept": "application/json",
@@ -162,7 +166,7 @@ class IndustrialProductionClient:
)
# URL может быть относительным
if url and not url.startswith("http"):
return f"https://{self.host}{url}"
return f"{self.scheme}://{self.host}{url}"
return url
return None

View File

@@ -13,6 +13,7 @@ from io import BytesIO
from apps.parsers.clients.base import BaseHTTPClient, HTTPClientError
from apps.parsers.clients.minpromtorg.schemas import Manufacturer
from openpyxl import load_workbook
from requests.adapters import BaseAdapter
logger = logging.getLogger(__name__)
@@ -51,10 +52,12 @@ class ManufacturesClient:
proxies: list[str] | None = None
host: str = DEFAULT_HOST
scheme: str = "https"
api_path: str = DEFAULT_API_PATH
doc_type: str = DEFAULT_DOC_TYPE
query: str = DEFAULT_QUERY
timeout: int = 120
http_adapter: BaseAdapter | None = None
_http_client: BaseHTTPClient | None = field(default=None, repr=False)
def __post_init__(self) -> None:
@@ -66,9 +69,10 @@ class ManufacturesClient:
"""Ленивая инициализация HTTP клиента."""
if self._http_client is None:
self._http_client = BaseHTTPClient(
base_url=f"https://{self.host}",
base_url=f"{self.scheme}://{self.host}",
proxies=self.proxies,
timeout=self.timeout,
adapter=self.http_adapter,
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/120.0.0.0",
"Accept": "application/json",
@@ -159,7 +163,7 @@ class ManufacturesClient:
"Latest file: %s (date: %s)", latest_file.get("name"), latest_date
)
if url and not url.startswith("http"):
return f"https://{self.host}{url}"
return f"{self.scheme}://{self.host}{url}"
return url
return None

View File

@@ -21,6 +21,7 @@ from xml.etree import ( # noqa: S314 - XML parsing with proper error handling
from apps.parsers.clients.base import BaseHTTPClient, HTTPClientError
from apps.parsers.clients.proverki.schemas import Inspection, InspectionPlan
from requests.adapters import BaseAdapter
logger = logging.getLogger(__name__)
@@ -65,9 +66,12 @@ class ProverkiClient:
proxies: list[str] | None = None
host: str = DEFAULT_HOST
scheme: str = "https"
timeout: int = 120
temp_dir: str | None = None
use_playwright: bool = True # Использовать Playwright как fallback
http_adapter: BaseAdapter | None = None
STREAMING_THRESHOLD_BYTES = 50 * 1024 * 1024
_http_client: BaseHTTPClient | None = field(default=None, repr=False)
_playwright: object | None = field(default=None, repr=False)
_browser: object | None = field(default=None, repr=False)
@@ -84,9 +88,10 @@ class ProverkiClient:
"""Ленивая инициализация HTTP клиента."""
if self._http_client is None:
self._http_client = BaseHTTPClient(
base_url=f"https://{self.host}",
base_url=f"{self.scheme}://{self.host}",
proxies=self.proxies,
timeout=self.timeout,
adapter=self.http_adapter,
headers={
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
@@ -243,18 +248,25 @@ class ProverkiClient:
"""Скачать файл и распарсить его содержимое."""
logger.info("Downloading: %s (format=%s)", file_url, file_format)
# Если это портал - сразу используем Playwright
# Если это портал - используем Playwright или делаем прямую попытку скачивания
if file_format == "portal" or "/portal/" in file_url:
if not self.use_playwright:
raise ProverkiClientError(
"Портал proverki.gov.ru требует JavaScript. "
"Включите use_playwright=True.",
url=file_url,
)
if progress_callback:
progress_callback(20, "Навигация по порталу...")
content = self._download_from_portal(file_url, progress_callback)
self._close_playwright()
if self.use_playwright:
if progress_callback:
progress_callback(20, "Навигация по порталу...")
content = self._download_from_portal(file_url, progress_callback)
self._close_playwright()
else:
if progress_callback:
progress_callback(20, f"Скачивание {file_url}...")
content = self.http_client.download_file(file_url)
logger.info("Downloaded %d bytes", len(content))
if content[:15].lower().startswith((b"<!doctype html", b"<html")):
raise ProverkiClientError(
"Сервер вернул HTML вместо данных. "
"API proverki.gov.ru требует JavaScript. "
"Включите use_playwright=True или получите данные вручную.",
url=file_url,
)
else:
if progress_callback:
progress_callback(20, f"Скачивание {file_url}...")
@@ -508,7 +520,7 @@ class ProverkiClient:
inspections = []
# Для больших файлов используем iterparse (потоковый парсинг)
if len(content) > 50 * 1024 * 1024: # > 50 MB
if len(content) > self.STREAMING_THRESHOLD_BYTES:
logger.info(
"Large file detected (%d MB), using streaming parser",
len(content) // (1024 * 1024),

View File

@@ -26,6 +26,7 @@ from xml.etree import ( # noqa: S314 - XML parsing with proper error handling
from apps.parsers.clients.base import BaseHTTPClient, HTTPClientError
from apps.parsers.clients.zakupki.schemas import Procurement, ProcurementPlan
from requests.adapters import BaseAdapter
logger = logging.getLogger(__name__)
@@ -85,7 +86,10 @@ class ZakupkiClient:
token: str | None = None # Токен для SOAP API (обязателен для работы)
proxies: list[str] | None = None
host: str = DEFAULT_HOST
scheme: str = "https"
soap_url: str = SOAP_API_URL
timeout: int = 120
http_adapter: BaseAdapter | None = None
_http_client: BaseHTTPClient | None = field(default=None, repr=False)
def __post_init__(self) -> None:
@@ -97,9 +101,10 @@ class ZakupkiClient:
"""Ленивая инициализация HTTP клиента."""
if self._http_client is None:
self._http_client = BaseHTTPClient(
base_url=f"https://{self.host}",
base_url=f"{self.scheme}://{self.host}",
proxies=self.proxies,
timeout=self.timeout,
adapter=self.http_adapter,
headers={
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
@@ -233,7 +238,7 @@ class ZakupkiClient:
# Отправляем SOAP запрос
try:
response = self.http_client.post(
SOAP_API_URL,
self.soap_url,
data=soap_request.encode("utf-8"),
headers={
"Content-Type": "text/xml; charset=utf-8",
@@ -479,13 +484,13 @@ class ZakupkiClient:
if month:
file_name = f"notifications_{region_code}_{year}{month:02d}_{fz_suffix}.zip"
file_url = (
f"https://{self.host}/opendata/download/"
f"{self.scheme}://{self.host}/opendata/download/"
f"notifications/{region_code}/{year}/{month:02d}/{fz_suffix}.zip"
)
else:
file_name = f"notifications_{region_code}_{year}_{fz_suffix}.zip"
file_url = (
f"https://{self.host}/opendata/download/"
f"{self.scheme}://{self.host}/opendata/download/"
f"notifications/{region_code}/{year}/{fz_suffix}.zip"
)