feat: обновления парсеров, тестов и миграций
Some checks failed
CI/CD Pipeline / Run Tests (push) Failing after 37s
CI/CD Pipeline / Code Quality Checks (push) Failing after 43s
CI/CD Pipeline / Build & Push Images (push) Has been skipped
CI/CD Pipeline / Deploy (dev) (push) Has been skipped
CI/CD Pipeline / Deploy (prod) (push) Has been skipped
CI/CD Pipeline / Code Quality Checks (pull_request) Failing after 0s
CI/CD Pipeline / Run Tests (pull_request) Failing after 0s
CI/CD Pipeline / Build & Push Images (pull_request) Has been skipped
CI/CD Pipeline / Deploy (dev) (pull_request) Has been skipped
CI/CD Pipeline / Deploy (prod) (pull_request) Has been skipped
Some checks failed
CI/CD Pipeline / Run Tests (push) Failing after 37s
CI/CD Pipeline / Code Quality Checks (push) Failing after 43s
CI/CD Pipeline / Build & Push Images (push) Has been skipped
CI/CD Pipeline / Deploy (dev) (push) Has been skipped
CI/CD Pipeline / Deploy (prod) (push) Has been skipped
CI/CD Pipeline / Code Quality Checks (pull_request) Failing after 0s
CI/CD Pipeline / Run Tests (pull_request) Failing after 0s
CI/CD Pipeline / Build & Push Images (pull_request) Has been skipped
CI/CD Pipeline / Deploy (dev) (pull_request) Has been skipped
CI/CD Pipeline / Deploy (prod) (pull_request) Has been skipped
- Обновлены клиенты парсеров (checko, fns, minpromtorg, proverki, zakupki) - Добавлены новые миграции для моделей - Расширено покрытие тестами - Обновлены конфигурации и настройки проекта - Добавлены утилиты для тестирования Co-Authored-By: Warp <agent@warp.dev>
This commit is contained in:
@@ -10,6 +10,7 @@ from dataclasses import dataclass, field
|
||||
from typing import Any
|
||||
|
||||
import requests
|
||||
from requests.adapters import BaseAdapter
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -68,6 +69,7 @@ class BaseHTTPClient:
|
||||
proxies: list[str] | None = None
|
||||
timeout: int = 30
|
||||
headers: dict[str, str] = field(default_factory=dict)
|
||||
adapter: BaseAdapter | None = None
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
"""Инициализация после создания dataclass."""
|
||||
@@ -115,6 +117,17 @@ class BaseHTTPClient:
|
||||
default_headers.update(self.headers)
|
||||
session.headers.update(default_headers)
|
||||
|
||||
if self.adapter is not None:
|
||||
session.mount(self.base_url, self.adapter)
|
||||
if self.base_url.startswith("http://"):
|
||||
session.mount(
|
||||
self.base_url.replace("http://", "https://", 1), self.adapter
|
||||
)
|
||||
elif self.base_url.startswith("https://"):
|
||||
session.mount(
|
||||
self.base_url.replace("https://", "http://", 1), self.adapter
|
||||
)
|
||||
|
||||
return session
|
||||
|
||||
def rotate_proxy(self) -> str | None:
|
||||
|
||||
@@ -96,6 +96,7 @@ from apps.parsers.clients.checko.schemas.responses import (
|
||||
TaxDebt,
|
||||
TaxPenalty,
|
||||
)
|
||||
from requests.adapters import BaseAdapter
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -274,6 +275,9 @@ class CheckoClient:
|
||||
proxies: list[str] | None = None
|
||||
"""Список прокси (опционально)."""
|
||||
|
||||
http_adapter: BaseAdapter | None = None
|
||||
"""Опциональный HTTP адаптер (для тестов)."""
|
||||
|
||||
_http_client: BaseHTTPClient = field(init=False, repr=False)
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
@@ -282,6 +286,7 @@ class CheckoClient:
|
||||
base_url=self.base_url,
|
||||
proxies=self.proxies,
|
||||
timeout=self.timeout,
|
||||
adapter=self.http_adapter,
|
||||
# Don't request Brotli compression (br) as it requires extra dependency
|
||||
headers={"Accept-Encoding": "gzip, deflate"},
|
||||
)
|
||||
|
||||
@@ -195,7 +195,7 @@ class FNSExcelParser:
|
||||
"""Преобразует значение ячейки в int или None."""
|
||||
if value is None:
|
||||
return None
|
||||
if isinstance(value, int | float):
|
||||
if isinstance(value, (int, float)):
|
||||
return int(value)
|
||||
if isinstance(value, str):
|
||||
value = value.strip()
|
||||
|
||||
@@ -13,6 +13,7 @@ from io import BytesIO
|
||||
from apps.parsers.clients.base import BaseHTTPClient, HTTPClientError
|
||||
from apps.parsers.clients.minpromtorg.schemas import IndustrialCertificate
|
||||
from openpyxl import load_workbook
|
||||
from requests.adapters import BaseAdapter
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -51,10 +52,12 @@ class IndustrialProductionClient:
|
||||
|
||||
proxies: list[str] | None = None
|
||||
host: str = DEFAULT_HOST
|
||||
scheme: str = "https"
|
||||
api_path: str = DEFAULT_API_PATH
|
||||
doc_type: str = DEFAULT_DOC_TYPE
|
||||
query: str = DEFAULT_QUERY
|
||||
timeout: int = 120
|
||||
http_adapter: BaseAdapter | None = None
|
||||
_http_client: BaseHTTPClient | None = field(default=None, repr=False)
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
@@ -66,9 +69,10 @@ class IndustrialProductionClient:
|
||||
"""Ленивая инициализация HTTP клиента."""
|
||||
if self._http_client is None:
|
||||
self._http_client = BaseHTTPClient(
|
||||
base_url=f"https://{self.host}",
|
||||
base_url=f"{self.scheme}://{self.host}",
|
||||
proxies=self.proxies,
|
||||
timeout=self.timeout,
|
||||
adapter=self.http_adapter,
|
||||
headers={
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/120.0.0.0",
|
||||
"Accept": "application/json",
|
||||
@@ -162,7 +166,7 @@ class IndustrialProductionClient:
|
||||
)
|
||||
# URL может быть относительным
|
||||
if url and not url.startswith("http"):
|
||||
return f"https://{self.host}{url}"
|
||||
return f"{self.scheme}://{self.host}{url}"
|
||||
return url
|
||||
|
||||
return None
|
||||
|
||||
@@ -13,6 +13,7 @@ from io import BytesIO
|
||||
from apps.parsers.clients.base import BaseHTTPClient, HTTPClientError
|
||||
from apps.parsers.clients.minpromtorg.schemas import Manufacturer
|
||||
from openpyxl import load_workbook
|
||||
from requests.adapters import BaseAdapter
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -51,10 +52,12 @@ class ManufacturesClient:
|
||||
|
||||
proxies: list[str] | None = None
|
||||
host: str = DEFAULT_HOST
|
||||
scheme: str = "https"
|
||||
api_path: str = DEFAULT_API_PATH
|
||||
doc_type: str = DEFAULT_DOC_TYPE
|
||||
query: str = DEFAULT_QUERY
|
||||
timeout: int = 120
|
||||
http_adapter: BaseAdapter | None = None
|
||||
_http_client: BaseHTTPClient | None = field(default=None, repr=False)
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
@@ -66,9 +69,10 @@ class ManufacturesClient:
|
||||
"""Ленивая инициализация HTTP клиента."""
|
||||
if self._http_client is None:
|
||||
self._http_client = BaseHTTPClient(
|
||||
base_url=f"https://{self.host}",
|
||||
base_url=f"{self.scheme}://{self.host}",
|
||||
proxies=self.proxies,
|
||||
timeout=self.timeout,
|
||||
adapter=self.http_adapter,
|
||||
headers={
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/120.0.0.0",
|
||||
"Accept": "application/json",
|
||||
@@ -159,7 +163,7 @@ class ManufacturesClient:
|
||||
"Latest file: %s (date: %s)", latest_file.get("name"), latest_date
|
||||
)
|
||||
if url and not url.startswith("http"):
|
||||
return f"https://{self.host}{url}"
|
||||
return f"{self.scheme}://{self.host}{url}"
|
||||
return url
|
||||
|
||||
return None
|
||||
|
||||
@@ -21,6 +21,7 @@ from xml.etree import ( # noqa: S314 - XML parsing with proper error handling
|
||||
|
||||
from apps.parsers.clients.base import BaseHTTPClient, HTTPClientError
|
||||
from apps.parsers.clients.proverki.schemas import Inspection, InspectionPlan
|
||||
from requests.adapters import BaseAdapter
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -65,9 +66,12 @@ class ProverkiClient:
|
||||
|
||||
proxies: list[str] | None = None
|
||||
host: str = DEFAULT_HOST
|
||||
scheme: str = "https"
|
||||
timeout: int = 120
|
||||
temp_dir: str | None = None
|
||||
use_playwright: bool = True # Использовать Playwright как fallback
|
||||
http_adapter: BaseAdapter | None = None
|
||||
STREAMING_THRESHOLD_BYTES = 50 * 1024 * 1024
|
||||
_http_client: BaseHTTPClient | None = field(default=None, repr=False)
|
||||
_playwright: object | None = field(default=None, repr=False)
|
||||
_browser: object | None = field(default=None, repr=False)
|
||||
@@ -84,9 +88,10 @@ class ProverkiClient:
|
||||
"""Ленивая инициализация HTTP клиента."""
|
||||
if self._http_client is None:
|
||||
self._http_client = BaseHTTPClient(
|
||||
base_url=f"https://{self.host}",
|
||||
base_url=f"{self.scheme}://{self.host}",
|
||||
proxies=self.proxies,
|
||||
timeout=self.timeout,
|
||||
adapter=self.http_adapter,
|
||||
headers={
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
@@ -243,18 +248,25 @@ class ProverkiClient:
|
||||
"""Скачать файл и распарсить его содержимое."""
|
||||
logger.info("Downloading: %s (format=%s)", file_url, file_format)
|
||||
|
||||
# Если это портал - сразу используем Playwright
|
||||
# Если это портал - используем Playwright или делаем прямую попытку скачивания
|
||||
if file_format == "portal" or "/portal/" in file_url:
|
||||
if not self.use_playwright:
|
||||
raise ProverkiClientError(
|
||||
"Портал proverki.gov.ru требует JavaScript. "
|
||||
"Включите use_playwright=True.",
|
||||
url=file_url,
|
||||
)
|
||||
if progress_callback:
|
||||
progress_callback(20, "Навигация по порталу...")
|
||||
content = self._download_from_portal(file_url, progress_callback)
|
||||
self._close_playwright()
|
||||
if self.use_playwright:
|
||||
if progress_callback:
|
||||
progress_callback(20, "Навигация по порталу...")
|
||||
content = self._download_from_portal(file_url, progress_callback)
|
||||
self._close_playwright()
|
||||
else:
|
||||
if progress_callback:
|
||||
progress_callback(20, f"Скачивание {file_url}...")
|
||||
content = self.http_client.download_file(file_url)
|
||||
logger.info("Downloaded %d bytes", len(content))
|
||||
if content[:15].lower().startswith((b"<!doctype html", b"<html")):
|
||||
raise ProverkiClientError(
|
||||
"Сервер вернул HTML вместо данных. "
|
||||
"API proverki.gov.ru требует JavaScript. "
|
||||
"Включите use_playwright=True или получите данные вручную.",
|
||||
url=file_url,
|
||||
)
|
||||
else:
|
||||
if progress_callback:
|
||||
progress_callback(20, f"Скачивание {file_url}...")
|
||||
@@ -508,7 +520,7 @@ class ProverkiClient:
|
||||
inspections = []
|
||||
|
||||
# Для больших файлов используем iterparse (потоковый парсинг)
|
||||
if len(content) > 50 * 1024 * 1024: # > 50 MB
|
||||
if len(content) > self.STREAMING_THRESHOLD_BYTES:
|
||||
logger.info(
|
||||
"Large file detected (%d MB), using streaming parser",
|
||||
len(content) // (1024 * 1024),
|
||||
|
||||
@@ -26,6 +26,7 @@ from xml.etree import ( # noqa: S314 - XML parsing with proper error handling
|
||||
|
||||
from apps.parsers.clients.base import BaseHTTPClient, HTTPClientError
|
||||
from apps.parsers.clients.zakupki.schemas import Procurement, ProcurementPlan
|
||||
from requests.adapters import BaseAdapter
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -85,7 +86,10 @@ class ZakupkiClient:
|
||||
token: str | None = None # Токен для SOAP API (обязателен для работы)
|
||||
proxies: list[str] | None = None
|
||||
host: str = DEFAULT_HOST
|
||||
scheme: str = "https"
|
||||
soap_url: str = SOAP_API_URL
|
||||
timeout: int = 120
|
||||
http_adapter: BaseAdapter | None = None
|
||||
_http_client: BaseHTTPClient | None = field(default=None, repr=False)
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
@@ -97,9 +101,10 @@ class ZakupkiClient:
|
||||
"""Ленивая инициализация HTTP клиента."""
|
||||
if self._http_client is None:
|
||||
self._http_client = BaseHTTPClient(
|
||||
base_url=f"https://{self.host}",
|
||||
base_url=f"{self.scheme}://{self.host}",
|
||||
proxies=self.proxies,
|
||||
timeout=self.timeout,
|
||||
adapter=self.http_adapter,
|
||||
headers={
|
||||
"User-Agent": (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
@@ -233,7 +238,7 @@ class ZakupkiClient:
|
||||
# Отправляем SOAP запрос
|
||||
try:
|
||||
response = self.http_client.post(
|
||||
SOAP_API_URL,
|
||||
self.soap_url,
|
||||
data=soap_request.encode("utf-8"),
|
||||
headers={
|
||||
"Content-Type": "text/xml; charset=utf-8",
|
||||
@@ -479,13 +484,13 @@ class ZakupkiClient:
|
||||
if month:
|
||||
file_name = f"notifications_{region_code}_{year}{month:02d}_{fz_suffix}.zip"
|
||||
file_url = (
|
||||
f"https://{self.host}/opendata/download/"
|
||||
f"{self.scheme}://{self.host}/opendata/download/"
|
||||
f"notifications/{region_code}/{year}/{month:02d}/{fz_suffix}.zip"
|
||||
)
|
||||
else:
|
||||
file_name = f"notifications_{region_code}_{year}_{fz_suffix}.zip"
|
||||
file_url = (
|
||||
f"https://{self.host}/opendata/download/"
|
||||
f"{self.scheme}://{self.host}/opendata/download/"
|
||||
f"notifications/{region_code}/{year}/{fz_suffix}.zip"
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user