feat: add parser source dashboard and scheduling
All checks were successful
CI/CD Pipeline / Code Quality Checks (pull_request) Successful in 1m6s
CI/CD Pipeline / Run Tests (pull_request) Successful in 1m18s
CI/CD Pipeline / Build Docker Images (pull_request) Has been skipped
CI/CD Pipeline / Push to Gitea Registry (pull_request) Has been skipped

This commit is contained in:
2026-04-27 23:36:28 +02:00
parent 199d871923
commit 44355deeb3
96 changed files with 15015 additions and 309 deletions

View File

@@ -1,19 +1,26 @@
"""Tests for parsers clients."""
import json
import zipfile
from io import BytesIO
from unittest.mock import patch
from django.test import TestCase, tag
from faker import Faker
from openpyxl import Workbook
from unittest.mock import Mock, patch
from apps.parsers.clients.base import BaseHTTPClient, HTTPClientError
from apps.parsers.clients.common import (
GenericParserItem,
StructuredDataClient,
StructuredDataClientError,
)
from apps.parsers.clients.minpromtorg.industrial import IndustrialProductionClient
from apps.parsers.clients.minpromtorg.manufactures import ManufacturesClient
from apps.parsers.clients.minpromtorg.schemas import IndustrialCertificate, Manufacturer
from apps.parsers.clients.proverki import ProverkiClient
from apps.parsers.clients.proverki.schemas import Inspection
from apps.parsers.clients.trudvsem import TrudvsemClient
from apps.parsers.models import ParserLoadLog
from django.test import TestCase, tag
from faker import Faker
from openpyxl import Workbook
fake = Faker("ru_RU")
@@ -63,6 +70,428 @@ class BaseHTTPClientTest(TestCase):
proxy = client.current_proxy
self.assertEqual(proxy, "http://proxy:8080")
def test_download_file_rejects_large_content_length_before_body_read(self):
"""Test download_file checks Content-Length before reading response body."""
client = BaseHTTPClient(base_url="https://example.com")
response = Mock()
response.ok = True
response.headers = {"Content-Length": "10"}
response.iter_content.return_value = [b"too-large"]
response.close = Mock()
client.session.get = Mock(return_value=response)
with self.assertRaises(HTTPClientError):
client.download_file("/data.csv", max_size_bytes=5)
response.iter_content.assert_not_called()
response.close.assert_called_once()
def test_download_file_passes_ssl_verification_flag(self):
"""Test download_file can disable SSL verification for broken upstream TLS."""
client = BaseHTTPClient(base_url="https://example.com", verify_ssl=False)
response = Mock()
response.ok = True
response.headers = {"Content-Length": "4"}
response.iter_content.return_value = [b"data"]
response.close = Mock()
client.session.get = Mock(return_value=response)
content = client.download_file("/data.csv")
self.assertEqual(content, b"data")
client.session.get.assert_called_once_with(
"https://example.com/data.csv",
stream=True,
timeout=30,
verify=False,
)
class StructuredDataClientTest(TestCase):
"""Tests for StructuredDataClient."""
def test_parse_json_records(self):
"""Test JSON parsing and normalization."""
payload = {
"data": [
{
"id": "FIN-1",
"inn": "1234567890",
"ogrn": "1234567890123",
"name": "Test Company",
"amount": "10 500,50",
"date": "2024",
}
]
}
client = StructuredDataClient(source=ParserLoadLog.Source.FNS_FINANCIAL)
records = client.fetch_records(
content=json.dumps(payload).encode("utf-8"),
file_name="data.json",
)
self.assertEqual(len(records), 1)
self.assertIsInstance(records[0], GenericParserItem)
self.assertEqual(records[0].external_id, "FIN-1")
self.assertEqual(records[0].inn, "1234567890")
self.assertEqual(str(records[0].amount), "10500.50")
def test_parse_csv_records(self):
"""Test CSV parsing with Russian headers."""
content = (
"реестровый номер;ИНН;наименование;сумма\n"
"RN-1;1234567890;ООО Тест;1000.00\n"
).encode("cp1251")
client = StructuredDataClient(source=ParserLoadLog.Source.UNFAIR_SUPPLIERS)
records = client.fetch_records(content=content, file_name="data.csv")
self.assertEqual(len(records), 1)
self.assertEqual(records[0].external_id, "RN-1")
self.assertEqual(records[0].organisation_name, "ООО Тест")
def test_parse_xml_records_under_wrapper(self):
"""Test XML parser selects repeated nested record elements."""
content = (
"<root><items>"
"<item><id>XML-1</id><inn>123</inn><name>ООО А</name></item>"
"<item><id>XML-2</id><inn>456</inn><name>ООО Б</name></item>"
"</items></root>"
).encode()
client = StructuredDataClient(source=ParserLoadLog.Source.FNS_FINANCIAL)
records = client.fetch_records(content=content, file_name="data.xml")
self.assertEqual(len(records), 2)
self.assertEqual(records[0].external_id, "XML-1")
self.assertEqual(records[1].external_id, "XML-2")
def test_json_payload_preserves_nested_objects(self):
"""Test payload keeps nested JSON structures machine-readable."""
payload = {
"data": [
{
"id": "NESTED-1",
"company": {"inn": "123", "name": "ООО А"},
"amounts": [1, 2],
}
]
}
client = StructuredDataClient(source=ParserLoadLog.Source.FNS_FINANCIAL)
record = client.fetch_records(
content=json.dumps(payload).encode("utf-8"),
file_name="data.json",
)[0]
self.assertEqual(record.payload["company"], {"inn": "123", "name": "ООО А"})
self.assertEqual(record.payload["amounts"], [1, 2])
def test_fallback_external_id_is_stable_after_reordering(self):
"""Test generated external_id does not depend on row position."""
client = StructuredDataClient(source=ParserLoadLog.Source.FNS_FINANCIAL)
first = client.fetch_records(
content=("name;amount\nООО А;10\nООО Б;20\n").encode(),
file_name="data.csv",
)
second = client.fetch_records(
content=("name;amount\nООО Б;20\nООО А;10\n").encode(),
file_name="data.csv",
)
self.assertEqual(first[0].external_id, second[1].external_id)
self.assertEqual(first[1].external_id, second[0].external_id)
def test_zip_rejects_too_many_supported_files(self):
"""Test ZIP parser refuses archives with too many supported files."""
archive_content = BytesIO()
with zipfile.ZipFile(archive_content, "w") as archive:
archive.writestr("one.csv", "id\n1\n")
archive.writestr("two.csv", "id\n2\n")
client = StructuredDataClient(
source=ParserLoadLog.Source.FNS_FINANCIAL,
max_zip_entries=1,
)
with self.assertRaises(StructuredDataClientError):
client.fetch_records(
content=archive_content.getvalue(),
file_name="data.zip",
)
def test_html_without_table_returns_empty_records(self):
"""Test generic parser does not treat HTML pages as malformed XML."""
content = b"<!doctype html><html><body><main>No table</main></body></html>"
client = StructuredDataClient(source=ParserLoadLog.Source.ARBITRATION)
records = client.fetch_records(content=content, file_name="")
self.assertEqual(records, [])
def test_html_table_after_long_head_is_detected(self):
"""Test HTML detection scans beyond the first kilobyte."""
content = (
"<!doctype html><html><head>" + (" " * 1500) + "</head><body><table>"
"<tr><th>id</th><th>inn</th></tr>"
"<tr><td>HTML-1</td><td>1234567890</td></tr>"
"</table></body></html>"
).encode()
client = StructuredDataClient(source=ParserLoadLog.Source.ARBITRATION)
records = client.fetch_records(content=content, file_name="")
self.assertEqual(len(records), 1)
self.assertEqual(records[0].external_id, "HTML-1")
self.assertEqual(records[0].inn, "1234567890")
def test_html_layout_table_without_headers_is_ignored(self):
"""Test layout/navigation tables are not imported as records."""
content = (
"<html><body><table>"
"<tr><td>Картотека</td><td>Страж</td></tr>"
"<tr><td>Календарь</td><td>Мой Арбитр</td></tr>"
"</table></body></html>"
).encode()
client = StructuredDataClient(source=ParserLoadLog.Source.ARBITRATION)
records = client.fetch_records(content=content, file_name="")
self.assertEqual(records, [])
@patch.object(BaseHTTPClient, "post_json")
def test_mpt_products_page_uses_official_search_api(self, mock_post_json):
"""Test GISP product page uses the official paginated UI API."""
mock_post_json.return_value = {
"ok": True,
"total_count": 1,
"items": [
{
"org_name": "ООО Производитель",
"org_inn": "7701000000",
"org_ogrn": "1027700000000",
"product_reg_number_2023": "10165413",
"product_name": "Средство дезинфицирующее",
"res_date": "2026-04-25",
"product_gisp_url": "https://gisp.gov.ru/goods/#/product/1",
}
],
}
client = StructuredDataClient(source=ParserLoadLog.Source.MPT_PRODUCTS)
records = client.fetch_records(file_url="https://gisp.gov.ru/pp719v2/pub/prod/")
self.assertEqual(len(records), 1)
self.assertEqual(records[0].external_id, "10165413")
self.assertEqual(records[0].inn, "7701000000")
self.assertEqual(records[0].organisation_name, "ООО Производитель")
self.assertEqual(records[0].title, "Средство дезинфицирующее")
self.assertEqual(
mock_post_json.call_args.args[0],
"https://gisp.gov.ru/pp719v2/pub/prod/b/",
)
def test_zakupki_cards_are_parsed_as_records(self):
"""Test ЕИС search cards are parsed when there is no HTML table."""
content = """
<html><body>
<div class="search-registry-entry-block">
<a href="/epz/order/notice/ea20/view/common-info.html?regNumber=0331">№ 0331</a>
<div>Работа комиссии</div>
<div>Объект закупки</div><div>Поставка оборудования</div>
<div>Заказчик</div><div>ГКУ Тест</div>
<div>Начальная цена</div><div>649 989,52 ₽</div>
<div>Размещено</div><div>20.04.2026</div>
</div>
</body></html>
""".encode()
client = StructuredDataClient(source=ParserLoadLog.Source.PROCUREMENTS_44FZ)
records = client.fetch_records(content=content, file_name="search.html")
self.assertEqual(len(records), 1)
self.assertEqual(records[0].external_id, "0331")
self.assertEqual(records[0].title, "Поставка оборудования")
self.assertEqual(records[0].organisation_name, "ГКУ Тест")
self.assertEqual(str(records[0].amount), "649989.52")
self.assertEqual(records[0].record_date, "20.04.2026")
def test_html_table_with_td_header_row_is_parsed(self):
"""Test registry tables without th still parse when first row is a header."""
content = """
<html><body><table>
<tr><td>Номер реестровой записи</td><td>Информация о лице</td><td>ИНН</td></tr>
<tr><td>ГОЗ-1</td><td>ООО Оборона</td><td>7701000000</td></tr>
</table></body></html>
""".encode()
client = StructuredDataClient(source=ParserLoadLog.Source.ARBITRATION)
records = client.fetch_records(content=content, file_name="fas.html")
self.assertEqual(len(records), 1)
self.assertEqual(records[0].external_id, "ГОЗ-1")
self.assertEqual(records[0].inn, "7701000000")
self.assertEqual(records[0].organisation_name, "ООО Оборона")
def test_fas_goz_multirow_header_table_is_parsed(self):
"""Test FAS GOZ table skips multirow headers and column-number rows."""
content = """
<html><body><table>
<tr><td rowspan="2">Номер реестровой записи</td><td rowspan="2">Орган</td>
<td colspan="3">Постановление</td><td colspan="3">Лицо</td></tr>
<tr><td>номер</td><td>дата</td><td>исполнение</td>
<td>полное наименование</td><td>адрес</td><td>ИНН</td></tr>
<tr><td>1</td><td>2</td><td>3</td><td>4</td><td>5</td><td>6</td><td>7</td><td>8</td></tr>
<tr><td>1</td><td>Нижегородское УФАС России</td>
<td>№ 052/04/7.29.2-2965/2023 от 22.01.2024</td>
<td>28.10.2025</td><td>В стадии исполнения</td>
<td>АО УАПО</td><td>АО УАПО</td><td>г. Уфа</td><td>0275074279</td></tr>
</table></body></html>
""".encode()
client = StructuredDataClient(source=ParserLoadLog.Source.FAS_GOZ)
records = client.fetch_records(content=content, file_name="fas.html")
self.assertEqual(len(records), 1)
self.assertEqual(records[0].external_id, "1")
self.assertEqual(records[0].inn, "0275074279")
self.assertEqual(records[0].organisation_name, "АО УАПО")
self.assertEqual(records[0].record_date, "28.10.2025")
self.assertEqual(records[0].status, "В стадии исполнения")
def test_fns_nested_bfo_fields_are_normalized(self):
"""Test FNS JSON keeps nested payload and maps useful BFO fields."""
payload = {
"content": [
{
"id": 6622458,
"inn": "<strong>7736050003</strong>",
"shortName": 'ПАО "ГАЗПРОМ"',
"ogrn": "1027700070518",
"statusCode": "ACTIVE",
"bfo": {
"period": "2025",
"actualBfoDate": "2026-03-16",
"gainSum": 5846351786,
},
}
]
}
client = StructuredDataClient(source=ParserLoadLog.Source.FNS_FINANCIAL)
records = client.fetch_records(
content=json.dumps(payload).encode("utf-8"),
file_name="fns.json",
)
self.assertEqual(records[0].external_id, "6622458")
self.assertEqual(records[0].inn, "7736050003")
self.assertEqual(records[0].organisation_name, 'ПАО "ГАЗПРОМ"')
self.assertEqual(records[0].record_date, "2026-03-16")
self.assertEqual(str(records[0].amount), "5846351786")
self.assertEqual(records[0].status, "ACTIVE")
@patch.object(BaseHTTPClient, "download_file")
def test_fstec_page_discovers_csv_download(self, mock_download):
"""Test FSTEC registry page follows the official CSV download link."""
mock_download.side_effect = [
(
'<html><a href="/reg3?option=com_rajax&module=rfiles&'
'method=download&format=file&mod=209&file=1">Государственный '
"реестр ССЗИ</a></html>"
).encode(),
(
'"№ сертификата","Дата внесения в реестр","Срок действия сертификата",'
'"Наименование средства (шифр)","Заявитель"\n'
'"17/1","2002-07-26","2020-08-01","ФСПК-100","ООО НПП ЭЛКОМ"\n'
).encode(),
]
client = StructuredDataClient(source=ParserLoadLog.Source.FSTEC)
records = client.fetch_records(file_url="https://reestr.fstec.ru/reg3")
self.assertEqual(len(records), 1)
self.assertEqual(records[0].external_id, "17/1")
self.assertEqual(records[0].organisation_name, "ООО НПП ЭЛКОМ")
self.assertEqual(records[0].title, "ФСПК-100")
self.assertEqual(records[0].record_date, "2002-07-26")
self.assertEqual(records[0].status, "2020-08-01")
class TrudvsemClientTest(TestCase):
"""Tests for TrudvsemClient."""
@patch.object(BaseHTTPClient, "get_json")
def test_fetch_vacancies_success(self, mock_get_json):
"""Test successful vacancies fetching."""
mock_get_json.return_value = {
"results": {
"vacancies": [
{
"vacancy": {
"id": "VAC-1",
"job-name": "Инженер",
"creation-date": "2026-01-01",
"salary": {"from": 120000},
"company": {
"name": "ООО Тест",
"inn": "1234567890",
"ogrn": "1234567890123",
},
"vac_url": "https://trudvsem.ru/vacancy/VAC-1",
}
}
]
}
}
with TrudvsemClient() as client:
records = client.fetch_vacancies(limit=1)
self.assertEqual(len(records), 1)
self.assertEqual(records[0].external_id, "VAC-1")
self.assertEqual(records[0].source, ParserLoadLog.Source.TRUDVSEM)
self.assertEqual(records[0].inn, "1234567890")
@patch.object(BaseHTTPClient, "get_json")
def test_fetch_vacancies_by_company_inn_scans_pages(self, mock_get_json):
"""Test company_inn search scans next pages instead of false empty result."""
mock_get_json.side_effect = [
{
"results": {
"vacancies": [
{
"vacancy": {
"id": "VAC-OTHER",
"company": {"inn": "0000000000"},
}
}
]
}
},
{
"results": {
"vacancies": [
{
"vacancy": {
"id": "VAC-MATCH",
"company": {"inn": "1234567890"},
}
}
]
}
},
]
with TrudvsemClient(company_search_max_pages=2) as client:
records = client.fetch_vacancies(limit=1, company_inn="1234567890")
self.assertEqual(len(records), 1)
self.assertEqual(records[0].external_id, "VAC-MATCH")
self.assertEqual(mock_get_json.call_args_list[0].kwargs["params"]["offset"], 0)
self.assertEqual(mock_get_json.call_args_list[1].kwargs["params"]["offset"], 1)
def _create_test_excel_certificates() -> bytes:
"""Create test Excel file with certificate data."""
@@ -159,7 +588,10 @@ class IndustrialProductionClientTest(TestCase):
{
"name": "Заключения о подтверждении производства промышленной продукции на территории Российской Федерации",
"files": [
{"name": "data_resolutions_20240101.xlsx", "url": "/files/test.xlsx"},
{
"name": "data_resolutions_20240101.xlsx",
"url": "/files/test.xlsx",
},
],
}
]
@@ -193,9 +625,18 @@ class IndustrialProductionClientTest(TestCase):
{
"name": "Заключения о подтверждении производства промышленной продукции на территории Российской Федерации",
"files": [
{"name": "data_resolutions_20240101.xlsx", "url": "/files/old.xlsx"},
{"name": "data_resolutions_20240315.xlsx", "url": "/files/new.xlsx"},
{"name": "data_resolutions_20240201.xlsx", "url": "/files/mid.xlsx"},
{
"name": "data_resolutions_20240101.xlsx",
"url": "/files/old.xlsx",
},
{
"name": "data_resolutions_20240315.xlsx",
"url": "/files/new.xlsx",
},
{
"name": "data_resolutions_20240201.xlsx",
"url": "/files/mid.xlsx",
},
],
}
]
@@ -539,7 +980,7 @@ class ProverkiClientTest(TestCase):
client = ProverkiClient()
xml_str = '<inspection inn="1234567890" registration_number="TEST123" organisation_name="Test Co"/>'
element = ET.fromstring(xml_str)
element = ET.fromstring(xml_str) # noqa: S314
result = client._parse_xml_record(element)
@@ -553,7 +994,7 @@ class ProverkiClientTest(TestCase):
client = ProverkiClient()
xml_str = "<empty_record></empty_record>"
element = ET.fromstring(xml_str)
element = ET.fromstring(xml_str) # noqa: S314
result = client._parse_xml_record(element)
@@ -569,9 +1010,7 @@ class ProverkiClientTest(TestCase):
<registration_number>TEST001</registration_number>
<organisation_name>Компания</organisation_name>
</inspection>
</inspections>""".encode(
"windows-1251"
)
</inspections>""".encode("windows-1251")
inspections = client._parse_xml_content(xml_content, None)