fix(parsers): search registry vacancies across job boards

This commit is contained in:
2026-05-14 17:07:58 +02:00
parent 5fdd23ecc0
commit df89e498cc
3 changed files with 327 additions and 18 deletions

View File

@@ -306,6 +306,20 @@ class VacanciesClient:
) )
return list(dict.fromkeys(selected)) return list(dict.fromkeys(selected))
def iter_source_clients(self) -> list[tuple[str, VacancyProvider]]:
"""Return configured vacancy providers in selected order."""
clients = self._build_source_clients()
configured_clients: list[tuple[str, VacancyProvider]] = []
for source in self._selected_sources():
client = clients.get(source)
if client is None:
if self.sources and source == SUPERJOB_SOURCE:
raise VacanciesClientError("SUPERJOB_APP_ID is required")
logger.info("Vacancy source %s is skipped: not configured", source)
continue
configured_clients.append((source, client))
return configured_clients
def fetch_vacancies( def fetch_vacancies(
self, self,
*, *,
@@ -316,18 +330,11 @@ class VacanciesClient:
text: str | None = None, text: str | None = None,
) -> list[GenericParserItem]: ) -> list[GenericParserItem]:
"""Получить вакансии из включённых источников.""" """Получить вакансии из включённых источников."""
clients = self._build_source_clients()
records: list[GenericParserItem] = [] records: list[GenericParserItem] = []
errors: list[str] = [] errors: list[str] = []
attempts = 0 attempts = 0
for source in self._selected_sources(): for source, client in self.iter_source_clients():
client = clients.get(source)
if client is None:
if self.sources and source == SUPERJOB_SOURCE:
raise VacanciesClientError("SUPERJOB_APP_ID is required")
logger.info("Vacancy source %s is skipped: not configured", source)
continue
if company_inn and not getattr(client, "supports_company_inn", False): if company_inn and not getattr(client, "supports_company_inn", False):
logger.info( logger.info(
"Vacancy source %s is skipped: company_inn is not supported", "Vacancy source %s is skipped: company_inn is not supported",

View File

@@ -7,6 +7,7 @@ Celery задачи для приложения парсеров.
import hashlib import hashlib
import logging import logging
import re
import shutil import shutil
import time import time
import uuid import uuid
@@ -91,6 +92,30 @@ class RegistryLookupTarget:
VACANCY_REGISTRY_MAX_PAGES_PER_ORGANIZATION = 100 VACANCY_REGISTRY_MAX_PAGES_PER_ORGANIZATION = 100
VACANCY_REGISTRY_TEXT_SEARCH_MAX_PAGES_PER_ORGANIZATION = 1
VACANCY_EMPLOYER_WORD_RE = re.compile(r"[0-9A-Za-zА-Яа-яЁё]+")
VACANCY_EMPLOYER_IGNORED_WORDS = {
"ао",
"акционерное",
"государственное",
"зао",
"индивидуальный",
"ип",
"муниципальное",
"нао",
"некоммерческая",
"оао",
"общество",
"ограниченной",
"ооо",
"ответственностью",
"пао",
"предприниматель",
"публичное",
"с",
"унитарное",
"фгуп",
}
def _resolve_lookup_limit( def _resolve_lookup_limit(
@@ -2929,27 +2954,151 @@ def _fetch_registry_target_vacancy_records(
*, *,
page_size: int, page_size: int,
) -> list[GenericParserItem]: ) -> list[GenericParserItem]:
records: list[GenericParserItem] = [] iter_source_clients = getattr(client, "iter_source_clients", None)
offset = 0 if iter_source_clients is None:
for _ in range(VACANCY_REGISTRY_MAX_PAGES_PER_ORGANIZATION): return _fetch_registry_target_source_vacancy_records(
page_records = client.fetch_vacancies( client,
limit=page_size, target,
offset=offset, page_size=page_size,
company_inn=target.inn, company_inn=target.inn,
) )
records.extend(
_attach_registry_vacancy_target(record, target) for record in page_records records: list[GenericParserItem] = []
errors: list[str] = []
attempts = 0
for source, source_client in iter_source_clients():
if getattr(source_client, "supports_company_inn", False):
kwargs = {"company_inn": target.inn}
else:
if not target.name:
logger.info(
"Vacancy source %s is skipped for registry organization %s: "
"empty organization name",
source,
target.organization_id,
) )
continue
kwargs = {"text": _vacancy_registry_text_query(target)}
attempts += 1
try:
source_records = _fetch_registry_target_source_vacancy_records(
source_client,
target,
page_size=page_size,
**kwargs,
)
except Exception as exc:
logger.warning(
"Vacancy source %s failed for registry organization %s (%s): %s",
source,
target.organization_id,
target.inn,
exc,
)
errors.append(f"{source}: {exc}")
continue
records.extend(source_records)
if errors and not records and attempts:
raise RuntimeError(
"All vacancy sources failed for registry organization "
f"{target.organization_id} ({target.inn}); first error: {errors[0]}"
)
return records
def _fetch_registry_target_source_vacancy_records(
source_client,
target: RegistryLookupTarget,
*,
page_size: int,
company_inn: str | None = None,
text: str | None = None,
) -> list[GenericParserItem]:
records: list[GenericParserItem] = []
offset = 0
filter_by_employer_name = company_inn is None
max_pages = (
VACANCY_REGISTRY_TEXT_SEARCH_MAX_PAGES_PER_ORGANIZATION
if filter_by_employer_name
else VACANCY_REGISTRY_MAX_PAGES_PER_ORGANIZATION
)
for _ in range(max_pages):
page_records = source_client.fetch_vacancies(
limit=page_size,
offset=offset,
company_inn=company_inn,
text=text,
)
if filter_by_employer_name:
matched_records = [
record
for record in page_records
if _vacancy_record_matches_registry_target(record, target)
]
else:
matched_records = page_records
records.extend(matched_records)
if len(page_records) < page_size: if len(page_records) < page_size:
return records return records
offset += page_size offset += page_size
if filter_by_employer_name:
return records
raise RuntimeError( raise RuntimeError(
"Vacancy registry organization page limit reached " "Vacancy registry organization page limit reached "
f"for organization {target.organization_id} ({target.inn})" f"for organization {target.organization_id} ({target.inn})"
) )
def _vacancy_record_matches_registry_target(
record: GenericParserItem,
target: RegistryLookupTarget,
) -> bool:
target_key = _vacancy_employer_match_key(target.name)
employer_key = _vacancy_employer_match_key(_vacancy_record_employer_name(record))
if not target_key or not employer_key:
return False
if target_key == employer_key:
return True
if min(len(target_key), len(employer_key)) < 8:
return False
return target_key in employer_key or employer_key in target_key
def _vacancy_registry_text_query(target: RegistryLookupTarget) -> str:
return _vacancy_employer_match_key(target.name) or target.name
def _vacancy_record_employer_name(record: GenericParserItem) -> str:
if record.organisation_name:
return record.organisation_name
payload = record.payload if isinstance(record.payload, dict) else {}
for key in ("employer", "company"):
nested = payload.get(key)
if isinstance(nested, dict) and nested.get("name"):
return str(nested["name"])
for key in ("firm_name", "company_name", "organisation_name"):
value = payload.get(key)
if value:
return str(value)
return ""
def _vacancy_employer_match_key(name: str) -> str:
words = []
for match in VACANCY_EMPLOYER_WORD_RE.finditer(name.casefold().replace("ё", "е")):
word = match.group(0)
if word not in VACANCY_EMPLOYER_IGNORED_WORDS:
words.append(word)
return " ".join(words)
def _fetch_registry_organization_vacancy_records( def _fetch_registry_organization_vacancy_records(
*, *,
proxies: list[str] | None, proxies: list[str] | None,
@@ -2966,11 +3115,10 @@ def _fetch_registry_organization_vacancy_records(
records: list[GenericParserItem] = [] records: list[GenericParserItem] = []
errors: list[str] = [] errors: list[str] = []
successful_fetches = 0 successful_fetches = 0
sources = vacancy_sources or ["trudvsem"]
with VacanciesClient( with VacanciesClient(
proxies=proxies, proxies=proxies,
superjob_app_id=getattr(settings, "SUPERJOB_APP_ID", ""), superjob_app_id=getattr(settings, "SUPERJOB_APP_ID", ""),
sources=sources, sources=vacancy_sources,
) as client: ) as client:
for target in targets: for target in targets:
try: try:

View File

@@ -2191,6 +2191,160 @@ class ParseVacanciesTaskTestCase(TestCase):
{"trudvsem:7701000102"}, {"trudvsem:7701000102"},
) )
@override_settings(SUPERJOB_APP_ID="test-superjob-app-id")
def test_parse_trudvsem_vacancies_matches_job_boards_by_employer_name(self):
organization = OrganizationFactory(
pn_name='Общество с ограниченной ответственностью "Ромашка"',
mn_inn=7701000301,
mn_ogrn=1027700000301,
)
RegistryMembershipPeriodFactory(organization=organization, ended_at=None)
captured_client_kwargs = {}
captured_text_queries = {}
class _Provider:
def __init__(self, source_name, *, supports_company_inn):
self.source_name = source_name
self.supports_company_inn = supports_company_inn
def fetch_vacancies(self, **kwargs):
if self.source_name == "trudvsem":
return [
GenericParserItem(
source=ParserLoadLog.Source.TRUDVSEM,
external_id="trudvsem:romashka",
inn=kwargs["company_inn"],
title="Работа России",
payload={"vacancy_source": "trudvsem"},
)
]
captured_text_queries[self.source_name] = kwargs["text"]
return [
GenericParserItem(
source=ParserLoadLog.Source.TRUDVSEM,
external_id=f"{self.source_name}:romashka",
organisation_name='ООО "Ромашка"',
title=f"{self.source_name} matching vacancy",
payload={"vacancy_source": self.source_name},
),
GenericParserItem(
source=ParserLoadLog.Source.TRUDVSEM,
external_id=f"{self.source_name}:other",
organisation_name='ООО "Лютик"',
title=f"{self.source_name} unrelated vacancy",
payload={"vacancy_source": self.source_name},
),
]
class _VacanciesClient:
def __init__(self, **kwargs):
captured_client_kwargs.update(kwargs)
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
return None
def fetch_vacancies(self, **kwargs):
return [
GenericParserItem(
source=ParserLoadLog.Source.TRUDVSEM,
external_id="trudvsem:romashka",
inn=kwargs["company_inn"],
title="Работа России",
payload={"vacancy_source": "trudvsem"},
)
]
def iter_source_clients(self):
return [
("trudvsem", _Provider("trudvsem", supports_company_inn=True)),
("hh", _Provider("hh", supports_company_inn=False)),
("superjob", _Provider("superjob", supports_company_inn=False)),
]
original_client = parser_tasks.VacanciesClient
parser_tasks.VacanciesClient = _VacanciesClient
try:
result = parse_trudvsem_vacancies(limit=50, proxies=[])
finally:
parser_tasks.VacanciesClient = original_client
self.assertEqual(result["status"], "success")
self.assertIsNone(captured_client_kwargs["sources"])
self.assertEqual(
captured_text_queries,
{
"hh": "ромашка",
"superjob": "ромашка",
},
)
self.assertEqual(result["saved"], 3)
self.assertEqual(
set(
GenericParserRecord.objects.values_list(
"external_id",
"source",
"registry_organization_id",
)
),
{
("trudvsem:romashka", "trudvsem", organization.id),
("hh:romashka", "hh", organization.id),
("superjob:romashka", "superjob", organization.id),
},
)
def test_registry_job_board_matching_fetches_only_first_text_search_page(self):
organization = OrganizationFactory(
pn_name='Общество с ограниченной ответственностью "Ромашка"',
mn_inn=7701000302,
mn_ogrn=1027700000302,
)
RegistryMembershipPeriodFactory(organization=organization, ended_at=None)
captured_offsets = []
class _Provider:
supports_company_inn = False
def fetch_vacancies(self, **kwargs):
captured_offsets.append(kwargs["offset"])
return [
GenericParserItem(
source=ParserLoadLog.Source.TRUDVSEM,
external_id=f"hh:romashka:{kwargs['offset']}",
organisation_name='ООО "Ромашка"',
title="HeadHunter",
payload={"vacancy_source": "hh"},
)
]
class _VacanciesClient:
def __init__(self, **kwargs):
pass
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
return None
def iter_source_clients(self):
return [("hh", _Provider())]
original_client = parser_tasks.VacanciesClient
parser_tasks.VacanciesClient = _VacanciesClient
try:
result = parse_trudvsem_vacancies(limit=1, proxies=[])
finally:
parser_tasks.VacanciesClient = original_client
self.assertEqual(result["status"], "success")
self.assertEqual(result["saved"], 1)
self.assertEqual(captured_offsets, [0])
@override_settings(SUPERJOB_APP_ID="test-superjob-app-id") @override_settings(SUPERJOB_APP_ID="test-superjob-app-id")
def test_parse_trudvsem_vacancies_uses_combined_vacancies_client(self): def test_parse_trudvsem_vacancies_uses_combined_vacancies_client(self):
captured_kwargs = {} captured_kwargs = {}