fix(parsers): search registry vacancies across job boards
This commit is contained in:
@@ -306,6 +306,20 @@ class VacanciesClient:
|
||||
)
|
||||
return list(dict.fromkeys(selected))
|
||||
|
||||
def iter_source_clients(self) -> list[tuple[str, VacancyProvider]]:
|
||||
"""Return configured vacancy providers in selected order."""
|
||||
clients = self._build_source_clients()
|
||||
configured_clients: list[tuple[str, VacancyProvider]] = []
|
||||
for source in self._selected_sources():
|
||||
client = clients.get(source)
|
||||
if client is None:
|
||||
if self.sources and source == SUPERJOB_SOURCE:
|
||||
raise VacanciesClientError("SUPERJOB_APP_ID is required")
|
||||
logger.info("Vacancy source %s is skipped: not configured", source)
|
||||
continue
|
||||
configured_clients.append((source, client))
|
||||
return configured_clients
|
||||
|
||||
def fetch_vacancies(
|
||||
self,
|
||||
*,
|
||||
@@ -316,18 +330,11 @@ class VacanciesClient:
|
||||
text: str | None = None,
|
||||
) -> list[GenericParserItem]:
|
||||
"""Получить вакансии из включённых источников."""
|
||||
clients = self._build_source_clients()
|
||||
records: list[GenericParserItem] = []
|
||||
errors: list[str] = []
|
||||
attempts = 0
|
||||
|
||||
for source in self._selected_sources():
|
||||
client = clients.get(source)
|
||||
if client is None:
|
||||
if self.sources and source == SUPERJOB_SOURCE:
|
||||
raise VacanciesClientError("SUPERJOB_APP_ID is required")
|
||||
logger.info("Vacancy source %s is skipped: not configured", source)
|
||||
continue
|
||||
for source, client in self.iter_source_clients():
|
||||
if company_inn and not getattr(client, "supports_company_inn", False):
|
||||
logger.info(
|
||||
"Vacancy source %s is skipped: company_inn is not supported",
|
||||
|
||||
@@ -7,6 +7,7 @@ Celery задачи для приложения парсеров.
|
||||
|
||||
import hashlib
|
||||
import logging
|
||||
import re
|
||||
import shutil
|
||||
import time
|
||||
import uuid
|
||||
@@ -91,6 +92,30 @@ class RegistryLookupTarget:
|
||||
|
||||
|
||||
VACANCY_REGISTRY_MAX_PAGES_PER_ORGANIZATION = 100
|
||||
VACANCY_REGISTRY_TEXT_SEARCH_MAX_PAGES_PER_ORGANIZATION = 1
|
||||
VACANCY_EMPLOYER_WORD_RE = re.compile(r"[0-9A-Za-zА-Яа-яЁё]+")
|
||||
VACANCY_EMPLOYER_IGNORED_WORDS = {
|
||||
"ао",
|
||||
"акционерное",
|
||||
"государственное",
|
||||
"зао",
|
||||
"индивидуальный",
|
||||
"ип",
|
||||
"муниципальное",
|
||||
"нао",
|
||||
"некоммерческая",
|
||||
"оао",
|
||||
"общество",
|
||||
"ограниченной",
|
||||
"ооо",
|
||||
"ответственностью",
|
||||
"пао",
|
||||
"предприниматель",
|
||||
"публичное",
|
||||
"с",
|
||||
"унитарное",
|
||||
"фгуп",
|
||||
}
|
||||
|
||||
|
||||
def _resolve_lookup_limit(
|
||||
@@ -2929,27 +2954,151 @@ def _fetch_registry_target_vacancy_records(
|
||||
*,
|
||||
page_size: int,
|
||||
) -> list[GenericParserItem]:
|
||||
records: list[GenericParserItem] = []
|
||||
offset = 0
|
||||
for _ in range(VACANCY_REGISTRY_MAX_PAGES_PER_ORGANIZATION):
|
||||
page_records = client.fetch_vacancies(
|
||||
limit=page_size,
|
||||
offset=offset,
|
||||
iter_source_clients = getattr(client, "iter_source_clients", None)
|
||||
if iter_source_clients is None:
|
||||
return _fetch_registry_target_source_vacancy_records(
|
||||
client,
|
||||
target,
|
||||
page_size=page_size,
|
||||
company_inn=target.inn,
|
||||
)
|
||||
records.extend(
|
||||
_attach_registry_vacancy_target(record, target) for record in page_records
|
||||
|
||||
records: list[GenericParserItem] = []
|
||||
errors: list[str] = []
|
||||
attempts = 0
|
||||
|
||||
for source, source_client in iter_source_clients():
|
||||
if getattr(source_client, "supports_company_inn", False):
|
||||
kwargs = {"company_inn": target.inn}
|
||||
else:
|
||||
if not target.name:
|
||||
logger.info(
|
||||
"Vacancy source %s is skipped for registry organization %s: "
|
||||
"empty organization name",
|
||||
source,
|
||||
target.organization_id,
|
||||
)
|
||||
continue
|
||||
kwargs = {"text": _vacancy_registry_text_query(target)}
|
||||
|
||||
attempts += 1
|
||||
try:
|
||||
source_records = _fetch_registry_target_source_vacancy_records(
|
||||
source_client,
|
||||
target,
|
||||
page_size=page_size,
|
||||
**kwargs,
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.warning(
|
||||
"Vacancy source %s failed for registry organization %s (%s): %s",
|
||||
source,
|
||||
target.organization_id,
|
||||
target.inn,
|
||||
exc,
|
||||
)
|
||||
errors.append(f"{source}: {exc}")
|
||||
continue
|
||||
|
||||
records.extend(source_records)
|
||||
|
||||
if errors and not records and attempts:
|
||||
raise RuntimeError(
|
||||
"All vacancy sources failed for registry organization "
|
||||
f"{target.organization_id} ({target.inn}); first error: {errors[0]}"
|
||||
)
|
||||
return records
|
||||
|
||||
|
||||
def _fetch_registry_target_source_vacancy_records(
|
||||
source_client,
|
||||
target: RegistryLookupTarget,
|
||||
*,
|
||||
page_size: int,
|
||||
company_inn: str | None = None,
|
||||
text: str | None = None,
|
||||
) -> list[GenericParserItem]:
|
||||
records: list[GenericParserItem] = []
|
||||
offset = 0
|
||||
filter_by_employer_name = company_inn is None
|
||||
max_pages = (
|
||||
VACANCY_REGISTRY_TEXT_SEARCH_MAX_PAGES_PER_ORGANIZATION
|
||||
if filter_by_employer_name
|
||||
else VACANCY_REGISTRY_MAX_PAGES_PER_ORGANIZATION
|
||||
)
|
||||
for _ in range(max_pages):
|
||||
page_records = source_client.fetch_vacancies(
|
||||
limit=page_size,
|
||||
offset=offset,
|
||||
company_inn=company_inn,
|
||||
text=text,
|
||||
)
|
||||
if filter_by_employer_name:
|
||||
matched_records = [
|
||||
record
|
||||
for record in page_records
|
||||
if _vacancy_record_matches_registry_target(record, target)
|
||||
]
|
||||
else:
|
||||
matched_records = page_records
|
||||
records.extend(matched_records)
|
||||
if len(page_records) < page_size:
|
||||
return records
|
||||
offset += page_size
|
||||
|
||||
if filter_by_employer_name:
|
||||
return records
|
||||
|
||||
raise RuntimeError(
|
||||
"Vacancy registry organization page limit reached "
|
||||
f"for organization {target.organization_id} ({target.inn})"
|
||||
)
|
||||
|
||||
|
||||
def _vacancy_record_matches_registry_target(
|
||||
record: GenericParserItem,
|
||||
target: RegistryLookupTarget,
|
||||
) -> bool:
|
||||
target_key = _vacancy_employer_match_key(target.name)
|
||||
employer_key = _vacancy_employer_match_key(_vacancy_record_employer_name(record))
|
||||
if not target_key or not employer_key:
|
||||
return False
|
||||
if target_key == employer_key:
|
||||
return True
|
||||
if min(len(target_key), len(employer_key)) < 8:
|
||||
return False
|
||||
return target_key in employer_key or employer_key in target_key
|
||||
|
||||
|
||||
def _vacancy_registry_text_query(target: RegistryLookupTarget) -> str:
|
||||
return _vacancy_employer_match_key(target.name) or target.name
|
||||
|
||||
|
||||
def _vacancy_record_employer_name(record: GenericParserItem) -> str:
|
||||
if record.organisation_name:
|
||||
return record.organisation_name
|
||||
|
||||
payload = record.payload if isinstance(record.payload, dict) else {}
|
||||
for key in ("employer", "company"):
|
||||
nested = payload.get(key)
|
||||
if isinstance(nested, dict) and nested.get("name"):
|
||||
return str(nested["name"])
|
||||
for key in ("firm_name", "company_name", "organisation_name"):
|
||||
value = payload.get(key)
|
||||
if value:
|
||||
return str(value)
|
||||
return ""
|
||||
|
||||
|
||||
def _vacancy_employer_match_key(name: str) -> str:
|
||||
words = []
|
||||
for match in VACANCY_EMPLOYER_WORD_RE.finditer(name.casefold().replace("ё", "е")):
|
||||
word = match.group(0)
|
||||
if word not in VACANCY_EMPLOYER_IGNORED_WORDS:
|
||||
words.append(word)
|
||||
return " ".join(words)
|
||||
|
||||
|
||||
def _fetch_registry_organization_vacancy_records(
|
||||
*,
|
||||
proxies: list[str] | None,
|
||||
@@ -2966,11 +3115,10 @@ def _fetch_registry_organization_vacancy_records(
|
||||
records: list[GenericParserItem] = []
|
||||
errors: list[str] = []
|
||||
successful_fetches = 0
|
||||
sources = vacancy_sources or ["trudvsem"]
|
||||
with VacanciesClient(
|
||||
proxies=proxies,
|
||||
superjob_app_id=getattr(settings, "SUPERJOB_APP_ID", ""),
|
||||
sources=sources,
|
||||
sources=vacancy_sources,
|
||||
) as client:
|
||||
for target in targets:
|
||||
try:
|
||||
|
||||
@@ -2191,6 +2191,160 @@ class ParseVacanciesTaskTestCase(TestCase):
|
||||
{"trudvsem:7701000102"},
|
||||
)
|
||||
|
||||
@override_settings(SUPERJOB_APP_ID="test-superjob-app-id")
|
||||
def test_parse_trudvsem_vacancies_matches_job_boards_by_employer_name(self):
|
||||
organization = OrganizationFactory(
|
||||
pn_name='Общество с ограниченной ответственностью "Ромашка"',
|
||||
mn_inn=7701000301,
|
||||
mn_ogrn=1027700000301,
|
||||
)
|
||||
RegistryMembershipPeriodFactory(organization=organization, ended_at=None)
|
||||
captured_client_kwargs = {}
|
||||
captured_text_queries = {}
|
||||
|
||||
class _Provider:
|
||||
def __init__(self, source_name, *, supports_company_inn):
|
||||
self.source_name = source_name
|
||||
self.supports_company_inn = supports_company_inn
|
||||
|
||||
def fetch_vacancies(self, **kwargs):
|
||||
if self.source_name == "trudvsem":
|
||||
return [
|
||||
GenericParserItem(
|
||||
source=ParserLoadLog.Source.TRUDVSEM,
|
||||
external_id="trudvsem:romashka",
|
||||
inn=kwargs["company_inn"],
|
||||
title="Работа России",
|
||||
payload={"vacancy_source": "trudvsem"},
|
||||
)
|
||||
]
|
||||
|
||||
captured_text_queries[self.source_name] = kwargs["text"]
|
||||
return [
|
||||
GenericParserItem(
|
||||
source=ParserLoadLog.Source.TRUDVSEM,
|
||||
external_id=f"{self.source_name}:romashka",
|
||||
organisation_name='ООО "Ромашка"',
|
||||
title=f"{self.source_name} matching vacancy",
|
||||
payload={"vacancy_source": self.source_name},
|
||||
),
|
||||
GenericParserItem(
|
||||
source=ParserLoadLog.Source.TRUDVSEM,
|
||||
external_id=f"{self.source_name}:other",
|
||||
organisation_name='ООО "Лютик"',
|
||||
title=f"{self.source_name} unrelated vacancy",
|
||||
payload={"vacancy_source": self.source_name},
|
||||
),
|
||||
]
|
||||
|
||||
class _VacanciesClient:
|
||||
def __init__(self, **kwargs):
|
||||
captured_client_kwargs.update(kwargs)
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
return None
|
||||
|
||||
def fetch_vacancies(self, **kwargs):
|
||||
return [
|
||||
GenericParserItem(
|
||||
source=ParserLoadLog.Source.TRUDVSEM,
|
||||
external_id="trudvsem:romashka",
|
||||
inn=kwargs["company_inn"],
|
||||
title="Работа России",
|
||||
payload={"vacancy_source": "trudvsem"},
|
||||
)
|
||||
]
|
||||
|
||||
def iter_source_clients(self):
|
||||
return [
|
||||
("trudvsem", _Provider("trudvsem", supports_company_inn=True)),
|
||||
("hh", _Provider("hh", supports_company_inn=False)),
|
||||
("superjob", _Provider("superjob", supports_company_inn=False)),
|
||||
]
|
||||
|
||||
original_client = parser_tasks.VacanciesClient
|
||||
parser_tasks.VacanciesClient = _VacanciesClient
|
||||
try:
|
||||
result = parse_trudvsem_vacancies(limit=50, proxies=[])
|
||||
finally:
|
||||
parser_tasks.VacanciesClient = original_client
|
||||
|
||||
self.assertEqual(result["status"], "success")
|
||||
self.assertIsNone(captured_client_kwargs["sources"])
|
||||
self.assertEqual(
|
||||
captured_text_queries,
|
||||
{
|
||||
"hh": "ромашка",
|
||||
"superjob": "ромашка",
|
||||
},
|
||||
)
|
||||
self.assertEqual(result["saved"], 3)
|
||||
self.assertEqual(
|
||||
set(
|
||||
GenericParserRecord.objects.values_list(
|
||||
"external_id",
|
||||
"source",
|
||||
"registry_organization_id",
|
||||
)
|
||||
),
|
||||
{
|
||||
("trudvsem:romashka", "trudvsem", organization.id),
|
||||
("hh:romashka", "hh", organization.id),
|
||||
("superjob:romashka", "superjob", organization.id),
|
||||
},
|
||||
)
|
||||
|
||||
def test_registry_job_board_matching_fetches_only_first_text_search_page(self):
|
||||
organization = OrganizationFactory(
|
||||
pn_name='Общество с ограниченной ответственностью "Ромашка"',
|
||||
mn_inn=7701000302,
|
||||
mn_ogrn=1027700000302,
|
||||
)
|
||||
RegistryMembershipPeriodFactory(organization=organization, ended_at=None)
|
||||
captured_offsets = []
|
||||
|
||||
class _Provider:
|
||||
supports_company_inn = False
|
||||
|
||||
def fetch_vacancies(self, **kwargs):
|
||||
captured_offsets.append(kwargs["offset"])
|
||||
return [
|
||||
GenericParserItem(
|
||||
source=ParserLoadLog.Source.TRUDVSEM,
|
||||
external_id=f"hh:romashka:{kwargs['offset']}",
|
||||
organisation_name='ООО "Ромашка"',
|
||||
title="HeadHunter",
|
||||
payload={"vacancy_source": "hh"},
|
||||
)
|
||||
]
|
||||
|
||||
class _VacanciesClient:
|
||||
def __init__(self, **kwargs):
|
||||
pass
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
return None
|
||||
|
||||
def iter_source_clients(self):
|
||||
return [("hh", _Provider())]
|
||||
|
||||
original_client = parser_tasks.VacanciesClient
|
||||
parser_tasks.VacanciesClient = _VacanciesClient
|
||||
try:
|
||||
result = parse_trudvsem_vacancies(limit=1, proxies=[])
|
||||
finally:
|
||||
parser_tasks.VacanciesClient = original_client
|
||||
|
||||
self.assertEqual(result["status"], "success")
|
||||
self.assertEqual(result["saved"], 1)
|
||||
self.assertEqual(captured_offsets, [0])
|
||||
|
||||
@override_settings(SUPERJOB_APP_ID="test-superjob-app-id")
|
||||
def test_parse_trudvsem_vacancies_uses_combined_vacancies_client(self):
|
||||
captured_kwargs = {}
|
||||
|
||||
Reference in New Issue
Block a user