fix(parsers): search registry vacancies across job boards
This commit is contained in:
@@ -306,6 +306,20 @@ class VacanciesClient:
|
|||||||
)
|
)
|
||||||
return list(dict.fromkeys(selected))
|
return list(dict.fromkeys(selected))
|
||||||
|
|
||||||
|
def iter_source_clients(self) -> list[tuple[str, VacancyProvider]]:
|
||||||
|
"""Return configured vacancy providers in selected order."""
|
||||||
|
clients = self._build_source_clients()
|
||||||
|
configured_clients: list[tuple[str, VacancyProvider]] = []
|
||||||
|
for source in self._selected_sources():
|
||||||
|
client = clients.get(source)
|
||||||
|
if client is None:
|
||||||
|
if self.sources and source == SUPERJOB_SOURCE:
|
||||||
|
raise VacanciesClientError("SUPERJOB_APP_ID is required")
|
||||||
|
logger.info("Vacancy source %s is skipped: not configured", source)
|
||||||
|
continue
|
||||||
|
configured_clients.append((source, client))
|
||||||
|
return configured_clients
|
||||||
|
|
||||||
def fetch_vacancies(
|
def fetch_vacancies(
|
||||||
self,
|
self,
|
||||||
*,
|
*,
|
||||||
@@ -316,18 +330,11 @@ class VacanciesClient:
|
|||||||
text: str | None = None,
|
text: str | None = None,
|
||||||
) -> list[GenericParserItem]:
|
) -> list[GenericParserItem]:
|
||||||
"""Получить вакансии из включённых источников."""
|
"""Получить вакансии из включённых источников."""
|
||||||
clients = self._build_source_clients()
|
|
||||||
records: list[GenericParserItem] = []
|
records: list[GenericParserItem] = []
|
||||||
errors: list[str] = []
|
errors: list[str] = []
|
||||||
attempts = 0
|
attempts = 0
|
||||||
|
|
||||||
for source in self._selected_sources():
|
for source, client in self.iter_source_clients():
|
||||||
client = clients.get(source)
|
|
||||||
if client is None:
|
|
||||||
if self.sources and source == SUPERJOB_SOURCE:
|
|
||||||
raise VacanciesClientError("SUPERJOB_APP_ID is required")
|
|
||||||
logger.info("Vacancy source %s is skipped: not configured", source)
|
|
||||||
continue
|
|
||||||
if company_inn and not getattr(client, "supports_company_inn", False):
|
if company_inn and not getattr(client, "supports_company_inn", False):
|
||||||
logger.info(
|
logger.info(
|
||||||
"Vacancy source %s is skipped: company_inn is not supported",
|
"Vacancy source %s is skipped: company_inn is not supported",
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ Celery задачи для приложения парсеров.
|
|||||||
|
|
||||||
import hashlib
|
import hashlib
|
||||||
import logging
|
import logging
|
||||||
|
import re
|
||||||
import shutil
|
import shutil
|
||||||
import time
|
import time
|
||||||
import uuid
|
import uuid
|
||||||
@@ -91,6 +92,30 @@ class RegistryLookupTarget:
|
|||||||
|
|
||||||
|
|
||||||
VACANCY_REGISTRY_MAX_PAGES_PER_ORGANIZATION = 100
|
VACANCY_REGISTRY_MAX_PAGES_PER_ORGANIZATION = 100
|
||||||
|
VACANCY_REGISTRY_TEXT_SEARCH_MAX_PAGES_PER_ORGANIZATION = 1
|
||||||
|
VACANCY_EMPLOYER_WORD_RE = re.compile(r"[0-9A-Za-zА-Яа-яЁё]+")
|
||||||
|
VACANCY_EMPLOYER_IGNORED_WORDS = {
|
||||||
|
"ао",
|
||||||
|
"акционерное",
|
||||||
|
"государственное",
|
||||||
|
"зао",
|
||||||
|
"индивидуальный",
|
||||||
|
"ип",
|
||||||
|
"муниципальное",
|
||||||
|
"нао",
|
||||||
|
"некоммерческая",
|
||||||
|
"оао",
|
||||||
|
"общество",
|
||||||
|
"ограниченной",
|
||||||
|
"ооо",
|
||||||
|
"ответственностью",
|
||||||
|
"пао",
|
||||||
|
"предприниматель",
|
||||||
|
"публичное",
|
||||||
|
"с",
|
||||||
|
"унитарное",
|
||||||
|
"фгуп",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def _resolve_lookup_limit(
|
def _resolve_lookup_limit(
|
||||||
@@ -2929,27 +2954,151 @@ def _fetch_registry_target_vacancy_records(
|
|||||||
*,
|
*,
|
||||||
page_size: int,
|
page_size: int,
|
||||||
) -> list[GenericParserItem]:
|
) -> list[GenericParserItem]:
|
||||||
records: list[GenericParserItem] = []
|
iter_source_clients = getattr(client, "iter_source_clients", None)
|
||||||
offset = 0
|
if iter_source_clients is None:
|
||||||
for _ in range(VACANCY_REGISTRY_MAX_PAGES_PER_ORGANIZATION):
|
return _fetch_registry_target_source_vacancy_records(
|
||||||
page_records = client.fetch_vacancies(
|
client,
|
||||||
limit=page_size,
|
target,
|
||||||
offset=offset,
|
page_size=page_size,
|
||||||
company_inn=target.inn,
|
company_inn=target.inn,
|
||||||
)
|
)
|
||||||
records.extend(
|
|
||||||
_attach_registry_vacancy_target(record, target) for record in page_records
|
records: list[GenericParserItem] = []
|
||||||
|
errors: list[str] = []
|
||||||
|
attempts = 0
|
||||||
|
|
||||||
|
for source, source_client in iter_source_clients():
|
||||||
|
if getattr(source_client, "supports_company_inn", False):
|
||||||
|
kwargs = {"company_inn": target.inn}
|
||||||
|
else:
|
||||||
|
if not target.name:
|
||||||
|
logger.info(
|
||||||
|
"Vacancy source %s is skipped for registry organization %s: "
|
||||||
|
"empty organization name",
|
||||||
|
source,
|
||||||
|
target.organization_id,
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
kwargs = {"text": _vacancy_registry_text_query(target)}
|
||||||
|
|
||||||
|
attempts += 1
|
||||||
|
try:
|
||||||
|
source_records = _fetch_registry_target_source_vacancy_records(
|
||||||
|
source_client,
|
||||||
|
target,
|
||||||
|
page_size=page_size,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning(
|
||||||
|
"Vacancy source %s failed for registry organization %s (%s): %s",
|
||||||
|
source,
|
||||||
|
target.organization_id,
|
||||||
|
target.inn,
|
||||||
|
exc,
|
||||||
|
)
|
||||||
|
errors.append(f"{source}: {exc}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
records.extend(source_records)
|
||||||
|
|
||||||
|
if errors and not records and attempts:
|
||||||
|
raise RuntimeError(
|
||||||
|
"All vacancy sources failed for registry organization "
|
||||||
|
f"{target.organization_id} ({target.inn}); first error: {errors[0]}"
|
||||||
)
|
)
|
||||||
|
return records
|
||||||
|
|
||||||
|
|
||||||
|
def _fetch_registry_target_source_vacancy_records(
|
||||||
|
source_client,
|
||||||
|
target: RegistryLookupTarget,
|
||||||
|
*,
|
||||||
|
page_size: int,
|
||||||
|
company_inn: str | None = None,
|
||||||
|
text: str | None = None,
|
||||||
|
) -> list[GenericParserItem]:
|
||||||
|
records: list[GenericParserItem] = []
|
||||||
|
offset = 0
|
||||||
|
filter_by_employer_name = company_inn is None
|
||||||
|
max_pages = (
|
||||||
|
VACANCY_REGISTRY_TEXT_SEARCH_MAX_PAGES_PER_ORGANIZATION
|
||||||
|
if filter_by_employer_name
|
||||||
|
else VACANCY_REGISTRY_MAX_PAGES_PER_ORGANIZATION
|
||||||
|
)
|
||||||
|
for _ in range(max_pages):
|
||||||
|
page_records = source_client.fetch_vacancies(
|
||||||
|
limit=page_size,
|
||||||
|
offset=offset,
|
||||||
|
company_inn=company_inn,
|
||||||
|
text=text,
|
||||||
|
)
|
||||||
|
if filter_by_employer_name:
|
||||||
|
matched_records = [
|
||||||
|
record
|
||||||
|
for record in page_records
|
||||||
|
if _vacancy_record_matches_registry_target(record, target)
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
matched_records = page_records
|
||||||
|
records.extend(matched_records)
|
||||||
if len(page_records) < page_size:
|
if len(page_records) < page_size:
|
||||||
return records
|
return records
|
||||||
offset += page_size
|
offset += page_size
|
||||||
|
|
||||||
|
if filter_by_employer_name:
|
||||||
|
return records
|
||||||
|
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
"Vacancy registry organization page limit reached "
|
"Vacancy registry organization page limit reached "
|
||||||
f"for organization {target.organization_id} ({target.inn})"
|
f"for organization {target.organization_id} ({target.inn})"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _vacancy_record_matches_registry_target(
|
||||||
|
record: GenericParserItem,
|
||||||
|
target: RegistryLookupTarget,
|
||||||
|
) -> bool:
|
||||||
|
target_key = _vacancy_employer_match_key(target.name)
|
||||||
|
employer_key = _vacancy_employer_match_key(_vacancy_record_employer_name(record))
|
||||||
|
if not target_key or not employer_key:
|
||||||
|
return False
|
||||||
|
if target_key == employer_key:
|
||||||
|
return True
|
||||||
|
if min(len(target_key), len(employer_key)) < 8:
|
||||||
|
return False
|
||||||
|
return target_key in employer_key or employer_key in target_key
|
||||||
|
|
||||||
|
|
||||||
|
def _vacancy_registry_text_query(target: RegistryLookupTarget) -> str:
|
||||||
|
return _vacancy_employer_match_key(target.name) or target.name
|
||||||
|
|
||||||
|
|
||||||
|
def _vacancy_record_employer_name(record: GenericParserItem) -> str:
|
||||||
|
if record.organisation_name:
|
||||||
|
return record.organisation_name
|
||||||
|
|
||||||
|
payload = record.payload if isinstance(record.payload, dict) else {}
|
||||||
|
for key in ("employer", "company"):
|
||||||
|
nested = payload.get(key)
|
||||||
|
if isinstance(nested, dict) and nested.get("name"):
|
||||||
|
return str(nested["name"])
|
||||||
|
for key in ("firm_name", "company_name", "organisation_name"):
|
||||||
|
value = payload.get(key)
|
||||||
|
if value:
|
||||||
|
return str(value)
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def _vacancy_employer_match_key(name: str) -> str:
|
||||||
|
words = []
|
||||||
|
for match in VACANCY_EMPLOYER_WORD_RE.finditer(name.casefold().replace("ё", "е")):
|
||||||
|
word = match.group(0)
|
||||||
|
if word not in VACANCY_EMPLOYER_IGNORED_WORDS:
|
||||||
|
words.append(word)
|
||||||
|
return " ".join(words)
|
||||||
|
|
||||||
|
|
||||||
def _fetch_registry_organization_vacancy_records(
|
def _fetch_registry_organization_vacancy_records(
|
||||||
*,
|
*,
|
||||||
proxies: list[str] | None,
|
proxies: list[str] | None,
|
||||||
@@ -2966,11 +3115,10 @@ def _fetch_registry_organization_vacancy_records(
|
|||||||
records: list[GenericParserItem] = []
|
records: list[GenericParserItem] = []
|
||||||
errors: list[str] = []
|
errors: list[str] = []
|
||||||
successful_fetches = 0
|
successful_fetches = 0
|
||||||
sources = vacancy_sources or ["trudvsem"]
|
|
||||||
with VacanciesClient(
|
with VacanciesClient(
|
||||||
proxies=proxies,
|
proxies=proxies,
|
||||||
superjob_app_id=getattr(settings, "SUPERJOB_APP_ID", ""),
|
superjob_app_id=getattr(settings, "SUPERJOB_APP_ID", ""),
|
||||||
sources=sources,
|
sources=vacancy_sources,
|
||||||
) as client:
|
) as client:
|
||||||
for target in targets:
|
for target in targets:
|
||||||
try:
|
try:
|
||||||
|
|||||||
@@ -2191,6 +2191,160 @@ class ParseVacanciesTaskTestCase(TestCase):
|
|||||||
{"trudvsem:7701000102"},
|
{"trudvsem:7701000102"},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@override_settings(SUPERJOB_APP_ID="test-superjob-app-id")
|
||||||
|
def test_parse_trudvsem_vacancies_matches_job_boards_by_employer_name(self):
|
||||||
|
organization = OrganizationFactory(
|
||||||
|
pn_name='Общество с ограниченной ответственностью "Ромашка"',
|
||||||
|
mn_inn=7701000301,
|
||||||
|
mn_ogrn=1027700000301,
|
||||||
|
)
|
||||||
|
RegistryMembershipPeriodFactory(organization=organization, ended_at=None)
|
||||||
|
captured_client_kwargs = {}
|
||||||
|
captured_text_queries = {}
|
||||||
|
|
||||||
|
class _Provider:
|
||||||
|
def __init__(self, source_name, *, supports_company_inn):
|
||||||
|
self.source_name = source_name
|
||||||
|
self.supports_company_inn = supports_company_inn
|
||||||
|
|
||||||
|
def fetch_vacancies(self, **kwargs):
|
||||||
|
if self.source_name == "trudvsem":
|
||||||
|
return [
|
||||||
|
GenericParserItem(
|
||||||
|
source=ParserLoadLog.Source.TRUDVSEM,
|
||||||
|
external_id="trudvsem:romashka",
|
||||||
|
inn=kwargs["company_inn"],
|
||||||
|
title="Работа России",
|
||||||
|
payload={"vacancy_source": "trudvsem"},
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
captured_text_queries[self.source_name] = kwargs["text"]
|
||||||
|
return [
|
||||||
|
GenericParserItem(
|
||||||
|
source=ParserLoadLog.Source.TRUDVSEM,
|
||||||
|
external_id=f"{self.source_name}:romashka",
|
||||||
|
organisation_name='ООО "Ромашка"',
|
||||||
|
title=f"{self.source_name} matching vacancy",
|
||||||
|
payload={"vacancy_source": self.source_name},
|
||||||
|
),
|
||||||
|
GenericParserItem(
|
||||||
|
source=ParserLoadLog.Source.TRUDVSEM,
|
||||||
|
external_id=f"{self.source_name}:other",
|
||||||
|
organisation_name='ООО "Лютик"',
|
||||||
|
title=f"{self.source_name} unrelated vacancy",
|
||||||
|
payload={"vacancy_source": self.source_name},
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
class _VacanciesClient:
|
||||||
|
def __init__(self, **kwargs):
|
||||||
|
captured_client_kwargs.update(kwargs)
|
||||||
|
|
||||||
|
def __enter__(self):
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||||
|
return None
|
||||||
|
|
||||||
|
def fetch_vacancies(self, **kwargs):
|
||||||
|
return [
|
||||||
|
GenericParserItem(
|
||||||
|
source=ParserLoadLog.Source.TRUDVSEM,
|
||||||
|
external_id="trudvsem:romashka",
|
||||||
|
inn=kwargs["company_inn"],
|
||||||
|
title="Работа России",
|
||||||
|
payload={"vacancy_source": "trudvsem"},
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
def iter_source_clients(self):
|
||||||
|
return [
|
||||||
|
("trudvsem", _Provider("trudvsem", supports_company_inn=True)),
|
||||||
|
("hh", _Provider("hh", supports_company_inn=False)),
|
||||||
|
("superjob", _Provider("superjob", supports_company_inn=False)),
|
||||||
|
]
|
||||||
|
|
||||||
|
original_client = parser_tasks.VacanciesClient
|
||||||
|
parser_tasks.VacanciesClient = _VacanciesClient
|
||||||
|
try:
|
||||||
|
result = parse_trudvsem_vacancies(limit=50, proxies=[])
|
||||||
|
finally:
|
||||||
|
parser_tasks.VacanciesClient = original_client
|
||||||
|
|
||||||
|
self.assertEqual(result["status"], "success")
|
||||||
|
self.assertIsNone(captured_client_kwargs["sources"])
|
||||||
|
self.assertEqual(
|
||||||
|
captured_text_queries,
|
||||||
|
{
|
||||||
|
"hh": "ромашка",
|
||||||
|
"superjob": "ромашка",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
self.assertEqual(result["saved"], 3)
|
||||||
|
self.assertEqual(
|
||||||
|
set(
|
||||||
|
GenericParserRecord.objects.values_list(
|
||||||
|
"external_id",
|
||||||
|
"source",
|
||||||
|
"registry_organization_id",
|
||||||
|
)
|
||||||
|
),
|
||||||
|
{
|
||||||
|
("trudvsem:romashka", "trudvsem", organization.id),
|
||||||
|
("hh:romashka", "hh", organization.id),
|
||||||
|
("superjob:romashka", "superjob", organization.id),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_registry_job_board_matching_fetches_only_first_text_search_page(self):
|
||||||
|
organization = OrganizationFactory(
|
||||||
|
pn_name='Общество с ограниченной ответственностью "Ромашка"',
|
||||||
|
mn_inn=7701000302,
|
||||||
|
mn_ogrn=1027700000302,
|
||||||
|
)
|
||||||
|
RegistryMembershipPeriodFactory(organization=organization, ended_at=None)
|
||||||
|
captured_offsets = []
|
||||||
|
|
||||||
|
class _Provider:
|
||||||
|
supports_company_inn = False
|
||||||
|
|
||||||
|
def fetch_vacancies(self, **kwargs):
|
||||||
|
captured_offsets.append(kwargs["offset"])
|
||||||
|
return [
|
||||||
|
GenericParserItem(
|
||||||
|
source=ParserLoadLog.Source.TRUDVSEM,
|
||||||
|
external_id=f"hh:romashka:{kwargs['offset']}",
|
||||||
|
organisation_name='ООО "Ромашка"',
|
||||||
|
title="HeadHunter",
|
||||||
|
payload={"vacancy_source": "hh"},
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
class _VacanciesClient:
|
||||||
|
def __init__(self, **kwargs):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def __enter__(self):
|
||||||
|
return self
|
||||||
|
|
||||||
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||||
|
return None
|
||||||
|
|
||||||
|
def iter_source_clients(self):
|
||||||
|
return [("hh", _Provider())]
|
||||||
|
|
||||||
|
original_client = parser_tasks.VacanciesClient
|
||||||
|
parser_tasks.VacanciesClient = _VacanciesClient
|
||||||
|
try:
|
||||||
|
result = parse_trudvsem_vacancies(limit=1, proxies=[])
|
||||||
|
finally:
|
||||||
|
parser_tasks.VacanciesClient = original_client
|
||||||
|
|
||||||
|
self.assertEqual(result["status"], "success")
|
||||||
|
self.assertEqual(result["saved"], 1)
|
||||||
|
self.assertEqual(captured_offsets, [0])
|
||||||
|
|
||||||
@override_settings(SUPERJOB_APP_ID="test-superjob-app-id")
|
@override_settings(SUPERJOB_APP_ID="test-superjob-app-id")
|
||||||
def test_parse_trudvsem_vacancies_uses_combined_vacancies_client(self):
|
def test_parse_trudvsem_vacancies_uses_combined_vacancies_client(self):
|
||||||
captured_kwargs = {}
|
captured_kwargs = {}
|
||||||
|
|||||||
Reference in New Issue
Block a user