testing

openml.testing #

CustomImputer #

Bases: Imputer

Duplicate class alias for sklearn's SimpleImputer

Helps bypass the sklearn extension duplicate operation check

TestBase #

Bases: TestCase

Base class for tests

Note#

Currently hard-codes a read-write key. Hopefully soon allows using a test server, not the production server.

setUp #

setUp(n_levels: int = 1, tmpdir_suffix: str = '') -> None

Setup variables and temporary directories.

In particular, this methods:

creates a temporary working directory
figures out a path to a few static test files
set the default server to be the test server
set a static API key for the test server
increases the maximal number of retries

Parameters#

n_levels : int Number of nested directories the test is in. Necessary to resolve the path to the files directory, which is located directly under the tests directory.

Source code in openml/testing.py

def setUp(self, n_levels: int = 1, tmpdir_suffix: str = "") -> None:
    """Setup variables and temporary directories.

    In particular, this methods:

    * creates a temporary working directory
    * figures out a path to a few static test files
    * set the default server to be the test server
    * set a static API key for the test server
    * increases the maximal number of retries

    Parameters
    ----------
    n_levels : int
        Number of nested directories the test is in. Necessary to resolve the path to the
        ``files`` directory, which is located directly under the ``tests`` directory.
    """
    # This cache directory is checked in to git to simulate a populated
    # cache
    self.maxDiff = None
    abspath_this_file = Path(inspect.getfile(self.__class__)).absolute()
    static_cache_dir = abspath_this_file.parent
    for _ in range(n_levels):
        static_cache_dir = static_cache_dir.parent.absolute()

    content = os.listdir(static_cache_dir)
    if "files" in content:
        static_cache_dir = static_cache_dir / "files"
    else:
        raise ValueError(
            f"Cannot find test cache dir, expected it to be {static_cache_dir}!",
        )

    self.static_cache_dir = static_cache_dir
    self.cwd = Path.cwd()
    workdir = Path(__file__).parent.absolute()
    tmp_dir_name = self.id() + tmpdir_suffix
    self.workdir = workdir / tmp_dir_name
    shutil.rmtree(self.workdir, ignore_errors=True)

    self.workdir.mkdir(exist_ok=True)
    os.chdir(self.workdir)

    self.cached = True
    openml.config.apikey = TestBase.apikey
    self.production_server = "https://www.openml.org/api/v1/xml"
    openml.config.set_root_cache_directory(str(self.workdir))

    # Increase the number of retries to avoid spurious server failures
    self.retry_policy = openml.config.retry_policy
    self.connection_n_retries = openml.config.connection_n_retries
    openml.config.set_retry_policy("robot", n_retries=20)

tearDown #

tearDown() -> None

Tear down the test

Source code in openml/testing.py

def tearDown(self) -> None:
    """Tear down the test"""
    os.chdir(self.cwd)
    try:
        shutil.rmtree(self.workdir)
    except PermissionError as e:
        if os.name != "nt":
            # one of the files may still be used by another process
            raise e

    openml.config.connection_n_retries = self.connection_n_retries
    openml.config.retry_policy = self.retry_policy

check_task_existence #

check_task_existence(task_type: TaskType, dataset_id: int, target_name: str, **kwargs: dict[str, str | int | dict[str, str | int | TaskType]]) -> int | None

Checks if any task with exists on test server that matches the meta data.

Parameter#

task_type : openml.tasks.TaskType dataset_id : int target_name : str

Return#

int, None

Source code in openml/testing.py

def check_task_existence(
    task_type: TaskType,
    dataset_id: int,
    target_name: str,
    **kwargs: dict[str, str | int | dict[str, str | int | openml.tasks.TaskType]],
) -> int | None:
    """Checks if any task with exists on test server that matches the meta data.

    Parameter
    ---------
    task_type : openml.tasks.TaskType
    dataset_id : int
    target_name : str

    Return
    ------
    int, None
    """
    return_val = None
    tasks = openml.tasks.list_tasks(task_type=task_type)
    if len(tasks) == 0:
        return None
    tasks = tasks.loc[tasks["did"] == dataset_id]
    if len(tasks) == 0:
        return None
    tasks = tasks.loc[tasks["target_feature"] == target_name]
    if len(tasks) == 0:
        return None
    task_match = []
    for task_id in tasks["tid"].to_list():
        task_match.append(task_id)
        try:
            task = openml.tasks.get_task(task_id)
        except OpenMLServerException:
            # can fail if task_id deleted by another parallely run unit test
            task_match.pop(-1)
            return_val = None
            continue
        for k, v in kwargs.items():
            if getattr(task, k) != v:
                # even if one of the meta-data key mismatches, then task_id is not a match
                task_match.pop(-1)
                break
        # if task_id is retained in the task_match list, it passed all meta key-value matches
        if len(task_match) == 1:
            return_val = task_id
            break
    if len(task_match) == 0:
        return_val = None
    return return_val