Skip to content

functions

openml.tasks.functions #

__list_tasks #

__list_tasks(api_call: str) -> DataFrame

Returns a Pandas DataFrame with information about OpenML tasks.

PARAMETER DESCRIPTION
api_call

The API call specifying which tasks to return.

TYPE: str

RETURNS DESCRIPTION
A Pandas DataFrame with information about OpenML tasks.
RAISES DESCRIPTION
ValueError

If the XML returned by the OpenML API does not contain 'oml:tasks', '@xmlns:oml', or has an incorrect value for '@xmlns:oml'.

KeyError

If an invalid key is found in the XML for a task.

Source code in openml/tasks/functions.py
def __list_tasks(api_call: str) -> pd.DataFrame:  # noqa: C901, PLR0912
    """Returns a Pandas DataFrame with information about OpenML tasks.

    Parameters
    ----------
    api_call : str
        The API call specifying which tasks to return.

    Returns
    -------
        A Pandas DataFrame with information about OpenML tasks.

    Raises
    ------
    ValueError
        If the XML returned by the OpenML API does not contain 'oml:tasks', '@xmlns:oml',
        or has an incorrect value for '@xmlns:oml'.
    KeyError
        If an invalid key is found in the XML for a task.
    """
    xml_string = openml._api_calls._perform_api_call(api_call, "get")
    tasks_dict = xmltodict.parse(xml_string, force_list=("oml:task", "oml:input"))
    # Minimalistic check if the XML is useful
    if "oml:tasks" not in tasks_dict:
        raise ValueError(f'Error in return XML, does not contain "oml:runs": {tasks_dict}')

    if "@xmlns:oml" not in tasks_dict["oml:tasks"]:
        raise ValueError(
            f'Error in return XML, does not contain "oml:runs"/@xmlns:oml: {tasks_dict}'
        )

    if tasks_dict["oml:tasks"]["@xmlns:oml"] != "http://openml.org/openml":
        raise ValueError(
            "Error in return XML, value of  "
            '"oml:runs"/@xmlns:oml is not '
            f'"http://openml.org/openml": {tasks_dict!s}',
        )

    assert isinstance(tasks_dict["oml:tasks"]["oml:task"], list), type(tasks_dict["oml:tasks"])

    tasks = {}
    procs = _get_estimation_procedure_list()
    proc_dict = {x["id"]: x for x in procs}

    for task_ in tasks_dict["oml:tasks"]["oml:task"]:
        tid = None
        try:
            tid = int(task_["oml:task_id"])
            task_type_int = int(task_["oml:task_type_id"])
            try:
                task_type_id = TaskType(task_type_int)
            except ValueError as e:
                warnings.warn(
                    f"Could not create task type id for {task_type_int} due to error {e}",
                    RuntimeWarning,
                    stacklevel=2,
                )
                continue

            task = {
                "tid": tid,
                "ttid": task_type_id,
                "did": int(task_["oml:did"]),
                "name": task_["oml:name"],
                "task_type": task_["oml:task_type"],
                "status": task_["oml:status"],
            }

            # Other task inputs
            for _input in task_.get("oml:input", []):
                if _input["@name"] == "estimation_procedure":
                    task[_input["@name"]] = proc_dict[int(_input["#text"])]["name"]
                else:
                    value = _input.get("#text")
                    task[_input["@name"]] = value

            # The number of qualities can range from 0 to infinity
            for quality in task_.get("oml:quality", []):
                if "#text" not in quality:
                    quality_value = 0.0
                else:
                    quality["#text"] = float(quality["#text"])
                    if abs(int(quality["#text"]) - quality["#text"]) < 0.0000001:
                        quality["#text"] = int(quality["#text"])
                    quality_value = quality["#text"]
                task[quality["@name"]] = quality_value
            tasks[tid] = task
        except KeyError as e:
            if tid is not None:
                warnings.warn(
                    f"Invalid xml for task {tid}: {e}\nFrom {task_}",
                    RuntimeWarning,
                    stacklevel=2,
                )
            else:
                warnings.warn(f"Could not find key {e} in {task_}!", RuntimeWarning, stacklevel=2)

    return pd.DataFrame.from_dict(tasks, orient="index")

create_task #

create_task(task_type: TaskType, dataset_id: int, estimation_procedure_id: int, target_name: str | None = None, evaluation_measure: str | None = None, **kwargs: Any) -> OpenMLClassificationTask | OpenMLRegressionTask | OpenMLLearningCurveTask | OpenMLClusteringTask

Create a task based on different given attributes.

Builds a task object with the function arguments as attributes. The type of the task object built is determined from the task type id. More information on how the arguments (task attributes), relate to the different possible tasks can be found in the individual task objects at the openml.tasks.task module.

PARAMETER DESCRIPTION
task_type

Id of the task type.

TYPE: TaskType

dataset_id

The id of the dataset for the task.

TYPE: int

target_name

The name of the feature used as a target. At the moment, only optional for the clustering tasks.

TYPE: str DEFAULT: None

estimation_procedure_id

The id of the estimation procedure.

TYPE: int

evaluation_measure

The name of the evaluation measure.

TYPE: str DEFAULT: None

kwargs

Other task attributes that are not mandatory for task upload.

TYPE: dict DEFAULT: {}

RETURNS DESCRIPTION
(OpenMLClassificationTask, OpenMLRegressionTask)
(OpenMLLearningCurveTask, OpenMLClusteringTask)
Source code in openml/tasks/functions.py
def create_task(
    task_type: TaskType,
    dataset_id: int,
    estimation_procedure_id: int,
    target_name: str | None = None,
    evaluation_measure: str | None = None,
    **kwargs: Any,
) -> (
    OpenMLClassificationTask | OpenMLRegressionTask | OpenMLLearningCurveTask | OpenMLClusteringTask
):
    """Create a task based on different given attributes.

    Builds a task object with the function arguments as
    attributes. The type of the task object built is
    determined from the task type id.
    More information on how the arguments (task attributes),
    relate to the different possible tasks can be found in
    the individual task objects at the openml.tasks.task
    module.

    Parameters
    ----------
    task_type : TaskType
        Id of the task type.
    dataset_id : int
        The id of the dataset for the task.
    target_name : str, optional
        The name of the feature used as a target.
        At the moment, only optional for the clustering tasks.
    estimation_procedure_id : int
        The id of the estimation procedure.
    evaluation_measure : str, optional
        The name of the evaluation measure.
    kwargs : dict, optional
        Other task attributes that are not mandatory
        for task upload.

    Returns
    -------
    OpenMLClassificationTask, OpenMLRegressionTask,
    OpenMLLearningCurveTask, OpenMLClusteringTask
    """
    if task_type == TaskType.CLUSTERING:
        task_cls = OpenMLClusteringTask
    elif task_type == TaskType.LEARNING_CURVE:
        task_cls = OpenMLLearningCurveTask  # type: ignore
    elif task_type == TaskType.SUPERVISED_CLASSIFICATION:
        task_cls = OpenMLClassificationTask  # type: ignore
    elif task_type == TaskType.SUPERVISED_REGRESSION:
        task_cls = OpenMLRegressionTask  # type: ignore
    else:
        raise NotImplementedError(
            f"Task type ID {task_type:d} is not supported. "
            f"Supported task type IDs: {TaskType.SUPERVISED_CLASSIFICATION.value},"
            f"{TaskType.SUPERVISED_REGRESSION.value}, "
            f"{TaskType.CLUSTERING.value}, {TaskType.LEARNING_CURVE.value}. "
            f"Please refer to the TaskType enum for valid task type identifiers."
        )

    return task_cls(
        task_type_id=task_type,
        task_type="None",  # TODO: refactor to get task type string from ID.
        data_set_id=dataset_id,
        target_name=target_name,  # type: ignore
        estimation_procedure_id=estimation_procedure_id,
        evaluation_measure=evaluation_measure,
        **kwargs,
    )

delete_task #

delete_task(task_id: int) -> bool

Delete task with id task_id from the OpenML server.

You can only delete tasks which you created and have no runs associated with them.

PARAMETER DESCRIPTION
task_id

OpenML id of the task

TYPE: int

RETURNS DESCRIPTION
bool

True if the deletion was successful. False otherwise.

Source code in openml/tasks/functions.py
def delete_task(task_id: int) -> bool:
    """Delete task with id `task_id` from the OpenML server.

    You can only delete tasks which you created and have
    no runs associated with them.

    Parameters
    ----------
    task_id : int
        OpenML id of the task

    Returns
    -------
    bool
        True if the deletion was successful. False otherwise.
    """
    return openml.utils._delete_entity("task", task_id)

get_task #

get_task(task_id: int, download_splits: bool = False, **get_dataset_kwargs: Any) -> OpenMLTask

Download OpenML task for a given task ID.

Downloads the task representation.

Use the download_splits parameter to control whether the splits are downloaded. Moreover, you may pass additional parameter (args or kwargs) that are passed to :meth:openml.datasets.get_dataset.

PARAMETER DESCRIPTION
task_id

The OpenML task id of the task to download.

TYPE: int

download_splits

Whether to download the splits as well.

TYPE: bool DEFAULT: False

get_dataset_kwargs

Args and kwargs can be used pass optional parameters to :meth:openml.datasets.get_dataset.

TYPE: Any DEFAULT: {}

RETURNS DESCRIPTION
task

TYPE: OpenMLTask

Source code in openml/tasks/functions.py
@openml.utils.thread_safe_if_oslo_installed
def get_task(
    task_id: int,
    download_splits: bool = False,  # noqa: FBT002
    **get_dataset_kwargs: Any,
) -> OpenMLTask:
    """Download OpenML task for a given task ID.

    Downloads the task representation.

    Use the `download_splits` parameter to control whether the splits are downloaded.
    Moreover, you may pass additional parameter (args or kwargs) that are passed to
    :meth:`openml.datasets.get_dataset`.

    Parameters
    ----------
    task_id : int
        The OpenML task id of the task to download.
    download_splits: bool (default=False)
        Whether to download the splits as well.
    get_dataset_kwargs :
        Args and kwargs can be used pass optional parameters to :meth:`openml.datasets.get_dataset`.

    Returns
    -------
    task: OpenMLTask
    """
    if not isinstance(task_id, int):
        raise TypeError(f"Task id should be integer, is {type(task_id)}")

    cache_key_dir = openml.utils._create_cache_directory_for_id(TASKS_CACHE_DIR_NAME, task_id)
    tid_cache_dir = cache_key_dir / str(task_id)
    tid_cache_dir_existed = tid_cache_dir.exists()
    try:
        task = _get_task_description(task_id)
        dataset = get_dataset(task.dataset_id, **get_dataset_kwargs)
        # List of class labels available in dataset description
        # Including class labels as part of task meta data handles
        #   the case where data download was initially disabled
        if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)):
            task.class_labels = dataset.retrieve_class_labels(task.target_name)
        # Clustering tasks do not have class labels
        # and do not offer download_split
        if download_splits and isinstance(task, OpenMLSupervisedTask):
            task.download_split()
    except Exception as e:
        if not tid_cache_dir_existed:
            openml.utils._remove_cache_dir_for_id(TASKS_CACHE_DIR_NAME, tid_cache_dir)
        raise e

    return task

get_tasks #

get_tasks(task_ids: list[int], download_data: bool | None = None, download_qualities: bool | None = None) -> list[OpenMLTask]

Download tasks.

This function iterates :meth:openml.tasks.get_task.

PARAMETER DESCRIPTION
task_ids

A list of task ids to download.

TYPE: List[int]

download_data

Option to trigger download of data along with the meta data.

TYPE: bool(default=True) DEFAULT: None

download_qualities

Option to download 'qualities' meta-data in addition to the minimal dataset description.

TYPE: bool(default=True) DEFAULT: None

RETURNS DESCRIPTION
list
Source code in openml/tasks/functions.py
def get_tasks(
    task_ids: list[int],
    download_data: bool | None = None,
    download_qualities: bool | None = None,
) -> list[OpenMLTask]:
    """Download tasks.

    This function iterates :meth:`openml.tasks.get_task`.

    Parameters
    ----------
    task_ids : List[int]
        A list of task ids to download.
    download_data : bool (default = True)
        Option to trigger download of data along with the meta data.
    download_qualities : bool (default=True)
        Option to download 'qualities' meta-data in addition to the minimal dataset description.

    Returns
    -------
    list
    """
    if download_data is None:
        warnings.warn(
            "`download_data` will default to False starting in 0.16. "
            "Please set `download_data` explicitly to suppress this warning.",
            stacklevel=1,
        )
        download_data = True

    if download_qualities is None:
        warnings.warn(
            "`download_qualities` will default to False starting in 0.16. "
            "Please set `download_qualities` explicitly to suppress this warning.",
            stacklevel=1,
        )
        download_qualities = True

    tasks = []
    for task_id in task_ids:
        tasks.append(
            get_task(task_id, download_data=download_data, download_qualities=download_qualities)
        )
    return tasks

list_tasks #

list_tasks(task_type: TaskType | None = None, offset: int | None = None, size: int | None = None, tag: str | None = None, data_tag: str | None = None, status: str | None = None, data_name: str | None = None, data_id: int | None = None, number_instances: int | None = None, number_features: int | None = None, number_classes: int | None = None, number_missing_values: int | None = None) -> DataFrame

Return a number of tasks having the given tag and task_type

PARAMETER DESCRIPTION
Filter

it

type

offset

the number of tasks to skip, starting from the first

TYPE: int DEFAULT: None

task_type

Refers to the type of task.

TYPE: TaskType DEFAULT: None

size

the maximum number of tasks to show

TYPE: int DEFAULT: None

tag

the tag to include

TYPE: str DEFAULT: None

data_tag

the tag of the dataset

TYPE: str DEFAULT: None

data_id

TYPE: int DEFAULT: None

status

TYPE: str DEFAULT: None

data_name

TYPE: str DEFAULT: None

number_instances

TYPE: int DEFAULT: None

number_features

TYPE: int DEFAULT: None

number_classes

TYPE: int DEFAULT: None

number_missing_values

TYPE: int DEFAULT: None

RETURNS DESCRIPTION
dataframe

All tasks having the given task_type and the give tag. Every task is represented by a row in the data frame containing the following information as columns: task id, dataset id, task_type and status. If qualities are calculated for the associated dataset, some of these are also returned.

Source code in openml/tasks/functions.py
def list_tasks(  # noqa: PLR0913
    task_type: TaskType | None = None,
    offset: int | None = None,
    size: int | None = None,
    tag: str | None = None,
    data_tag: str | None = None,
    status: str | None = None,
    data_name: str | None = None,
    data_id: int | None = None,
    number_instances: int | None = None,
    number_features: int | None = None,
    number_classes: int | None = None,
    number_missing_values: int | None = None,
) -> pd.DataFrame:
    """
    Return a number of tasks having the given tag and task_type

    Parameters
    ----------
    Filter task_type is separated from the other filters because
    it is used as task_type in the task description, but it is named
    type when used as a filter in list tasks call.
    offset : int, optional
        the number of tasks to skip, starting from the first
    task_type : TaskType, optional
        Refers to the type of task.
    size : int, optional
        the maximum number of tasks to show
    tag : str, optional
        the tag to include
    data_tag : str, optional
        the tag of the dataset
    data_id : int, optional
    status : str, optional
    data_name : str, optional
    number_instances : int, optional
    number_features : int, optional
    number_classes : int, optional
    number_missing_values : int, optional

    Returns
    -------
    dataframe
        All tasks having the given task_type and the give tag. Every task is
        represented by a row in the data frame containing the following information
        as columns: task id, dataset id, task_type and status. If qualities are
        calculated for the associated dataset, some of these are also returned.
    """
    listing_call = partial(
        _list_tasks,
        task_type=task_type,
        tag=tag,
        data_tag=data_tag,
        status=status,
        data_id=data_id,
        data_name=data_name,
        number_instances=number_instances,
        number_features=number_features,
        number_classes=number_classes,
        number_missing_values=number_missing_values,
    )
    batches = openml.utils._list_all(listing_call, offset=offset, limit=size)
    if len(batches) == 0:
        return pd.DataFrame()

    return pd.concat(batches)