Skip to content

tasks

openml.tasks #

OpenMLClassificationTask #

OpenMLClassificationTask(task_type_id: TaskType, task_type: str, data_set_id: int, target_name: str, estimation_procedure_id: int = 1, estimation_procedure_type: str | None = None, estimation_parameters: dict[str, str] | None = None, evaluation_measure: str | None = None, data_splits_url: str | None = None, task_id: int | None = None, class_labels: list[str] | None = None, cost_matrix: ndarray | None = None)

Bases: OpenMLSupervisedTask

OpenML Classification object.

PARAMETER DESCRIPTION
task_type_id

ID of the Classification task type.

TYPE: TaskType

task_type

Name of the Classification task type.

TYPE: str

data_set_id

ID of the OpenML dataset associated with the Classification task.

TYPE: int

target_name

Name of the target variable.

TYPE: str

estimation_procedure_id

ID of the estimation procedure for the Classification task.

TYPE: int DEFAULT: None

estimation_procedure_type

Type of the estimation procedure.

TYPE: str DEFAULT: None

estimation_parameters

Estimation parameters for the Classification task.

TYPE: dict DEFAULT: None

evaluation_measure

Name of the evaluation measure.

TYPE: str DEFAULT: None

data_splits_url

URL of the data splits for the Classification task.

TYPE: str DEFAULT: None

task_id

ID of the Classification task (if it already exists on OpenML).

TYPE: Union[int, None] DEFAULT: None

class_labels

A list of class labels (for classification tasks).

TYPE: List of str DEFAULT: None

cost_matrix

A cost matrix (for classification tasks).

TYPE: array DEFAULT: None

Source code in openml/tasks/task.py
def __init__(  # noqa: PLR0913
    self,
    task_type_id: TaskType,
    task_type: str,
    data_set_id: int,
    target_name: str,
    estimation_procedure_id: int = 1,
    estimation_procedure_type: str | None = None,
    estimation_parameters: dict[str, str] | None = None,
    evaluation_measure: str | None = None,
    data_splits_url: str | None = None,
    task_id: int | None = None,
    class_labels: list[str] | None = None,
    cost_matrix: np.ndarray | None = None,
):
    super().__init__(
        task_id=task_id,
        task_type_id=task_type_id,
        task_type=task_type,
        data_set_id=data_set_id,
        estimation_procedure_id=estimation_procedure_id,
        estimation_procedure_type=estimation_procedure_type,
        estimation_parameters=estimation_parameters,
        evaluation_measure=evaluation_measure,
        target_name=target_name,
        data_splits_url=data_splits_url,
    )
    self.class_labels = class_labels
    self.cost_matrix = cost_matrix

    if cost_matrix is not None:
        raise NotImplementedError("Costmatrix functionality is not yet implemented.")

estimation_parameters property writable #

estimation_parameters: dict[str, str] | None

Return the estimation parameters for the task.

id property #

id: int | None

Return the OpenML ID of this task.

openml_url property #

openml_url: str | None

The URL of the object on the server, if it was uploaded, else None.

download_split #

download_split() -> OpenMLSplit

Download the OpenML split for a given task.

Source code in openml/tasks/task.py
def download_split(self) -> OpenMLSplit:
    """Download the OpenML split for a given task."""
    # TODO(eddiebergman): Can this every be `None`?
    assert self.task_id is not None
    cache_dir = _create_cache_directory_for_id("tasks", self.task_id)
    cached_split_file = cache_dir / "datasplits.arff"

    try:
        split = OpenMLSplit._from_arff_file(cached_split_file)
    except OSError:
        # Next, download and cache the associated split file
        self._download_split(cached_split_file)
        split = OpenMLSplit._from_arff_file(cached_split_file)

    return split

get_X_and_y #

get_X_and_y() -> tuple[DataFrame, Series | DataFrame | None]

Get data associated with the current task.

RETURNS DESCRIPTION
tuple - X and y
Source code in openml/tasks/task.py
def get_X_and_y(self) -> tuple[pd.DataFrame, pd.Series | pd.DataFrame | None]:
    """Get data associated with the current task.

    Returns
    -------
    tuple - X and y

    """
    dataset = self.get_dataset()
    if self.task_type_id not in (
        TaskType.SUPERVISED_CLASSIFICATION,
        TaskType.SUPERVISED_REGRESSION,
        TaskType.LEARNING_CURVE,
    ):
        raise NotImplementedError(
            f"Task type '{self.task_type}' is not implemented for get_X_and_y(). "
            f"Supported types: SUPERVISED_CLASSIFICATION, SUPERVISED_REGRESSION,"
            f"LEARNING_CURVE."
            f"Task ID: {getattr(self, 'task_id', 'unknown')}. "
        )

    X, y, _, _ = dataset.get_data(target=self.target_name)
    return X, y

get_dataset #

get_dataset(**kwargs: Any) -> OpenMLDataset

Download dataset associated with task.

Accepts the same keyword arguments as the openml.datasets.get_dataset.

Source code in openml/tasks/task.py
def get_dataset(self, **kwargs: Any) -> datasets.OpenMLDataset:
    """Download dataset associated with task.

    Accepts the same keyword arguments as the `openml.datasets.get_dataset`.
    """
    return datasets.get_dataset(self.dataset_id, **kwargs)

get_split_dimensions #

get_split_dimensions() -> tuple[int, int, int]

Get the (repeats, folds, samples) of the split for a given task.

Source code in openml/tasks/task.py
def get_split_dimensions(self) -> tuple[int, int, int]:
    """Get the (repeats, folds, samples) of the split for a given task."""
    if self.split is None:
        self.split = self.download_split()

    return self.split.repeats, self.split.folds, self.split.samples

get_train_test_split_indices #

get_train_test_split_indices(fold: int = 0, repeat: int = 0, sample: int = 0) -> tuple[ndarray, ndarray]

Get the indices of the train and test splits for a given task.

Source code in openml/tasks/task.py
def get_train_test_split_indices(
    self,
    fold: int = 0,
    repeat: int = 0,
    sample: int = 0,
) -> tuple[np.ndarray, np.ndarray]:
    """Get the indices of the train and test splits for a given task."""
    # Replace with retrieve from cache
    if self.split is None:
        self.split = self.download_split()

    return self.split.get(repeat=repeat, fold=fold, sample=sample)

open_in_browser #

open_in_browser() -> None

Opens the OpenML web page corresponding to this object in your default browser.

Source code in openml/base.py
def open_in_browser(self) -> None:
    """Opens the OpenML web page corresponding to this object in your default browser."""
    if self.openml_url is None:
        raise ValueError(
            "Cannot open element on OpenML.org when attribute `openml_url` is `None`",
        )

    webbrowser.open(self.openml_url)

publish #

publish() -> OpenMLBase

Publish the object on the OpenML server.

Source code in openml/base.py
def publish(self) -> OpenMLBase:
    """Publish the object on the OpenML server."""
    file_elements = self._get_file_elements()

    if "description" not in file_elements:
        file_elements["description"] = self._to_xml()

    call = f"{_get_rest_api_type_alias(self)}/"
    response_text = openml._api_calls._perform_api_call(
        call,
        "post",
        file_elements=file_elements,
    )
    xml_response = xmltodict.parse(response_text)

    self._parse_publish_response(xml_response)
    return self

push_tag #

push_tag(tag: str) -> None

Annotates this entity with a tag on the server.

PARAMETER DESCRIPTION
tag

Tag to attach to the flow.

TYPE: str

Source code in openml/base.py
def push_tag(self, tag: str) -> None:
    """Annotates this entity with a tag on the server.

    Parameters
    ----------
    tag : str
        Tag to attach to the flow.
    """
    _tag_openml_base(self, tag)

remove_tag #

remove_tag(tag: str) -> None

Removes a tag from this entity on the server.

PARAMETER DESCRIPTION
tag

Tag to attach to the flow.

TYPE: str

Source code in openml/base.py
def remove_tag(self, tag: str) -> None:
    """Removes a tag from this entity on the server.

    Parameters
    ----------
    tag : str
        Tag to attach to the flow.
    """
    _tag_openml_base(self, tag, untag=True)

url_for_id classmethod #

url_for_id(id_: int) -> str

Return the OpenML URL for the object of the class entity with the given id.

Source code in openml/base.py
@classmethod
def url_for_id(cls, id_: int) -> str:
    """Return the OpenML URL for the object of the class entity with the given id."""
    # Sample url for a flow: openml.org/f/123
    return f"{openml.config.get_server_base_url()}/{cls._entity_letter()}/{id_}"

OpenMLClusteringTask #

OpenMLClusteringTask(task_type_id: TaskType, task_type: str, data_set_id: int, estimation_procedure_id: int = 17, task_id: int | None = None, estimation_procedure_type: str | None = None, estimation_parameters: dict[str, str] | None = None, data_splits_url: str | None = None, evaluation_measure: str | None = None, target_name: str | None = None)

Bases: OpenMLTask

OpenML Clustering object.

PARAMETER DESCRIPTION
task_type_id

Task type ID of the OpenML clustering task.

TYPE: TaskType

task_type

Task type of the OpenML clustering task.

TYPE: str

data_set_id

ID of the OpenML dataset used in clustering the task.

TYPE: int

estimation_procedure_id

ID of the OpenML estimation procedure.

TYPE: int DEFAULT: None

task_id

ID of the OpenML clustering task.

TYPE: Union[int, None] DEFAULT: None

estimation_procedure_type

Type of the OpenML estimation procedure used in the clustering task.

TYPE: str DEFAULT: None

estimation_parameters

Parameters used by the OpenML estimation procedure.

TYPE: dict DEFAULT: None

data_splits_url

URL of the OpenML data splits for the clustering task.

TYPE: str DEFAULT: None

evaluation_measure

Evaluation measure used in the clustering task.

TYPE: str DEFAULT: None

target_name

Name of the target feature (class) that is not part of the feature set for the clustering task.

TYPE: str DEFAULT: None

Source code in openml/tasks/task.py
def __init__(  # noqa: PLR0913
    self,
    task_type_id: TaskType,
    task_type: str,
    data_set_id: int,
    estimation_procedure_id: int = 17,
    task_id: int | None = None,
    estimation_procedure_type: str | None = None,
    estimation_parameters: dict[str, str] | None = None,
    data_splits_url: str | None = None,
    evaluation_measure: str | None = None,
    target_name: str | None = None,
):
    super().__init__(
        task_id=task_id,
        task_type_id=task_type_id,
        task_type=task_type,
        data_set_id=data_set_id,
        evaluation_measure=evaluation_measure,
        estimation_procedure_id=estimation_procedure_id,
        estimation_procedure_type=estimation_procedure_type,
        estimation_parameters=estimation_parameters,
        data_splits_url=data_splits_url,
    )

    self.target_name = target_name

id property #

id: int | None

Return the OpenML ID of this task.

openml_url property #

openml_url: str | None

The URL of the object on the server, if it was uploaded, else None.

download_split #

download_split() -> OpenMLSplit

Download the OpenML split for a given task.

Source code in openml/tasks/task.py
def download_split(self) -> OpenMLSplit:
    """Download the OpenML split for a given task."""
    # TODO(eddiebergman): Can this every be `None`?
    assert self.task_id is not None
    cache_dir = _create_cache_directory_for_id("tasks", self.task_id)
    cached_split_file = cache_dir / "datasplits.arff"

    try:
        split = OpenMLSplit._from_arff_file(cached_split_file)
    except OSError:
        # Next, download and cache the associated split file
        self._download_split(cached_split_file)
        split = OpenMLSplit._from_arff_file(cached_split_file)

    return split

get_X #

get_X() -> DataFrame

Get data associated with the current task.

RETURNS DESCRIPTION
The X data as a dataframe
Source code in openml/tasks/task.py
def get_X(self) -> pd.DataFrame:
    """Get data associated with the current task.

    Returns
    -------
    The X data as a dataframe
    """
    dataset = self.get_dataset()
    data, *_ = dataset.get_data(target=None)
    return data

get_dataset #

get_dataset(**kwargs: Any) -> OpenMLDataset

Download dataset associated with task.

Accepts the same keyword arguments as the openml.datasets.get_dataset.

Source code in openml/tasks/task.py
def get_dataset(self, **kwargs: Any) -> datasets.OpenMLDataset:
    """Download dataset associated with task.

    Accepts the same keyword arguments as the `openml.datasets.get_dataset`.
    """
    return datasets.get_dataset(self.dataset_id, **kwargs)

get_split_dimensions #

get_split_dimensions() -> tuple[int, int, int]

Get the (repeats, folds, samples) of the split for a given task.

Source code in openml/tasks/task.py
def get_split_dimensions(self) -> tuple[int, int, int]:
    """Get the (repeats, folds, samples) of the split for a given task."""
    if self.split is None:
        self.split = self.download_split()

    return self.split.repeats, self.split.folds, self.split.samples

get_train_test_split_indices #

get_train_test_split_indices(fold: int = 0, repeat: int = 0, sample: int = 0) -> tuple[ndarray, ndarray]

Get the indices of the train and test splits for a given task.

Source code in openml/tasks/task.py
def get_train_test_split_indices(
    self,
    fold: int = 0,
    repeat: int = 0,
    sample: int = 0,
) -> tuple[np.ndarray, np.ndarray]:
    """Get the indices of the train and test splits for a given task."""
    # Replace with retrieve from cache
    if self.split is None:
        self.split = self.download_split()

    return self.split.get(repeat=repeat, fold=fold, sample=sample)

open_in_browser #

open_in_browser() -> None

Opens the OpenML web page corresponding to this object in your default browser.

Source code in openml/base.py
def open_in_browser(self) -> None:
    """Opens the OpenML web page corresponding to this object in your default browser."""
    if self.openml_url is None:
        raise ValueError(
            "Cannot open element on OpenML.org when attribute `openml_url` is `None`",
        )

    webbrowser.open(self.openml_url)

publish #

publish() -> OpenMLBase

Publish the object on the OpenML server.

Source code in openml/base.py
def publish(self) -> OpenMLBase:
    """Publish the object on the OpenML server."""
    file_elements = self._get_file_elements()

    if "description" not in file_elements:
        file_elements["description"] = self._to_xml()

    call = f"{_get_rest_api_type_alias(self)}/"
    response_text = openml._api_calls._perform_api_call(
        call,
        "post",
        file_elements=file_elements,
    )
    xml_response = xmltodict.parse(response_text)

    self._parse_publish_response(xml_response)
    return self

push_tag #

push_tag(tag: str) -> None

Annotates this entity with a tag on the server.

PARAMETER DESCRIPTION
tag

Tag to attach to the flow.

TYPE: str

Source code in openml/base.py
def push_tag(self, tag: str) -> None:
    """Annotates this entity with a tag on the server.

    Parameters
    ----------
    tag : str
        Tag to attach to the flow.
    """
    _tag_openml_base(self, tag)

remove_tag #

remove_tag(tag: str) -> None

Removes a tag from this entity on the server.

PARAMETER DESCRIPTION
tag

Tag to attach to the flow.

TYPE: str

Source code in openml/base.py
def remove_tag(self, tag: str) -> None:
    """Removes a tag from this entity on the server.

    Parameters
    ----------
    tag : str
        Tag to attach to the flow.
    """
    _tag_openml_base(self, tag, untag=True)

url_for_id classmethod #

url_for_id(id_: int) -> str

Return the OpenML URL for the object of the class entity with the given id.

Source code in openml/base.py
@classmethod
def url_for_id(cls, id_: int) -> str:
    """Return the OpenML URL for the object of the class entity with the given id."""
    # Sample url for a flow: openml.org/f/123
    return f"{openml.config.get_server_base_url()}/{cls._entity_letter()}/{id_}"

OpenMLLearningCurveTask #

OpenMLLearningCurveTask(task_type_id: TaskType, task_type: str, data_set_id: int, target_name: str, estimation_procedure_id: int = 13, estimation_procedure_type: str | None = None, estimation_parameters: dict[str, str] | None = None, data_splits_url: str | None = None, task_id: int | None = None, evaluation_measure: str | None = None, class_labels: list[str] | None = None, cost_matrix: ndarray | None = None)

Bases: OpenMLClassificationTask

OpenML Learning Curve object.

PARAMETER DESCRIPTION
task_type_id

ID of the Learning Curve task.

TYPE: TaskType

task_type

Name of the Learning Curve task.

TYPE: str

data_set_id

ID of the dataset that this task is associated with.

TYPE: int

target_name

Name of the target feature in the dataset.

TYPE: str

estimation_procedure_id

ID of the estimation procedure to use for evaluating models.

TYPE: int DEFAULT: None

estimation_procedure_type

Type of the estimation procedure.

TYPE: str DEFAULT: None

estimation_parameters

Additional parameters for the estimation procedure.

TYPE: dict DEFAULT: None

data_splits_url

URL of the file containing the data splits for Learning Curve task.

TYPE: str DEFAULT: None

task_id

ID of the Learning Curve task.

TYPE: Union[int, None] DEFAULT: None

evaluation_measure

Name of the evaluation measure to use for evaluating models.

TYPE: str DEFAULT: None

class_labels

Class labels for Learning Curve tasks.

TYPE: list of str DEFAULT: None

cost_matrix

Cost matrix for Learning Curve tasks.

TYPE: numpy array DEFAULT: None

Source code in openml/tasks/task.py
def __init__(  # noqa: PLR0913
    self,
    task_type_id: TaskType,
    task_type: str,
    data_set_id: int,
    target_name: str,
    estimation_procedure_id: int = 13,
    estimation_procedure_type: str | None = None,
    estimation_parameters: dict[str, str] | None = None,
    data_splits_url: str | None = None,
    task_id: int | None = None,
    evaluation_measure: str | None = None,
    class_labels: list[str] | None = None,
    cost_matrix: np.ndarray | None = None,
):
    super().__init__(
        task_id=task_id,
        task_type_id=task_type_id,
        task_type=task_type,
        data_set_id=data_set_id,
        estimation_procedure_id=estimation_procedure_id,
        estimation_procedure_type=estimation_procedure_type,
        estimation_parameters=estimation_parameters,
        evaluation_measure=evaluation_measure,
        target_name=target_name,
        data_splits_url=data_splits_url,
        class_labels=class_labels,
        cost_matrix=cost_matrix,
    )

estimation_parameters property writable #

estimation_parameters: dict[str, str] | None

Return the estimation parameters for the task.

id property #

id: int | None

Return the OpenML ID of this task.

openml_url property #

openml_url: str | None

The URL of the object on the server, if it was uploaded, else None.

download_split #

download_split() -> OpenMLSplit

Download the OpenML split for a given task.

Source code in openml/tasks/task.py
def download_split(self) -> OpenMLSplit:
    """Download the OpenML split for a given task."""
    # TODO(eddiebergman): Can this every be `None`?
    assert self.task_id is not None
    cache_dir = _create_cache_directory_for_id("tasks", self.task_id)
    cached_split_file = cache_dir / "datasplits.arff"

    try:
        split = OpenMLSplit._from_arff_file(cached_split_file)
    except OSError:
        # Next, download and cache the associated split file
        self._download_split(cached_split_file)
        split = OpenMLSplit._from_arff_file(cached_split_file)

    return split

get_X_and_y #

get_X_and_y() -> tuple[DataFrame, Series | DataFrame | None]

Get data associated with the current task.

RETURNS DESCRIPTION
tuple - X and y
Source code in openml/tasks/task.py
def get_X_and_y(self) -> tuple[pd.DataFrame, pd.Series | pd.DataFrame | None]:
    """Get data associated with the current task.

    Returns
    -------
    tuple - X and y

    """
    dataset = self.get_dataset()
    if self.task_type_id not in (
        TaskType.SUPERVISED_CLASSIFICATION,
        TaskType.SUPERVISED_REGRESSION,
        TaskType.LEARNING_CURVE,
    ):
        raise NotImplementedError(
            f"Task type '{self.task_type}' is not implemented for get_X_and_y(). "
            f"Supported types: SUPERVISED_CLASSIFICATION, SUPERVISED_REGRESSION,"
            f"LEARNING_CURVE."
            f"Task ID: {getattr(self, 'task_id', 'unknown')}. "
        )

    X, y, _, _ = dataset.get_data(target=self.target_name)
    return X, y

get_dataset #

get_dataset(**kwargs: Any) -> OpenMLDataset

Download dataset associated with task.

Accepts the same keyword arguments as the openml.datasets.get_dataset.

Source code in openml/tasks/task.py
def get_dataset(self, **kwargs: Any) -> datasets.OpenMLDataset:
    """Download dataset associated with task.

    Accepts the same keyword arguments as the `openml.datasets.get_dataset`.
    """
    return datasets.get_dataset(self.dataset_id, **kwargs)

get_split_dimensions #

get_split_dimensions() -> tuple[int, int, int]

Get the (repeats, folds, samples) of the split for a given task.

Source code in openml/tasks/task.py
def get_split_dimensions(self) -> tuple[int, int, int]:
    """Get the (repeats, folds, samples) of the split for a given task."""
    if self.split is None:
        self.split = self.download_split()

    return self.split.repeats, self.split.folds, self.split.samples

get_train_test_split_indices #

get_train_test_split_indices(fold: int = 0, repeat: int = 0, sample: int = 0) -> tuple[ndarray, ndarray]

Get the indices of the train and test splits for a given task.

Source code in openml/tasks/task.py
def get_train_test_split_indices(
    self,
    fold: int = 0,
    repeat: int = 0,
    sample: int = 0,
) -> tuple[np.ndarray, np.ndarray]:
    """Get the indices of the train and test splits for a given task."""
    # Replace with retrieve from cache
    if self.split is None:
        self.split = self.download_split()

    return self.split.get(repeat=repeat, fold=fold, sample=sample)

open_in_browser #

open_in_browser() -> None

Opens the OpenML web page corresponding to this object in your default browser.

Source code in openml/base.py
def open_in_browser(self) -> None:
    """Opens the OpenML web page corresponding to this object in your default browser."""
    if self.openml_url is None:
        raise ValueError(
            "Cannot open element on OpenML.org when attribute `openml_url` is `None`",
        )

    webbrowser.open(self.openml_url)

publish #

publish() -> OpenMLBase

Publish the object on the OpenML server.

Source code in openml/base.py
def publish(self) -> OpenMLBase:
    """Publish the object on the OpenML server."""
    file_elements = self._get_file_elements()

    if "description" not in file_elements:
        file_elements["description"] = self._to_xml()

    call = f"{_get_rest_api_type_alias(self)}/"
    response_text = openml._api_calls._perform_api_call(
        call,
        "post",
        file_elements=file_elements,
    )
    xml_response = xmltodict.parse(response_text)

    self._parse_publish_response(xml_response)
    return self

push_tag #

push_tag(tag: str) -> None

Annotates this entity with a tag on the server.

PARAMETER DESCRIPTION
tag

Tag to attach to the flow.

TYPE: str

Source code in openml/base.py
def push_tag(self, tag: str) -> None:
    """Annotates this entity with a tag on the server.

    Parameters
    ----------
    tag : str
        Tag to attach to the flow.
    """
    _tag_openml_base(self, tag)

remove_tag #

remove_tag(tag: str) -> None

Removes a tag from this entity on the server.

PARAMETER DESCRIPTION
tag

Tag to attach to the flow.

TYPE: str

Source code in openml/base.py
def remove_tag(self, tag: str) -> None:
    """Removes a tag from this entity on the server.

    Parameters
    ----------
    tag : str
        Tag to attach to the flow.
    """
    _tag_openml_base(self, tag, untag=True)

url_for_id classmethod #

url_for_id(id_: int) -> str

Return the OpenML URL for the object of the class entity with the given id.

Source code in openml/base.py
@classmethod
def url_for_id(cls, id_: int) -> str:
    """Return the OpenML URL for the object of the class entity with the given id."""
    # Sample url for a flow: openml.org/f/123
    return f"{openml.config.get_server_base_url()}/{cls._entity_letter()}/{id_}"

OpenMLRegressionTask #

OpenMLRegressionTask(task_type_id: TaskType, task_type: str, data_set_id: int, target_name: str, estimation_procedure_id: int = 7, estimation_procedure_type: str | None = None, estimation_parameters: dict[str, str] | None = None, data_splits_url: str | None = None, task_id: int | None = None, evaluation_measure: str | None = None)

Bases: OpenMLSupervisedTask

OpenML Regression object.

PARAMETER DESCRIPTION
task_type_id

Task type ID of the OpenML Regression task.

TYPE: TaskType

task_type

Task type of the OpenML Regression task.

TYPE: str

data_set_id

ID of the OpenML dataset.

TYPE: int

target_name

Name of the target feature used in the Regression task.

TYPE: str

estimation_procedure_id

ID of the OpenML estimation procedure.

TYPE: int DEFAULT: None

estimation_procedure_type

Type of the OpenML estimation procedure.

TYPE: str DEFAULT: None

estimation_parameters

Parameters used by the OpenML estimation procedure.

TYPE: dict DEFAULT: None

data_splits_url

URL of the OpenML data splits for the Regression task.

TYPE: str DEFAULT: None

task_id

ID of the OpenML Regression task.

TYPE: Union[int, None] DEFAULT: None

evaluation_measure

Evaluation measure used in the Regression task.

TYPE: str DEFAULT: None

Source code in openml/tasks/task.py
def __init__(  # noqa: PLR0913
    self,
    task_type_id: TaskType,
    task_type: str,
    data_set_id: int,
    target_name: str,
    estimation_procedure_id: int = 7,
    estimation_procedure_type: str | None = None,
    estimation_parameters: dict[str, str] | None = None,
    data_splits_url: str | None = None,
    task_id: int | None = None,
    evaluation_measure: str | None = None,
):
    super().__init__(
        task_id=task_id,
        task_type_id=task_type_id,
        task_type=task_type,
        data_set_id=data_set_id,
        estimation_procedure_id=estimation_procedure_id,
        estimation_procedure_type=estimation_procedure_type,
        estimation_parameters=estimation_parameters,
        evaluation_measure=evaluation_measure,
        target_name=target_name,
        data_splits_url=data_splits_url,
    )

estimation_parameters property writable #

estimation_parameters: dict[str, str] | None

Return the estimation parameters for the task.

id property #

id: int | None

Return the OpenML ID of this task.

openml_url property #

openml_url: str | None

The URL of the object on the server, if it was uploaded, else None.

download_split #

download_split() -> OpenMLSplit

Download the OpenML split for a given task.

Source code in openml/tasks/task.py
def download_split(self) -> OpenMLSplit:
    """Download the OpenML split for a given task."""
    # TODO(eddiebergman): Can this every be `None`?
    assert self.task_id is not None
    cache_dir = _create_cache_directory_for_id("tasks", self.task_id)
    cached_split_file = cache_dir / "datasplits.arff"

    try:
        split = OpenMLSplit._from_arff_file(cached_split_file)
    except OSError:
        # Next, download and cache the associated split file
        self._download_split(cached_split_file)
        split = OpenMLSplit._from_arff_file(cached_split_file)

    return split

get_X_and_y #

get_X_and_y() -> tuple[DataFrame, Series | DataFrame | None]

Get data associated with the current task.

RETURNS DESCRIPTION
tuple - X and y
Source code in openml/tasks/task.py
def get_X_and_y(self) -> tuple[pd.DataFrame, pd.Series | pd.DataFrame | None]:
    """Get data associated with the current task.

    Returns
    -------
    tuple - X and y

    """
    dataset = self.get_dataset()
    if self.task_type_id not in (
        TaskType.SUPERVISED_CLASSIFICATION,
        TaskType.SUPERVISED_REGRESSION,
        TaskType.LEARNING_CURVE,
    ):
        raise NotImplementedError(
            f"Task type '{self.task_type}' is not implemented for get_X_and_y(). "
            f"Supported types: SUPERVISED_CLASSIFICATION, SUPERVISED_REGRESSION,"
            f"LEARNING_CURVE."
            f"Task ID: {getattr(self, 'task_id', 'unknown')}. "
        )

    X, y, _, _ = dataset.get_data(target=self.target_name)
    return X, y

get_dataset #

get_dataset(**kwargs: Any) -> OpenMLDataset

Download dataset associated with task.

Accepts the same keyword arguments as the openml.datasets.get_dataset.

Source code in openml/tasks/task.py
def get_dataset(self, **kwargs: Any) -> datasets.OpenMLDataset:
    """Download dataset associated with task.

    Accepts the same keyword arguments as the `openml.datasets.get_dataset`.
    """
    return datasets.get_dataset(self.dataset_id, **kwargs)

get_split_dimensions #

get_split_dimensions() -> tuple[int, int, int]

Get the (repeats, folds, samples) of the split for a given task.

Source code in openml/tasks/task.py
def get_split_dimensions(self) -> tuple[int, int, int]:
    """Get the (repeats, folds, samples) of the split for a given task."""
    if self.split is None:
        self.split = self.download_split()

    return self.split.repeats, self.split.folds, self.split.samples

get_train_test_split_indices #

get_train_test_split_indices(fold: int = 0, repeat: int = 0, sample: int = 0) -> tuple[ndarray, ndarray]

Get the indices of the train and test splits for a given task.

Source code in openml/tasks/task.py
def get_train_test_split_indices(
    self,
    fold: int = 0,
    repeat: int = 0,
    sample: int = 0,
) -> tuple[np.ndarray, np.ndarray]:
    """Get the indices of the train and test splits for a given task."""
    # Replace with retrieve from cache
    if self.split is None:
        self.split = self.download_split()

    return self.split.get(repeat=repeat, fold=fold, sample=sample)

open_in_browser #

open_in_browser() -> None

Opens the OpenML web page corresponding to this object in your default browser.

Source code in openml/base.py
def open_in_browser(self) -> None:
    """Opens the OpenML web page corresponding to this object in your default browser."""
    if self.openml_url is None:
        raise ValueError(
            "Cannot open element on OpenML.org when attribute `openml_url` is `None`",
        )

    webbrowser.open(self.openml_url)

publish #

publish() -> OpenMLBase

Publish the object on the OpenML server.

Source code in openml/base.py
def publish(self) -> OpenMLBase:
    """Publish the object on the OpenML server."""
    file_elements = self._get_file_elements()

    if "description" not in file_elements:
        file_elements["description"] = self._to_xml()

    call = f"{_get_rest_api_type_alias(self)}/"
    response_text = openml._api_calls._perform_api_call(
        call,
        "post",
        file_elements=file_elements,
    )
    xml_response = xmltodict.parse(response_text)

    self._parse_publish_response(xml_response)
    return self

push_tag #

push_tag(tag: str) -> None

Annotates this entity with a tag on the server.

PARAMETER DESCRIPTION
tag

Tag to attach to the flow.

TYPE: str

Source code in openml/base.py
def push_tag(self, tag: str) -> None:
    """Annotates this entity with a tag on the server.

    Parameters
    ----------
    tag : str
        Tag to attach to the flow.
    """
    _tag_openml_base(self, tag)

remove_tag #

remove_tag(tag: str) -> None

Removes a tag from this entity on the server.

PARAMETER DESCRIPTION
tag

Tag to attach to the flow.

TYPE: str

Source code in openml/base.py
def remove_tag(self, tag: str) -> None:
    """Removes a tag from this entity on the server.

    Parameters
    ----------
    tag : str
        Tag to attach to the flow.
    """
    _tag_openml_base(self, tag, untag=True)

url_for_id classmethod #

url_for_id(id_: int) -> str

Return the OpenML URL for the object of the class entity with the given id.

Source code in openml/base.py
@classmethod
def url_for_id(cls, id_: int) -> str:
    """Return the OpenML URL for the object of the class entity with the given id."""
    # Sample url for a flow: openml.org/f/123
    return f"{openml.config.get_server_base_url()}/{cls._entity_letter()}/{id_}"

OpenMLSplit #

OpenMLSplit(name: int | str, description: str, split: dict[int, dict[int, dict[int, tuple[ndarray, ndarray]]]])

OpenML Split object.

This class manages train-test splits for a dataset across multiple repetitions, folds, and samples.

PARAMETER DESCRIPTION
name

The name or ID of the split.

TYPE: int or str

description

A description of the split.

TYPE: str

split

A dictionary containing the splits organized by repetition, fold, and sample.

TYPE: dict

Source code in openml/tasks/split.py
def __init__(
    self,
    name: int | str,
    description: str,
    split: dict[int, dict[int, dict[int, tuple[np.ndarray, np.ndarray]]]],
):
    self.description = description
    self.name = name
    self.split: dict[int, dict[int, dict[int, tuple[np.ndarray, np.ndarray]]]] = {}

    # Add splits according to repetition
    for repetition in split:
        _rep = int(repetition)
        self.split[_rep] = OrderedDict()
        for fold in split[_rep]:
            self.split[_rep][fold] = OrderedDict()
            for sample in split[_rep][fold]:
                self.split[_rep][fold][sample] = split[_rep][fold][sample]

    self.repeats = len(self.split)

    # TODO(eddiebergman): Better error message
    if any(len(self.split[0]) != len(self.split[i]) for i in range(self.repeats)):
        raise ValueError("")

    self.folds = len(self.split[0])
    self.samples = len(self.split[0][0])

get #

get(repeat: int = 0, fold: int = 0, sample: int = 0) -> tuple[ndarray, ndarray]

Returns the specified data split from the CrossValidationSplit object.

PARAMETER DESCRIPTION
repeat

Index of the repeat to retrieve.

TYPE: int DEFAULT: 0

fold

Index of the fold to retrieve.

TYPE: int DEFAULT: 0

sample

Index of the sample to retrieve.

TYPE: int DEFAULT: 0

RETURNS DESCRIPTION
ndarray

The data split for the specified repeat, fold, and sample.

RAISES DESCRIPTION
ValueError

If the specified repeat, fold, or sample is not known.

Source code in openml/tasks/split.py
def get(self, repeat: int = 0, fold: int = 0, sample: int = 0) -> tuple[np.ndarray, np.ndarray]:
    """Returns the specified data split from the CrossValidationSplit object.

    Parameters
    ----------
    repeat : int
        Index of the repeat to retrieve.
    fold : int
        Index of the fold to retrieve.
    sample : int
        Index of the sample to retrieve.

    Returns
    -------
    numpy.ndarray
        The data split for the specified repeat, fold, and sample.

    Raises
    ------
    ValueError
        If the specified repeat, fold, or sample is not known.
    """
    if repeat not in self.split:
        raise ValueError(f"Repeat {repeat!s} not known")
    if fold not in self.split[repeat]:
        raise ValueError(f"Fold {fold!s} not known")
    if sample not in self.split[repeat][fold]:
        raise ValueError(f"Sample {sample!s} not known")
    return self.split[repeat][fold][sample]

OpenMLSupervisedTask #

OpenMLSupervisedTask(task_type_id: TaskType, task_type: str, data_set_id: int, target_name: str, estimation_procedure_id: int = 1, estimation_procedure_type: str | None = None, estimation_parameters: dict[str, str] | None = None, evaluation_measure: str | None = None, data_splits_url: str | None = None, task_id: int | None = None)

Bases: OpenMLTask, ABC

OpenML Supervised Classification object.

PARAMETER DESCRIPTION
task_type_id

ID of the task type.

TYPE: TaskType

task_type

Name of the task type.

TYPE: str

data_set_id

ID of the OpenML dataset associated with the task.

TYPE: int

target_name

Name of the target feature (the class variable).

TYPE: str

estimation_procedure_id

ID of the estimation procedure for the task.

TYPE: int DEFAULT: None

estimation_procedure_type

Type of the estimation procedure for the task.

TYPE: str DEFAULT: None

estimation_parameters

Estimation parameters for the task.

TYPE: dict DEFAULT: None

evaluation_measure

Name of the evaluation measure for the task.

TYPE: str DEFAULT: None

data_splits_url

URL of the data splits for the task.

TYPE: str DEFAULT: None

task_id

Refers to the unique identifier of task.

TYPE: int | None DEFAULT: None

Source code in openml/tasks/task.py
def __init__(  # noqa: PLR0913
    self,
    task_type_id: TaskType,
    task_type: str,
    data_set_id: int,
    target_name: str,
    estimation_procedure_id: int = 1,
    estimation_procedure_type: str | None = None,
    estimation_parameters: dict[str, str] | None = None,
    evaluation_measure: str | None = None,
    data_splits_url: str | None = None,
    task_id: int | None = None,
):
    super().__init__(
        task_id=task_id,
        task_type_id=task_type_id,
        task_type=task_type,
        data_set_id=data_set_id,
        estimation_procedure_id=estimation_procedure_id,
        estimation_procedure_type=estimation_procedure_type,
        estimation_parameters=estimation_parameters,
        evaluation_measure=evaluation_measure,
        data_splits_url=data_splits_url,
    )

    self.target_name = target_name

estimation_parameters property writable #

estimation_parameters: dict[str, str] | None

Return the estimation parameters for the task.

id property #

id: int | None

Return the OpenML ID of this task.

openml_url property #

openml_url: str | None

The URL of the object on the server, if it was uploaded, else None.

download_split #

download_split() -> OpenMLSplit

Download the OpenML split for a given task.

Source code in openml/tasks/task.py
def download_split(self) -> OpenMLSplit:
    """Download the OpenML split for a given task."""
    # TODO(eddiebergman): Can this every be `None`?
    assert self.task_id is not None
    cache_dir = _create_cache_directory_for_id("tasks", self.task_id)
    cached_split_file = cache_dir / "datasplits.arff"

    try:
        split = OpenMLSplit._from_arff_file(cached_split_file)
    except OSError:
        # Next, download and cache the associated split file
        self._download_split(cached_split_file)
        split = OpenMLSplit._from_arff_file(cached_split_file)

    return split

get_X_and_y #

get_X_and_y() -> tuple[DataFrame, Series | DataFrame | None]

Get data associated with the current task.

RETURNS DESCRIPTION
tuple - X and y
Source code in openml/tasks/task.py
def get_X_and_y(self) -> tuple[pd.DataFrame, pd.Series | pd.DataFrame | None]:
    """Get data associated with the current task.

    Returns
    -------
    tuple - X and y

    """
    dataset = self.get_dataset()
    if self.task_type_id not in (
        TaskType.SUPERVISED_CLASSIFICATION,
        TaskType.SUPERVISED_REGRESSION,
        TaskType.LEARNING_CURVE,
    ):
        raise NotImplementedError(
            f"Task type '{self.task_type}' is not implemented for get_X_and_y(). "
            f"Supported types: SUPERVISED_CLASSIFICATION, SUPERVISED_REGRESSION,"
            f"LEARNING_CURVE."
            f"Task ID: {getattr(self, 'task_id', 'unknown')}. "
        )

    X, y, _, _ = dataset.get_data(target=self.target_name)
    return X, y

get_dataset #

get_dataset(**kwargs: Any) -> OpenMLDataset

Download dataset associated with task.

Accepts the same keyword arguments as the openml.datasets.get_dataset.

Source code in openml/tasks/task.py
def get_dataset(self, **kwargs: Any) -> datasets.OpenMLDataset:
    """Download dataset associated with task.

    Accepts the same keyword arguments as the `openml.datasets.get_dataset`.
    """
    return datasets.get_dataset(self.dataset_id, **kwargs)

get_split_dimensions #

get_split_dimensions() -> tuple[int, int, int]

Get the (repeats, folds, samples) of the split for a given task.

Source code in openml/tasks/task.py
def get_split_dimensions(self) -> tuple[int, int, int]:
    """Get the (repeats, folds, samples) of the split for a given task."""
    if self.split is None:
        self.split = self.download_split()

    return self.split.repeats, self.split.folds, self.split.samples

get_train_test_split_indices #

get_train_test_split_indices(fold: int = 0, repeat: int = 0, sample: int = 0) -> tuple[ndarray, ndarray]

Get the indices of the train and test splits for a given task.

Source code in openml/tasks/task.py
def get_train_test_split_indices(
    self,
    fold: int = 0,
    repeat: int = 0,
    sample: int = 0,
) -> tuple[np.ndarray, np.ndarray]:
    """Get the indices of the train and test splits for a given task."""
    # Replace with retrieve from cache
    if self.split is None:
        self.split = self.download_split()

    return self.split.get(repeat=repeat, fold=fold, sample=sample)

open_in_browser #

open_in_browser() -> None

Opens the OpenML web page corresponding to this object in your default browser.

Source code in openml/base.py
def open_in_browser(self) -> None:
    """Opens the OpenML web page corresponding to this object in your default browser."""
    if self.openml_url is None:
        raise ValueError(
            "Cannot open element on OpenML.org when attribute `openml_url` is `None`",
        )

    webbrowser.open(self.openml_url)

publish #

publish() -> OpenMLBase

Publish the object on the OpenML server.

Source code in openml/base.py
def publish(self) -> OpenMLBase:
    """Publish the object on the OpenML server."""
    file_elements = self._get_file_elements()

    if "description" not in file_elements:
        file_elements["description"] = self._to_xml()

    call = f"{_get_rest_api_type_alias(self)}/"
    response_text = openml._api_calls._perform_api_call(
        call,
        "post",
        file_elements=file_elements,
    )
    xml_response = xmltodict.parse(response_text)

    self._parse_publish_response(xml_response)
    return self

push_tag #

push_tag(tag: str) -> None

Annotates this entity with a tag on the server.

PARAMETER DESCRIPTION
tag

Tag to attach to the flow.

TYPE: str

Source code in openml/base.py
def push_tag(self, tag: str) -> None:
    """Annotates this entity with a tag on the server.

    Parameters
    ----------
    tag : str
        Tag to attach to the flow.
    """
    _tag_openml_base(self, tag)

remove_tag #

remove_tag(tag: str) -> None

Removes a tag from this entity on the server.

PARAMETER DESCRIPTION
tag

Tag to attach to the flow.

TYPE: str

Source code in openml/base.py
def remove_tag(self, tag: str) -> None:
    """Removes a tag from this entity on the server.

    Parameters
    ----------
    tag : str
        Tag to attach to the flow.
    """
    _tag_openml_base(self, tag, untag=True)

url_for_id classmethod #

url_for_id(id_: int) -> str

Return the OpenML URL for the object of the class entity with the given id.

Source code in openml/base.py
@classmethod
def url_for_id(cls, id_: int) -> str:
    """Return the OpenML URL for the object of the class entity with the given id."""
    # Sample url for a flow: openml.org/f/123
    return f"{openml.config.get_server_base_url()}/{cls._entity_letter()}/{id_}"

OpenMLTask #

OpenMLTask(task_id: int | None, task_type_id: TaskType, task_type: str, data_set_id: int, estimation_procedure_id: int = 1, estimation_procedure_type: str | None = None, estimation_parameters: dict[str, str] | None = None, evaluation_measure: str | None = None, data_splits_url: str | None = None)

Bases: OpenMLBase

OpenML Task object.

PARAMETER DESCRIPTION
task_id

Refers to the unique identifier of OpenML task.

TYPE: int | None

task_type_id

Refers to the type of OpenML task.

TYPE: TaskType

task_type

Refers to the OpenML task.

TYPE: str

data_set_id

Refers to the data.

TYPE: int

estimation_procedure_id

Refers to the type of estimates used.

TYPE: int DEFAULT: 1

estimation_procedure_type

Refers to the type of estimation procedure used for the OpenML task.

TYPE: str | None DEFAULT: None

estimation_parameters

Estimation parameters used for the OpenML task.

TYPE: dict[str, str] | None DEFAULT: None

evaluation_measure

Refers to the evaluation measure.

TYPE: str | None DEFAULT: None

data_splits_url

Refers to the URL of the data splits used for the OpenML task.

TYPE: str | None DEFAULT: None

Source code in openml/tasks/task.py
def __init__(  # noqa: PLR0913
    self,
    task_id: int | None,
    task_type_id: TaskType,
    task_type: str,
    data_set_id: int,
    estimation_procedure_id: int = 1,
    estimation_procedure_type: str | None = None,
    estimation_parameters: dict[str, str] | None = None,
    evaluation_measure: str | None = None,
    data_splits_url: str | None = None,
):
    self.task_id = int(task_id) if task_id is not None else None
    self.task_type_id = task_type_id
    self.task_type = task_type
    self.dataset_id = int(data_set_id)
    self.evaluation_measure = evaluation_measure
    self.estimation_procedure: _EstimationProcedure = {
        "type": estimation_procedure_type,
        "parameters": estimation_parameters,
        "data_splits_url": data_splits_url,
    }
    self.estimation_procedure_id = estimation_procedure_id
    self.split: OpenMLSplit | None = None

id property #

id: int | None

Return the OpenML ID of this task.

openml_url property #

openml_url: str | None

The URL of the object on the server, if it was uploaded, else None.

download_split #

download_split() -> OpenMLSplit

Download the OpenML split for a given task.

Source code in openml/tasks/task.py
def download_split(self) -> OpenMLSplit:
    """Download the OpenML split for a given task."""
    # TODO(eddiebergman): Can this every be `None`?
    assert self.task_id is not None
    cache_dir = _create_cache_directory_for_id("tasks", self.task_id)
    cached_split_file = cache_dir / "datasplits.arff"

    try:
        split = OpenMLSplit._from_arff_file(cached_split_file)
    except OSError:
        # Next, download and cache the associated split file
        self._download_split(cached_split_file)
        split = OpenMLSplit._from_arff_file(cached_split_file)

    return split

get_dataset #

get_dataset(**kwargs: Any) -> OpenMLDataset

Download dataset associated with task.

Accepts the same keyword arguments as the openml.datasets.get_dataset.

Source code in openml/tasks/task.py
def get_dataset(self, **kwargs: Any) -> datasets.OpenMLDataset:
    """Download dataset associated with task.

    Accepts the same keyword arguments as the `openml.datasets.get_dataset`.
    """
    return datasets.get_dataset(self.dataset_id, **kwargs)

get_split_dimensions #

get_split_dimensions() -> tuple[int, int, int]

Get the (repeats, folds, samples) of the split for a given task.

Source code in openml/tasks/task.py
def get_split_dimensions(self) -> tuple[int, int, int]:
    """Get the (repeats, folds, samples) of the split for a given task."""
    if self.split is None:
        self.split = self.download_split()

    return self.split.repeats, self.split.folds, self.split.samples

get_train_test_split_indices #

get_train_test_split_indices(fold: int = 0, repeat: int = 0, sample: int = 0) -> tuple[ndarray, ndarray]

Get the indices of the train and test splits for a given task.

Source code in openml/tasks/task.py
def get_train_test_split_indices(
    self,
    fold: int = 0,
    repeat: int = 0,
    sample: int = 0,
) -> tuple[np.ndarray, np.ndarray]:
    """Get the indices of the train and test splits for a given task."""
    # Replace with retrieve from cache
    if self.split is None:
        self.split = self.download_split()

    return self.split.get(repeat=repeat, fold=fold, sample=sample)

open_in_browser #

open_in_browser() -> None

Opens the OpenML web page corresponding to this object in your default browser.

Source code in openml/base.py
def open_in_browser(self) -> None:
    """Opens the OpenML web page corresponding to this object in your default browser."""
    if self.openml_url is None:
        raise ValueError(
            "Cannot open element on OpenML.org when attribute `openml_url` is `None`",
        )

    webbrowser.open(self.openml_url)

publish #

publish() -> OpenMLBase

Publish the object on the OpenML server.

Source code in openml/base.py
def publish(self) -> OpenMLBase:
    """Publish the object on the OpenML server."""
    file_elements = self._get_file_elements()

    if "description" not in file_elements:
        file_elements["description"] = self._to_xml()

    call = f"{_get_rest_api_type_alias(self)}/"
    response_text = openml._api_calls._perform_api_call(
        call,
        "post",
        file_elements=file_elements,
    )
    xml_response = xmltodict.parse(response_text)

    self._parse_publish_response(xml_response)
    return self

push_tag #

push_tag(tag: str) -> None

Annotates this entity with a tag on the server.

PARAMETER DESCRIPTION
tag

Tag to attach to the flow.

TYPE: str

Source code in openml/base.py
def push_tag(self, tag: str) -> None:
    """Annotates this entity with a tag on the server.

    Parameters
    ----------
    tag : str
        Tag to attach to the flow.
    """
    _tag_openml_base(self, tag)

remove_tag #

remove_tag(tag: str) -> None

Removes a tag from this entity on the server.

PARAMETER DESCRIPTION
tag

Tag to attach to the flow.

TYPE: str

Source code in openml/base.py
def remove_tag(self, tag: str) -> None:
    """Removes a tag from this entity on the server.

    Parameters
    ----------
    tag : str
        Tag to attach to the flow.
    """
    _tag_openml_base(self, tag, untag=True)

url_for_id classmethod #

url_for_id(id_: int) -> str

Return the OpenML URL for the object of the class entity with the given id.

Source code in openml/base.py
@classmethod
def url_for_id(cls, id_: int) -> str:
    """Return the OpenML URL for the object of the class entity with the given id."""
    # Sample url for a flow: openml.org/f/123
    return f"{openml.config.get_server_base_url()}/{cls._entity_letter()}/{id_}"

TaskType #

Bases: Enum

Possible task types as defined in OpenML.

create_task #

create_task(task_type: TaskType, dataset_id: int, estimation_procedure_id: int, target_name: str | None = None, evaluation_measure: str | None = None, **kwargs: Any) -> OpenMLClassificationTask | OpenMLRegressionTask | OpenMLLearningCurveTask | OpenMLClusteringTask

Create a task based on different given attributes.

Builds a task object with the function arguments as attributes. The type of the task object built is determined from the task type id. More information on how the arguments (task attributes), relate to the different possible tasks can be found in the individual task objects at the openml.tasks.task module.

PARAMETER DESCRIPTION
task_type

Id of the task type.

TYPE: TaskType

dataset_id

The id of the dataset for the task.

TYPE: int

target_name

The name of the feature used as a target. At the moment, only optional for the clustering tasks.

TYPE: str DEFAULT: None

estimation_procedure_id

The id of the estimation procedure.

TYPE: int

evaluation_measure

The name of the evaluation measure.

TYPE: str DEFAULT: None

kwargs

Other task attributes that are not mandatory for task upload.

TYPE: dict DEFAULT: {}

RETURNS DESCRIPTION
(OpenMLClassificationTask, OpenMLRegressionTask)
(OpenMLLearningCurveTask, OpenMLClusteringTask)
Source code in openml/tasks/functions.py
def create_task(
    task_type: TaskType,
    dataset_id: int,
    estimation_procedure_id: int,
    target_name: str | None = None,
    evaluation_measure: str | None = None,
    **kwargs: Any,
) -> (
    OpenMLClassificationTask | OpenMLRegressionTask | OpenMLLearningCurveTask | OpenMLClusteringTask
):
    """Create a task based on different given attributes.

    Builds a task object with the function arguments as
    attributes. The type of the task object built is
    determined from the task type id.
    More information on how the arguments (task attributes),
    relate to the different possible tasks can be found in
    the individual task objects at the openml.tasks.task
    module.

    Parameters
    ----------
    task_type : TaskType
        Id of the task type.
    dataset_id : int
        The id of the dataset for the task.
    target_name : str, optional
        The name of the feature used as a target.
        At the moment, only optional for the clustering tasks.
    estimation_procedure_id : int
        The id of the estimation procedure.
    evaluation_measure : str, optional
        The name of the evaluation measure.
    kwargs : dict, optional
        Other task attributes that are not mandatory
        for task upload.

    Returns
    -------
    OpenMLClassificationTask, OpenMLRegressionTask,
    OpenMLLearningCurveTask, OpenMLClusteringTask
    """
    if task_type == TaskType.CLUSTERING:
        task_cls = OpenMLClusteringTask
    elif task_type == TaskType.LEARNING_CURVE:
        task_cls = OpenMLLearningCurveTask  # type: ignore
    elif task_type == TaskType.SUPERVISED_CLASSIFICATION:
        task_cls = OpenMLClassificationTask  # type: ignore
    elif task_type == TaskType.SUPERVISED_REGRESSION:
        task_cls = OpenMLRegressionTask  # type: ignore
    else:
        raise NotImplementedError(
            f"Task type ID {task_type:d} is not supported. "
            f"Supported task type IDs: {TaskType.SUPERVISED_CLASSIFICATION.value},"
            f"{TaskType.SUPERVISED_REGRESSION.value}, "
            f"{TaskType.CLUSTERING.value}, {TaskType.LEARNING_CURVE.value}. "
            f"Please refer to the TaskType enum for valid task type identifiers."
        )

    return task_cls(
        task_type_id=task_type,
        task_type="None",  # TODO: refactor to get task type string from ID.
        data_set_id=dataset_id,
        target_name=target_name,  # type: ignore
        estimation_procedure_id=estimation_procedure_id,
        evaluation_measure=evaluation_measure,
        **kwargs,
    )

delete_task #

delete_task(task_id: int) -> bool

Delete task with id task_id from the OpenML server.

You can only delete tasks which you created and have no runs associated with them.

PARAMETER DESCRIPTION
task_id

OpenML id of the task

TYPE: int

RETURNS DESCRIPTION
bool

True if the deletion was successful. False otherwise.

Source code in openml/tasks/functions.py
def delete_task(task_id: int) -> bool:
    """Delete task with id `task_id` from the OpenML server.

    You can only delete tasks which you created and have
    no runs associated with them.

    Parameters
    ----------
    task_id : int
        OpenML id of the task

    Returns
    -------
    bool
        True if the deletion was successful. False otherwise.
    """
    return openml.utils._delete_entity("task", task_id)

get_task #

get_task(task_id: int, download_splits: bool = False, **get_dataset_kwargs: Any) -> OpenMLTask

Download OpenML task for a given task ID.

Downloads the task representation.

Use the download_splits parameter to control whether the splits are downloaded. Moreover, you may pass additional parameter (args or kwargs) that are passed to :meth:openml.datasets.get_dataset.

PARAMETER DESCRIPTION
task_id

The OpenML task id of the task to download.

TYPE: int

download_splits

Whether to download the splits as well.

TYPE: bool DEFAULT: False

get_dataset_kwargs

Args and kwargs can be used pass optional parameters to :meth:openml.datasets.get_dataset.

TYPE: Any DEFAULT: {}

RETURNS DESCRIPTION
task

TYPE: OpenMLTask

Source code in openml/tasks/functions.py
@openml.utils.thread_safe_if_oslo_installed
def get_task(
    task_id: int,
    download_splits: bool = False,  # noqa: FBT002
    **get_dataset_kwargs: Any,
) -> OpenMLTask:
    """Download OpenML task for a given task ID.

    Downloads the task representation.

    Use the `download_splits` parameter to control whether the splits are downloaded.
    Moreover, you may pass additional parameter (args or kwargs) that are passed to
    :meth:`openml.datasets.get_dataset`.

    Parameters
    ----------
    task_id : int
        The OpenML task id of the task to download.
    download_splits: bool (default=False)
        Whether to download the splits as well.
    get_dataset_kwargs :
        Args and kwargs can be used pass optional parameters to :meth:`openml.datasets.get_dataset`.

    Returns
    -------
    task: OpenMLTask
    """
    if not isinstance(task_id, int):
        raise TypeError(f"Task id should be integer, is {type(task_id)}")

    cache_key_dir = openml.utils._create_cache_directory_for_id(TASKS_CACHE_DIR_NAME, task_id)
    tid_cache_dir = cache_key_dir / str(task_id)
    tid_cache_dir_existed = tid_cache_dir.exists()
    try:
        task = _get_task_description(task_id)
        dataset = get_dataset(task.dataset_id, **get_dataset_kwargs)
        # List of class labels available in dataset description
        # Including class labels as part of task meta data handles
        #   the case where data download was initially disabled
        if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)):
            task.class_labels = dataset.retrieve_class_labels(task.target_name)
        # Clustering tasks do not have class labels
        # and do not offer download_split
        if download_splits and isinstance(task, OpenMLSupervisedTask):
            task.download_split()
    except Exception as e:
        if not tid_cache_dir_existed:
            openml.utils._remove_cache_dir_for_id(TASKS_CACHE_DIR_NAME, tid_cache_dir)
        raise e

    return task

get_tasks #

get_tasks(task_ids: list[int], download_data: bool | None = None, download_qualities: bool | None = None) -> list[OpenMLTask]

Download tasks.

This function iterates :meth:openml.tasks.get_task.

PARAMETER DESCRIPTION
task_ids

A list of task ids to download.

TYPE: List[int]

download_data

Option to trigger download of data along with the meta data.

TYPE: bool(default=True) DEFAULT: None

download_qualities

Option to download 'qualities' meta-data in addition to the minimal dataset description.

TYPE: bool(default=True) DEFAULT: None

RETURNS DESCRIPTION
list
Source code in openml/tasks/functions.py
def get_tasks(
    task_ids: list[int],
    download_data: bool | None = None,
    download_qualities: bool | None = None,
) -> list[OpenMLTask]:
    """Download tasks.

    This function iterates :meth:`openml.tasks.get_task`.

    Parameters
    ----------
    task_ids : List[int]
        A list of task ids to download.
    download_data : bool (default = True)
        Option to trigger download of data along with the meta data.
    download_qualities : bool (default=True)
        Option to download 'qualities' meta-data in addition to the minimal dataset description.

    Returns
    -------
    list
    """
    if download_data is None:
        warnings.warn(
            "`download_data` will default to False starting in 0.16. "
            "Please set `download_data` explicitly to suppress this warning.",
            stacklevel=1,
        )
        download_data = True

    if download_qualities is None:
        warnings.warn(
            "`download_qualities` will default to False starting in 0.16. "
            "Please set `download_qualities` explicitly to suppress this warning.",
            stacklevel=1,
        )
        download_qualities = True

    tasks = []
    for task_id in task_ids:
        tasks.append(
            get_task(task_id, download_data=download_data, download_qualities=download_qualities)
        )
    return tasks

list_tasks #

list_tasks(task_type: TaskType | None = None, offset: int | None = None, size: int | None = None, tag: str | None = None, data_tag: str | None = None, status: str | None = None, data_name: str | None = None, data_id: int | None = None, number_instances: int | None = None, number_features: int | None = None, number_classes: int | None = None, number_missing_values: int | None = None) -> DataFrame

Return a number of tasks having the given tag and task_type

PARAMETER DESCRIPTION
Filter

it

type

offset

the number of tasks to skip, starting from the first

TYPE: int DEFAULT: None

task_type

Refers to the type of task.

TYPE: TaskType DEFAULT: None

size

the maximum number of tasks to show

TYPE: int DEFAULT: None

tag

the tag to include

TYPE: str DEFAULT: None

data_tag

the tag of the dataset

TYPE: str DEFAULT: None

data_id

TYPE: int DEFAULT: None

status

TYPE: str DEFAULT: None

data_name

TYPE: str DEFAULT: None

number_instances

TYPE: int DEFAULT: None

number_features

TYPE: int DEFAULT: None

number_classes

TYPE: int DEFAULT: None

number_missing_values

TYPE: int DEFAULT: None

RETURNS DESCRIPTION
dataframe

All tasks having the given task_type and the give tag. Every task is represented by a row in the data frame containing the following information as columns: task id, dataset id, task_type and status. If qualities are calculated for the associated dataset, some of these are also returned.

Source code in openml/tasks/functions.py
def list_tasks(  # noqa: PLR0913
    task_type: TaskType | None = None,
    offset: int | None = None,
    size: int | None = None,
    tag: str | None = None,
    data_tag: str | None = None,
    status: str | None = None,
    data_name: str | None = None,
    data_id: int | None = None,
    number_instances: int | None = None,
    number_features: int | None = None,
    number_classes: int | None = None,
    number_missing_values: int | None = None,
) -> pd.DataFrame:
    """
    Return a number of tasks having the given tag and task_type

    Parameters
    ----------
    Filter task_type is separated from the other filters because
    it is used as task_type in the task description, but it is named
    type when used as a filter in list tasks call.
    offset : int, optional
        the number of tasks to skip, starting from the first
    task_type : TaskType, optional
        Refers to the type of task.
    size : int, optional
        the maximum number of tasks to show
    tag : str, optional
        the tag to include
    data_tag : str, optional
        the tag of the dataset
    data_id : int, optional
    status : str, optional
    data_name : str, optional
    number_instances : int, optional
    number_features : int, optional
    number_classes : int, optional
    number_missing_values : int, optional

    Returns
    -------
    dataframe
        All tasks having the given task_type and the give tag. Every task is
        represented by a row in the data frame containing the following information
        as columns: task id, dataset id, task_type and status. If qualities are
        calculated for the associated dataset, some of these are also returned.
    """
    listing_call = partial(
        _list_tasks,
        task_type=task_type,
        tag=tag,
        data_tag=data_tag,
        status=status,
        data_id=data_id,
        data_name=data_name,
        number_instances=number_instances,
        number_features=number_features,
        number_classes=number_classes,
        number_missing_values=number_missing_values,
    )
    batches = openml.utils._list_all(listing_call, offset=offset, limit=size)
    if len(batches) == 0:
        return pd.DataFrame()

    return pd.concat(batches)