tasks

openml.tasks #

OpenMLClassificationTask #

OpenMLClassificationTask(task_type_id: TaskType, task_type: str, data_set_id: int, target_name: str, estimation_procedure_id: int = 1, estimation_procedure_type: str | None = None, estimation_parameters: dict[str, str] | None = None, evaluation_measure: str | None = None, data_splits_url: str | None = None, task_id: int | None = None, class_labels: list[str] | None = None, cost_matrix: ndarray | None = None)

Bases: OpenMLSupervisedTask

OpenML Classification object.

Parameters#

task_type_id : TaskType ID of the Classification task type. task_type : str Name of the Classification task type. data_set_id : int ID of the OpenML dataset associated with the Classification task. target_name : str Name of the target variable. estimation_procedure_id : int, default=None ID of the estimation procedure for the Classification task. estimation_procedure_type : str, default=None Type of the estimation procedure. estimation_parameters : dict, default=None Estimation parameters for the Classification task. evaluation_measure : str, default=None Name of the evaluation measure. data_splits_url : str, default=None URL of the data splits for the Classification task. task_id : Union[int, None] ID of the Classification task (if it already exists on OpenML). class_labels : List of str, default=None A list of class labels (for classification tasks). cost_matrix : array, default=None A cost matrix (for classification tasks).

Source code in openml/tasks/task.py

def __init__(  # noqa: PLR0913
    self,
    task_type_id: TaskType,
    task_type: str,
    data_set_id: int,
    target_name: str,
    estimation_procedure_id: int = 1,
    estimation_procedure_type: str | None = None,
    estimation_parameters: dict[str, str] | None = None,
    evaluation_measure: str | None = None,
    data_splits_url: str | None = None,
    task_id: int | None = None,
    class_labels: list[str] | None = None,
    cost_matrix: np.ndarray | None = None,
):
    super().__init__(
        task_id=task_id,
        task_type_id=task_type_id,
        task_type=task_type,
        data_set_id=data_set_id,
        estimation_procedure_id=estimation_procedure_id,
        estimation_procedure_type=estimation_procedure_type,
        estimation_parameters=estimation_parameters,
        evaluation_measure=evaluation_measure,
        target_name=target_name,
        data_splits_url=data_splits_url,
    )
    self.class_labels = class_labels
    self.cost_matrix = cost_matrix

    if cost_matrix is not None:
        raise NotImplementedError("Costmatrix")

estimation_parameters `property` `writable` #

estimation_parameters: dict[str, str] | None

Return the estimation parameters for the task.

id `property` #

id: int | None

Return the OpenML ID of this task.

openml_url `property` #

openml_url: str | None

The URL of the object on the server, if it was uploaded, else None.

download_split #

download_split() -> OpenMLSplit

Download the OpenML split for a given task.

Source code in openml/tasks/task.py

def download_split(self) -> OpenMLSplit:
    """Download the OpenML split for a given task."""
    # TODO(eddiebergman): Can this every be `None`?
    assert self.task_id is not None
    cache_dir = _create_cache_directory_for_id("tasks", self.task_id)
    cached_split_file = cache_dir / "datasplits.arff"

    try:
        split = OpenMLSplit._from_arff_file(cached_split_file)
    except OSError:
        # Next, download and cache the associated split file
        self._download_split(cached_split_file)
        split = OpenMLSplit._from_arff_file(cached_split_file)

    return split

get_X_and_y #

get_X_and_y() -> tuple[DataFrame, Series | DataFrame | None]

Get data associated with the current task.

Returns#

tuple - X and y

Source code in openml/tasks/task.py

def get_X_and_y(self) -> tuple[pd.DataFrame, pd.Series | pd.DataFrame | None]:
    """Get data associated with the current task.

    Returns
    -------
    tuple - X and y

    """
    dataset = self.get_dataset()
    if self.task_type_id not in (
        TaskType.SUPERVISED_CLASSIFICATION,
        TaskType.SUPERVISED_REGRESSION,
        TaskType.LEARNING_CURVE,
    ):
        raise NotImplementedError(self.task_type)

    X, y, _, _ = dataset.get_data(target=self.target_name)
    return X, y

get_dataset #

get_dataset(**kwargs: Any) -> OpenMLDataset

Download dataset associated with task.

Accepts the same keyword arguments as the openml.datasets.get_dataset.

Source code in openml/tasks/task.py

def get_dataset(self, **kwargs: Any) -> datasets.OpenMLDataset:
    """Download dataset associated with task.

    Accepts the same keyword arguments as the `openml.datasets.get_dataset`.
    """
    return datasets.get_dataset(self.dataset_id, **kwargs)

get_split_dimensions #

get_split_dimensions() -> tuple[int, int, int]

Get the (repeats, folds, samples) of the split for a given task.

Source code in openml/tasks/task.py

def get_split_dimensions(self) -> tuple[int, int, int]:
    """Get the (repeats, folds, samples) of the split for a given task."""
    if self.split is None:
        self.split = self.download_split()

    return self.split.repeats, self.split.folds, self.split.samples

get_train_test_split_indices #

get_train_test_split_indices(fold: int = 0, repeat: int = 0, sample: int = 0) -> tuple[ndarray, ndarray]

Get the indices of the train and test splits for a given task.

Source code in openml/tasks/task.py

def get_train_test_split_indices(
    self,
    fold: int = 0,
    repeat: int = 0,
    sample: int = 0,
) -> tuple[np.ndarray, np.ndarray]:
    """Get the indices of the train and test splits for a given task."""
    # Replace with retrieve from cache
    if self.split is None:
        self.split = self.download_split()

    return self.split.get(repeat=repeat, fold=fold, sample=sample)

open_in_browser #

open_in_browser() -> None

Opens the OpenML web page corresponding to this object in your default browser.

Source code in openml/base.py

def open_in_browser(self) -> None:
    """Opens the OpenML web page corresponding to this object in your default browser."""
    if self.openml_url is None:
        raise ValueError(
            "Cannot open element on OpenML.org when attribute `openml_url` is `None`",
        )

    webbrowser.open(self.openml_url)

publish #

publish() -> OpenMLBase

Publish the object on the OpenML server.

Source code in openml/base.py

def publish(self) -> OpenMLBase:
    """Publish the object on the OpenML server."""
    file_elements = self._get_file_elements()

    if "description" not in file_elements:
        file_elements["description"] = self._to_xml()

    call = f"{_get_rest_api_type_alias(self)}/"
    response_text = openml._api_calls._perform_api_call(
        call,
        "post",
        file_elements=file_elements,
    )
    xml_response = xmltodict.parse(response_text)

    self._parse_publish_response(xml_response)
    return self

push_tag #

push_tag(tag: str) -> None

Annotates this entity with a tag on the server.

Parameters#

tag : str Tag to attach to the flow.

Source code in openml/base.py

def push_tag(self, tag: str) -> None:
    """Annotates this entity with a tag on the server.

    Parameters
    ----------
    tag : str
        Tag to attach to the flow.
    """
    _tag_openml_base(self, tag)

remove_tag #

remove_tag(tag: str) -> None

Removes a tag from this entity on the server.

Parameters#

tag : str Tag to attach to the flow.

Source code in openml/base.py

def remove_tag(self, tag: str) -> None:
    """Removes a tag from this entity on the server.

    Parameters
    ----------
    tag : str
        Tag to attach to the flow.
    """
    _tag_openml_base(self, tag, untag=True)

url_for_id `classmethod` #

url_for_id(id_: int) -> str

Return the OpenML URL for the object of the class entity with the given id.

Source code in openml/base.py

@classmethod
def url_for_id(cls, id_: int) -> str:
    """Return the OpenML URL for the object of the class entity with the given id."""
    # Sample url for a flow: openml.org/f/123
    return f"{openml.config.get_server_base_url()}/{cls._entity_letter()}/{id_}"

OpenMLClusteringTask #

OpenMLClusteringTask(task_type_id: TaskType, task_type: str, data_set_id: int, estimation_procedure_id: int = 17, task_id: int | None = None, estimation_procedure_type: str | None = None, estimation_parameters: dict[str, str] | None = None, data_splits_url: str | None = None, evaluation_measure: str | None = None, target_name: str | None = None)

Bases: OpenMLTask

OpenML Clustering object.

Parameters#

task_type_id : TaskType Task type ID of the OpenML clustering task. task_type : str Task type of the OpenML clustering task. data_set_id : int ID of the OpenML dataset used in clustering the task. estimation_procedure_id : int, default=None ID of the OpenML estimation procedure. task_id : Union[int, None] ID of the OpenML clustering task. estimation_procedure_type : str, default=None Type of the OpenML estimation procedure used in the clustering task. estimation_parameters : dict, default=None Parameters used by the OpenML estimation procedure. data_splits_url : str, default=None URL of the OpenML data splits for the clustering task. evaluation_measure : str, default=None Evaluation measure used in the clustering task. target_name : str, default=None Name of the target feature (class) that is not part of the feature set for the clustering task.

Source code in openml/tasks/task.py

def __init__(  # noqa: PLR0913
    self,
    task_type_id: TaskType,
    task_type: str,
    data_set_id: int,
    estimation_procedure_id: int = 17,
    task_id: int | None = None,
    estimation_procedure_type: str | None = None,
    estimation_parameters: dict[str, str] | None = None,
    data_splits_url: str | None = None,
    evaluation_measure: str | None = None,
    target_name: str | None = None,
):
    super().__init__(
        task_id=task_id,
        task_type_id=task_type_id,
        task_type=task_type,
        data_set_id=data_set_id,
        evaluation_measure=evaluation_measure,
        estimation_procedure_id=estimation_procedure_id,
        estimation_procedure_type=estimation_procedure_type,
        estimation_parameters=estimation_parameters,
        data_splits_url=data_splits_url,
    )

    self.target_name = target_name

id `property` #

id: int | None

Return the OpenML ID of this task.

openml_url `property` #

openml_url: str | None

The URL of the object on the server, if it was uploaded, else None.

download_split #

download_split() -> OpenMLSplit

Download the OpenML split for a given task.

Source code in openml/tasks/task.py

def download_split(self) -> OpenMLSplit:
    """Download the OpenML split for a given task."""
    # TODO(eddiebergman): Can this every be `None`?
    assert self.task_id is not None
    cache_dir = _create_cache_directory_for_id("tasks", self.task_id)
    cached_split_file = cache_dir / "datasplits.arff"

    try:
        split = OpenMLSplit._from_arff_file(cached_split_file)
    except OSError:
        # Next, download and cache the associated split file
        self._download_split(cached_split_file)
        split = OpenMLSplit._from_arff_file(cached_split_file)

    return split

get_X #

get_X() -> DataFrame

Get data associated with the current task.

Returns#

The X data as a dataframe

Source code in openml/tasks/task.py

def get_X(self) -> pd.DataFrame:
    """Get data associated with the current task.

    Returns
    -------
    The X data as a dataframe
    """
    dataset = self.get_dataset()
    data, *_ = dataset.get_data(target=None)
    return data

get_dataset #

get_dataset(**kwargs: Any) -> OpenMLDataset

Download dataset associated with task.

Accepts the same keyword arguments as the openml.datasets.get_dataset.

Source code in openml/tasks/task.py

def get_dataset(self, **kwargs: Any) -> datasets.OpenMLDataset:
    """Download dataset associated with task.

    Accepts the same keyword arguments as the `openml.datasets.get_dataset`.
    """
    return datasets.get_dataset(self.dataset_id, **kwargs)

get_split_dimensions #

get_split_dimensions() -> tuple[int, int, int]

Get the (repeats, folds, samples) of the split for a given task.

Source code in openml/tasks/task.py

def get_split_dimensions(self) -> tuple[int, int, int]:
    """Get the (repeats, folds, samples) of the split for a given task."""
    if self.split is None:
        self.split = self.download_split()

    return self.split.repeats, self.split.folds, self.split.samples

get_train_test_split_indices #

get_train_test_split_indices(fold: int = 0, repeat: int = 0, sample: int = 0) -> tuple[ndarray, ndarray]

Get the indices of the train and test splits for a given task.

Source code in openml/tasks/task.py

def get_train_test_split_indices(
    self,
    fold: int = 0,
    repeat: int = 0,
    sample: int = 0,
) -> tuple[np.ndarray, np.ndarray]:
    """Get the indices of the train and test splits for a given task."""
    # Replace with retrieve from cache
    if self.split is None:
        self.split = self.download_split()

    return self.split.get(repeat=repeat, fold=fold, sample=sample)

open_in_browser #

open_in_browser() -> None

Opens the OpenML web page corresponding to this object in your default browser.

Source code in openml/base.py

def open_in_browser(self) -> None:
    """Opens the OpenML web page corresponding to this object in your default browser."""
    if self.openml_url is None:
        raise ValueError(
            "Cannot open element on OpenML.org when attribute `openml_url` is `None`",
        )

    webbrowser.open(self.openml_url)

publish #

publish() -> OpenMLBase

Publish the object on the OpenML server.

Source code in openml/base.py

def publish(self) -> OpenMLBase:
    """Publish the object on the OpenML server."""
    file_elements = self._get_file_elements()

    if "description" not in file_elements:
        file_elements["description"] = self._to_xml()

    call = f"{_get_rest_api_type_alias(self)}/"
    response_text = openml._api_calls._perform_api_call(
        call,
        "post",
        file_elements=file_elements,
    )
    xml_response = xmltodict.parse(response_text)

    self._parse_publish_response(xml_response)
    return self

push_tag #

push_tag(tag: str) -> None

Annotates this entity with a tag on the server.

Parameters#

tag : str Tag to attach to the flow.

Source code in openml/base.py

def push_tag(self, tag: str) -> None:
    """Annotates this entity with a tag on the server.

    Parameters
    ----------
    tag : str
        Tag to attach to the flow.
    """
    _tag_openml_base(self, tag)

remove_tag #

remove_tag(tag: str) -> None

Removes a tag from this entity on the server.

Parameters#

tag : str Tag to attach to the flow.

Source code in openml/base.py

def remove_tag(self, tag: str) -> None:
    """Removes a tag from this entity on the server.

    Parameters
    ----------
    tag : str
        Tag to attach to the flow.
    """
    _tag_openml_base(self, tag, untag=True)

url_for_id `classmethod` #

url_for_id(id_: int) -> str

Return the OpenML URL for the object of the class entity with the given id.

Source code in openml/base.py

@classmethod
def url_for_id(cls, id_: int) -> str:
    """Return the OpenML URL for the object of the class entity with the given id."""
    # Sample url for a flow: openml.org/f/123
    return f"{openml.config.get_server_base_url()}/{cls._entity_letter()}/{id_}"

OpenMLLearningCurveTask #

OpenMLLearningCurveTask(task_type_id: TaskType, task_type: str, data_set_id: int, target_name: str, estimation_procedure_id: int = 13, estimation_procedure_type: str | None = None, estimation_parameters: dict[str, str] | None = None, data_splits_url: str | None = None, task_id: int | None = None, evaluation_measure: str | None = None, class_labels: list[str] | None = None, cost_matrix: ndarray | None = None)

Bases: OpenMLClassificationTask

OpenML Learning Curve object.

Parameters#

task_type_id : TaskType ID of the Learning Curve task. task_type : str Name of the Learning Curve task. data_set_id : int ID of the dataset that this task is associated with. target_name : str Name of the target feature in the dataset. estimation_procedure_id : int, default=None ID of the estimation procedure to use for evaluating models. estimation_procedure_type : str, default=None Type of the estimation procedure. estimation_parameters : dict, default=None Additional parameters for the estimation procedure. data_splits_url : str, default=None URL of the file containing the data splits for Learning Curve task. task_id : Union[int, None] ID of the Learning Curve task. evaluation_measure : str, default=None Name of the evaluation measure to use for evaluating models. class_labels : list of str, default=None Class labels for Learning Curve tasks. cost_matrix : numpy array, default=None Cost matrix for Learning Curve tasks.

Source code in openml/tasks/task.py

def __init__(  # noqa: PLR0913
    self,
    task_type_id: TaskType,
    task_type: str,
    data_set_id: int,
    target_name: str,
    estimation_procedure_id: int = 13,
    estimation_procedure_type: str | None = None,
    estimation_parameters: dict[str, str] | None = None,
    data_splits_url: str | None = None,
    task_id: int | None = None,
    evaluation_measure: str | None = None,
    class_labels: list[str] | None = None,
    cost_matrix: np.ndarray | None = None,
):
    super().__init__(
        task_id=task_id,
        task_type_id=task_type_id,
        task_type=task_type,
        data_set_id=data_set_id,
        estimation_procedure_id=estimation_procedure_id,
        estimation_procedure_type=estimation_procedure_type,
        estimation_parameters=estimation_parameters,
        evaluation_measure=evaluation_measure,
        target_name=target_name,
        data_splits_url=data_splits_url,
        class_labels=class_labels,
        cost_matrix=cost_matrix,
    )

estimation_parameters `property` `writable` #

estimation_parameters: dict[str, str] | None

Return the estimation parameters for the task.

id `property` #

id: int | None

Return the OpenML ID of this task.

openml_url `property` #

openml_url: str | None

The URL of the object on the server, if it was uploaded, else None.

download_split #

download_split() -> OpenMLSplit

Download the OpenML split for a given task.

Source code in openml/tasks/task.py

def download_split(self) -> OpenMLSplit:
    """Download the OpenML split for a given task."""
    # TODO(eddiebergman): Can this every be `None`?
    assert self.task_id is not None
    cache_dir = _create_cache_directory_for_id("tasks", self.task_id)
    cached_split_file = cache_dir / "datasplits.arff"

    try:
        split = OpenMLSplit._from_arff_file(cached_split_file)
    except OSError:
        # Next, download and cache the associated split file
        self._download_split(cached_split_file)
        split = OpenMLSplit._from_arff_file(cached_split_file)

    return split

get_X_and_y #

get_X_and_y() -> tuple[DataFrame, Series | DataFrame | None]

Get data associated with the current task.

Returns#

tuple - X and y

Source code in openml/tasks/task.py

def get_X_and_y(self) -> tuple[pd.DataFrame, pd.Series | pd.DataFrame | None]:
    """Get data associated with the current task.

    Returns
    -------
    tuple - X and y

    """
    dataset = self.get_dataset()
    if self.task_type_id not in (
        TaskType.SUPERVISED_CLASSIFICATION,
        TaskType.SUPERVISED_REGRESSION,
        TaskType.LEARNING_CURVE,
    ):
        raise NotImplementedError(self.task_type)

    X, y, _, _ = dataset.get_data(target=self.target_name)
    return X, y

get_dataset #

get_dataset(**kwargs: Any) -> OpenMLDataset

Download dataset associated with task.

Accepts the same keyword arguments as the openml.datasets.get_dataset.

Source code in openml/tasks/task.py

def get_dataset(self, **kwargs: Any) -> datasets.OpenMLDataset:
    """Download dataset associated with task.

    Accepts the same keyword arguments as the `openml.datasets.get_dataset`.
    """
    return datasets.get_dataset(self.dataset_id, **kwargs)

get_split_dimensions #

get_split_dimensions() -> tuple[int, int, int]

Get the (repeats, folds, samples) of the split for a given task.

Source code in openml/tasks/task.py

def get_split_dimensions(self) -> tuple[int, int, int]:
    """Get the (repeats, folds, samples) of the split for a given task."""
    if self.split is None:
        self.split = self.download_split()

    return self.split.repeats, self.split.folds, self.split.samples

get_train_test_split_indices #

get_train_test_split_indices(fold: int = 0, repeat: int = 0, sample: int = 0) -> tuple[ndarray, ndarray]

Get the indices of the train and test splits for a given task.

Source code in openml/tasks/task.py

def get_train_test_split_indices(
    self,
    fold: int = 0,
    repeat: int = 0,
    sample: int = 0,
) -> tuple[np.ndarray, np.ndarray]:
    """Get the indices of the train and test splits for a given task."""
    # Replace with retrieve from cache
    if self.split is None:
        self.split = self.download_split()

    return self.split.get(repeat=repeat, fold=fold, sample=sample)

open_in_browser #

open_in_browser() -> None

Opens the OpenML web page corresponding to this object in your default browser.

Source code in openml/base.py

def open_in_browser(self) -> None:
    """Opens the OpenML web page corresponding to this object in your default browser."""
    if self.openml_url is None:
        raise ValueError(
            "Cannot open element on OpenML.org when attribute `openml_url` is `None`",
        )

    webbrowser.open(self.openml_url)

publish #

publish() -> OpenMLBase

Publish the object on the OpenML server.

Source code in openml/base.py

def publish(self) -> OpenMLBase:
    """Publish the object on the OpenML server."""
    file_elements = self._get_file_elements()

    if "description" not in file_elements:
        file_elements["description"] = self._to_xml()

    call = f"{_get_rest_api_type_alias(self)}/"
    response_text = openml._api_calls._perform_api_call(
        call,
        "post",
        file_elements=file_elements,
    )
    xml_response = xmltodict.parse(response_text)

    self._parse_publish_response(xml_response)
    return self

push_tag #

push_tag(tag: str) -> None

Annotates this entity with a tag on the server.

Parameters#

tag : str Tag to attach to the flow.

Source code in openml/base.py

def push_tag(self, tag: str) -> None:
    """Annotates this entity with a tag on the server.

    Parameters
    ----------
    tag : str
        Tag to attach to the flow.
    """
    _tag_openml_base(self, tag)

remove_tag #

remove_tag(tag: str) -> None

Removes a tag from this entity on the server.

Parameters#

tag : str Tag to attach to the flow.

Source code in openml/base.py

def remove_tag(self, tag: str) -> None:
    """Removes a tag from this entity on the server.

    Parameters
    ----------
    tag : str
        Tag to attach to the flow.
    """
    _tag_openml_base(self, tag, untag=True)

url_for_id `classmethod` #

url_for_id(id_: int) -> str

Return the OpenML URL for the object of the class entity with the given id.

Source code in openml/base.py

@classmethod
def url_for_id(cls, id_: int) -> str:
    """Return the OpenML URL for the object of the class entity with the given id."""
    # Sample url for a flow: openml.org/f/123
    return f"{openml.config.get_server_base_url()}/{cls._entity_letter()}/{id_}"

OpenMLRegressionTask #

OpenMLRegressionTask(task_type_id: TaskType, task_type: str, data_set_id: int, target_name: str, estimation_procedure_id: int = 7, estimation_procedure_type: str | None = None, estimation_parameters: dict[str, str] | None = None, data_splits_url: str | None = None, task_id: int | None = None, evaluation_measure: str | None = None)

Bases: OpenMLSupervisedTask

OpenML Regression object.

Parameters#

task_type_id : TaskType Task type ID of the OpenML Regression task. task_type : str Task type of the OpenML Regression task. data_set_id : int ID of the OpenML dataset. target_name : str Name of the target feature used in the Regression task. estimation_procedure_id : int, default=None ID of the OpenML estimation procedure. estimation_procedure_type : str, default=None Type of the OpenML estimation procedure. estimation_parameters : dict, default=None Parameters used by the OpenML estimation procedure. data_splits_url : str, default=None URL of the OpenML data splits for the Regression task. task_id : Union[int, None] ID of the OpenML Regression task. evaluation_measure : str, default=None Evaluation measure used in the Regression task.

Source code in openml/tasks/task.py

def __init__(  # noqa: PLR0913
    self,
    task_type_id: TaskType,
    task_type: str,
    data_set_id: int,
    target_name: str,
    estimation_procedure_id: int = 7,
    estimation_procedure_type: str | None = None,
    estimation_parameters: dict[str, str] | None = None,
    data_splits_url: str | None = None,
    task_id: int | None = None,
    evaluation_measure: str | None = None,
):
    super().__init__(
        task_id=task_id,
        task_type_id=task_type_id,
        task_type=task_type,
        data_set_id=data_set_id,
        estimation_procedure_id=estimation_procedure_id,
        estimation_procedure_type=estimation_procedure_type,
        estimation_parameters=estimation_parameters,
        evaluation_measure=evaluation_measure,
        target_name=target_name,
        data_splits_url=data_splits_url,
    )

estimation_parameters `property` `writable` #

estimation_parameters: dict[str, str] | None

Return the estimation parameters for the task.

id `property` #

id: int | None

Return the OpenML ID of this task.

openml_url `property` #

openml_url: str | None

The URL of the object on the server, if it was uploaded, else None.

download_split #

download_split() -> OpenMLSplit

Download the OpenML split for a given task.

Source code in openml/tasks/task.py

def download_split(self) -> OpenMLSplit:
    """Download the OpenML split for a given task."""
    # TODO(eddiebergman): Can this every be `None`?
    assert self.task_id is not None
    cache_dir = _create_cache_directory_for_id("tasks", self.task_id)
    cached_split_file = cache_dir / "datasplits.arff"

    try:
        split = OpenMLSplit._from_arff_file(cached_split_file)
    except OSError:
        # Next, download and cache the associated split file
        self._download_split(cached_split_file)
        split = OpenMLSplit._from_arff_file(cached_split_file)

    return split

get_X_and_y #

get_X_and_y() -> tuple[DataFrame, Series | DataFrame | None]

Get data associated with the current task.

Returns#

tuple - X and y

Source code in openml/tasks/task.py

def get_X_and_y(self) -> tuple[pd.DataFrame, pd.Series | pd.DataFrame | None]:
    """Get data associated with the current task.

    Returns
    -------
    tuple - X and y

    """
    dataset = self.get_dataset()
    if self.task_type_id not in (
        TaskType.SUPERVISED_CLASSIFICATION,
        TaskType.SUPERVISED_REGRESSION,
        TaskType.LEARNING_CURVE,
    ):
        raise NotImplementedError(self.task_type)

    X, y, _, _ = dataset.get_data(target=self.target_name)
    return X, y

get_dataset #

get_dataset(**kwargs: Any) -> OpenMLDataset

Download dataset associated with task.

Accepts the same keyword arguments as the openml.datasets.get_dataset.

Source code in openml/tasks/task.py

def get_dataset(self, **kwargs: Any) -> datasets.OpenMLDataset:
    """Download dataset associated with task.

    Accepts the same keyword arguments as the `openml.datasets.get_dataset`.
    """
    return datasets.get_dataset(self.dataset_id, **kwargs)

get_split_dimensions #

get_split_dimensions() -> tuple[int, int, int]

Get the (repeats, folds, samples) of the split for a given task.

Source code in openml/tasks/task.py

def get_split_dimensions(self) -> tuple[int, int, int]:
    """Get the (repeats, folds, samples) of the split for a given task."""
    if self.split is None:
        self.split = self.download_split()

    return self.split.repeats, self.split.folds, self.split.samples

get_train_test_split_indices #

get_train_test_split_indices(fold: int = 0, repeat: int = 0, sample: int = 0) -> tuple[ndarray, ndarray]

Get the indices of the train and test splits for a given task.

Source code in openml/tasks/task.py

def get_train_test_split_indices(
    self,
    fold: int = 0,
    repeat: int = 0,
    sample: int = 0,
) -> tuple[np.ndarray, np.ndarray]:
    """Get the indices of the train and test splits for a given task."""
    # Replace with retrieve from cache
    if self.split is None:
        self.split = self.download_split()

    return self.split.get(repeat=repeat, fold=fold, sample=sample)

open_in_browser #

open_in_browser() -> None

Opens the OpenML web page corresponding to this object in your default browser.

Source code in openml/base.py

def open_in_browser(self) -> None:
    """Opens the OpenML web page corresponding to this object in your default browser."""
    if self.openml_url is None:
        raise ValueError(
            "Cannot open element on OpenML.org when attribute `openml_url` is `None`",
        )

    webbrowser.open(self.openml_url)

publish #

publish() -> OpenMLBase

Publish the object on the OpenML server.

Source code in openml/base.py

def publish(self) -> OpenMLBase:
    """Publish the object on the OpenML server."""
    file_elements = self._get_file_elements()

    if "description" not in file_elements:
        file_elements["description"] = self._to_xml()

    call = f"{_get_rest_api_type_alias(self)}/"
    response_text = openml._api_calls._perform_api_call(
        call,
        "post",
        file_elements=file_elements,
    )
    xml_response = xmltodict.parse(response_text)

    self._parse_publish_response(xml_response)
    return self

push_tag #

push_tag(tag: str) -> None

Annotates this entity with a tag on the server.

Parameters#

tag : str Tag to attach to the flow.

Source code in openml/base.py

def push_tag(self, tag: str) -> None:
    """Annotates this entity with a tag on the server.

    Parameters
    ----------
    tag : str
        Tag to attach to the flow.
    """
    _tag_openml_base(self, tag)

remove_tag #

remove_tag(tag: str) -> None

Removes a tag from this entity on the server.

Parameters#

tag : str Tag to attach to the flow.

Source code in openml/base.py

def remove_tag(self, tag: str) -> None:
    """Removes a tag from this entity on the server.

    Parameters
    ----------
    tag : str
        Tag to attach to the flow.
    """
    _tag_openml_base(self, tag, untag=True)

url_for_id `classmethod` #

url_for_id(id_: int) -> str

Return the OpenML URL for the object of the class entity with the given id.

Source code in openml/base.py

@classmethod
def url_for_id(cls, id_: int) -> str:
    """Return the OpenML URL for the object of the class entity with the given id."""
    # Sample url for a flow: openml.org/f/123
    return f"{openml.config.get_server_base_url()}/{cls._entity_letter()}/{id_}"

OpenMLSplit #

OpenMLSplit(name: int | str, description: str, split: dict[int, dict[int, dict[int, tuple[ndarray, ndarray]]]])

OpenML Split object.

This class manages train-test splits for a dataset across multiple repetitions, folds, and samples.

Parameters#

name : int or str The name or ID of the split. description : str A description of the split. split : dict A dictionary containing the splits organized by repetition, fold, and sample.

Source code in openml/tasks/split.py

def __init__(
    self,
    name: int | str,
    description: str,
    split: dict[int, dict[int, dict[int, tuple[np.ndarray, np.ndarray]]]],
):
    self.description = description
    self.name = name
    self.split: dict[int, dict[int, dict[int, tuple[np.ndarray, np.ndarray]]]] = {}

    # Add splits according to repetition
    for repetition in split:
        _rep = int(repetition)
        self.split[_rep] = OrderedDict()
        for fold in split[_rep]:
            self.split[_rep][fold] = OrderedDict()
            for sample in split[_rep][fold]:
                self.split[_rep][fold][sample] = split[_rep][fold][sample]

    self.repeats = len(self.split)

    # TODO(eddiebergman): Better error message
    if any(len(self.split[0]) != len(self.split[i]) for i in range(self.repeats)):
        raise ValueError("")

    self.folds = len(self.split[0])
    self.samples = len(self.split[0][0])

get #

get(repeat: int = 0, fold: int = 0, sample: int = 0) -> tuple[ndarray, ndarray]

Returns the specified data split from the CrossValidationSplit object.

Parameters#

repeat : int Index of the repeat to retrieve. fold : int Index of the fold to retrieve. sample : int Index of the sample to retrieve.

Returns#

numpy.ndarray The data split for the specified repeat, fold, and sample.

Raises#

ValueError If the specified repeat, fold, or sample is not known.

Source code in openml/tasks/split.py

def get(self, repeat: int = 0, fold: int = 0, sample: int = 0) -> tuple[np.ndarray, np.ndarray]:
    """Returns the specified data split from the CrossValidationSplit object.

    Parameters
    ----------
    repeat : int
        Index of the repeat to retrieve.
    fold : int
        Index of the fold to retrieve.
    sample : int
        Index of the sample to retrieve.

    Returns
    -------
    numpy.ndarray
        The data split for the specified repeat, fold, and sample.

    Raises
    ------
    ValueError
        If the specified repeat, fold, or sample is not known.
    """
    if repeat not in self.split:
        raise ValueError(f"Repeat {repeat!s} not known")
    if fold not in self.split[repeat]:
        raise ValueError(f"Fold {fold!s} not known")
    if sample not in self.split[repeat][fold]:
        raise ValueError(f"Sample {sample!s} not known")
    return self.split[repeat][fold][sample]

OpenMLSupervisedTask #

OpenMLSupervisedTask(task_type_id: TaskType, task_type: str, data_set_id: int, target_name: str, estimation_procedure_id: int = 1, estimation_procedure_type: str | None = None, estimation_parameters: dict[str, str] | None = None, evaluation_measure: str | None = None, data_splits_url: str | None = None, task_id: int | None = None)

Bases: OpenMLTask, ABC

OpenML Supervised Classification object.

Parameters#

task_type_id : TaskType ID of the task type. task_type : str Name of the task type. data_set_id : int ID of the OpenML dataset associated with the task. target_name : str Name of the target feature (the class variable). estimation_procedure_id : int, default=None ID of the estimation procedure for the task. estimation_procedure_type : str, default=None Type of the estimation procedure for the task. estimation_parameters : dict, default=None Estimation parameters for the task. evaluation_measure : str, default=None Name of the evaluation measure for the task. data_splits_url : str, default=None URL of the data splits for the task. task_id: Union[int, None] Refers to the unique identifier of task.

Source code in openml/tasks/task.py

def __init__(  # noqa: PLR0913
    self,
    task_type_id: TaskType,
    task_type: str,
    data_set_id: int,
    target_name: str,
    estimation_procedure_id: int = 1,
    estimation_procedure_type: str | None = None,
    estimation_parameters: dict[str, str] | None = None,
    evaluation_measure: str | None = None,
    data_splits_url: str | None = None,
    task_id: int | None = None,
):
    super().__init__(
        task_id=task_id,
        task_type_id=task_type_id,
        task_type=task_type,
        data_set_id=data_set_id,
        estimation_procedure_id=estimation_procedure_id,
        estimation_procedure_type=estimation_procedure_type,
        estimation_parameters=estimation_parameters,
        evaluation_measure=evaluation_measure,
        data_splits_url=data_splits_url,
    )

    self.target_name = target_name

estimation_parameters `property` `writable` #

estimation_parameters: dict[str, str] | None

Return the estimation parameters for the task.

id `property` #

id: int | None

Return the OpenML ID of this task.

openml_url `property` #

openml_url: str | None

The URL of the object on the server, if it was uploaded, else None.

download_split #

download_split() -> OpenMLSplit

Download the OpenML split for a given task.

Source code in openml/tasks/task.py

def download_split(self) -> OpenMLSplit:
    """Download the OpenML split for a given task."""
    # TODO(eddiebergman): Can this every be `None`?
    assert self.task_id is not None
    cache_dir = _create_cache_directory_for_id("tasks", self.task_id)
    cached_split_file = cache_dir / "datasplits.arff"

    try:
        split = OpenMLSplit._from_arff_file(cached_split_file)
    except OSError:
        # Next, download and cache the associated split file
        self._download_split(cached_split_file)
        split = OpenMLSplit._from_arff_file(cached_split_file)

    return split

get_X_and_y #

get_X_and_y() -> tuple[DataFrame, Series | DataFrame | None]

Get data associated with the current task.

Returns#

tuple - X and y

Source code in openml/tasks/task.py

def get_X_and_y(self) -> tuple[pd.DataFrame, pd.Series | pd.DataFrame | None]:
    """Get data associated with the current task.

    Returns
    -------
    tuple - X and y

    """
    dataset = self.get_dataset()
    if self.task_type_id not in (
        TaskType.SUPERVISED_CLASSIFICATION,
        TaskType.SUPERVISED_REGRESSION,
        TaskType.LEARNING_CURVE,
    ):
        raise NotImplementedError(self.task_type)

    X, y, _, _ = dataset.get_data(target=self.target_name)
    return X, y

get_dataset #

get_dataset(**kwargs: Any) -> OpenMLDataset

Download dataset associated with task.

Accepts the same keyword arguments as the openml.datasets.get_dataset.

Source code in openml/tasks/task.py

def get_dataset(self, **kwargs: Any) -> datasets.OpenMLDataset:
    """Download dataset associated with task.

    Accepts the same keyword arguments as the `openml.datasets.get_dataset`.
    """
    return datasets.get_dataset(self.dataset_id, **kwargs)

get_split_dimensions #

get_split_dimensions() -> tuple[int, int, int]

Get the (repeats, folds, samples) of the split for a given task.

Source code in openml/tasks/task.py

def get_split_dimensions(self) -> tuple[int, int, int]:
    """Get the (repeats, folds, samples) of the split for a given task."""
    if self.split is None:
        self.split = self.download_split()

    return self.split.repeats, self.split.folds, self.split.samples

get_train_test_split_indices #

get_train_test_split_indices(fold: int = 0, repeat: int = 0, sample: int = 0) -> tuple[ndarray, ndarray]

Get the indices of the train and test splits for a given task.

Source code in openml/tasks/task.py

def get_train_test_split_indices(
    self,
    fold: int = 0,
    repeat: int = 0,
    sample: int = 0,
) -> tuple[np.ndarray, np.ndarray]:
    """Get the indices of the train and test splits for a given task."""
    # Replace with retrieve from cache
    if self.split is None:
        self.split = self.download_split()

    return self.split.get(repeat=repeat, fold=fold, sample=sample)

open_in_browser #

open_in_browser() -> None

Opens the OpenML web page corresponding to this object in your default browser.

Source code in openml/base.py

def open_in_browser(self) -> None:
    """Opens the OpenML web page corresponding to this object in your default browser."""
    if self.openml_url is None:
        raise ValueError(
            "Cannot open element on OpenML.org when attribute `openml_url` is `None`",
        )

    webbrowser.open(self.openml_url)

publish #

publish() -> OpenMLBase

Publish the object on the OpenML server.

Source code in openml/base.py

def publish(self) -> OpenMLBase:
    """Publish the object on the OpenML server."""
    file_elements = self._get_file_elements()

    if "description" not in file_elements:
        file_elements["description"] = self._to_xml()

    call = f"{_get_rest_api_type_alias(self)}/"
    response_text = openml._api_calls._perform_api_call(
        call,
        "post",
        file_elements=file_elements,
    )
    xml_response = xmltodict.parse(response_text)

    self._parse_publish_response(xml_response)
    return self

push_tag #

push_tag(tag: str) -> None

Annotates this entity with a tag on the server.

Parameters#

tag : str Tag to attach to the flow.

Source code in openml/base.py

def push_tag(self, tag: str) -> None:
    """Annotates this entity with a tag on the server.

    Parameters
    ----------
    tag : str
        Tag to attach to the flow.
    """
    _tag_openml_base(self, tag)

remove_tag #

remove_tag(tag: str) -> None

Removes a tag from this entity on the server.

Parameters#

tag : str Tag to attach to the flow.

Source code in openml/base.py

def remove_tag(self, tag: str) -> None:
    """Removes a tag from this entity on the server.

    Parameters
    ----------
    tag : str
        Tag to attach to the flow.
    """
    _tag_openml_base(self, tag, untag=True)

url_for_id `classmethod` #

url_for_id(id_: int) -> str

Return the OpenML URL for the object of the class entity with the given id.

Source code in openml/base.py

@classmethod
def url_for_id(cls, id_: int) -> str:
    """Return the OpenML URL for the object of the class entity with the given id."""
    # Sample url for a flow: openml.org/f/123
    return f"{openml.config.get_server_base_url()}/{cls._entity_letter()}/{id_}"

OpenMLTask #

OpenMLTask(task_id: int | None, task_type_id: TaskType, task_type: str, data_set_id: int, estimation_procedure_id: int = 1, estimation_procedure_type: str | None = None, estimation_parameters: dict[str, str] | None = None, evaluation_measure: str | None = None, data_splits_url: str | None = None)

Bases: OpenMLBase

OpenML Task object.

Parameters#

task_id: Union[int, None] Refers to the unique identifier of OpenML task. task_type_id: TaskType Refers to the type of OpenML task. task_type: str Refers to the OpenML task. data_set_id: int Refers to the data. estimation_procedure_id: int Refers to the type of estimates used. estimation_procedure_type: str, default=None Refers to the type of estimation procedure used for the OpenML task. estimation_parameters: [Dict[str, str]], default=None Estimation parameters used for the OpenML task. evaluation_measure: str, default=None Refers to the evaluation measure. data_splits_url: str, default=None Refers to the URL of the data splits used for the OpenML task.

Source code in openml/tasks/task.py

def __init__(  # noqa: PLR0913
    self,
    task_id: int | None,
    task_type_id: TaskType,
    task_type: str,
    data_set_id: int,
    estimation_procedure_id: int = 1,
    estimation_procedure_type: str | None = None,
    estimation_parameters: dict[str, str] | None = None,
    evaluation_measure: str | None = None,
    data_splits_url: str | None = None,
):
    self.task_id = int(task_id) if task_id is not None else None
    self.task_type_id = task_type_id
    self.task_type = task_type
    self.dataset_id = int(data_set_id)
    self.evaluation_measure = evaluation_measure
    self.estimation_procedure: _EstimationProcedure = {
        "type": estimation_procedure_type,
        "parameters": estimation_parameters,
        "data_splits_url": data_splits_url,
    }
    self.estimation_procedure_id = estimation_procedure_id
    self.split: OpenMLSplit | None = None

id `property` #

id: int | None

Return the OpenML ID of this task.

openml_url `property` #

openml_url: str | None

The URL of the object on the server, if it was uploaded, else None.

download_split #

download_split() -> OpenMLSplit

Download the OpenML split for a given task.

Source code in openml/tasks/task.py

def download_split(self) -> OpenMLSplit:
    """Download the OpenML split for a given task."""
    # TODO(eddiebergman): Can this every be `None`?
    assert self.task_id is not None
    cache_dir = _create_cache_directory_for_id("tasks", self.task_id)
    cached_split_file = cache_dir / "datasplits.arff"

    try:
        split = OpenMLSplit._from_arff_file(cached_split_file)
    except OSError:
        # Next, download and cache the associated split file
        self._download_split(cached_split_file)
        split = OpenMLSplit._from_arff_file(cached_split_file)

    return split

get_dataset #

get_dataset(**kwargs: Any) -> OpenMLDataset

Download dataset associated with task.

Accepts the same keyword arguments as the openml.datasets.get_dataset.

Source code in openml/tasks/task.py

def get_dataset(self, **kwargs: Any) -> datasets.OpenMLDataset:
    """Download dataset associated with task.

    Accepts the same keyword arguments as the `openml.datasets.get_dataset`.
    """
    return datasets.get_dataset(self.dataset_id, **kwargs)

get_split_dimensions #

get_split_dimensions() -> tuple[int, int, int]

Get the (repeats, folds, samples) of the split for a given task.

Source code in openml/tasks/task.py

def get_split_dimensions(self) -> tuple[int, int, int]:
    """Get the (repeats, folds, samples) of the split for a given task."""
    if self.split is None:
        self.split = self.download_split()

    return self.split.repeats, self.split.folds, self.split.samples

get_train_test_split_indices #

get_train_test_split_indices(fold: int = 0, repeat: int = 0, sample: int = 0) -> tuple[ndarray, ndarray]

Get the indices of the train and test splits for a given task.

Source code in openml/tasks/task.py

def get_train_test_split_indices(
    self,
    fold: int = 0,
    repeat: int = 0,
    sample: int = 0,
) -> tuple[np.ndarray, np.ndarray]:
    """Get the indices of the train and test splits for a given task."""
    # Replace with retrieve from cache
    if self.split is None:
        self.split = self.download_split()

    return self.split.get(repeat=repeat, fold=fold, sample=sample)

open_in_browser #

open_in_browser() -> None

Opens the OpenML web page corresponding to this object in your default browser.

Source code in openml/base.py

def open_in_browser(self) -> None:
    """Opens the OpenML web page corresponding to this object in your default browser."""
    if self.openml_url is None:
        raise ValueError(
            "Cannot open element on OpenML.org when attribute `openml_url` is `None`",
        )

    webbrowser.open(self.openml_url)

publish #

publish() -> OpenMLBase

Publish the object on the OpenML server.

Source code in openml/base.py

def publish(self) -> OpenMLBase:
    """Publish the object on the OpenML server."""
    file_elements = self._get_file_elements()

    if "description" not in file_elements:
        file_elements["description"] = self._to_xml()

    call = f"{_get_rest_api_type_alias(self)}/"
    response_text = openml._api_calls._perform_api_call(
        call,
        "post",
        file_elements=file_elements,
    )
    xml_response = xmltodict.parse(response_text)

    self._parse_publish_response(xml_response)
    return self

push_tag #

push_tag(tag: str) -> None

Annotates this entity with a tag on the server.

Parameters#

tag : str Tag to attach to the flow.

Source code in openml/base.py

def push_tag(self, tag: str) -> None:
    """Annotates this entity with a tag on the server.

    Parameters
    ----------
    tag : str
        Tag to attach to the flow.
    """
    _tag_openml_base(self, tag)

remove_tag #

remove_tag(tag: str) -> None

Removes a tag from this entity on the server.

Parameters#

tag : str Tag to attach to the flow.

Source code in openml/base.py

def remove_tag(self, tag: str) -> None:
    """Removes a tag from this entity on the server.

    Parameters
    ----------
    tag : str
        Tag to attach to the flow.
    """
    _tag_openml_base(self, tag, untag=True)

url_for_id `classmethod` #

url_for_id(id_: int) -> str

Return the OpenML URL for the object of the class entity with the given id.

Source code in openml/base.py

@classmethod
def url_for_id(cls, id_: int) -> str:
    """Return the OpenML URL for the object of the class entity with the given id."""
    # Sample url for a flow: openml.org/f/123
    return f"{openml.config.get_server_base_url()}/{cls._entity_letter()}/{id_}"

TaskType #

Bases: Enum

Possible task types as defined in OpenML.

create_task #

create_task(task_type: TaskType, dataset_id: int, estimation_procedure_id: int, target_name: str | None = None, evaluation_measure: str | None = None, **kwargs: Any) -> OpenMLClassificationTask | OpenMLRegressionTask | OpenMLLearningCurveTask | OpenMLClusteringTask

Create a task based on different given attributes.

Builds a task object with the function arguments as attributes. The type of the task object built is determined from the task type id. More information on how the arguments (task attributes), relate to the different possible tasks can be found in the individual task objects at the openml.tasks.task module.

Parameters#

task_type : TaskType Id of the task type. dataset_id : int The id of the dataset for the task. target_name : str, optional The name of the feature used as a target. At the moment, only optional for the clustering tasks. estimation_procedure_id : int The id of the estimation procedure. evaluation_measure : str, optional The name of the evaluation measure. kwargs : dict, optional Other task attributes that are not mandatory for task upload.

Returns#

OpenMLClassificationTask, OpenMLRegressionTask, OpenMLLearningCurveTask, OpenMLClusteringTask

Source code in openml/tasks/functions.py

def create_task(
    task_type: TaskType,
    dataset_id: int,
    estimation_procedure_id: int,
    target_name: str | None = None,
    evaluation_measure: str | None = None,
    **kwargs: Any,
) -> (
    OpenMLClassificationTask | OpenMLRegressionTask | OpenMLLearningCurveTask | OpenMLClusteringTask
):
    """Create a task based on different given attributes.

    Builds a task object with the function arguments as
    attributes. The type of the task object built is
    determined from the task type id.
    More information on how the arguments (task attributes),
    relate to the different possible tasks can be found in
    the individual task objects at the openml.tasks.task
    module.

    Parameters
    ----------
    task_type : TaskType
        Id of the task type.
    dataset_id : int
        The id of the dataset for the task.
    target_name : str, optional
        The name of the feature used as a target.
        At the moment, only optional for the clustering tasks.
    estimation_procedure_id : int
        The id of the estimation procedure.
    evaluation_measure : str, optional
        The name of the evaluation measure.
    kwargs : dict, optional
        Other task attributes that are not mandatory
        for task upload.

    Returns
    -------
    OpenMLClassificationTask, OpenMLRegressionTask,
    OpenMLLearningCurveTask, OpenMLClusteringTask
    """
    if task_type == TaskType.CLUSTERING:
        task_cls = OpenMLClusteringTask
    elif task_type == TaskType.LEARNING_CURVE:
        task_cls = OpenMLLearningCurveTask  # type: ignore
    elif task_type == TaskType.SUPERVISED_CLASSIFICATION:
        task_cls = OpenMLClassificationTask  # type: ignore
    elif task_type == TaskType.SUPERVISED_REGRESSION:
        task_cls = OpenMLRegressionTask  # type: ignore
    else:
        raise NotImplementedError(f"Task type {task_type:d} not supported.")

    return task_cls(
        task_type_id=task_type,
        task_type="None",  # TODO: refactor to get task type string from ID.
        data_set_id=dataset_id,
        target_name=target_name,  # type: ignore
        estimation_procedure_id=estimation_procedure_id,
        evaluation_measure=evaluation_measure,
        **kwargs,
    )

delete_task #

delete_task(task_id: int) -> bool

Delete task with id task_id from the OpenML server.

You can only delete tasks which you created and have no runs associated with them.

Parameters#

task_id : int OpenML id of the task

Returns#

bool True if the deletion was successful. False otherwise.

Source code in openml/tasks/functions.py

def delete_task(task_id: int) -> bool:
    """Delete task with id `task_id` from the OpenML server.

    You can only delete tasks which you created and have
    no runs associated with them.

    Parameters
    ----------
    task_id : int
        OpenML id of the task

    Returns
    -------
    bool
        True if the deletion was successful. False otherwise.
    """
    return openml.utils._delete_entity("task", task_id)

get_task #

get_task(task_id: int, download_splits: bool = False, **get_dataset_kwargs: Any) -> OpenMLTask

Download OpenML task for a given task ID.

Downloads the task representation.

Use the download_splits parameter to control whether the splits are downloaded. Moreover, you may pass additional parameter (args or kwargs) that are passed to :meth:openml.datasets.get_dataset.

Parameters#

task_id : int The OpenML task id of the task to download. download_splits: bool (default=False) Whether to download the splits as well. get_dataset_kwargs : Args and kwargs can be used pass optional parameters to :meth:openml.datasets.get_dataset.

Returns#

task: OpenMLTask

Source code in openml/tasks/functions.py

@openml.utils.thread_safe_if_oslo_installed
def get_task(
    task_id: int,
    download_splits: bool = False,  # noqa: FBT001, FBT002
    **get_dataset_kwargs: Any,
) -> OpenMLTask:
    """Download OpenML task for a given task ID.

    Downloads the task representation.

    Use the `download_splits` parameter to control whether the splits are downloaded.
    Moreover, you may pass additional parameter (args or kwargs) that are passed to
    :meth:`openml.datasets.get_dataset`.

    Parameters
    ----------
    task_id : int
        The OpenML task id of the task to download.
    download_splits: bool (default=False)
        Whether to download the splits as well.
    get_dataset_kwargs :
        Args and kwargs can be used pass optional parameters to :meth:`openml.datasets.get_dataset`.

    Returns
    -------
    task: OpenMLTask
    """
    if not isinstance(task_id, int):
        raise TypeError(f"Task id should be integer, is {type(task_id)}")

    tid_cache_dir = openml.utils._create_cache_directory_for_id(TASKS_CACHE_DIR_NAME, task_id)

    try:
        task = _get_task_description(task_id)
        dataset = get_dataset(task.dataset_id, **get_dataset_kwargs)
        # List of class labels available in dataset description
        # Including class labels as part of task meta data handles
        #   the case where data download was initially disabled
        if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)):
            task.class_labels = dataset.retrieve_class_labels(task.target_name)
        # Clustering tasks do not have class labels
        # and do not offer download_split
        if download_splits and isinstance(task, OpenMLSupervisedTask):
            task.download_split()
    except Exception as e:
        openml.utils._remove_cache_dir_for_id(TASKS_CACHE_DIR_NAME, tid_cache_dir)
        raise e

    return task

get_tasks #

get_tasks(task_ids: list[int], download_data: bool | None = None, download_qualities: bool | None = None) -> list[OpenMLTask]

Download tasks.

This function iterates :meth:openml.tasks.get_task.

Parameters#

task_ids : List[int] A list of task ids to download. download_data : bool (default = True) Option to trigger download of data along with the meta data. download_qualities : bool (default=True) Option to download 'qualities' meta-data in addition to the minimal dataset description.

Returns#

list

Source code in openml/tasks/functions.py

def get_tasks(
    task_ids: list[int],
    download_data: bool | None = None,
    download_qualities: bool | None = None,
) -> list[OpenMLTask]:
    """Download tasks.

    This function iterates :meth:`openml.tasks.get_task`.

    Parameters
    ----------
    task_ids : List[int]
        A list of task ids to download.
    download_data : bool (default = True)
        Option to trigger download of data along with the meta data.
    download_qualities : bool (default=True)
        Option to download 'qualities' meta-data in addition to the minimal dataset description.

    Returns
    -------
    list
    """
    if download_data is None:
        warnings.warn(
            "`download_data` will default to False starting in 0.16. "
            "Please set `download_data` explicitly to suppress this warning.",
            stacklevel=1,
        )
        download_data = True

    if download_qualities is None:
        warnings.warn(
            "`download_qualities` will default to False starting in 0.16. "
            "Please set `download_qualities` explicitly to suppress this warning.",
            stacklevel=1,
        )
        download_qualities = True

    tasks = []
    for task_id in task_ids:
        tasks.append(
            get_task(task_id, download_data=download_data, download_qualities=download_qualities)
        )
    return tasks

list_tasks #

list_tasks(task_type: TaskType | None = None, offset: int | None = None, size: int | None = None, tag: str | None = None, data_tag: str | None = None, status: str | None = None, data_name: str | None = None, data_id: int | None = None, number_instances: int | None = None, number_features: int | None = None, number_classes: int | None = None, number_missing_values: int | None = None) -> DataFrame

Return a number of tasks having the given tag and task_type

Parameters#

Filter task_type is separated from the other filters because it is used as task_type in the task description, but it is named type when used as a filter in list tasks call. offset : int, optional the number of tasks to skip, starting from the first task_type : TaskType, optional Refers to the type of task. size : int, optional the maximum number of tasks to show tag : str, optional the tag to include data_tag : str, optional the tag of the dataset data_id : int, optional status : str, optional data_name : str, optional number_instances : int, optional number_features : int, optional number_classes : int, optional number_missing_values : int, optional

Returns#

dataframe All tasks having the given task_type and the give tag. Every task is represented by a row in the data frame containing the following information as columns: task id, dataset id, task_type and status. If qualities are calculated for the associated dataset, some of these are also returned.

Source code in openml/tasks/functions.py

def list_tasks(  # noqa: PLR0913
    task_type: TaskType | None = None,
    offset: int | None = None,
    size: int | None = None,
    tag: str | None = None,
    data_tag: str | None = None,
    status: str | None = None,
    data_name: str | None = None,
    data_id: int | None = None,
    number_instances: int | None = None,
    number_features: int | None = None,
    number_classes: int | None = None,
    number_missing_values: int | None = None,
) -> pd.DataFrame:
    """
    Return a number of tasks having the given tag and task_type

    Parameters
    ----------
    Filter task_type is separated from the other filters because
    it is used as task_type in the task description, but it is named
    type when used as a filter in list tasks call.
    offset : int, optional
        the number of tasks to skip, starting from the first
    task_type : TaskType, optional
        Refers to the type of task.
    size : int, optional
        the maximum number of tasks to show
    tag : str, optional
        the tag to include
    data_tag : str, optional
        the tag of the dataset
    data_id : int, optional
    status : str, optional
    data_name : str, optional
    number_instances : int, optional
    number_features : int, optional
    number_classes : int, optional
    number_missing_values : int, optional

    Returns
    -------
    dataframe
        All tasks having the given task_type and the give tag. Every task is
        represented by a row in the data frame containing the following information
        as columns: task id, dataset id, task_type and status. If qualities are
        calculated for the associated dataset, some of these are also returned.
    """
    listing_call = partial(
        _list_tasks,
        task_type=task_type,
        tag=tag,
        data_tag=data_tag,
        status=status,
        data_id=data_id,
        data_name=data_name,
        number_instances=number_instances,
        number_features=number_features,
        number_classes=number_classes,
        number_missing_values=number_missing_values,
    )
    batches = openml.utils._list_all(listing_call, offset=offset, limit=size)
    if len(batches) == 0:
        return pd.DataFrame()

    return pd.concat(batches)

tasks

openml.tasks #

OpenMLClassificationTask #

Parameters#

estimation_parameters property writable #

id property #

openml_url property #

download_split #

get_X_and_y #

Returns#

get_dataset #

get_split_dimensions #

get_train_test_split_indices #

open_in_browser #

publish #

push_tag #

Parameters#

remove_tag #

Parameters#

url_for_id classmethod #

OpenMLClusteringTask #

Parameters#

id property #

openml_url property #

download_split #

get_X #

Returns#

get_dataset #

get_split_dimensions #

get_train_test_split_indices #

open_in_browser #

publish #

push_tag #

Parameters#

remove_tag #

Parameters#

url_for_id classmethod #

OpenMLLearningCurveTask #

Parameters#

estimation_parameters property writable #

id property #

openml_url property #

download_split #

get_X_and_y #

Returns#

get_dataset #

get_split_dimensions #

get_train_test_split_indices #

open_in_browser #

publish #

push_tag #

Parameters#

remove_tag #

Parameters#

url_for_id classmethod #

OpenMLRegressionTask #

Parameters#

estimation_parameters property writable #

id property #

openml_url property #

download_split #

get_X_and_y #

Returns#

get_dataset #

get_split_dimensions #

get_train_test_split_indices #

open_in_browser #

publish #

push_tag #

Parameters#

remove_tag #

Parameters#

url_for_id classmethod #

OpenMLSplit #

Parameters#

get #

Parameters#

Returns#

Raises#

OpenMLSupervisedTask #

estimation_parameters `property` `writable` #

id `property` #

openml_url `property` #

url_for_id `classmethod` #

id `property` #

openml_url `property` #

url_for_id `classmethod` #

estimation_parameters `property` `writable` #

id `property` #

openml_url `property` #

url_for_id `classmethod` #

estimation_parameters `property` `writable` #

id `property` #

openml_url `property` #

url_for_id `classmethod` #

estimation_parameters `property` `writable` #

id `property` #

openml_url `property` #

url_for_id `classmethod` #

id `property` #

openml_url `property` #

url_for_id `classmethod` #