tasks

openml.tasks #

OpenMLClassificationTask #

OpenMLClassificationTask(task_type_id: TaskType, task_type: str, data_set_id: int, target_name: str, estimation_procedure_id: int | None = None, estimation_procedure_type: str | None = None, estimation_parameters: dict[str, str] | None = None, evaluation_measure: str | None = None, data_splits_url: str | None = None, task_id: int | None = None, class_labels: list[str] | None = None, cost_matrix: ndarray | None = None)

Bases: OpenMLSupervisedTask

OpenML Classification object.

PARAMETER	DESCRIPTION
`task_id`	ID of the Classification task (if it already exists on OpenML). TYPE: `Union[int, None]` DEFAULT: `None`
`task_type_id`	ID of the Classification task type. TYPE: `TaskType`
`task_type`	Name of the Classification task type. TYPE: `str`
`data_set_id`	ID of the OpenML dataset associated with the Classification task. TYPE: `int`
`target_name`	Name of the target variable. TYPE: `str`
`estimation_procedure_id`	ID of the estimation procedure for the Classification task. TYPE: `int` DEFAULT: `1`
`estimation_procedure_type`	Type of the estimation procedure. TYPE: `str` DEFAULT: `None`
`estimation_parameters`	Estimation parameters for the Classification task. TYPE: `dict` DEFAULT: `None`
`evaluation_measure`	Name of the evaluation measure. TYPE: `str` DEFAULT: `None`
`data_splits_url`	URL of the data splits for the Classification task. TYPE: `str` DEFAULT: `None`
`class_labels`	A list of class labels (for classification tasks). TYPE: `List of str` DEFAULT: `None`
`cost_matrix`	A cost matrix (for classification tasks). TYPE: `array` DEFAULT: `None`

Source code in openml/tasks/task.py

def __init__(  # noqa: PLR0913
    self,
    task_type_id: TaskType,
    task_type: str,
    data_set_id: int,
    target_name: str,
    estimation_procedure_id: int | None = None,
    estimation_procedure_type: str | None = None,
    estimation_parameters: dict[str, str] | None = None,
    evaluation_measure: str | None = None,
    data_splits_url: str | None = None,
    task_id: int | None = None,
    class_labels: list[str] | None = None,
    cost_matrix: np.ndarray | None = None,
):
    super().__init__(
        task_type_id=task_type_id,
        task_type=task_type,
        data_set_id=data_set_id,
        target_name=target_name,
        estimation_procedure_id=estimation_procedure_id,
        estimation_procedure_type=estimation_procedure_type,
        estimation_parameters=estimation_parameters,
        evaluation_measure=evaluation_measure,
        data_splits_url=data_splits_url,
        task_id=task_id,
    )
    self.class_labels = class_labels
    self.cost_matrix = cost_matrix
    if cost_matrix is not None:
        raise NotImplementedError("Costmatrix functionality is not yet implemented.")

estimation_parameters `property` `writable` #

estimation_parameters: dict[str, str] | None

Return the estimation parameters for the task.

id `property` #

id: int | None

Return the OpenML ID of this task.

openml_url `property` #

openml_url: str | None

The URL of the object on the server, if it was uploaded, else None.

download_split #

download_split() -> OpenMLSplit

Download the OpenML split for a given task.

Source code in openml/tasks/task.py

def download_split(self) -> OpenMLSplit:
    """Download the OpenML split for a given task."""
    # TODO(eddiebergman): Can this every be `None`?
    assert self.task_id is not None
    cache_dir = _create_cache_directory_for_id("tasks", self.task_id)
    cached_split_file = cache_dir / "datasplits.arff"

    try:
        split = OpenMLSplit._from_arff_file(cached_split_file)
    except OSError:
        # Next, download and cache the associated split file
        self._download_split(cached_split_file)
        split = OpenMLSplit._from_arff_file(cached_split_file)

    return split

get_X_and_y #

get_X_and_y() -> tuple[DataFrame, Series | DataFrame | None]

Get data associated with the current task.

RETURNS	DESCRIPTION
`tuple - X and y`

Source code in openml/tasks/task.py

def get_X_and_y(self) -> tuple[pd.DataFrame, pd.Series | pd.DataFrame | None]:
    """Get data associated with the current task.

    Returns
    -------
    tuple - X and y

    """
    dataset = self.get_dataset()
    if self.task_type_id not in (
        TaskType.SUPERVISED_CLASSIFICATION,
        TaskType.SUPERVISED_REGRESSION,
        TaskType.LEARNING_CURVE,
    ):
        raise NotImplementedError(
            f"Task type '{self.task_type}' is not implemented for get_X_and_y(). "
            f"Supported types: SUPERVISED_CLASSIFICATION, SUPERVISED_REGRESSION,"
            f"LEARNING_CURVE."
            f"Task ID: {getattr(self, 'task_id', 'unknown')}. "
        )

    X, y, _, _ = dataset.get_data(target=self.target_name)
    return X, y

get_dataset #

get_dataset(**kwargs: Any) -> OpenMLDataset

Download dataset associated with task.

Accepts the same keyword arguments as the openml.datasets.get_dataset.

Source code in openml/tasks/task.py

def get_dataset(self, **kwargs: Any) -> datasets.OpenMLDataset:
    """Download dataset associated with task.

    Accepts the same keyword arguments as the `openml.datasets.get_dataset`.
    """
    return datasets.get_dataset(self.dataset_id, **kwargs)

get_split_dimensions #

get_split_dimensions() -> tuple[int, int, int]

Get the (repeats, folds, samples) of the split for a given task.

Source code in openml/tasks/task.py

def get_split_dimensions(self) -> tuple[int, int, int]:
    """Get the (repeats, folds, samples) of the split for a given task."""
    if self.split is None:
        self.split = self.download_split()

    return self.split.repeats, self.split.folds, self.split.samples

get_train_test_split_indices #

get_train_test_split_indices(fold: int = 0, repeat: int = 0, sample: int = 0) -> tuple[ndarray, ndarray]

Get the indices of the train and test splits for a given task.

Source code in openml/tasks/task.py

def get_train_test_split_indices(
    self,
    fold: int = 0,
    repeat: int = 0,
    sample: int = 0,
) -> tuple[np.ndarray, np.ndarray]:
    """Get the indices of the train and test splits for a given task."""
    # Replace with retrieve from cache
    if self.split is None:
        self.split = self.download_split()

    return self.split.get(repeat=repeat, fold=fold, sample=sample)

open_in_browser #

open_in_browser() -> None

Opens the OpenML web page corresponding to this object in your default browser.

Source code in openml/base.py

def open_in_browser(self) -> None:
    """Opens the OpenML web page corresponding to this object in your default browser."""
    if self.openml_url is None:
        raise ValueError(
            "Cannot open element on OpenML.org when attribute `openml_url` is `None`",
        )

    webbrowser.open(self.openml_url)

publish #

publish() -> OpenMLBase

Publish the object on the OpenML server.

Source code in openml/base.py

def publish(self) -> OpenMLBase:
    """Publish the object on the OpenML server."""
    file_elements = self._get_file_elements()

    if "description" not in file_elements:
        file_elements["description"] = self._to_xml()

    call = f"{_get_rest_api_type_alias(self)}/"
    response_text = openml._api_calls._perform_api_call(
        call,
        "post",
        file_elements=file_elements,
    )
    xml_response = xmltodict.parse(response_text)

    self._parse_publish_response(xml_response)
    return self

push_tag #

push_tag(tag: str) -> None

Annotates this entity with a tag on the server.

PARAMETER	DESCRIPTION
`tag`	Tag to attach to the flow. TYPE: `str`

Source code in openml/base.py

def push_tag(self, tag: str) -> None:
    """Annotates this entity with a tag on the server.

    Parameters
    ----------
    tag : str
        Tag to attach to the flow.
    """
    _tag_openml_base(self, tag)

remove_tag #

remove_tag(tag: str) -> None

Removes a tag from this entity on the server.

PARAMETER	DESCRIPTION
`tag`	Tag to attach to the flow. TYPE: `str`

Source code in openml/base.py

def remove_tag(self, tag: str) -> None:
    """Removes a tag from this entity on the server.

    Parameters
    ----------
    tag : str
        Tag to attach to the flow.
    """
    _tag_openml_base(self, tag, untag=True)

url_for_id `classmethod` #

url_for_id(id_: int) -> str

Return the OpenML URL for the object of the class entity with the given id.

Source code in openml/base.py

@classmethod
def url_for_id(cls, id_: int) -> str:
    """Return the OpenML URL for the object of the class entity with the given id."""
    # Sample url for a flow: openml.org/f/123
    return f"{openml.config.get_server_base_url()}/{cls._entity_letter()}/{id_}"

OpenMLClusteringTask #

OpenMLClusteringTask(task_id: int | None, task_type_id: TaskType, task_type: str, data_set_id: int, estimation_procedure_id: int | None = None, estimation_procedure_type: str | None = None, estimation_parameters: dict[str, str] | None = None, evaluation_measure: str | None = None, data_splits_url: str | None = None, target_name: str | None = None)

Bases: OpenMLTask

OpenML Clustering object.

PARAMETER	DESCRIPTION
`task_id`	ID of the OpenML clustering task. TYPE: `Union[int, None]`
`task_type_id`	Task type ID of the OpenML clustering task. TYPE: `TaskType`
`task_type`	Task type of the OpenML clustering task. TYPE: `str`
`data_set_id`	ID of the OpenML dataset used in clustering the task. TYPE: `int`
`estimation_procedure_id`	ID of the OpenML estimation procedure. TYPE: `int` DEFAULT: `17`
`estimation_procedure_type`	Type of the OpenML estimation procedure used in the clustering task. TYPE: `str` DEFAULT: `None`
`estimation_parameters`	Parameters used by the OpenML estimation procedure. TYPE: `dict` DEFAULT: `None`
`data_splits_url`	URL of the OpenML data splits for the clustering task. TYPE: `str` DEFAULT: `None`
`evaluation_measure`	Evaluation measure used in the clustering task. TYPE: `str` DEFAULT: `None`
`target_name`	Name of the target feature (class) that is not part of the feature set for the clustering task. TYPE: `str` DEFAULT: `None`

Source code in openml/tasks/task.py

def __init__(  # noqa: PLR0913
    self,
    task_id: int | None,
    task_type_id: TaskType,
    task_type: str,
    data_set_id: int,
    estimation_procedure_id: int | None = None,
    estimation_procedure_type: str | None = None,
    estimation_parameters: dict[str, str] | None = None,
    evaluation_measure: str | None = None,
    data_splits_url: str | None = None,
    target_name: str | None = None,
):
    self.task_id = int(task_id) if task_id is not None else None
    self.task_type_id = task_type_id
    self.task_type = task_type
    self.dataset_id = int(data_set_id)
    self.target_name = target_name
    resolved_estimation_procedure_id = self._resolve_estimation_procedure_id(
        estimation_procedure_id,
    )
    self.evaluation_measure = evaluation_measure
    self.estimation_procedure: _EstimationProcedure = {
        "type": estimation_procedure_type,
        "parameters": estimation_parameters,
        "data_splits_url": data_splits_url,
    }
    self.estimation_procedure_id = resolved_estimation_procedure_id
    self.split: OpenMLSplit | None = None

id `property` #

id: int | None

Return the OpenML ID of this task.

openml_url `property` #

openml_url: str | None

The URL of the object on the server, if it was uploaded, else None.

download_split #

download_split() -> OpenMLSplit

Download the OpenML split for a given task.

Source code in openml/tasks/task.py

def download_split(self) -> OpenMLSplit:
    """Download the OpenML split for a given task."""
    # TODO(eddiebergman): Can this every be `None`?
    assert self.task_id is not None
    cache_dir = _create_cache_directory_for_id("tasks", self.task_id)
    cached_split_file = cache_dir / "datasplits.arff"

    try:
        split = OpenMLSplit._from_arff_file(cached_split_file)
    except OSError:
        # Next, download and cache the associated split file
        self._download_split(cached_split_file)
        split = OpenMLSplit._from_arff_file(cached_split_file)

    return split

get_X #

get_X() -> DataFrame

Get data associated with the current task.

RETURNS	DESCRIPTION
`The X data as a dataframe`

Source code in openml/tasks/task.py

def get_X(self) -> pd.DataFrame:
    """Get data associated with the current task.

    Returns
    -------
    The X data as a dataframe
    """
    dataset = self.get_dataset()
    data, *_ = dataset.get_data(target=None)
    return data

get_dataset #

get_dataset(**kwargs: Any) -> OpenMLDataset

Download dataset associated with task.

Accepts the same keyword arguments as the openml.datasets.get_dataset.

Source code in openml/tasks/task.py

def get_dataset(self, **kwargs: Any) -> datasets.OpenMLDataset:
    """Download dataset associated with task.

    Accepts the same keyword arguments as the `openml.datasets.get_dataset`.
    """
    return datasets.get_dataset(self.dataset_id, **kwargs)

get_split_dimensions #

get_split_dimensions() -> tuple[int, int, int]

Get the (repeats, folds, samples) of the split for a given task.

Source code in openml/tasks/task.py

def get_split_dimensions(self) -> tuple[int, int, int]:
    """Get the (repeats, folds, samples) of the split for a given task."""
    if self.split is None:
        self.split = self.download_split()

    return self.split.repeats, self.split.folds, self.split.samples

get_train_test_split_indices #

get_train_test_split_indices(fold: int = 0, repeat: int = 0, sample: int = 0) -> tuple[ndarray, ndarray]

Get the indices of the train and test splits for a given task.

Source code in openml/tasks/task.py

def get_train_test_split_indices(
    self,
    fold: int = 0,
    repeat: int = 0,
    sample: int = 0,
) -> tuple[np.ndarray, np.ndarray]:
    """Get the indices of the train and test splits for a given task."""
    # Replace with retrieve from cache
    if self.split is None:
        self.split = self.download_split()

    return self.split.get(repeat=repeat, fold=fold, sample=sample)

open_in_browser #

open_in_browser() -> None

Opens the OpenML web page corresponding to this object in your default browser.

Source code in openml/base.py

def open_in_browser(self) -> None:
    """Opens the OpenML web page corresponding to this object in your default browser."""
    if self.openml_url is None:
        raise ValueError(
            "Cannot open element on OpenML.org when attribute `openml_url` is `None`",
        )

    webbrowser.open(self.openml_url)

publish #

publish() -> OpenMLBase

Publish the object on the OpenML server.

Source code in openml/base.py

def publish(self) -> OpenMLBase:
    """Publish the object on the OpenML server."""
    file_elements = self._get_file_elements()

    if "description" not in file_elements:
        file_elements["description"] = self._to_xml()

    call = f"{_get_rest_api_type_alias(self)}/"
    response_text = openml._api_calls._perform_api_call(
        call,
        "post",
        file_elements=file_elements,
    )
    xml_response = xmltodict.parse(response_text)

    self._parse_publish_response(xml_response)
    return self

push_tag #

push_tag(tag: str) -> None

Annotates this entity with a tag on the server.

PARAMETER	DESCRIPTION
`tag`	Tag to attach to the flow. TYPE: `str`

Source code in openml/base.py

def push_tag(self, tag: str) -> None:
    """Annotates this entity with a tag on the server.

    Parameters
    ----------
    tag : str
        Tag to attach to the flow.
    """
    _tag_openml_base(self, tag)

remove_tag #

remove_tag(tag: str) -> None

Removes a tag from this entity on the server.

PARAMETER	DESCRIPTION
`tag`	Tag to attach to the flow. TYPE: `str`

Source code in openml/base.py

def remove_tag(self, tag: str) -> None:
    """Removes a tag from this entity on the server.

    Parameters
    ----------
    tag : str
        Tag to attach to the flow.
    """
    _tag_openml_base(self, tag, untag=True)

url_for_id `classmethod` #

url_for_id(id_: int) -> str

Return the OpenML URL for the object of the class entity with the given id.

Source code in openml/base.py

@classmethod
def url_for_id(cls, id_: int) -> str:
    """Return the OpenML URL for the object of the class entity with the given id."""
    # Sample url for a flow: openml.org/f/123
    return f"{openml.config.get_server_base_url()}/{cls._entity_letter()}/{id_}"

OpenMLLearningCurveTask #

OpenMLLearningCurveTask(task_type_id: TaskType, task_type: str, data_set_id: int, target_name: str, estimation_procedure_id: int | None = None, estimation_procedure_type: str | None = None, estimation_parameters: dict[str, str] | None = None, evaluation_measure: str | None = None, data_splits_url: str | None = None, task_id: int | None = None, class_labels: list[str] | None = None, cost_matrix: ndarray | None = None)

Bases: OpenMLClassificationTask

OpenML Learning Curve object.

PARAMETER	DESCRIPTION
`task_id`	ID of the Learning Curve task. TYPE: `Union[int, None]` DEFAULT: `None`
`task_type_id`	ID of the Learning Curve task. TYPE: `TaskType`
`task_type`	Name of the Learning Curve task. TYPE: `str`
`data_set_id`	ID of the dataset that this task is associated with. TYPE: `int`
`target_name`	Name of the target feature in the dataset. TYPE: `str`
`estimation_procedure_id`	ID of the estimation procedure to use for evaluating models. TYPE: `int` DEFAULT: `13`
`estimation_procedure_type`	Type of the estimation procedure. TYPE: `str` DEFAULT: `None`
`estimation_parameters`	Additional parameters for the estimation procedure. TYPE: `dict` DEFAULT: `None`
`data_splits_url`	URL of the file containing the data splits for Learning Curve task. TYPE: `str` DEFAULT: `None`
`evaluation_measure`	Name of the evaluation measure to use for evaluating models. TYPE: `str` DEFAULT: `None`
`class_labels`	Class labels for Learning Curve tasks. TYPE: `list of str` DEFAULT: `None`
`cost_matrix`	Cost matrix for Learning Curve tasks. TYPE: `numpy array` DEFAULT: `None`

Source code in openml/tasks/task.py

def __init__(  # noqa: PLR0913
    self,
    task_type_id: TaskType,
    task_type: str,
    data_set_id: int,
    target_name: str,
    estimation_procedure_id: int | None = None,
    estimation_procedure_type: str | None = None,
    estimation_parameters: dict[str, str] | None = None,
    evaluation_measure: str | None = None,
    data_splits_url: str | None = None,
    task_id: int | None = None,
    class_labels: list[str] | None = None,
    cost_matrix: np.ndarray | None = None,
):
    super().__init__(
        task_type_id=task_type_id,
        task_type=task_type,
        data_set_id=data_set_id,
        target_name=target_name,
        estimation_procedure_id=estimation_procedure_id,
        estimation_procedure_type=estimation_procedure_type,
        estimation_parameters=estimation_parameters,
        evaluation_measure=evaluation_measure,
        data_splits_url=data_splits_url,
        task_id=task_id,
    )
    self.class_labels = class_labels
    self.cost_matrix = cost_matrix
    if cost_matrix is not None:
        raise NotImplementedError("Costmatrix functionality is not yet implemented.")

estimation_parameters `property` `writable` #

estimation_parameters: dict[str, str] | None

Return the estimation parameters for the task.

id `property` #

id: int | None

Return the OpenML ID of this task.

openml_url `property` #

openml_url: str | None

The URL of the object on the server, if it was uploaded, else None.

download_split #

download_split() -> OpenMLSplit

Download the OpenML split for a given task.

Source code in openml/tasks/task.py

def download_split(self) -> OpenMLSplit:
    """Download the OpenML split for a given task."""
    # TODO(eddiebergman): Can this every be `None`?
    assert self.task_id is not None
    cache_dir = _create_cache_directory_for_id("tasks", self.task_id)
    cached_split_file = cache_dir / "datasplits.arff"

    try:
        split = OpenMLSplit._from_arff_file(cached_split_file)
    except OSError:
        # Next, download and cache the associated split file
        self._download_split(cached_split_file)
        split = OpenMLSplit._from_arff_file(cached_split_file)

    return split

get_X_and_y #

get_X_and_y() -> tuple[DataFrame, Series | DataFrame | None]

Get data associated with the current task.

RETURNS	DESCRIPTION
`tuple - X and y`

Source code in openml/tasks/task.py

def get_X_and_y(self) -> tuple[pd.DataFrame, pd.Series | pd.DataFrame | None]:
    """Get data associated with the current task.

    Returns
    -------
    tuple - X and y

    """
    dataset = self.get_dataset()
    if self.task_type_id not in (
        TaskType.SUPERVISED_CLASSIFICATION,
        TaskType.SUPERVISED_REGRESSION,
        TaskType.LEARNING_CURVE,
    ):
        raise NotImplementedError(
            f"Task type '{self.task_type}' is not implemented for get_X_and_y(). "
            f"Supported types: SUPERVISED_CLASSIFICATION, SUPERVISED_REGRESSION,"
            f"LEARNING_CURVE."
            f"Task ID: {getattr(self, 'task_id', 'unknown')}. "
        )

    X, y, _, _ = dataset.get_data(target=self.target_name)
    return X, y

get_dataset #

get_dataset(**kwargs: Any) -> OpenMLDataset

Download dataset associated with task.

Accepts the same keyword arguments as the openml.datasets.get_dataset.

Source code in openml/tasks/task.py

def get_dataset(self, **kwargs: Any) -> datasets.OpenMLDataset:
    """Download dataset associated with task.

    Accepts the same keyword arguments as the `openml.datasets.get_dataset`.
    """
    return datasets.get_dataset(self.dataset_id, **kwargs)

get_split_dimensions #

get_split_dimensions() -> tuple[int, int, int]

Get the (repeats, folds, samples) of the split for a given task.

Source code in openml/tasks/task.py

def get_split_dimensions(self) -> tuple[int, int, int]:
    """Get the (repeats, folds, samples) of the split for a given task."""
    if self.split is None:
        self.split = self.download_split()

    return self.split.repeats, self.split.folds, self.split.samples

get_train_test_split_indices #

get_train_test_split_indices(fold: int = 0, repeat: int = 0, sample: int = 0) -> tuple[ndarray, ndarray]

Get the indices of the train and test splits for a given task.

Source code in openml/tasks/task.py

def get_train_test_split_indices(
    self,
    fold: int = 0,
    repeat: int = 0,
    sample: int = 0,
) -> tuple[np.ndarray, np.ndarray]:
    """Get the indices of the train and test splits for a given task."""
    # Replace with retrieve from cache
    if self.split is None:
        self.split = self.download_split()

    return self.split.get(repeat=repeat, fold=fold, sample=sample)

open_in_browser #

open_in_browser() -> None

Opens the OpenML web page corresponding to this object in your default browser.

Source code in openml/base.py

def open_in_browser(self) -> None:
    """Opens the OpenML web page corresponding to this object in your default browser."""
    if self.openml_url is None:
        raise ValueError(
            "Cannot open element on OpenML.org when attribute `openml_url` is `None`",
        )

    webbrowser.open(self.openml_url)

publish #

publish() -> OpenMLBase

Publish the object on the OpenML server.

Source code in openml/base.py

def publish(self) -> OpenMLBase:
    """Publish the object on the OpenML server."""
    file_elements = self._get_file_elements()

    if "description" not in file_elements:
        file_elements["description"] = self._to_xml()

    call = f"{_get_rest_api_type_alias(self)}/"
    response_text = openml._api_calls._perform_api_call(
        call,
        "post",
        file_elements=file_elements,
    )
    xml_response = xmltodict.parse(response_text)

    self._parse_publish_response(xml_response)
    return self

push_tag #

push_tag(tag: str) -> None

Annotates this entity with a tag on the server.

PARAMETER	DESCRIPTION
`tag`	Tag to attach to the flow. TYPE: `str`

Source code in openml/base.py

def push_tag(self, tag: str) -> None:
    """Annotates this entity with a tag on the server.

    Parameters
    ----------
    tag : str
        Tag to attach to the flow.
    """
    _tag_openml_base(self, tag)

remove_tag #

remove_tag(tag: str) -> None

Removes a tag from this entity on the server.

PARAMETER	DESCRIPTION
`tag`	Tag to attach to the flow. TYPE: `str`

Source code in openml/base.py

def remove_tag(self, tag: str) -> None:
    """Removes a tag from this entity on the server.

    Parameters
    ----------
    tag : str
        Tag to attach to the flow.
    """
    _tag_openml_base(self, tag, untag=True)

url_for_id `classmethod` #

url_for_id(id_: int) -> str

Return the OpenML URL for the object of the class entity with the given id.

Source code in openml/base.py

@classmethod
def url_for_id(cls, id_: int) -> str:
    """Return the OpenML URL for the object of the class entity with the given id."""
    # Sample url for a flow: openml.org/f/123
    return f"{openml.config.get_server_base_url()}/{cls._entity_letter()}/{id_}"

OpenMLRegressionTask #

OpenMLRegressionTask(task_type_id: TaskType, task_type: str, data_set_id: int, target_name: str, estimation_procedure_id: int | None = None, estimation_procedure_type: str | None = None, estimation_parameters: dict[str, str] | None = None, evaluation_measure: str | None = None, data_splits_url: str | None = None, task_id: int | None = None)

Bases: OpenMLSupervisedTask

OpenML Regression object.

PARAMETER	DESCRIPTION
`task_id`	ID of the OpenML Regression task. TYPE: `Union[int, None]` DEFAULT: `None`
`task_type_id`	Task type ID of the OpenML Regression task. TYPE: `TaskType`
`task_type`	Task type of the OpenML Regression task. TYPE: `str`
`data_set_id`	ID of the OpenML dataset. TYPE: `int`
`target_name`	Name of the target feature used in the Regression task. TYPE: `str`
`estimation_procedure_id`	ID of the OpenML estimation procedure. TYPE: `int` DEFAULT: `7`
`estimation_procedure_type`	Type of the OpenML estimation procedure. TYPE: `str` DEFAULT: `None`
`estimation_parameters`	Parameters used by the OpenML estimation procedure. TYPE: `dict` DEFAULT: `None`
`data_splits_url`	URL of the OpenML data splits for the Regression task. TYPE: `str` DEFAULT: `None`
`evaluation_measure`	Evaluation measure used in the Regression task. TYPE: `str` DEFAULT: `None`

Source code in openml/tasks/task.py

def __init__(  # noqa: PLR0913
    self,
    task_type_id: TaskType,
    task_type: str,
    data_set_id: int,
    target_name: str,
    estimation_procedure_id: int | None = None,
    estimation_procedure_type: str | None = None,
    estimation_parameters: dict[str, str] | None = None,
    evaluation_measure: str | None = None,
    data_splits_url: str | None = None,
    task_id: int | None = None,
):
    super().__init__(
        task_id=task_id,
        task_type_id=task_type_id,
        task_type=task_type,
        data_set_id=data_set_id,
        estimation_procedure_id=estimation_procedure_id,
        estimation_procedure_type=estimation_procedure_type,
        estimation_parameters=estimation_parameters,
        evaluation_measure=evaluation_measure,
        data_splits_url=data_splits_url,
        target_name=target_name,
    )

estimation_parameters `property` `writable` #

estimation_parameters: dict[str, str] | None

Return the estimation parameters for the task.

id `property` #

id: int | None

Return the OpenML ID of this task.

openml_url `property` #

openml_url: str | None

The URL of the object on the server, if it was uploaded, else None.

download_split #

download_split() -> OpenMLSplit

Download the OpenML split for a given task.

Source code in openml/tasks/task.py

def download_split(self) -> OpenMLSplit:
    """Download the OpenML split for a given task."""
    # TODO(eddiebergman): Can this every be `None`?
    assert self.task_id is not None
    cache_dir = _create_cache_directory_for_id("tasks", self.task_id)
    cached_split_file = cache_dir / "datasplits.arff"

    try:
        split = OpenMLSplit._from_arff_file(cached_split_file)
    except OSError:
        # Next, download and cache the associated split file
        self._download_split(cached_split_file)
        split = OpenMLSplit._from_arff_file(cached_split_file)

    return split

get_X_and_y #

get_X_and_y() -> tuple[DataFrame, Series | DataFrame | None]

Get data associated with the current task.

RETURNS	DESCRIPTION
`tuple - X and y`

Source code in openml/tasks/task.py

def get_X_and_y(self) -> tuple[pd.DataFrame, pd.Series | pd.DataFrame | None]:
    """Get data associated with the current task.

    Returns
    -------
    tuple - X and y

    """
    dataset = self.get_dataset()
    if self.task_type_id not in (
        TaskType.SUPERVISED_CLASSIFICATION,
        TaskType.SUPERVISED_REGRESSION,
        TaskType.LEARNING_CURVE,
    ):
        raise NotImplementedError(
            f"Task type '{self.task_type}' is not implemented for get_X_and_y(). "
            f"Supported types: SUPERVISED_CLASSIFICATION, SUPERVISED_REGRESSION,"
            f"LEARNING_CURVE."
            f"Task ID: {getattr(self, 'task_id', 'unknown')}. "
        )

    X, y, _, _ = dataset.get_data(target=self.target_name)
    return X, y

get_dataset #

get_dataset(**kwargs: Any) -> OpenMLDataset

Download dataset associated with task.

Accepts the same keyword arguments as the openml.datasets.get_dataset.

Source code in openml/tasks/task.py

def get_dataset(self, **kwargs: Any) -> datasets.OpenMLDataset:
    """Download dataset associated with task.

    Accepts the same keyword arguments as the `openml.datasets.get_dataset`.
    """
    return datasets.get_dataset(self.dataset_id, **kwargs)

get_split_dimensions #

get_split_dimensions() -> tuple[int, int, int]

Get the (repeats, folds, samples) of the split for a given task.

Source code in openml/tasks/task.py

def get_split_dimensions(self) -> tuple[int, int, int]:
    """Get the (repeats, folds, samples) of the split for a given task."""
    if self.split is None:
        self.split = self.download_split()

    return self.split.repeats, self.split.folds, self.split.samples

get_train_test_split_indices #

get_train_test_split_indices(fold: int = 0, repeat: int = 0, sample: int = 0) -> tuple[ndarray, ndarray]

Get the indices of the train and test splits for a given task.

Source code in openml/tasks/task.py

def get_train_test_split_indices(
    self,
    fold: int = 0,
    repeat: int = 0,
    sample: int = 0,
) -> tuple[np.ndarray, np.ndarray]:
    """Get the indices of the train and test splits for a given task."""
    # Replace with retrieve from cache
    if self.split is None:
        self.split = self.download_split()

    return self.split.get(repeat=repeat, fold=fold, sample=sample)

open_in_browser #

open_in_browser() -> None

Opens the OpenML web page corresponding to this object in your default browser.

Source code in openml/base.py

def open_in_browser(self) -> None:
    """Opens the OpenML web page corresponding to this object in your default browser."""
    if self.openml_url is None:
        raise ValueError(
            "Cannot open element on OpenML.org when attribute `openml_url` is `None`",
        )

    webbrowser.open(self.openml_url)

publish #

publish() -> OpenMLBase

Publish the object on the OpenML server.

Source code in openml/base.py

def publish(self) -> OpenMLBase:
    """Publish the object on the OpenML server."""
    file_elements = self._get_file_elements()

    if "description" not in file_elements:
        file_elements["description"] = self._to_xml()

    call = f"{_get_rest_api_type_alias(self)}/"
    response_text = openml._api_calls._perform_api_call(
        call,
        "post",
        file_elements=file_elements,
    )
    xml_response = xmltodict.parse(response_text)

    self._parse_publish_response(xml_response)
    return self

push_tag #

push_tag(tag: str) -> None

Annotates this entity with a tag on the server.

PARAMETER	DESCRIPTION
`tag`	Tag to attach to the flow. TYPE: `str`

Source code in openml/base.py

def push_tag(self, tag: str) -> None:
    """Annotates this entity with a tag on the server.

    Parameters
    ----------
    tag : str
        Tag to attach to the flow.
    """
    _tag_openml_base(self, tag)

remove_tag #

remove_tag(tag: str) -> None

Removes a tag from this entity on the server.

PARAMETER	DESCRIPTION
`tag`	Tag to attach to the flow. TYPE: `str`

Source code in openml/base.py

def remove_tag(self, tag: str) -> None:
    """Removes a tag from this entity on the server.

    Parameters
    ----------
    tag : str
        Tag to attach to the flow.
    """
    _tag_openml_base(self, tag, untag=True)

url_for_id `classmethod` #

url_for_id(id_: int) -> str

Return the OpenML URL for the object of the class entity with the given id.

Source code in openml/base.py

@classmethod
def url_for_id(cls, id_: int) -> str:
    """Return the OpenML URL for the object of the class entity with the given id."""
    # Sample url for a flow: openml.org/f/123
    return f"{openml.config.get_server_base_url()}/{cls._entity_letter()}/{id_}"

OpenMLSplit #

OpenMLSplit(name: int | str, description: str, split: dict[int, dict[int, dict[int, tuple[ndarray, ndarray]]]])

OpenML Split object.

This class manages train-test splits for a dataset across multiple repetitions, folds, and samples.

PARAMETER	DESCRIPTION
`name`	The name or ID of the split. TYPE: `int or str`
`description`	A description of the split. TYPE: `str`
`split`	A dictionary containing the splits organized by repetition, fold, and sample. TYPE: `dict`

Source code in openml/tasks/split.py

def __init__(
    self,
    name: int | str,
    description: str,
    split: dict[int, dict[int, dict[int, tuple[np.ndarray, np.ndarray]]]],
):
    self.description = description
    self.name = name
    self.split: dict[int, dict[int, dict[int, tuple[np.ndarray, np.ndarray]]]] = {}

    # Add splits according to repetition
    for repetition in split:
        _rep = int(repetition)
        self.split[_rep] = OrderedDict()
        for fold in split[_rep]:
            self.split[_rep][fold] = OrderedDict()
            for sample in split[_rep][fold]:
                self.split[_rep][fold][sample] = split[_rep][fold][sample]

    self.repeats = len(self.split)

    # TODO(eddiebergman): Better error message
    if any(len(self.split[0]) != len(self.split[i]) for i in range(self.repeats)):
        raise ValueError("")

    self.folds = len(self.split[0])
    self.samples = len(self.split[0][0])

get #

get(repeat: int = 0, fold: int = 0, sample: int = 0) -> tuple[ndarray, ndarray]

Returns the specified data split from the CrossValidationSplit object.

PARAMETER	DESCRIPTION
`repeat`	Index of the repeat to retrieve. TYPE: `int` DEFAULT: `0`
`fold`	Index of the fold to retrieve. TYPE: `int` DEFAULT: `0`
`sample`	Index of the sample to retrieve. TYPE: `int` DEFAULT: `0`

RETURNS	DESCRIPTION
`ndarray`	The data split for the specified repeat, fold, and sample.

RAISES	DESCRIPTION
`ValueError`	If the specified repeat, fold, or sample is not known.

Source code in openml/tasks/split.py

def get(self, repeat: int = 0, fold: int = 0, sample: int = 0) -> tuple[np.ndarray, np.ndarray]:
    """Returns the specified data split from the CrossValidationSplit object.

    Parameters
    ----------
    repeat : int
        Index of the repeat to retrieve.
    fold : int
        Index of the fold to retrieve.
    sample : int
        Index of the sample to retrieve.

    Returns
    -------
    numpy.ndarray
        The data split for the specified repeat, fold, and sample.

    Raises
    ------
    ValueError
        If the specified repeat, fold, or sample is not known.
    """
    if repeat not in self.split:
        raise ValueError(f"Repeat {repeat!s} not known")
    if fold not in self.split[repeat]:
        raise ValueError(f"Fold {fold!s} not known")
    if sample not in self.split[repeat][fold]:
        raise ValueError(f"Sample {sample!s} not known")
    return self.split[repeat][fold][sample]

OpenMLSupervisedTask #

OpenMLSupervisedTask(task_type_id: TaskType, task_type: str, data_set_id: int, target_name: str, estimation_procedure_id: int | None = None, estimation_procedure_type: str | None = None, estimation_parameters: dict[str, str] | None = None, evaluation_measure: str | None = None, data_splits_url: str | None = None, task_id: int | None = None)

Bases: OpenMLTask, ABC

OpenML Supervised Classification object.

PARAMETER	DESCRIPTION
`task_type_id`	ID of the task type. TYPE: `TaskType`
`task_type`	Name of the task type. TYPE: `str`
`data_set_id`	ID of the OpenML dataset associated with the task. TYPE: `int`
`target_name`	Name of the target feature (the class variable). TYPE: `str`
`estimation_procedure_id`	ID of the estimation procedure for the task. TYPE: `int` DEFAULT: `None`
`estimation_procedure_type`	Type of the estimation procedure for the task. TYPE: `str` DEFAULT: `None`
`estimation_parameters`	Estimation parameters for the task. TYPE: `dict` DEFAULT: `None`
`evaluation_measure`	Name of the evaluation measure for the task. TYPE: `str` DEFAULT: `None`
`data_splits_url`	URL of the data splits for the task. TYPE: `str` DEFAULT: `None`
`task_id`	Refers to the unique identifier of task. TYPE: `int \| None` DEFAULT: `None`

Source code in openml/tasks/task.py

def __init__(  # noqa: PLR0913
    self,
    task_type_id: TaskType,
    task_type: str,
    data_set_id: int,
    target_name: str,
    estimation_procedure_id: int | None = None,
    estimation_procedure_type: str | None = None,
    estimation_parameters: dict[str, str] | None = None,
    evaluation_measure: str | None = None,
    data_splits_url: str | None = None,
    task_id: int | None = None,
):
    super().__init__(
        task_id=task_id,
        task_type_id=task_type_id,
        task_type=task_type,
        data_set_id=data_set_id,
        estimation_procedure_id=estimation_procedure_id,
        estimation_procedure_type=estimation_procedure_type,
        estimation_parameters=estimation_parameters,
        evaluation_measure=evaluation_measure,
        data_splits_url=data_splits_url,
        target_name=target_name,
    )

estimation_parameters `property` `writable` #

estimation_parameters: dict[str, str] | None

Return the estimation parameters for the task.

id `property` #

id: int | None

Return the OpenML ID of this task.

openml_url `property` #

openml_url: str | None

The URL of the object on the server, if it was uploaded, else None.

download_split #

download_split() -> OpenMLSplit

Download the OpenML split for a given task.

Source code in openml/tasks/task.py

def download_split(self) -> OpenMLSplit:
    """Download the OpenML split for a given task."""
    # TODO(eddiebergman): Can this every be `None`?
    assert self.task_id is not None
    cache_dir = _create_cache_directory_for_id("tasks", self.task_id)
    cached_split_file = cache_dir / "datasplits.arff"

    try:
        split = OpenMLSplit._from_arff_file(cached_split_file)
    except OSError:
        # Next, download and cache the associated split file
        self._download_split(cached_split_file)
        split = OpenMLSplit._from_arff_file(cached_split_file)

    return split

get_X_and_y #

get_X_and_y() -> tuple[DataFrame, Series | DataFrame | None]

Get data associated with the current task.

RETURNS	DESCRIPTION
`tuple - X and y`

Source code in openml/tasks/task.py

def get_X_and_y(self) -> tuple[pd.DataFrame, pd.Series | pd.DataFrame | None]:
    """Get data associated with the current task.

    Returns
    -------
    tuple - X and y

    """
    dataset = self.get_dataset()
    if self.task_type_id not in (
        TaskType.SUPERVISED_CLASSIFICATION,
        TaskType.SUPERVISED_REGRESSION,
        TaskType.LEARNING_CURVE,
    ):
        raise NotImplementedError(
            f"Task type '{self.task_type}' is not implemented for get_X_and_y(). "
            f"Supported types: SUPERVISED_CLASSIFICATION, SUPERVISED_REGRESSION,"
            f"LEARNING_CURVE."
            f"Task ID: {getattr(self, 'task_id', 'unknown')}. "
        )

    X, y, _, _ = dataset.get_data(target=self.target_name)
    return X, y

get_dataset #

get_dataset(**kwargs: Any) -> OpenMLDataset

Download dataset associated with task.

Accepts the same keyword arguments as the openml.datasets.get_dataset.

Source code in openml/tasks/task.py

def get_dataset(self, **kwargs: Any) -> datasets.OpenMLDataset:
    """Download dataset associated with task.

    Accepts the same keyword arguments as the `openml.datasets.get_dataset`.
    """
    return datasets.get_dataset(self.dataset_id, **kwargs)

get_split_dimensions #

get_split_dimensions() -> tuple[int, int, int]

Get the (repeats, folds, samples) of the split for a given task.

Source code in openml/tasks/task.py

def get_split_dimensions(self) -> tuple[int, int, int]:
    """Get the (repeats, folds, samples) of the split for a given task."""
    if self.split is None:
        self.split = self.download_split()

    return self.split.repeats, self.split.folds, self.split.samples

get_train_test_split_indices #

get_train_test_split_indices(fold: int = 0, repeat: int = 0, sample: int = 0) -> tuple[ndarray, ndarray]

Get the indices of the train and test splits for a given task.

Source code in openml/tasks/task.py

def get_train_test_split_indices(
    self,
    fold: int = 0,
    repeat: int = 0,
    sample: int = 0,
) -> tuple[np.ndarray, np.ndarray]:
    """Get the indices of the train and test splits for a given task."""
    # Replace with retrieve from cache
    if self.split is None:
        self.split = self.download_split()

    return self.split.get(repeat=repeat, fold=fold, sample=sample)

open_in_browser #

open_in_browser() -> None

Opens the OpenML web page corresponding to this object in your default browser.

Source code in openml/base.py

def open_in_browser(self) -> None:
    """Opens the OpenML web page corresponding to this object in your default browser."""
    if self.openml_url is None:
        raise ValueError(
            "Cannot open element on OpenML.org when attribute `openml_url` is `None`",
        )

    webbrowser.open(self.openml_url)

publish #

publish() -> OpenMLBase

Publish the object on the OpenML server.

Source code in openml/base.py

def publish(self) -> OpenMLBase:
    """Publish the object on the OpenML server."""
    file_elements = self._get_file_elements()

    if "description" not in file_elements:
        file_elements["description"] = self._to_xml()

    call = f"{_get_rest_api_type_alias(self)}/"
    response_text = openml._api_calls._perform_api_call(
        call,
        "post",
        file_elements=file_elements,
    )
    xml_response = xmltodict.parse(response_text)

    self._parse_publish_response(xml_response)
    return self

push_tag #

push_tag(tag: str) -> None

Annotates this entity with a tag on the server.

PARAMETER	DESCRIPTION
`tag`	Tag to attach to the flow. TYPE: `str`

Source code in openml/base.py

def push_tag(self, tag: str) -> None:
    """Annotates this entity with a tag on the server.

    Parameters
    ----------
    tag : str
        Tag to attach to the flow.
    """
    _tag_openml_base(self, tag)

remove_tag #

remove_tag(tag: str) -> None

Removes a tag from this entity on the server.

PARAMETER	DESCRIPTION
`tag`	Tag to attach to the flow. TYPE: `str`

Source code in openml/base.py

def remove_tag(self, tag: str) -> None:
    """Removes a tag from this entity on the server.

    Parameters
    ----------
    tag : str
        Tag to attach to the flow.
    """
    _tag_openml_base(self, tag, untag=True)

url_for_id `classmethod` #

url_for_id(id_: int) -> str

Return the OpenML URL for the object of the class entity with the given id.

Source code in openml/base.py

@classmethod
def url_for_id(cls, id_: int) -> str:
    """Return the OpenML URL for the object of the class entity with the given id."""
    # Sample url for a flow: openml.org/f/123
    return f"{openml.config.get_server_base_url()}/{cls._entity_letter()}/{id_}"

OpenMLTask #

OpenMLTask(task_id: int | None, task_type_id: TaskType, task_type: str, data_set_id: int, estimation_procedure_id: int | None = None, estimation_procedure_type: str | None = None, estimation_parameters: dict[str, str] | None = None, evaluation_measure: str | None = None, data_splits_url: str | None = None, target_name: str | None = None)

Bases: OpenMLBase

OpenML Task object.

PARAMETER	DESCRIPTION
`task_id`	Refers to the unique identifier of OpenML task. TYPE: `int \| None`
`task_type_id`	Refers to the type of OpenML task. TYPE: `TaskType`
`task_type`	Refers to the OpenML task. TYPE: `str`
`data_set_id`	Refers to the data. TYPE: `int`
`estimation_procedure_id`	Refers to the type of estimates used. TYPE: `int \| None` DEFAULT: `None`
`estimation_procedure_type`	Refers to the type of estimation procedure used for the OpenML task. TYPE: `str \| None` DEFAULT: `None`
`estimation_parameters`	Estimation parameters used for the OpenML task. TYPE: `dict[str, str] \| None` DEFAULT: `None`
`evaluation_measure`	Refers to the evaluation measure. TYPE: `str \| None` DEFAULT: `None`
`data_splits_url`	Refers to the URL of the data splits used for the OpenML task. TYPE: `str \| None` DEFAULT: `None`

Source code in openml/tasks/task.py

def __init__(  # noqa: PLR0913
    self,
    task_id: int | None,
    task_type_id: TaskType,
    task_type: str,
    data_set_id: int,
    estimation_procedure_id: int | None = None,
    estimation_procedure_type: str | None = None,
    estimation_parameters: dict[str, str] | None = None,
    evaluation_measure: str | None = None,
    data_splits_url: str | None = None,
    target_name: str | None = None,
):
    self.task_id = int(task_id) if task_id is not None else None
    self.task_type_id = task_type_id
    self.task_type = task_type
    self.dataset_id = int(data_set_id)
    self.target_name = target_name
    resolved_estimation_procedure_id = self._resolve_estimation_procedure_id(
        estimation_procedure_id,
    )
    self.evaluation_measure = evaluation_measure
    self.estimation_procedure: _EstimationProcedure = {
        "type": estimation_procedure_type,
        "parameters": estimation_parameters,
        "data_splits_url": data_splits_url,
    }
    self.estimation_procedure_id = resolved_estimation_procedure_id
    self.split: OpenMLSplit | None = None

id `property` #

id: int | None

Return the OpenML ID of this task.

openml_url `property` #

openml_url: str | None

The URL of the object on the server, if it was uploaded, else None.

download_split #

download_split() -> OpenMLSplit

Download the OpenML split for a given task.

Source code in openml/tasks/task.py

def download_split(self) -> OpenMLSplit:
    """Download the OpenML split for a given task."""
    # TODO(eddiebergman): Can this every be `None`?
    assert self.task_id is not None
    cache_dir = _create_cache_directory_for_id("tasks", self.task_id)
    cached_split_file = cache_dir / "datasplits.arff"

    try:
        split = OpenMLSplit._from_arff_file(cached_split_file)
    except OSError:
        # Next, download and cache the associated split file
        self._download_split(cached_split_file)
        split = OpenMLSplit._from_arff_file(cached_split_file)

    return split

get_dataset #

get_dataset(**kwargs: Any) -> OpenMLDataset

Download dataset associated with task.

Accepts the same keyword arguments as the openml.datasets.get_dataset.

Source code in openml/tasks/task.py

def get_dataset(self, **kwargs: Any) -> datasets.OpenMLDataset:
    """Download dataset associated with task.

    Accepts the same keyword arguments as the `openml.datasets.get_dataset`.
    """
    return datasets.get_dataset(self.dataset_id, **kwargs)

get_split_dimensions #

get_split_dimensions() -> tuple[int, int, int]

Get the (repeats, folds, samples) of the split for a given task.

Source code in openml/tasks/task.py

def get_split_dimensions(self) -> tuple[int, int, int]:
    """Get the (repeats, folds, samples) of the split for a given task."""
    if self.split is None:
        self.split = self.download_split()

    return self.split.repeats, self.split.folds, self.split.samples

get_train_test_split_indices #

get_train_test_split_indices(fold: int = 0, repeat: int = 0, sample: int = 0) -> tuple[ndarray, ndarray]

Get the indices of the train and test splits for a given task.

Source code in openml/tasks/task.py

def get_train_test_split_indices(
    self,
    fold: int = 0,
    repeat: int = 0,
    sample: int = 0,
) -> tuple[np.ndarray, np.ndarray]:
    """Get the indices of the train and test splits for a given task."""
    # Replace with retrieve from cache
    if self.split is None:
        self.split = self.download_split()

    return self.split.get(repeat=repeat, fold=fold, sample=sample)

open_in_browser #

open_in_browser() -> None

Opens the OpenML web page corresponding to this object in your default browser.

Source code in openml/base.py

def open_in_browser(self) -> None:
    """Opens the OpenML web page corresponding to this object in your default browser."""
    if self.openml_url is None:
        raise ValueError(
            "Cannot open element on OpenML.org when attribute `openml_url` is `None`",
        )

    webbrowser.open(self.openml_url)

publish #

publish() -> OpenMLBase

Publish the object on the OpenML server.

Source code in openml/base.py

def publish(self) -> OpenMLBase:
    """Publish the object on the OpenML server."""
    file_elements = self._get_file_elements()

    if "description" not in file_elements:
        file_elements["description"] = self._to_xml()

    call = f"{_get_rest_api_type_alias(self)}/"
    response_text = openml._api_calls._perform_api_call(
        call,
        "post",
        file_elements=file_elements,
    )
    xml_response = xmltodict.parse(response_text)

    self._parse_publish_response(xml_response)
    return self

push_tag #

push_tag(tag: str) -> None

Annotates this entity with a tag on the server.

PARAMETER	DESCRIPTION
`tag`	Tag to attach to the flow. TYPE: `str`

Source code in openml/base.py

def push_tag(self, tag: str) -> None:
    """Annotates this entity with a tag on the server.

    Parameters
    ----------
    tag : str
        Tag to attach to the flow.
    """
    _tag_openml_base(self, tag)

remove_tag #

remove_tag(tag: str) -> None

Removes a tag from this entity on the server.

PARAMETER	DESCRIPTION
`tag`	Tag to attach to the flow. TYPE: `str`

Source code in openml/base.py

def remove_tag(self, tag: str) -> None:
    """Removes a tag from this entity on the server.

    Parameters
    ----------
    tag : str
        Tag to attach to the flow.
    """
    _tag_openml_base(self, tag, untag=True)

url_for_id `classmethod` #

url_for_id(id_: int) -> str

Return the OpenML URL for the object of the class entity with the given id.

Source code in openml/base.py

@classmethod
def url_for_id(cls, id_: int) -> str:
    """Return the OpenML URL for the object of the class entity with the given id."""
    # Sample url for a flow: openml.org/f/123
    return f"{openml.config.get_server_base_url()}/{cls._entity_letter()}/{id_}"

TaskType #

Bases: Enum

Possible task types as defined in OpenML.

create_task #

create_task(task_type: TaskType, dataset_id: int, estimation_procedure_id: int, target_name: str | None = None, evaluation_measure: str | None = None, **kwargs: Any) -> OpenMLClassificationTask | OpenMLRegressionTask | OpenMLLearningCurveTask | OpenMLClusteringTask

Create a task based on different given attributes.

Builds a task object with the function arguments as attributes. The type of the task object built is determined from the task type id. More information on how the arguments (task attributes), relate to the different possible tasks can be found in the individual task objects at the openml.tasks.task module.

PARAMETER	DESCRIPTION
`task_type`	Id of the task type. TYPE: `TaskType`
`dataset_id`	The id of the dataset for the task. TYPE: `int`
`target_name`	The name of the feature used as a target. At the moment, only optional for the clustering tasks. TYPE: `str` DEFAULT: `None`
`estimation_procedure_id`	The id of the estimation procedure. TYPE: `int`
`evaluation_measure`	The name of the evaluation measure. TYPE: `str` DEFAULT: `None`
`kwargs`	Other task attributes that are not mandatory for task upload. TYPE: `dict` DEFAULT: `{}`

RETURNS	DESCRIPTION
`(OpenMLClassificationTask, OpenMLRegressionTask)`
`(OpenMLLearningCurveTask, OpenMLClusteringTask)`

Source code in openml/tasks/functions.py

def create_task(
    task_type: TaskType,
    dataset_id: int,
    estimation_procedure_id: int,
    target_name: str | None = None,
    evaluation_measure: str | None = None,
    **kwargs: Any,
) -> (
    OpenMLClassificationTask | OpenMLRegressionTask | OpenMLLearningCurveTask | OpenMLClusteringTask
):
    """Create a task based on different given attributes.

    Builds a task object with the function arguments as
    attributes. The type of the task object built is
    determined from the task type id.
    More information on how the arguments (task attributes),
    relate to the different possible tasks can be found in
    the individual task objects at the openml.tasks.task
    module.

    Parameters
    ----------
    task_type : TaskType
        Id of the task type.
    dataset_id : int
        The id of the dataset for the task.
    target_name : str, optional
        The name of the feature used as a target.
        At the moment, only optional for the clustering tasks.
    estimation_procedure_id : int
        The id of the estimation procedure.
    evaluation_measure : str, optional
        The name of the evaluation measure.
    kwargs : dict, optional
        Other task attributes that are not mandatory
        for task upload.

    Returns
    -------
    OpenMLClassificationTask, OpenMLRegressionTask,
    OpenMLLearningCurveTask, OpenMLClusteringTask
    """
    if task_type == TaskType.CLUSTERING:
        task_cls = OpenMLClusteringTask
    elif task_type == TaskType.LEARNING_CURVE:
        task_cls = OpenMLLearningCurveTask  # type: ignore
    elif task_type == TaskType.SUPERVISED_CLASSIFICATION:
        task_cls = OpenMLClassificationTask  # type: ignore
    elif task_type == TaskType.SUPERVISED_REGRESSION:
        task_cls = OpenMLRegressionTask  # type: ignore
    else:
        raise NotImplementedError(
            f"Task type ID {task_type:d} is not supported. "
            f"Supported task type IDs: {TaskType.SUPERVISED_CLASSIFICATION.value},"
            f"{TaskType.SUPERVISED_REGRESSION.value}, "
            f"{TaskType.CLUSTERING.value}, {TaskType.LEARNING_CURVE.value}. "
            f"Please refer to the TaskType enum for valid task type identifiers."
        )

    return task_cls(
        task_id=None,
        task_type_id=task_type,
        task_type="None",  # TODO: refactor to get task type string from ID.
        data_set_id=dataset_id,
        target_name=target_name,  # type: ignore
        estimation_procedure_id=estimation_procedure_id,
        evaluation_measure=evaluation_measure,
        **kwargs,
    )

delete_task #

delete_task(task_id: int) -> bool

Delete task with id task_id from the OpenML server.

You can only delete tasks which you created and have no runs associated with them.

PARAMETER	DESCRIPTION
`task_id`	OpenML id of the task TYPE: `int`

RETURNS	DESCRIPTION
`bool`	True if the deletion was successful. False otherwise.

Source code in openml/tasks/functions.py

def delete_task(task_id: int) -> bool:
    """Delete task with id `task_id` from the OpenML server.

    You can only delete tasks which you created and have
    no runs associated with them.

    Parameters
    ----------
    task_id : int
        OpenML id of the task

    Returns
    -------
    bool
        True if the deletion was successful. False otherwise.
    """
    return openml.utils._delete_entity("task", task_id)

get_task #

get_task(task_id: int, download_splits: bool = False, **get_dataset_kwargs: Any) -> OpenMLTask

Download OpenML task for a given task ID.

Downloads the task representation.

Use the download_splits parameter to control whether the splits are downloaded. Moreover, you may pass additional parameter (args or kwargs) that are passed to :meth:openml.datasets.get_dataset.

PARAMETER	DESCRIPTION
`task_id`	The OpenML task id of the task to download. TYPE: `int`
`download_splits`	Whether to download the splits as well. TYPE: `bool` DEFAULT: `False`
`get_dataset_kwargs`	Args and kwargs can be used pass optional parameters to :meth:`openml.datasets.get_dataset`. TYPE: `Any` DEFAULT: `{}`

RETURNS	DESCRIPTION
`task`	TYPE: `OpenMLTask`

Source code in openml/tasks/functions.py

@openml.utils.thread_safe_if_oslo_installed
def get_task(
    task_id: int,
    download_splits: bool = False,  # noqa: FBT002
    **get_dataset_kwargs: Any,
) -> OpenMLTask:
    """Download OpenML task for a given task ID.

    Downloads the task representation.

    Use the `download_splits` parameter to control whether the splits are downloaded.
    Moreover, you may pass additional parameter (args or kwargs) that are passed to
    :meth:`openml.datasets.get_dataset`.

    Parameters
    ----------
    task_id : int
        The OpenML task id of the task to download.
    download_splits: bool (default=False)
        Whether to download the splits as well.
    get_dataset_kwargs :
        Args and kwargs can be used pass optional parameters to :meth:`openml.datasets.get_dataset`.

    Returns
    -------
    task: OpenMLTask
    """
    if not isinstance(task_id, int):
        raise TypeError(f"Task id should be integer, is {type(task_id)}")

    task_cache_directory = openml.utils._create_cache_directory_for_id(
        TASKS_CACHE_DIR_NAME, task_id
    )
    task_cache_directory_existed = task_cache_directory.exists()
    try:
        task = _get_task_description(task_id)
        dataset = get_dataset(task.dataset_id, **get_dataset_kwargs)
        # List of class labels available in dataset description
        # Including class labels as part of task meta data handles
        #   the case where data download was initially disabled
        if isinstance(task, (OpenMLClassificationTask, OpenMLLearningCurveTask)):
            assert task.target_name is not None, (
                "Supervised tasks must define a target feature before retrieving class labels."
            )
            task.class_labels = dataset.retrieve_class_labels(task.target_name)
        # Clustering tasks do not have class labels
        # and do not offer download_split
        if download_splits and isinstance(task, OpenMLSupervisedTask):
            task.download_split()
    except Exception as e:
        if not task_cache_directory_existed:
            openml.utils._remove_cache_dir_for_id(TASKS_CACHE_DIR_NAME, task_cache_directory)
        raise e

    return task

get_tasks #

get_tasks(task_ids: list[int], download_data: bool | None = None, download_qualities: bool | None = None) -> list[OpenMLTask]

Download tasks.

This function iterates :meth:openml.tasks.get_task.

PARAMETER	DESCRIPTION
`task_ids`	A list of task ids to download. TYPE: `List[int]`
`download_data`	Option to trigger download of data along with the meta data. TYPE: `bool(default=True)` DEFAULT: `None`
`download_qualities`	Option to download 'qualities' meta-data in addition to the minimal dataset description. TYPE: `bool(default=True)` DEFAULT: `None`

RETURNS	DESCRIPTION
`list`

Source code in openml/tasks/functions.py

def get_tasks(
    task_ids: list[int],
    download_data: bool | None = None,
    download_qualities: bool | None = None,
) -> list[OpenMLTask]:
    """Download tasks.

    This function iterates :meth:`openml.tasks.get_task`.

    Parameters
    ----------
    task_ids : List[int]
        A list of task ids to download.
    download_data : bool (default = True)
        Option to trigger download of data along with the meta data.
    download_qualities : bool (default=True)
        Option to download 'qualities' meta-data in addition to the minimal dataset description.

    Returns
    -------
    list
    """
    if download_data is None:
        warnings.warn(
            "`download_data` will default to False starting in 0.16. "
            "Please set `download_data` explicitly to suppress this warning.",
            stacklevel=1,
        )
        download_data = True

    if download_qualities is None:
        warnings.warn(
            "`download_qualities` will default to False starting in 0.16. "
            "Please set `download_qualities` explicitly to suppress this warning.",
            stacklevel=1,
        )
        download_qualities = True

    tasks = []
    for task_id in task_ids:
        tasks.append(
            get_task(task_id, download_data=download_data, download_qualities=download_qualities)
        )
    return tasks

list_tasks #

list_tasks(task_type: TaskType | None = None, offset: int | None = None, size: int | None = None, tag: str | None = None, data_tag: str | None = None, status: str | None = None, data_name: str | None = None, data_id: int | None = None, number_instances: int | None = None, number_features: int | None = None, number_classes: int | None = None, number_missing_values: int | None = None) -> DataFrame

Return a number of tasks having the given tag and task_type

PARAMETER	DESCRIPTION
`Filter`
`it`
`type`
`offset`	the number of tasks to skip, starting from the first TYPE: `int` DEFAULT: `None`
`task_type`	Refers to the type of task. TYPE: `TaskType` DEFAULT: `None`
`size`	the maximum number of tasks to show TYPE: `int` DEFAULT: `None`
`tag`	the tag to include TYPE: `str` DEFAULT: `None`
`data_tag`	the tag of the dataset TYPE: `str` DEFAULT: `None`
`data_id`	TYPE: `int` DEFAULT: `None`
`status`	TYPE: `str` DEFAULT: `None`
`data_name`	TYPE: `str` DEFAULT: `None`
`number_instances`	TYPE: `int` DEFAULT: `None`
`number_features`	TYPE: `int` DEFAULT: `None`
`number_classes`	TYPE: `int` DEFAULT: `None`
`number_missing_values`	TYPE: `int` DEFAULT: `None`

RETURNS	DESCRIPTION
`dataframe`	All tasks having the given task_type and the give tag. Every task is represented by a row in the data frame containing the following information as columns: task id, dataset id, task_type and status. If qualities are calculated for the associated dataset, some of these are also returned.

Source code in openml/tasks/functions.py

def list_tasks(  # noqa: PLR0913
    task_type: TaskType | None = None,
    offset: int | None = None,
    size: int | None = None,
    tag: str | None = None,
    data_tag: str | None = None,
    status: str | None = None,
    data_name: str | None = None,
    data_id: int | None = None,
    number_instances: int | None = None,
    number_features: int | None = None,
    number_classes: int | None = None,
    number_missing_values: int | None = None,
) -> pd.DataFrame:
    """
    Return a number of tasks having the given tag and task_type

    Parameters
    ----------
    Filter task_type is separated from the other filters because
    it is used as task_type in the task description, but it is named
    type when used as a filter in list tasks call.
    offset : int, optional
        the number of tasks to skip, starting from the first
    task_type : TaskType, optional
        Refers to the type of task.
    size : int, optional
        the maximum number of tasks to show
    tag : str, optional
        the tag to include
    data_tag : str, optional
        the tag of the dataset
    data_id : int, optional
    status : str, optional
    data_name : str, optional
    number_instances : int, optional
    number_features : int, optional
    number_classes : int, optional
    number_missing_values : int, optional

    Returns
    -------
    dataframe
        All tasks having the given task_type and the give tag. Every task is
        represented by a row in the data frame containing the following information
        as columns: task id, dataset id, task_type and status. If qualities are
        calculated for the associated dataset, some of these are also returned.
    """
    listing_call = partial(
        _list_tasks,
        task_type=task_type,
        tag=tag,
        data_tag=data_tag,
        status=status,
        data_id=data_id,
        data_name=data_name,
        number_instances=number_instances,
        number_features=number_features,
        number_classes=number_classes,
        number_missing_values=number_missing_values,
    )
    batches = openml.utils._list_all(listing_call, offset=offset, limit=size)
    if len(batches) == 0:
        return pd.DataFrame()

    return pd.concat(batches)

tasks

openml.tasks #

OpenMLClassificationTask #

estimation_parameters property writable #

id property #

openml_url property #

download_split #

get_X_and_y #

get_dataset #

get_split_dimensions #

get_train_test_split_indices #

open_in_browser #

publish #

push_tag #

remove_tag #

url_for_id classmethod #

OpenMLClusteringTask #

id property #

openml_url property #

download_split #

get_X #

get_dataset #

get_split_dimensions #

get_train_test_split_indices #

open_in_browser #

publish #

push_tag #

remove_tag #

url_for_id classmethod #

OpenMLLearningCurveTask #

estimation_parameters property writable #

id property #

openml_url property #

download_split #

get_X_and_y #

get_dataset #

get_split_dimensions #

get_train_test_split_indices #

open_in_browser #

publish #

push_tag #

remove_tag #

url_for_id classmethod #

OpenMLRegressionTask #

estimation_parameters property writable #

id property #

openml_url property #

download_split #

get_X_and_y #

get_dataset #

get_split_dimensions #

get_train_test_split_indices #

open_in_browser #

publish #

push_tag #

remove_tag #

url_for_id classmethod #

OpenMLSplit #

get #

OpenMLSupervisedTask #

estimation_parameters property writable #

id property #

openml_url property #

download_split #

get_X_and_y #

get_dataset #

get_split_dimensions #

get_train_test_split_indices #

open_in_browser #

publish #

push_tag #

remove_tag #

url_for_id classmethod #

OpenMLTask #

id property #

openml_url property #

download_split #

get_dataset #

get_split_dimensions #

get_train_test_split_indices #

estimation_parameters `property` `writable` #

id `property` #

openml_url `property` #

url_for_id `classmethod` #

id `property` #

openml_url `property` #

url_for_id `classmethod` #

estimation_parameters `property` `writable` #

id `property` #

openml_url `property` #

url_for_id `classmethod` #

estimation_parameters `property` `writable` #

id `property` #

openml_url `property` #

url_for_id `classmethod` #

estimation_parameters `property` `writable` #

id `property` #

openml_url `property` #

url_for_id `classmethod` #

id `property` #

openml_url `property` #

url_for_id `classmethod` #