OpenMLDataset(name: str, description: str | None, data_format: Literal['arff', 'sparse_arff'] = 'arff', cache_format: Literal['feather', 'pickle'] = 'pickle', dataset_id: int | None = None, version: int | None = None, creator: str | None = None, contributor: str | None = None, collection_date: str | None = None, upload_date: str | None = None, language: str | None = None, licence: str | None = None, url: str | None = None, default_target_attribute: str | None = None, row_id_attribute: str | None = None, ignore_attribute: str | list[str] | None = None, version_label: str | None = None, citation: str | None = None, tag: str | None = None, visibility: str | None = None, original_data_url: str | None = None, paper_url: str | None = None, update_comment: str | None = None, md5_checksum: str | None = None, data_file: str | None = None, features_file: str | None = None, qualities_file: str | None = None, dataset: str | None = None, parquet_url: str | None = None, parquet_file: str | None = None)
Bases: OpenMLBase
Dataset object.
Allows fetching and uploading datasets to OpenML.
Parameters
name : str
Name of the dataset.
description : str
Description of the dataset.
data_format : str
Format of the dataset which can be either 'arff' or 'sparse_arff'.
cache_format : str
Format for caching the dataset which can be either 'feather' or 'pickle'.
dataset_id : int, optional
Id autogenerated by the server.
version : int, optional
Version of this dataset. '1' for original version.
Auto-incremented by server.
creator : str, optional
The person who created the dataset.
contributor : str, optional
People who contributed to the current version of the dataset.
collection_date : str, optional
The date the data was originally collected, given by the uploader.
upload_date : str, optional
The date-time when the dataset was uploaded, generated by server.
language : str, optional
Language in which the data is represented.
Starts with 1 upper case letter, rest lower case, e.g. 'English'.
licence : str, optional
License of the data.
url : str, optional
Valid URL, points to actual data file.
The file can be on the OpenML server or another dataset repository.
default_target_attribute : str, optional
The default target attribute, if it exists.
Can have multiple values, comma separated.
row_id_attribute : str, optional
The attribute that represents the row-id column,
if present in the dataset.
ignore_attribute : str | list, optional
Attributes that should be excluded in modelling,
such as identifiers and indexes.
version_label : str, optional
Version label provided by user.
Can be a date, hash, or some other type of id.
citation : str, optional
Reference(s) that should be cited when building on this data.
tag : str, optional
Tags, describing the algorithms.
visibility : str, optional
Who can see the dataset.
Typical values: 'Everyone','All my friends','Only me'.
Can also be any of the user's circles.
original_data_url : str, optional
For derived data, the url to the original dataset.
paper_url : str, optional
Link to a paper describing the dataset.
update_comment : str, optional
An explanation for when the dataset is uploaded.
md5_checksum : str, optional
MD5 checksum to check if the dataset is downloaded without corruption.
data_file : str, optional
Path to where the dataset is located.
features_file : dict, optional
A dictionary of dataset features,
which maps a feature index to a OpenMLDataFeature.
qualities_file : dict, optional
A dictionary of dataset qualities,
which maps a quality name to a quality value.
dataset: string, optional
Serialized arff dataset string.
parquet_url: string, optional
This is the URL to the storage location where the dataset files are hosted.
This can be a MinIO bucket URL. If specified, the data will be accessed
from this URL when reading the files.
parquet_file: string, optional
Path to the local file.
Source code in openml/datasets/dataset.py
| def __init__( # noqa: C901, PLR0912, PLR0913, PLR0915
self,
name: str,
description: str | None,
data_format: Literal["arff", "sparse_arff"] = "arff",
cache_format: Literal["feather", "pickle"] = "pickle",
dataset_id: int | None = None,
version: int | None = None,
creator: str | None = None,
contributor: str | None = None,
collection_date: str | None = None,
upload_date: str | None = None,
language: str | None = None,
licence: str | None = None,
url: str | None = None,
default_target_attribute: str | None = None,
row_id_attribute: str | None = None,
ignore_attribute: str | list[str] | None = None,
version_label: str | None = None,
citation: str | None = None,
tag: str | None = None,
visibility: str | None = None,
original_data_url: str | None = None,
paper_url: str | None = None,
update_comment: str | None = None,
md5_checksum: str | None = None,
data_file: str | None = None,
features_file: str | None = None,
qualities_file: str | None = None,
dataset: str | None = None,
parquet_url: str | None = None,
parquet_file: str | None = None,
):
if cache_format not in ["feather", "pickle"]:
raise ValueError(
"cache_format must be one of 'feather' or 'pickle. "
f"Invalid format specified: {cache_format}",
)
def find_invalid_characters(string: str, pattern: str) -> str:
invalid_chars = set()
regex = re.compile(pattern)
for char in string:
if not regex.match(char):
invalid_chars.add(char)
return ",".join(
[f"'{char}'" if char != "'" else f'"{char}"' for char in invalid_chars],
)
if dataset_id is None:
pattern = "^[\x00-\x7f]*$"
if description and not re.match(pattern, description):
# not basiclatin (XSD complains)
invalid_characters = find_invalid_characters(description, pattern)
raise ValueError(
f"Invalid symbols {invalid_characters} in description: {description}",
)
pattern = "^[\x00-\x7f]*$"
if citation and not re.match(pattern, citation):
# not basiclatin (XSD complains)
invalid_characters = find_invalid_characters(citation, pattern)
raise ValueError(
f"Invalid symbols {invalid_characters} in citation: {citation}",
)
pattern = "^[a-zA-Z0-9_\\-\\.\\(\\),]+$"
if not re.match(pattern, name):
# regex given by server in error message
invalid_characters = find_invalid_characters(name, pattern)
raise ValueError(f"Invalid symbols {invalid_characters} in name: {name}")
self.ignore_attribute: list[str] | None = None
if isinstance(ignore_attribute, str):
self.ignore_attribute = [ignore_attribute]
elif isinstance(ignore_attribute, list) or ignore_attribute is None:
self.ignore_attribute = ignore_attribute
else:
raise ValueError("Wrong data type for ignore_attribute. Should be list.")
# TODO add function to check if the name is casual_string128
# Attributes received by querying the RESTful API
self.dataset_id = int(dataset_id) if dataset_id is not None else None
self.name = name
self.version = int(version) if version is not None else None
self.description = description
self.cache_format = cache_format
# Has to be called format, otherwise there will be an XML upload error
self.format = data_format
self.creator = creator
self.contributor = contributor
self.collection_date = collection_date
self.upload_date = upload_date
self.language = language
self.licence = licence
self.url = url
self.default_target_attribute = default_target_attribute
self.row_id_attribute = row_id_attribute
self.version_label = version_label
self.citation = citation
self.tag = tag
self.visibility = visibility
self.original_data_url = original_data_url
self.paper_url = paper_url
self.update_comment = update_comment
self.md5_checksum = md5_checksum
self.data_file = data_file
self.parquet_file = parquet_file
self._dataset = dataset
self._parquet_url = parquet_url
self._features: dict[int, OpenMLDataFeature] | None = None
self._qualities: dict[str, float] | None = None
self._no_qualities_found = False
if features_file is not None:
self._features = _read_features(Path(features_file))
# "" was the old default value by `get_dataset` and maybe still used by some
if qualities_file == "":
# TODO(0.15): to switch to "qualities_file is not None" below and remove warning
warnings.warn(
"Starting from Version 0.15 `qualities_file` must be None and not an empty string "
"to avoid reading the qualities from file. Set `qualities_file` to None to avoid "
"this warning.",
FutureWarning,
stacklevel=2,
)
qualities_file = None
if qualities_file is not None:
self._qualities = _read_qualities(Path(qualities_file))
if data_file is not None:
data_pickle, data_feather, feather_attribute = self._compressed_cache_file_paths(
Path(data_file)
)
self.data_pickle_file = data_pickle if Path(data_pickle).exists() else None
self.data_feather_file = data_feather if Path(data_feather).exists() else None
self.feather_attribute_file = feather_attribute if Path(feather_attribute) else None
else:
self.data_pickle_file = None
self.data_feather_file = None
self.feather_attribute_file = None
|
features
property
Get the features of this dataset.
id
property
Get the dataset numeric id.
openml_url
property
The URL of the object on the server, if it was uploaded, else None.
qualities
property
qualities: dict[str, float] | None
Get the qualities of this dataset.
get_data
get_data(target: list[str] | str | None = None, include_row_id: bool = False, include_ignore_attribute: bool = False) -> tuple[DataFrame, Series | None, list[bool], list[str]]
Returns dataset content as dataframes.
Parameters
target : string, List[str] or None (default=None)
Name of target column to separate from the data.
Splitting multiple columns is currently not supported.
include_row_id : boolean (default=False)
Whether to include row ids in the returned dataset.
include_ignore_attribute : boolean (default=False)
Whether to include columns that are marked as "ignore"
on the server in the dataset.
Returns
X : dataframe, shape (n_samples, n_columns)
Dataset, may have sparse dtypes in the columns if required.
y : pd.Series, shape (n_samples, ) or None
Target column
categorical_indicator : list[bool]
Mask that indicate categorical features.
attribute_names : list[str]
List of attribute names.
Source code in openml/datasets/dataset.py
| def get_data( # noqa: C901
self,
target: list[str] | str | None = None,
include_row_id: bool = False, # noqa: FBT001, FBT002
include_ignore_attribute: bool = False, # noqa: FBT001, FBT002
) -> tuple[pd.DataFrame, pd.Series | None, list[bool], list[str]]:
"""Returns dataset content as dataframes.
Parameters
----------
target : string, List[str] or None (default=None)
Name of target column to separate from the data.
Splitting multiple columns is currently not supported.
include_row_id : boolean (default=False)
Whether to include row ids in the returned dataset.
include_ignore_attribute : boolean (default=False)
Whether to include columns that are marked as "ignore"
on the server in the dataset.
Returns
-------
X : dataframe, shape (n_samples, n_columns)
Dataset, may have sparse dtypes in the columns if required.
y : pd.Series, shape (n_samples, ) or None
Target column
categorical_indicator : list[bool]
Mask that indicate categorical features.
attribute_names : list[str]
List of attribute names.
"""
data, categorical_mask, attribute_names = self._load_data()
to_exclude = []
if not include_row_id and self.row_id_attribute is not None:
if isinstance(self.row_id_attribute, str):
to_exclude.append(self.row_id_attribute)
elif isinstance(self.row_id_attribute, Iterable):
to_exclude.extend(self.row_id_attribute)
if not include_ignore_attribute and self.ignore_attribute is not None:
if isinstance(self.ignore_attribute, str):
to_exclude.append(self.ignore_attribute)
elif isinstance(self.ignore_attribute, Iterable):
to_exclude.extend(self.ignore_attribute)
if len(to_exclude) > 0:
logger.info(f"Going to remove the following attributes: {to_exclude}")
keep = np.array([column not in to_exclude for column in attribute_names])
data = data.drop(columns=to_exclude)
categorical_mask = [cat for cat, k in zip(categorical_mask, keep) if k]
attribute_names = [att for att, k in zip(attribute_names, keep) if k]
if target is None:
return data, None, categorical_mask, attribute_names
if isinstance(target, str):
target_names = target.split(",") if "," in target else [target]
else:
target_names = target
# All the assumptions below for the target are dependant on the number of targets being 1
n_targets = len(target_names)
if n_targets > 1:
raise NotImplementedError(f"Number of targets {n_targets} not implemented.")
target_name = target_names[0]
x = data.drop(columns=[target_name])
y = data[target_name].squeeze()
# Finally, remove the target from the list of attributes and categorical mask
target_index = attribute_names.index(target_name)
categorical_mask.pop(target_index)
attribute_names.remove(target_name)
assert isinstance(y, pd.Series)
return x, y, categorical_mask, attribute_names
|
get_features_by_type
get_features_by_type(data_type: str, exclude: list[str] | None = None, exclude_ignore_attribute: bool = True, exclude_row_id_attribute: bool = True) -> list[int]
Return indices of features of a given type, e.g. all nominal features.
Optional parameters to exclude various features by index or ontology.
Parameters
data_type : str
The data type to return (e.g., nominal, numeric, date, string)
exclude : list(int)
List of columns to exclude from the return value
exclude_ignore_attribute : bool
Whether to exclude the defined ignore attributes (and adapt the
return values as if these indices are not present)
exclude_row_id_attribute : bool
Whether to exclude the defined row id attributes (and adapt the
return values as if these indices are not present)
Returns
result : list
a list of indices that have the specified data type
Source code in openml/datasets/dataset.py
| def get_features_by_type( # noqa: C901
self,
data_type: str,
exclude: list[str] | None = None,
exclude_ignore_attribute: bool = True, # noqa: FBT002, FBT001
exclude_row_id_attribute: bool = True, # noqa: FBT002, FBT001
) -> list[int]:
"""
Return indices of features of a given type, e.g. all nominal features.
Optional parameters to exclude various features by index or ontology.
Parameters
----------
data_type : str
The data type to return (e.g., nominal, numeric, date, string)
exclude : list(int)
List of columns to exclude from the return value
exclude_ignore_attribute : bool
Whether to exclude the defined ignore attributes (and adapt the
return values as if these indices are not present)
exclude_row_id_attribute : bool
Whether to exclude the defined row id attributes (and adapt the
return values as if these indices are not present)
Returns
-------
result : list
a list of indices that have the specified data type
"""
if data_type not in OpenMLDataFeature.LEGAL_DATA_TYPES:
raise TypeError("Illegal feature type requested")
if self.ignore_attribute is not None and not isinstance(self.ignore_attribute, list):
raise TypeError("ignore_attribute should be a list")
if self.row_id_attribute is not None and not isinstance(self.row_id_attribute, str):
raise TypeError("row id attribute should be a str")
if exclude is not None and not isinstance(exclude, list):
raise TypeError("Exclude should be a list")
# assert all(isinstance(elem, str) for elem in exclude),
# "Exclude should be a list of strings"
to_exclude = []
if exclude is not None:
to_exclude.extend(exclude)
if exclude_ignore_attribute and self.ignore_attribute is not None:
to_exclude.extend(self.ignore_attribute)
if exclude_row_id_attribute and self.row_id_attribute is not None:
to_exclude.append(self.row_id_attribute)
result = []
offset = 0
# this function assumes that everything in to_exclude will
# be 'excluded' from the dataset (hence the offset)
for idx in self.features:
name = self.features[idx].name
if name in to_exclude:
offset += 1
elif self.features[idx].data_type == data_type:
result.append(idx - offset)
return result
|
open_in_browser
open_in_browser() -> None
Opens the OpenML web page corresponding to this object in your default browser.
Source code in openml/base.py
| def open_in_browser(self) -> None:
"""Opens the OpenML web page corresponding to this object in your default browser."""
if self.openml_url is None:
raise ValueError(
"Cannot open element on OpenML.org when attribute `openml_url` is `None`",
)
webbrowser.open(self.openml_url)
|
publish
Publish the object on the OpenML server.
Source code in openml/base.py
| def publish(self) -> OpenMLBase:
"""Publish the object on the OpenML server."""
file_elements = self._get_file_elements()
if "description" not in file_elements:
file_elements["description"] = self._to_xml()
call = f"{_get_rest_api_type_alias(self)}/"
response_text = openml._api_calls._perform_api_call(
call,
"post",
file_elements=file_elements,
)
xml_response = xmltodict.parse(response_text)
self._parse_publish_response(xml_response)
return self
|
push_tag
push_tag(tag: str) -> None
Annotates this entity with a tag on the server.
Parameters
tag : str
Tag to attach to the flow.
Source code in openml/base.py
| def push_tag(self, tag: str) -> None:
"""Annotates this entity with a tag on the server.
Parameters
----------
tag : str
Tag to attach to the flow.
"""
_tag_openml_base(self, tag)
|
remove_tag
remove_tag(tag: str) -> None
Removes a tag from this entity on the server.
Parameters
tag : str
Tag to attach to the flow.
Source code in openml/base.py
| def remove_tag(self, tag: str) -> None:
"""Removes a tag from this entity on the server.
Parameters
----------
tag : str
Tag to attach to the flow.
"""
_tag_openml_base(self, tag, untag=True)
|
retrieve_class_labels
retrieve_class_labels(target_name: str = 'class') -> None | list[str]
Reads the datasets arff to determine the class-labels.
If the task has no class labels (for example a regression problem)
it returns None. Necessary because the data returned by get_data
only contains the indices of the classes, while OpenML needs the real
classname when uploading the results of a run.
Parameters
target_name : str
Name of the target attribute
Returns
list
Source code in openml/datasets/dataset.py
| def retrieve_class_labels(self, target_name: str = "class") -> None | list[str]:
"""Reads the datasets arff to determine the class-labels.
If the task has no class labels (for example a regression problem)
it returns None. Necessary because the data returned by get_data
only contains the indices of the classes, while OpenML needs the real
classname when uploading the results of a run.
Parameters
----------
target_name : str
Name of the target attribute
Returns
-------
list
"""
for feature in self.features.values():
if feature.name == target_name:
if feature.data_type == "nominal":
return feature.nominal_values
if feature.data_type == "string":
# Rel.: #1311
# The target is invalid for a classification task if the feature type is string
# and not nominal. For such miss-configured tasks, we silently fix it here as
# we can safely interpreter string as nominal.
df, *_ = self.get_data()
return list(df[feature.name].unique())
return None
|
url_for_id
classmethod
url_for_id(id_: int) -> str
Return the OpenML URL for the object of the class entity with the given id.
Source code in openml/base.py
| @classmethod
def url_for_id(cls, id_: int) -> str:
"""Return the OpenML URL for the object of the class entity with the given id."""
# Sample url for a flow: openml.org/f/123
return f"{openml.config.get_server_base_url()}/{cls._entity_letter()}/{id_}"
|