Skip to content

evaluations

OpenMLEvaluation

Contains all meta-information about a run / evaluation combination, according to the evaluation/list function

Parameters:

Name Type Description Default
run_id int

Refers to the run.

required
task_id int

Refers to the task.

required
setup_id int

Refers to the setup.

required
flow_id int

Refers to the flow.

required
flow_name str

Name of the referred flow.

required
data_id int

Refers to the dataset.

required
data_name str

The name of the dataset.

required
function str

The evaluation metric of this item (e.g., accuracy).

required
upload_time str

The time of evaluation.

required
uploader int

Uploader ID (user ID)

required
upload_name str

Name of the uploader of this evaluation

required
value float

The value (score) of this evaluation.

required
values List[float]

The values (scores) per repeat and fold (if requested)

required
array_data str

list of information per class. (e.g., in case of precision, auroc, recall)

None
Source code in openml/evaluations/evaluation.py
class OpenMLEvaluation:
    """
    Contains all meta-information about a run / evaluation combination,
    according to the evaluation/list function

    Parameters
    ----------
    run_id : int
        Refers to the run.
    task_id : int
        Refers to the task.
    setup_id : int
        Refers to the setup.
    flow_id : int
        Refers to the flow.
    flow_name : str
        Name of the referred flow.
    data_id : int
        Refers to the dataset.
    data_name : str
        The name of the dataset.
    function : str
        The evaluation metric of this item (e.g., accuracy).
    upload_time : str
        The time of evaluation.
    uploader: int
        Uploader ID (user ID)
    upload_name : str
        Name of the uploader of this evaluation
    value : float
        The value (score) of this evaluation.
    values : List[float]
        The values (scores) per repeat and fold (if requested)
    array_data : str
        list of information per class.
        (e.g., in case of precision, auroc, recall)
    """

    def __init__(  # noqa: PLR0913
        self,
        run_id: int,
        task_id: int,
        setup_id: int,
        flow_id: int,
        flow_name: str,
        data_id: int,
        data_name: str,
        function: str,
        upload_time: str,
        uploader: int,
        uploader_name: str,
        value: float | None,
        values: list[float] | None,
        array_data: str | None = None,
    ):
        self.run_id = run_id
        self.task_id = task_id
        self.setup_id = setup_id
        self.flow_id = flow_id
        self.flow_name = flow_name
        self.data_id = data_id
        self.data_name = data_name
        self.function = function
        self.upload_time = upload_time
        self.uploader = uploader
        self.uploader_name = uploader_name
        self.value = value
        self.values = values
        self.array_data = array_data

    def __repr__(self) -> str:
        header = "OpenML Evaluation"
        header = "{}\n{}\n".format(header, "=" * len(header))

        fields = {
            "Upload Date": self.upload_time,
            "Run ID": self.run_id,
            "OpenML Run URL": openml.runs.OpenMLRun.url_for_id(self.run_id),
            "Task ID": self.task_id,
            "OpenML Task URL": openml.tasks.OpenMLTask.url_for_id(self.task_id),
            "Flow ID": self.flow_id,
            "OpenML Flow URL": openml.flows.OpenMLFlow.url_for_id(self.flow_id),
            "Setup ID": self.setup_id,
            "Data ID": self.data_id,
            "Data Name": self.data_name,
            "OpenML Data URL": openml.datasets.OpenMLDataset.url_for_id(self.data_id),
            "Metric Used": self.function,
            "Result": self.value,
        }

        order = [
            "Uploader Date",
            "Run ID",
            "OpenML Run URL",
            "Task ID",
            "OpenML Task URL" "Flow ID",
            "OpenML Flow URL",
            "Setup ID",
            "Data ID",
            "Data Name",
            "OpenML Data URL",
            "Metric Used",
            "Result",
        ]
        _fields = [(key, fields[key]) for key in order if key in fields]

        longest_field_name_length = max(len(name) for name, _ in _fields)
        field_line_format = f"{{:.<{longest_field_name_length}}}: {{}}"
        body = "\n".join(field_line_format.format(name, value) for name, value in _fields)
        return header + body

list_evaluation_measures()

Return list of evaluation measures available.

The function performs an API call to retrieve the entire list of evaluation measures that are available.

Returns:

Type Description
list
Source code in openml/evaluations/functions.py
def list_evaluation_measures() -> list[str]:
    """Return list of evaluation measures available.

    The function performs an API call to retrieve the entire list of
    evaluation measures that are available.

    Returns
    -------
    list

    """
    api_call = "evaluationmeasure/list"
    xml_string = openml._api_calls._perform_api_call(api_call, "get")
    qualities = xmltodict.parse(xml_string, force_list=("oml:measures"))
    # Minimalistic check if the XML is useful
    if "oml:evaluation_measures" not in qualities:
        raise ValueError("Error in return XML, does not contain " '"oml:evaluation_measures"')
    if not isinstance(qualities["oml:evaluation_measures"]["oml:measures"][0]["oml:measure"], list):
        raise TypeError("Error in return XML, does not contain " '"oml:measure" as a list')
    return qualities["oml:evaluation_measures"]["oml:measures"][0]["oml:measure"]

list_evaluations(function, offset=None, size=10000, tasks=None, setups=None, flows=None, runs=None, uploaders=None, tag=None, study=None, per_fold=None, sort_order=None, output_format='object')

list_evaluations(function: str, offset: int | None = ..., size: int | None = ..., tasks: list[str | int] | None = ..., setups: list[str | int] | None = ..., flows: list[str | int] | None = ..., runs: list[str | int] | None = ..., uploaders: list[str | int] | None = ..., tag: str | None = ..., study: int | None = ..., per_fold: bool | None = ..., sort_order: str | None = ..., output_format: Literal['dict', 'object'] = 'dict') -> dict
list_evaluations(function: str, offset: int | None = ..., size: int | None = ..., tasks: list[str | int] | None = ..., setups: list[str | int] | None = ..., flows: list[str | int] | None = ..., runs: list[str | int] | None = ..., uploaders: list[str | int] | None = ..., tag: str | None = ..., study: int | None = ..., per_fold: bool | None = ..., sort_order: str | None = ..., output_format: Literal['dataframe'] = ...) -> pd.DataFrame

List all run-evaluation pairs matching all of the given filters. (Supports large amount of results)

Parameters:

Name Type Description Default
function str

the evaluation function. e.g., predictive_accuracy

required
offset int

the number of runs to skip, starting from the first

None
size int

The maximum number of runs to show. If set to None, it returns all the results.

10000
tasks list[int, str]

the list of task IDs

None
setups list[str | int] | None

the list of setup IDs

None
flows list[int, str]

the list of flow IDs

None
runs list[str | int] | None

the list of run IDs

None
uploaders list[int, str]

the list of uploader IDs

None
tag str

filter evaluation based on given tag

None
study int
None
per_fold bool
None
sort_order str

order of sorting evaluations, ascending ("asc") or descending ("desc")

None
output_format Literal['object', 'dict', 'dataframe']

The parameter decides the format of the output. - If 'object' the output is a dict of OpenMLEvaluation objects - If 'dict' the output is a dict of dict - If 'dataframe' the output is a pandas DataFrame

'object'

Returns:

Type Description
dict or dataframe
Source code in openml/evaluations/functions.py
def list_evaluations(
    function: str,
    offset: int | None = None,
    size: int | None = 10000,
    tasks: list[str | int] | None = None,
    setups: list[str | int] | None = None,
    flows: list[str | int] | None = None,
    runs: list[str | int] | None = None,
    uploaders: list[str | int] | None = None,
    tag: str | None = None,
    study: int | None = None,
    per_fold: bool | None = None,
    sort_order: str | None = None,
    output_format: Literal["object", "dict", "dataframe"] = "object",
) -> dict | pd.DataFrame:
    """
    List all run-evaluation pairs matching all of the given filters.
    (Supports large amount of results)

    Parameters
    ----------
    function : str
        the evaluation function. e.g., predictive_accuracy
    offset : int, optional
        the number of runs to skip, starting from the first
    size : int, default 10000
        The maximum number of runs to show.
        If set to ``None``, it returns all the results.

    tasks : list[int,str], optional
        the list of task IDs
    setups: list[int,str], optional
        the list of setup IDs
    flows : list[int,str], optional
        the list of flow IDs
    runs :list[int,str], optional
        the list of run IDs
    uploaders : list[int,str], optional
        the list of uploader IDs
    tag : str, optional
        filter evaluation based on given tag

    study : int, optional

    per_fold : bool, optional

    sort_order : str, optional
       order of sorting evaluations, ascending ("asc") or descending ("desc")

    output_format: str, optional (default='object')
        The parameter decides the format of the output.
        - If 'object' the output is a dict of OpenMLEvaluation objects
        - If 'dict' the output is a dict of dict
        - If 'dataframe' the output is a pandas DataFrame

    Returns
    -------
    dict or dataframe
    """
    if output_format not in ["dataframe", "dict", "object"]:
        raise ValueError(
            "Invalid output format selected. Only 'object', 'dataframe', or 'dict' applicable.",
        )

    # TODO: [0.15]
    if output_format == "dict":
        msg = (
            "Support for `output_format` of 'dict' will be removed in 0.15. "
            "To ensure your code will continue to work, "
            "use `output_format`='dataframe' or `output_format`='object'."
        )
        warnings.warn(msg, category=FutureWarning, stacklevel=2)

    per_fold_str = None
    if per_fold is not None:
        per_fold_str = str(per_fold).lower()

    return openml.utils._list_all(  # type: ignore
        list_output_format=output_format,  # type: ignore
        listing_call=_list_evaluations,
        function=function,
        offset=offset,
        size=size,
        tasks=tasks,
        setups=setups,
        flows=flows,
        runs=runs,
        uploaders=uploaders,
        tag=tag,
        study=study,
        sort_order=sort_order,
        per_fold=per_fold_str,
    )

list_evaluations_setups(function, offset=None, size=None, tasks=None, setups=None, flows=None, runs=None, uploaders=None, tag=None, per_fold=None, sort_order=None, output_format='dataframe', parameters_in_separate_columns=False)

List all run-evaluation pairs matching all of the given filters and their hyperparameter settings.

Parameters:

Name Type Description Default
function str

the evaluation function. e.g., predictive_accuracy

required
offset int

the number of runs to skip, starting from the first

None
size int

the maximum number of runs to show

None
tasks list[int]

the list of task IDs

None
setups list | None

the list of setup IDs

None
flows list[int]

the list of flow IDs

None
runs list[int]

the list of run IDs

None
uploaders list[int]

the list of uploader IDs

None
tag str

filter evaluation based on given tag

None
per_fold bool
None
sort_order str

order of sorting evaluations, ascending ("asc") or descending ("desc")

None
output_format str

The parameter decides the format of the output. - If 'dict' the output is a dict of dict - If 'dataframe' the output is a pandas DataFrame

'dataframe'
parameters_in_separate_columns bool

Returns hyperparameters in separate columns if set to True. Valid only for a single flow

False

Returns:

Type Description
dict or dataframe with hyperparameter settings as a list of tuples.
Source code in openml/evaluations/functions.py
def list_evaluations_setups(
    function: str,
    offset: int | None = None,
    size: int | None = None,
    tasks: list | None = None,
    setups: list | None = None,
    flows: list | None = None,
    runs: list | None = None,
    uploaders: list | None = None,
    tag: str | None = None,
    per_fold: bool | None = None,
    sort_order: str | None = None,
    output_format: str = "dataframe",
    parameters_in_separate_columns: bool = False,  # noqa: FBT001, FBT002
) -> dict | pd.DataFrame:
    """
    List all run-evaluation pairs matching all of the given filters
    and their hyperparameter settings.

    Parameters
    ----------
    function : str
        the evaluation function. e.g., predictive_accuracy
    offset : int, optional
        the number of runs to skip, starting from the first
    size : int, optional
        the maximum number of runs to show
    tasks : list[int], optional
        the list of task IDs
    setups: list[int], optional
        the list of setup IDs
    flows : list[int], optional
        the list of flow IDs
    runs : list[int], optional
        the list of run IDs
    uploaders : list[int], optional
        the list of uploader IDs
    tag : str, optional
        filter evaluation based on given tag
    per_fold : bool, optional
    sort_order : str, optional
       order of sorting evaluations, ascending ("asc") or descending ("desc")
    output_format: str, optional (default='dataframe')
        The parameter decides the format of the output.
        - If 'dict' the output is a dict of dict
        - If 'dataframe' the output is a pandas DataFrame
    parameters_in_separate_columns: bool, optional (default= False)
        Returns hyperparameters in separate columns if set to True.
        Valid only for a single flow


    Returns
    -------
    dict or dataframe with hyperparameter settings as a list of tuples.
    """
    if parameters_in_separate_columns and (flows is None or len(flows) != 1):
        raise ValueError(
            "Can set parameters_in_separate_columns to true " "only for single flow_id",
        )

    # List evaluations
    evals = list_evaluations(
        function=function,
        offset=offset,
        size=size,
        runs=runs,
        tasks=tasks,
        setups=setups,
        flows=flows,
        uploaders=uploaders,
        tag=tag,
        per_fold=per_fold,
        sort_order=sort_order,
        output_format="dataframe",
    )
    # List setups
    # list_setups by setup id does not support large sizes (exceeds URL length limit)
    # Hence we split the list of unique setup ids returned by list_evaluations into chunks of size N
    _df = pd.DataFrame()
    if len(evals) != 0:
        N = 100  # size of section
        length = len(evals["setup_id"].unique())  # length of the array we want to split
        # array_split - allows indices_or_sections to not equally divide the array
        # array_split -length % N sub-arrays of size length//N + 1 and the rest of size length//N.
        uniq = np.asarray(evals["setup_id"].unique())
        setup_chunks = np.array_split(uniq, ((length - 1) // N) + 1)
        setup_data = pd.DataFrame()
        for _setups in setup_chunks:
            result = openml.setups.list_setups(setup=_setups, output_format="dataframe")
            assert isinstance(result, pd.DataFrame)
            result = result.drop("flow_id", axis=1)
            # concat resulting setup chunks into single datframe
            setup_data = pd.concat([setup_data, result], ignore_index=True)

        parameters = []
        # Convert parameters of setup into list of tuples of (hyperparameter, value)
        for parameter_dict in setup_data["parameters"]:
            if parameter_dict is not None:
                parameters.append(
                    {param["full_name"]: param["value"] for param in parameter_dict.values()},
                )
            else:
                parameters.append({})
        setup_data["parameters"] = parameters
        # Merge setups with evaluations
        _df = evals.merge(setup_data, on="setup_id", how="left")

    if parameters_in_separate_columns:
        _df = pd.concat(
            [_df.drop("parameters", axis=1), _df["parameters"].apply(pd.Series)],
            axis=1,
        )

    if output_format == "dataframe":
        return _df

    return _df.to_dict(orient="index")