evaluations

`OpenMLEvaluation` ¶

Contains all meta-information about a run / evaluation combination, according to the evaluation/list function

Parameters:

Name	Type	Description	Default
`run_id`	`int`	Refers to the run.	required
`task_id`	`int`	Refers to the task.	required
`setup_id`	`int`	Refers to the setup.	required
`flow_id`	`int`	Refers to the flow.	required
`flow_name`	`str`	Name of the referred flow.	required
`data_id`	`int`	Refers to the dataset.	required
`data_name`	`str`	The name of the dataset.	required
`function`	`str`	The evaluation metric of this item (e.g., accuracy).	required
`upload_time`	`str`	The time of evaluation.	required
`uploader`	`int`	Uploader ID (user ID)	required
`upload_name`	`str`	Name of the uploader of this evaluation	required
`value`	`float`	The value (score) of this evaluation.	required
`values`	`List[float]`	The values (scores) per repeat and fold (if requested)	required
`array_data`	`str`	list of information per class. (e.g., in case of precision, auroc, recall)	`None`

Source code in openml/evaluations/evaluation.py

class OpenMLEvaluation:
    """
    Contains all meta-information about a run / evaluation combination,
    according to the evaluation/list function

    Parameters
    ----------
    run_id : int
        Refers to the run.
    task_id : int
        Refers to the task.
    setup_id : int
        Refers to the setup.
    flow_id : int
        Refers to the flow.
    flow_name : str
        Name of the referred flow.
    data_id : int
        Refers to the dataset.
    data_name : str
        The name of the dataset.
    function : str
        The evaluation metric of this item (e.g., accuracy).
    upload_time : str
        The time of evaluation.
    uploader: int
        Uploader ID (user ID)
    upload_name : str
        Name of the uploader of this evaluation
    value : float
        The value (score) of this evaluation.
    values : List[float]
        The values (scores) per repeat and fold (if requested)
    array_data : str
        list of information per class.
        (e.g., in case of precision, auroc, recall)
    """

    def __init__(  # noqa: PLR0913
        self,
        run_id: int,
        task_id: int,
        setup_id: int,
        flow_id: int,
        flow_name: str,
        data_id: int,
        data_name: str,
        function: str,
        upload_time: str,
        uploader: int,
        uploader_name: str,
        value: float | None,
        values: list[float] | None,
        array_data: str | None = None,
    ):
        self.run_id = run_id
        self.task_id = task_id
        self.setup_id = setup_id
        self.flow_id = flow_id
        self.flow_name = flow_name
        self.data_id = data_id
        self.data_name = data_name
        self.function = function
        self.upload_time = upload_time
        self.uploader = uploader
        self.uploader_name = uploader_name
        self.value = value
        self.values = values
        self.array_data = array_data

    def __repr__(self) -> str:
        header = "OpenML Evaluation"
        header = "{}\n{}\n".format(header, "=" * len(header))

        fields = {
            "Upload Date": self.upload_time,
            "Run ID": self.run_id,
            "OpenML Run URL": openml.runs.OpenMLRun.url_for_id(self.run_id),
            "Task ID": self.task_id,
            "OpenML Task URL": openml.tasks.OpenMLTask.url_for_id(self.task_id),
            "Flow ID": self.flow_id,
            "OpenML Flow URL": openml.flows.OpenMLFlow.url_for_id(self.flow_id),
            "Setup ID": self.setup_id,
            "Data ID": self.data_id,
            "Data Name": self.data_name,
            "OpenML Data URL": openml.datasets.OpenMLDataset.url_for_id(self.data_id),
            "Metric Used": self.function,
            "Result": self.value,
        }

        order = [
            "Uploader Date",
            "Run ID",
            "OpenML Run URL",
            "Task ID",
            "OpenML Task URL" "Flow ID",
            "OpenML Flow URL",
            "Setup ID",
            "Data ID",
            "Data Name",
            "OpenML Data URL",
            "Metric Used",
            "Result",
        ]
        _fields = [(key, fields[key]) for key in order if key in fields]

        longest_field_name_length = max(len(name) for name, _ in _fields)
        field_line_format = f"{{:.<{longest_field_name_length}}}: {{}}"
        body = "\n".join(field_line_format.format(name, value) for name, value in _fields)
        return header + body

`list_evaluation_measures()` ¶

Return list of evaluation measures available.

The function performs an API call to retrieve the entire list of evaluation measures that are available.

Returns:

Type	Description
`list`

Source code in openml/evaluations/functions.py

def list_evaluation_measures() -> list[str]:
    """Return list of evaluation measures available.

    The function performs an API call to retrieve the entire list of
    evaluation measures that are available.

    Returns
    -------
    list

    """
    api_call = "evaluationmeasure/list"
    xml_string = openml._api_calls._perform_api_call(api_call, "get")
    qualities = xmltodict.parse(xml_string, force_list=("oml:measures"))
    # Minimalistic check if the XML is useful
    if "oml:evaluation_measures" not in qualities:
        raise ValueError("Error in return XML, does not contain " '"oml:evaluation_measures"')
    if not isinstance(qualities["oml:evaluation_measures"]["oml:measures"][0]["oml:measure"], list):
        raise TypeError("Error in return XML, does not contain " '"oml:measure" as a list')
    return qualities["oml:evaluation_measures"]["oml:measures"][0]["oml:measure"]

`list_evaluations(function, offset=None, size=10000, tasks=None, setups=None, flows=None, runs=None, uploaders=None, tag=None, study=None, per_fold=None, sort_order=None, output_format='object')` ¶

list_evaluations(function: str, offset: int | None = ..., size: int | None = ..., tasks: list[str | int] | None = ..., setups: list[str | int] | None = ..., flows: list[str | int] | None = ..., runs: list[str | int] | None = ..., uploaders: list[str | int] | None = ..., tag: str | None = ..., study: int | None = ..., per_fold: bool | None = ..., sort_order: str | None = ..., output_format: Literal['dict', 'object'] = 'dict') -> dict

list_evaluations(function: str, offset: int | None = ..., size: int | None = ..., tasks: list[str | int] | None = ..., setups: list[str | int] | None = ..., flows: list[str | int] | None = ..., runs: list[str | int] | None = ..., uploaders: list[str | int] | None = ..., tag: str | None = ..., study: int | None = ..., per_fold: bool | None = ..., sort_order: str | None = ..., output_format: Literal['dataframe'] = ...) -> pd.DataFrame

List all run-evaluation pairs matching all of the given filters. (Supports large amount of results)

Parameters:

Name	Type	Description	Default
`function`	`str`	the evaluation function. e.g., predictive_accuracy	required
`offset`	`int`	the number of runs to skip, starting from the first	`None`
`size`	`int`	The maximum number of runs to show. If set to `None`, it returns all the results.	`10000`
`tasks`	`list[int, str]`	the list of task IDs	`None`
`setups`	`list[str \| int] \| None`	the list of setup IDs	`None`
`flows`	`list[int, str]`	the list of flow IDs	`None`
`runs`	`list[str \| int] \| None`	the list of run IDs	`None`
`uploaders`	`list[int, str]`	the list of uploader IDs	`None`
`tag`	`str`	filter evaluation based on given tag	`None`
`study`	`int`		`None`
`per_fold`	`bool`		`None`
`sort_order`	`str`	order of sorting evaluations, ascending ("asc") or descending ("desc")	`None`
`output_format`	`Literal['object', 'dict', 'dataframe']`	The parameter decides the format of the output. - If 'object' the output is a dict of OpenMLEvaluation objects - If 'dict' the output is a dict of dict - If 'dataframe' the output is a pandas DataFrame	`'object'`

Returns:

Type	Description
`dict or dataframe`

Source code in openml/evaluations/functions.py

def list_evaluations(
    function: str,
    offset: int | None = None,
    size: int | None = 10000,
    tasks: list[str | int] | None = None,
    setups: list[str | int] | None = None,
    flows: list[str | int] | None = None,
    runs: list[str | int] | None = None,
    uploaders: list[str | int] | None = None,
    tag: str | None = None,
    study: int | None = None,
    per_fold: bool | None = None,
    sort_order: str | None = None,
    output_format: Literal["object", "dict", "dataframe"] = "object",
) -> dict | pd.DataFrame:
    """
    List all run-evaluation pairs matching all of the given filters.
    (Supports large amount of results)

    Parameters
    ----------
    function : str
        the evaluation function. e.g., predictive_accuracy
    offset : int, optional
        the number of runs to skip, starting from the first
    size : int, default 10000
        The maximum number of runs to show.
        If set to ``None``, it returns all the results.

    tasks : list[int,str], optional
        the list of task IDs
    setups: list[int,str], optional
        the list of setup IDs
    flows : list[int,str], optional
        the list of flow IDs
    runs :list[int,str], optional
        the list of run IDs
    uploaders : list[int,str], optional
        the list of uploader IDs
    tag : str, optional
        filter evaluation based on given tag

    study : int, optional

    per_fold : bool, optional

    sort_order : str, optional
       order of sorting evaluations, ascending ("asc") or descending ("desc")

    output_format: str, optional (default='object')
        The parameter decides the format of the output.
        - If 'object' the output is a dict of OpenMLEvaluation objects
        - If 'dict' the output is a dict of dict
        - If 'dataframe' the output is a pandas DataFrame

    Returns
    -------
    dict or dataframe
    """
    if output_format not in ["dataframe", "dict", "object"]:
        raise ValueError(
            "Invalid output format selected. Only 'object', 'dataframe', or 'dict' applicable.",
        )

    # TODO: [0.15]
    if output_format == "dict":
        msg = (
            "Support for `output_format` of 'dict' will be removed in 0.15. "
            "To ensure your code will continue to work, "
            "use `output_format`='dataframe' or `output_format`='object'."
        )
        warnings.warn(msg, category=FutureWarning, stacklevel=2)

    per_fold_str = None
    if per_fold is not None:
        per_fold_str = str(per_fold).lower()

    return openml.utils._list_all(  # type: ignore
        list_output_format=output_format,  # type: ignore
        listing_call=_list_evaluations,
        function=function,
        offset=offset,
        size=size,
        tasks=tasks,
        setups=setups,
        flows=flows,
        runs=runs,
        uploaders=uploaders,
        tag=tag,
        study=study,
        sort_order=sort_order,
        per_fold=per_fold_str,
    )

`list_evaluations_setups(function, offset=None, size=None, tasks=None, setups=None, flows=None, runs=None, uploaders=None, tag=None, per_fold=None, sort_order=None, output_format='dataframe', parameters_in_separate_columns=False)` ¶

List all run-evaluation pairs matching all of the given filters and their hyperparameter settings.

Parameters:

Name	Type	Description	Default
`function`	`str`	the evaluation function. e.g., predictive_accuracy	required
`offset`	`int`	the number of runs to skip, starting from the first	`None`
`size`	`int`	the maximum number of runs to show	`None`
`tasks`	`list[int]`	the list of task IDs	`None`
`setups`	`list \| None`	the list of setup IDs	`None`
`flows`	`list[int]`	the list of flow IDs	`None`
`runs`	`list[int]`	the list of run IDs	`None`
`uploaders`	`list[int]`	the list of uploader IDs	`None`
`tag`	`str`	filter evaluation based on given tag	`None`
`per_fold`	`bool`		`None`
`sort_order`	`str`	order of sorting evaluations, ascending ("asc") or descending ("desc")	`None`
`output_format`	`str`	The parameter decides the format of the output. - If 'dict' the output is a dict of dict - If 'dataframe' the output is a pandas DataFrame	`'dataframe'`
`parameters_in_separate_columns`	`bool`	Returns hyperparameters in separate columns if set to True. Valid only for a single flow	`False`

Returns:

Type	Description
`dict or dataframe with hyperparameter settings as a list of tuples.`

Source code in openml/evaluations/functions.py

def list_evaluations_setups(
    function: str,
    offset: int | None = None,
    size: int | None = None,
    tasks: list | None = None,
    setups: list | None = None,
    flows: list | None = None,
    runs: list | None = None,
    uploaders: list | None = None,
    tag: str | None = None,
    per_fold: bool | None = None,
    sort_order: str | None = None,
    output_format: str = "dataframe",
    parameters_in_separate_columns: bool = False,  # noqa: FBT001, FBT002
) -> dict | pd.DataFrame:
    """
    List all run-evaluation pairs matching all of the given filters
    and their hyperparameter settings.

    Parameters
    ----------
    function : str
        the evaluation function. e.g., predictive_accuracy
    offset : int, optional
        the number of runs to skip, starting from the first
    size : int, optional
        the maximum number of runs to show
    tasks : list[int], optional
        the list of task IDs
    setups: list[int], optional
        the list of setup IDs
    flows : list[int], optional
        the list of flow IDs
    runs : list[int], optional
        the list of run IDs
    uploaders : list[int], optional
        the list of uploader IDs
    tag : str, optional
        filter evaluation based on given tag
    per_fold : bool, optional
    sort_order : str, optional
       order of sorting evaluations, ascending ("asc") or descending ("desc")
    output_format: str, optional (default='dataframe')
        The parameter decides the format of the output.
        - If 'dict' the output is a dict of dict
        - If 'dataframe' the output is a pandas DataFrame
    parameters_in_separate_columns: bool, optional (default= False)
        Returns hyperparameters in separate columns if set to True.
        Valid only for a single flow


    Returns
    -------
    dict or dataframe with hyperparameter settings as a list of tuples.
    """
    if parameters_in_separate_columns and (flows is None or len(flows) != 1):
        raise ValueError(
            "Can set parameters_in_separate_columns to true " "only for single flow_id",
        )

    # List evaluations
    evals = list_evaluations(
        function=function,
        offset=offset,
        size=size,
        runs=runs,
        tasks=tasks,
        setups=setups,
        flows=flows,
        uploaders=uploaders,
        tag=tag,
        per_fold=per_fold,
        sort_order=sort_order,
        output_format="dataframe",
    )
    # List setups
    # list_setups by setup id does not support large sizes (exceeds URL length limit)
    # Hence we split the list of unique setup ids returned by list_evaluations into chunks of size N
    _df = pd.DataFrame()
    if len(evals) != 0:
        N = 100  # size of section
        length = len(evals["setup_id"].unique())  # length of the array we want to split
        # array_split - allows indices_or_sections to not equally divide the array
        # array_split -length % N sub-arrays of size length//N + 1 and the rest of size length//N.
        uniq = np.asarray(evals["setup_id"].unique())
        setup_chunks = np.array_split(uniq, ((length - 1) // N) + 1)
        setup_data = pd.DataFrame()
        for _setups in setup_chunks:
            result = openml.setups.list_setups(setup=_setups, output_format="dataframe")
            assert isinstance(result, pd.DataFrame)
            result = result.drop("flow_id", axis=1)
            # concat resulting setup chunks into single datframe
            setup_data = pd.concat([setup_data, result], ignore_index=True)

        parameters = []
        # Convert parameters of setup into list of tuples of (hyperparameter, value)
        for parameter_dict in setup_data["parameters"]:
            if parameter_dict is not None:
                parameters.append(
                    {param["full_name"]: param["value"] for param in parameter_dict.values()},
                )
            else:
                parameters.append({})
        setup_data["parameters"] = parameters
        # Merge setups with evaluations
        _df = evals.merge(setup_data, on="setup_id", how="left")

    if parameters_in_separate_columns:
        _df = pd.concat(
            [_df.drop("parameters", axis=1), _df["parameters"].apply(pd.Series)],
            axis=1,
        )

    if output_format == "dataframe":
        return _df

    return _df.to_dict(orient="index")

evaluations

OpenMLEvaluation ¶

list_evaluation_measures() ¶

list_evaluations(function, offset=None, size=10000, tasks=None, setups=None, flows=None, runs=None, uploaders=None, tag=None, study=None, per_fold=None, sort_order=None, output_format='object') ¶

list_evaluations_setups(function, offset=None, size=None, tasks=None, setups=None, flows=None, runs=None, uploaders=None, tag=None, per_fold=None, sort_order=None, output_format='dataframe', parameters_in_separate_columns=False) ¶

`OpenMLEvaluation` ¶

`list_evaluation_measures()` ¶

`list_evaluations(function, offset=None, size=10000, tasks=None, setups=None, flows=None, runs=None, uploaders=None, tag=None, study=None, per_fold=None, sort_order=None, output_format='object')` ¶

`list_evaluations_setups(function, offset=None, size=None, tasks=None, setups=None, flows=None, runs=None, uploaders=None, tag=None, per_fold=None, sort_order=None, output_format='dataframe', parameters_in_separate_columns=False)` ¶