evaluations

openml.evaluations #

OpenMLEvaluation `dataclass` #

OpenMLEvaluation(run_id: int, task_id: int, setup_id: int, flow_id: int, flow_name: str, data_id: int, data_name: str, function: str, upload_time: str, uploader: int, uploader_name: str, value: float | None, values: list[float] | None, array_data: str | None = None)

Contains all meta-information about a run / evaluation combination, according to the evaluation/list function

PARAMETER	DESCRIPTION
`run_id`	Refers to the run. TYPE: `int`
`task_id`	Refers to the task. TYPE: `int`
`setup_id`	Refers to the setup. TYPE: `int`
`flow_id`	Refers to the flow. TYPE: `int`
`flow_name`	Name of the referred flow. TYPE: `str`
`data_id`	Refers to the dataset. TYPE: `int`
`data_name`	The name of the dataset. TYPE: `str`
`function`	The evaluation metric of this item (e.g., accuracy). TYPE: `str`
`upload_time`	The time of evaluation. TYPE: `str`
`uploader`	Uploader ID (user ID) TYPE: `int`
`upload_name`	Name of the uploader of this evaluation TYPE: `str`
`value`	The value (score) of this evaluation. TYPE: `float`
`values`	The values (scores) per repeat and fold (if requested) TYPE: `List[float]`
`array_data`	list of information per class. (e.g., in case of precision, auroc, recall) TYPE: `str` DEFAULT: `None`

list_evaluation_measures #

list_evaluation_measures() -> list[str]

Return list of evaluation measures available.

The function performs an API call to retrieve the entire list of evaluation measures that are available.

RETURNS	DESCRIPTION
`list`

Source code in openml/evaluations/functions.py

def list_evaluation_measures() -> list[str]:
    """Return list of evaluation measures available.

    The function performs an API call to retrieve the entire list of
    evaluation measures that are available.

    Returns
    -------
    list

    """
    api_call = "evaluationmeasure/list"
    xml_string = openml._api_calls._perform_api_call(api_call, "get")
    qualities = xmltodict.parse(xml_string, force_list=("oml:measures"))
    # Minimalistic check if the XML is useful
    if "oml:evaluation_measures" not in qualities:
        raise ValueError('Error in return XML, does not contain "oml:evaluation_measures"')

    if not isinstance(qualities["oml:evaluation_measures"]["oml:measures"][0]["oml:measure"], list):
        raise TypeError('Error in return XML, does not contain "oml:measure" as a list')

    return qualities["oml:evaluation_measures"]["oml:measures"][0]["oml:measure"]

list_evaluations #

list_evaluations(function: str, offset: int | None = None, size: int | None = None, tasks: list[str | int] | None = None, setups: list[str | int] | None = None, flows: list[str | int] | None = None, runs: list[str | int] | None = None, uploaders: list[str | int] | None = None, tag: str | None = None, study: int | None = None, per_fold: bool | None = None, sort_order: str | None = None, output_format: Literal['dataframe'] = ...) -> DataFrame

list_evaluations(function: str, offset: int | None = None, size: int | None = None, tasks: list[str | int] | None = None, setups: list[str | int] | None = None, flows: list[str | int] | None = None, runs: list[str | int] | None = None, uploaders: list[str | int] | None = None, tag: str | None = None, study: int | None = None, per_fold: bool | None = None, sort_order: str | None = None, output_format: Literal['object'] = 'object') -> dict[int, OpenMLEvaluation]

list_evaluations(function: str, offset: int | None = None, size: int | None = None, tasks: list[str | int] | None = None, setups: list[str | int] | None = None, flows: list[str | int] | None = None, runs: list[str | int] | None = None, uploaders: list[str | int] | None = None, tag: str | None = None, study: int | None = None, per_fold: bool | None = None, sort_order: str | None = None, output_format: Literal['object', 'dataframe'] = 'object') -> dict[int, OpenMLEvaluation] | DataFrame

List all run-evaluation pairs matching all of the given filters.

(Supports large amount of results)

PARAMETER	DESCRIPTION
`function`	the evaluation function. e.g., predictive_accuracy TYPE: `str`
`offset`	the number of runs to skip, starting from the first TYPE: `int` DEFAULT: `None`
`size`	The maximum number of runs to show. If set to `None`, it returns all the results. TYPE: `int` DEFAULT: `10000`
`tasks`	the list of task IDs TYPE: `list[int, str]` DEFAULT: `None`
`setups`	the list of setup IDs TYPE: `list[str \| int] \| None` DEFAULT: `None`
`flows`	the list of flow IDs TYPE: `list[int, str]` DEFAULT: `None`
`runs`	the list of run IDs TYPE: `list[str \| int] \| None` DEFAULT: `None`
`uploaders`	the list of uploader IDs TYPE: `list[int, str]` DEFAULT: `None`
`tag`	filter evaluation based on given tag TYPE: `str` DEFAULT: `None`
`study`	TYPE: `int` DEFAULT: `None`
`per_fold`	TYPE: `bool` DEFAULT: `None`
`sort_order`	order of sorting evaluations, ascending ("asc") or descending ("desc") TYPE: `str` DEFAULT: `None`
`output_format`	The parameter decides the format of the output. - If 'object' the output is a dict of OpenMLEvaluation objects - If 'dataframe' the output is a pandas DataFrame TYPE: `Literal['object', 'dataframe']` DEFAULT: `'object'`

RETURNS	DESCRIPTION
`dict or dataframe`

Source code in openml/evaluations/functions.py

def list_evaluations(
    function: str,
    offset: int | None = None,
    size: int | None = None,
    tasks: list[str | int] | None = None,
    setups: list[str | int] | None = None,
    flows: list[str | int] | None = None,
    runs: list[str | int] | None = None,
    uploaders: list[str | int] | None = None,
    tag: str | None = None,
    study: int | None = None,
    per_fold: bool | None = None,
    sort_order: str | None = None,
    output_format: Literal["object", "dataframe"] = "object",
) -> dict[int, OpenMLEvaluation] | pd.DataFrame:
    """List all run-evaluation pairs matching all of the given filters.

    (Supports large amount of results)

    Parameters
    ----------
    function : str
        the evaluation function. e.g., predictive_accuracy
    offset : int, optional
        the number of runs to skip, starting from the first
    size : int, default 10000
        The maximum number of runs to show.
        If set to ``None``, it returns all the results.

    tasks : list[int,str], optional
        the list of task IDs
    setups: list[int,str], optional
        the list of setup IDs
    flows : list[int,str], optional
        the list of flow IDs
    runs :list[int,str], optional
        the list of run IDs
    uploaders : list[int,str], optional
        the list of uploader IDs
    tag : str, optional
        filter evaluation based on given tag

    study : int, optional

    per_fold : bool, optional

    sort_order : str, optional
       order of sorting evaluations, ascending ("asc") or descending ("desc")

    output_format: str, optional (default='object')
        The parameter decides the format of the output.
        - If 'object' the output is a dict of OpenMLEvaluation objects
        - If 'dataframe' the output is a pandas DataFrame

    Returns
    -------
    dict or dataframe
    """
    if output_format not in ("dataframe", "object"):
        raise ValueError("Invalid output format. Only 'object', 'dataframe'.")

    per_fold_str = None
    if per_fold is not None:
        per_fold_str = str(per_fold).lower()

    listing_call = partial(
        _list_evaluations,
        function=function,
        tasks=tasks,
        setups=setups,
        flows=flows,
        runs=runs,
        uploaders=uploaders,
        tag=tag,
        study=study,
        sort_order=sort_order,
        per_fold=per_fold_str,
    )
    eval_collection = openml.utils._list_all(listing_call, offset=offset, limit=size)

    flattened = list(chain.from_iterable(eval_collection))
    if output_format == "dataframe":
        records = [item._to_dict() for item in flattened]
        return pd.DataFrame.from_records(records)  # No index...

    return {e.run_id: e for e in flattened}

list_evaluations_setups #

list_evaluations_setups(function: str, offset: int | None = None, size: int | None = None, tasks: list | None = None, setups: list | None = None, flows: list | None = None, runs: list | None = None, uploaders: list | None = None, tag: str | None = None, per_fold: bool | None = None, sort_order: str | None = None, parameters_in_separate_columns: bool = False) -> DataFrame

List all run-evaluation pairs matching all of the given filters and their hyperparameter settings.

PARAMETER	DESCRIPTION
`function`	the evaluation function. e.g., predictive_accuracy TYPE: `str`
`offset`	the number of runs to skip, starting from the first TYPE: `int` DEFAULT: `None`
`size`	the maximum number of runs to show TYPE: `int` DEFAULT: `None`
`tasks`	the list of task IDs TYPE: `list[int]` DEFAULT: `None`
`setups`	the list of setup IDs TYPE: `list \| None` DEFAULT: `None`
`flows`	the list of flow IDs TYPE: `list[int]` DEFAULT: `None`
`runs`	the list of run IDs TYPE: `list[int]` DEFAULT: `None`
`uploaders`	the list of uploader IDs TYPE: `list[int]` DEFAULT: `None`
`tag`	filter evaluation based on given tag TYPE: `str` DEFAULT: `None`
`per_fold`	TYPE: `bool` DEFAULT: `None`
`sort_order`	order of sorting evaluations, ascending ("asc") or descending ("desc") TYPE: `str` DEFAULT: `None`
`parameters_in_separate_columns`	Returns hyperparameters in separate columns if set to True. Valid only for a single flow TYPE: `bool` DEFAULT: `False`

RETURNS	DESCRIPTION
`dataframe with hyperparameter settings as a list of tuples.`

Source code in openml/evaluations/functions.py

def list_evaluations_setups(
    function: str,
    offset: int | None = None,
    size: int | None = None,
    tasks: list | None = None,
    setups: list | None = None,
    flows: list | None = None,
    runs: list | None = None,
    uploaders: list | None = None,
    tag: str | None = None,
    per_fold: bool | None = None,
    sort_order: str | None = None,
    parameters_in_separate_columns: bool = False,  # noqa: FBT002
) -> pd.DataFrame:
    """List all run-evaluation pairs matching all of the given filters
    and their hyperparameter settings.

    Parameters
    ----------
    function : str
        the evaluation function. e.g., predictive_accuracy
    offset : int, optional
        the number of runs to skip, starting from the first
    size : int, optional
        the maximum number of runs to show
    tasks : list[int], optional
        the list of task IDs
    setups: list[int], optional
        the list of setup IDs
    flows : list[int], optional
        the list of flow IDs
    runs : list[int], optional
        the list of run IDs
    uploaders : list[int], optional
        the list of uploader IDs
    tag : str, optional
        filter evaluation based on given tag
    per_fold : bool, optional
    sort_order : str, optional
       order of sorting evaluations, ascending ("asc") or descending ("desc")
    parameters_in_separate_columns: bool, optional (default= False)
        Returns hyperparameters in separate columns if set to True.
        Valid only for a single flow

    Returns
    -------
    dataframe with hyperparameter settings as a list of tuples.
    """
    if parameters_in_separate_columns and (flows is None or len(flows) != 1):
        raise ValueError("Can set parameters_in_separate_columns to true only for single flow_id")

    # List evaluations
    evals = list_evaluations(
        function=function,
        offset=offset,
        size=size,
        runs=runs,
        tasks=tasks,
        setups=setups,
        flows=flows,
        uploaders=uploaders,
        tag=tag,
        per_fold=per_fold,
        sort_order=sort_order,
        output_format="dataframe",
    )
    # List setups
    # list_setups by setup id does not support large sizes (exceeds URL length limit)
    # Hence we split the list of unique setup ids returned by list_evaluations into chunks of size N
    _df = pd.DataFrame()
    if len(evals) != 0:
        N = 100  # size of section
        uniq = np.asarray(evals["setup_id"].unique())
        length = len(uniq)

        # array_split - allows indices_or_sections to not equally divide the array
        # array_split -length % N sub-arrays of size length//N + 1 and the rest of size length//N.
        split_size = ((length - 1) // N) + 1
        setup_chunks = np.array_split(uniq, split_size)

        setup_data = pd.DataFrame()
        for _setups in setup_chunks:
            result = openml.setups.list_setups(setup=_setups, output_format="dataframe")
            assert isinstance(result, pd.DataFrame)
            result = result.drop("flow_id", axis=1)
            # concat resulting setup chunks into single datframe
            setup_data = pd.concat([setup_data, result])

        parameters = []
        # Convert parameters of setup into dict of (hyperparameter, value)
        for parameter_dict in setup_data["parameters"]:
            if parameter_dict is not None:
                parameters.append(
                    {param["full_name"]: param["value"] for param in parameter_dict.values()},
                )
            else:
                parameters.append({})
        setup_data["parameters"] = parameters
        # Merge setups with evaluations
        _df = evals.merge(setup_data, on="setup_id", how="left")

    if parameters_in_separate_columns:
        _df = pd.concat(
            [_df.drop("parameters", axis=1), _df["parameters"].apply(pd.Series)],
            axis=1,
        )

    return _df

evaluations

openml.evaluations #

OpenMLEvaluation dataclass #

list_evaluation_measures #

list_evaluations #

list_evaluations_setups #

OpenMLEvaluation `dataclass` #