Skip to content

evaluations

openml.evaluations #

OpenMLEvaluation #

OpenMLEvaluation(run_id: int, task_id: int, setup_id: int, flow_id: int, flow_name: str, data_id: int, data_name: str, function: str, upload_time: str, uploader: int, uploader_name: str, value: float | None, values: list[float] | None, array_data: str | None = None)

Contains all meta-information about a run / evaluation combination, according to the evaluation/list function

Parameters#

run_id : int Refers to the run. task_id : int Refers to the task. setup_id : int Refers to the setup. flow_id : int Refers to the flow. flow_name : str Name of the referred flow. data_id : int Refers to the dataset. data_name : str The name of the dataset. function : str The evaluation metric of this item (e.g., accuracy). upload_time : str The time of evaluation. uploader: int Uploader ID (user ID) upload_name : str Name of the uploader of this evaluation value : float The value (score) of this evaluation. values : List[float] The values (scores) per repeat and fold (if requested) array_data : str list of information per class. (e.g., in case of precision, auroc, recall)

Source code in openml/evaluations/evaluation.py
def __init__(  # noqa: PLR0913
    self,
    run_id: int,
    task_id: int,
    setup_id: int,
    flow_id: int,
    flow_name: str,
    data_id: int,
    data_name: str,
    function: str,
    upload_time: str,
    uploader: int,
    uploader_name: str,
    value: float | None,
    values: list[float] | None,
    array_data: str | None = None,
):
    self.run_id = run_id
    self.task_id = task_id
    self.setup_id = setup_id
    self.flow_id = flow_id
    self.flow_name = flow_name
    self.data_id = data_id
    self.data_name = data_name
    self.function = function
    self.upload_time = upload_time
    self.uploader = uploader
    self.uploader_name = uploader_name
    self.value = value
    self.values = values
    self.array_data = array_data

list_evaluation_measures #

list_evaluation_measures() -> list[str]

Return list of evaluation measures available.

The function performs an API call to retrieve the entire list of evaluation measures that are available.

Returns#

list

Source code in openml/evaluations/functions.py
def list_evaluation_measures() -> list[str]:
    """Return list of evaluation measures available.

    The function performs an API call to retrieve the entire list of
    evaluation measures that are available.

    Returns
    -------
    list

    """
    api_call = "evaluationmeasure/list"
    xml_string = openml._api_calls._perform_api_call(api_call, "get")
    qualities = xmltodict.parse(xml_string, force_list=("oml:measures"))
    # Minimalistic check if the XML is useful
    if "oml:evaluation_measures" not in qualities:
        raise ValueError('Error in return XML, does not contain "oml:evaluation_measures"')

    if not isinstance(qualities["oml:evaluation_measures"]["oml:measures"][0]["oml:measure"], list):
        raise TypeError('Error in return XML, does not contain "oml:measure" as a list')

    return qualities["oml:evaluation_measures"]["oml:measures"][0]["oml:measure"]

list_evaluations #

list_evaluations(function: str, offset: int | None = None, size: int | None = None, tasks: list[str | int] | None = None, setups: list[str | int] | None = None, flows: list[str | int] | None = None, runs: list[str | int] | None = None, uploaders: list[str | int] | None = None, tag: str | None = None, study: int | None = None, per_fold: bool | None = None, sort_order: str | None = None, output_format: Literal['object', 'dataframe'] = 'object') -> dict[int, OpenMLEvaluation] | DataFrame

List all run-evaluation pairs matching all of the given filters.

(Supports large amount of results)

Parameters#

function : str the evaluation function. e.g., predictive_accuracy offset : int, optional the number of runs to skip, starting from the first size : int, default 10000 The maximum number of runs to show. If set to None, it returns all the results.

list[int,str], optional

the list of task IDs

setups: list[int,str], optional the list of setup IDs flows : list[int,str], optional the list of flow IDs runs :list[int,str], optional the list of run IDs uploaders : list[int,str], optional the list of uploader IDs tag : str, optional filter evaluation based on given tag

study : int, optional

per_fold : bool, optional

str, optional

order of sorting evaluations, ascending ("asc") or descending ("desc")

str, optional (default='object')

The parameter decides the format of the output. - If 'object' the output is a dict of OpenMLEvaluation objects - If 'dataframe' the output is a pandas DataFrame

Returns#

dict or dataframe

Source code in openml/evaluations/functions.py
def list_evaluations(
    function: str,
    offset: int | None = None,
    size: int | None = None,
    tasks: list[str | int] | None = None,
    setups: list[str | int] | None = None,
    flows: list[str | int] | None = None,
    runs: list[str | int] | None = None,
    uploaders: list[str | int] | None = None,
    tag: str | None = None,
    study: int | None = None,
    per_fold: bool | None = None,
    sort_order: str | None = None,
    output_format: Literal["object", "dataframe"] = "object",
) -> dict[int, OpenMLEvaluation] | pd.DataFrame:
    """List all run-evaluation pairs matching all of the given filters.

    (Supports large amount of results)

    Parameters
    ----------
    function : str
        the evaluation function. e.g., predictive_accuracy
    offset : int, optional
        the number of runs to skip, starting from the first
    size : int, default 10000
        The maximum number of runs to show.
        If set to ``None``, it returns all the results.

    tasks : list[int,str], optional
        the list of task IDs
    setups: list[int,str], optional
        the list of setup IDs
    flows : list[int,str], optional
        the list of flow IDs
    runs :list[int,str], optional
        the list of run IDs
    uploaders : list[int,str], optional
        the list of uploader IDs
    tag : str, optional
        filter evaluation based on given tag

    study : int, optional

    per_fold : bool, optional

    sort_order : str, optional
       order of sorting evaluations, ascending ("asc") or descending ("desc")

    output_format: str, optional (default='object')
        The parameter decides the format of the output.
        - If 'object' the output is a dict of OpenMLEvaluation objects
        - If 'dataframe' the output is a pandas DataFrame

    Returns
    -------
    dict or dataframe
    """
    if output_format not in ("dataframe", "object"):
        raise ValueError("Invalid output format. Only 'object', 'dataframe'.")

    per_fold_str = None
    if per_fold is not None:
        per_fold_str = str(per_fold).lower()

    listing_call = partial(
        _list_evaluations,
        function=function,
        tasks=tasks,
        setups=setups,
        flows=flows,
        runs=runs,
        uploaders=uploaders,
        tag=tag,
        study=study,
        sort_order=sort_order,
        per_fold=per_fold_str,
    )
    eval_collection = openml.utils._list_all(listing_call, offset=offset, limit=size)

    flattened = list(chain.from_iterable(eval_collection))
    if output_format == "dataframe":
        records = [item._to_dict() for item in flattened]
        return pd.DataFrame.from_records(records)  # No index...

    return {e.run_id: e for e in flattened}

list_evaluations_setups #

list_evaluations_setups(function: str, offset: int | None = None, size: int | None = None, tasks: list | None = None, setups: list | None = None, flows: list | None = None, runs: list | None = None, uploaders: list | None = None, tag: str | None = None, per_fold: bool | None = None, sort_order: str | None = None, parameters_in_separate_columns: bool = False) -> DataFrame

List all run-evaluation pairs matching all of the given filters and their hyperparameter settings.

Parameters#

function : str the evaluation function. e.g., predictive_accuracy offset : int, optional the number of runs to skip, starting from the first size : int, optional the maximum number of runs to show tasks : list[int], optional the list of task IDs setups: list[int], optional the list of setup IDs flows : list[int], optional the list of flow IDs runs : list[int], optional the list of run IDs uploaders : list[int], optional the list of uploader IDs tag : str, optional filter evaluation based on given tag per_fold : bool, optional sort_order : str, optional order of sorting evaluations, ascending ("asc") or descending ("desc") parameters_in_separate_columns: bool, optional (default= False) Returns hyperparameters in separate columns if set to True. Valid only for a single flow

Returns#

dataframe with hyperparameter settings as a list of tuples.

Source code in openml/evaluations/functions.py
def list_evaluations_setups(
    function: str,
    offset: int | None = None,
    size: int | None = None,
    tasks: list | None = None,
    setups: list | None = None,
    flows: list | None = None,
    runs: list | None = None,
    uploaders: list | None = None,
    tag: str | None = None,
    per_fold: bool | None = None,
    sort_order: str | None = None,
    parameters_in_separate_columns: bool = False,  # noqa: FBT001, FBT002
) -> pd.DataFrame:
    """List all run-evaluation pairs matching all of the given filters
    and their hyperparameter settings.

    Parameters
    ----------
    function : str
        the evaluation function. e.g., predictive_accuracy
    offset : int, optional
        the number of runs to skip, starting from the first
    size : int, optional
        the maximum number of runs to show
    tasks : list[int], optional
        the list of task IDs
    setups: list[int], optional
        the list of setup IDs
    flows : list[int], optional
        the list of flow IDs
    runs : list[int], optional
        the list of run IDs
    uploaders : list[int], optional
        the list of uploader IDs
    tag : str, optional
        filter evaluation based on given tag
    per_fold : bool, optional
    sort_order : str, optional
       order of sorting evaluations, ascending ("asc") or descending ("desc")
    parameters_in_separate_columns: bool, optional (default= False)
        Returns hyperparameters in separate columns if set to True.
        Valid only for a single flow

    Returns
    -------
    dataframe with hyperparameter settings as a list of tuples.
    """
    if parameters_in_separate_columns and (flows is None or len(flows) != 1):
        raise ValueError("Can set parameters_in_separate_columns to true only for single flow_id")

    # List evaluations
    evals = list_evaluations(
        function=function,
        offset=offset,
        size=size,
        runs=runs,
        tasks=tasks,
        setups=setups,
        flows=flows,
        uploaders=uploaders,
        tag=tag,
        per_fold=per_fold,
        sort_order=sort_order,
        output_format="dataframe",
    )
    # List setups
    # list_setups by setup id does not support large sizes (exceeds URL length limit)
    # Hence we split the list of unique setup ids returned by list_evaluations into chunks of size N
    _df = pd.DataFrame()
    if len(evals) != 0:
        N = 100  # size of section
        uniq = np.asarray(evals["setup_id"].unique())
        length = len(uniq)

        # array_split - allows indices_or_sections to not equally divide the array
        # array_split -length % N sub-arrays of size length//N + 1 and the rest of size length//N.
        split_size = ((length - 1) // N) + 1
        setup_chunks = np.array_split(uniq, split_size)

        setup_data = pd.DataFrame()
        for _setups in setup_chunks:
            result = openml.setups.list_setups(setup=_setups, output_format="dataframe")
            assert isinstance(result, pd.DataFrame)
            result = result.drop("flow_id", axis=1)
            # concat resulting setup chunks into single datframe
            setup_data = pd.concat([setup_data, result])

        parameters = []
        # Convert parameters of setup into dict of (hyperparameter, value)
        for parameter_dict in setup_data["parameters"]:
            if parameter_dict is not None:
                parameters.append(
                    {param["full_name"]: param["value"] for param in parameter_dict.values()},
                )
            else:
                parameters.append({})
        setup_data["parameters"] = parameters
        # Merge setups with evaluations
        _df = evals.merge(setup_data, on="setup_id", how="left")

    if parameters_in_separate_columns:
        _df = pd.concat(
            [_df.drop("parameters", axis=1), _df["parameters"].apply(pd.Series)],
            axis=1,
        )

    return _df