Flows and Runs
A simple tutorial on how to upload results from a machine learning experiment to OpenML.
In [ ]:
Copied!
import sklearn
from sklearn.neighbors import KNeighborsClassifier
import openml
import sklearn
from sklearn.neighbors import KNeighborsClassifier
import openml
Warning
This example uploads data. For that reason, this example connects to the
test server at test.openml.org.
This prevents the main server from becoming overloaded with example datasets, tasks,
runs, and other submissions.
Using this test server may affect the behavior and performance of the
OpenML-Python API.
In [ ]:
Copied!
openml.config.start_using_configuration_for_example()
openml.config.start_using_configuration_for_example()
Train a machine learning model and evaluate it¶
NOTE: We are using task 119 from the test server: https://test.openml.org/d/20
In [ ]:
Copied!
task = openml.tasks.get_task(119)
# Get the data
dataset = task.get_dataset()
X, y, categorical_indicator, attribute_names = dataset.get_data(
target=dataset.default_target_attribute
)
# Get the holdout split from the task
train_indices, test_indices = task.get_train_test_split_indices(fold=0, repeat=0)
X_train, X_test = X.iloc[train_indices], X.iloc[test_indices]
y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]
knn_parameters = {
"n_neighbors": 3,
}
clf = KNeighborsClassifier(**knn_parameters)
clf.fit(X_train, y_train)
# Get experiment results
y_pred = clf.predict(X_test)
y_pred_proba = clf.predict_proba(X_test)
task = openml.tasks.get_task(119)
# Get the data
dataset = task.get_dataset()
X, y, categorical_indicator, attribute_names = dataset.get_data(
target=dataset.default_target_attribute
)
# Get the holdout split from the task
train_indices, test_indices = task.get_train_test_split_indices(fold=0, repeat=0)
X_train, X_test = X.iloc[train_indices], X.iloc[test_indices]
y_train, y_test = y.iloc[train_indices], y.iloc[test_indices]
knn_parameters = {
"n_neighbors": 3,
}
clf = KNeighborsClassifier(**knn_parameters)
clf.fit(X_train, y_train)
# Get experiment results
y_pred = clf.predict(X_test)
y_pred_proba = clf.predict_proba(X_test)
Upload the machine learning experiments to OpenML¶
First, create a fow and fill it with metadata about the machine learning model.
In [ ]:
Copied!
knn_flow = openml.flows.OpenMLFlow(
# Metadata
model=clf, # or None, if you do not want to upload the model object.
name="CustomKNeighborsClassifier",
description="A custom KNeighborsClassifier flow for OpenML.",
external_version=f"{sklearn.__version__}",
language="English",
tags=["openml_tutorial_knn"],
dependencies=f"{sklearn.__version__}",
# Hyperparameters
parameters={k: str(v) for k, v in knn_parameters.items()},
parameters_meta_info={
"n_neighbors": {"description": "number of neighbors to use", "data_type": "int"}
},
# If you have a pipeline with subcomponents, such as preprocessing, add them here.
components={},
)
knn_flow.publish()
print(f"knn_flow was published with the ID {knn_flow.flow_id}")
knn_flow = openml.flows.OpenMLFlow(
# Metadata
model=clf, # or None, if you do not want to upload the model object.
name="CustomKNeighborsClassifier",
description="A custom KNeighborsClassifier flow for OpenML.",
external_version=f"{sklearn.__version__}",
language="English",
tags=["openml_tutorial_knn"],
dependencies=f"{sklearn.__version__}",
# Hyperparameters
parameters={k: str(v) for k, v in knn_parameters.items()},
parameters_meta_info={
"n_neighbors": {"description": "number of neighbors to use", "data_type": "int"}
},
# If you have a pipeline with subcomponents, such as preprocessing, add them here.
components={},
)
knn_flow.publish()
print(f"knn_flow was published with the ID {knn_flow.flow_id}")
Second, we create a run to store the results associated with the flow.
In [ ]:
Copied!
# Format the predictions for OpenML
predictions = []
for test_index, y_true_i, y_pred_i, y_pred_proba_i in zip(
test_indices, y_test, y_pred, y_pred_proba
):
predictions.append(
openml.runs.functions.format_prediction(
task=task,
repeat=0,
fold=0,
index=test_index,
prediction=y_pred_i,
truth=y_true_i,
proba=dict(zip(task.class_labels, y_pred_proba_i)),
)
)
# Format the parameters for OpenML
oml_knn_parameters = [
{"oml:name": k, "oml:value": v, "oml:component": knn_flow.flow_id}
for k, v in knn_parameters.items()
]
knn_run = openml.runs.OpenMLRun(
task_id=task.task_id,
flow_id=knn_flow.flow_id,
dataset_id=dataset.dataset_id,
parameter_settings=oml_knn_parameters,
data_content=predictions,
tags=["openml_tutorial_knn"],
description_text="Run generated by the tutorial.",
)
knn_run = knn_run.publish()
print(f"Run was uploaded to {knn_run.openml_url}")
print(f"The flow can be found at {knn_run.flow.openml_url}")
# Format the predictions for OpenML
predictions = []
for test_index, y_true_i, y_pred_i, y_pred_proba_i in zip(
test_indices, y_test, y_pred, y_pred_proba
):
predictions.append(
openml.runs.functions.format_prediction(
task=task,
repeat=0,
fold=0,
index=test_index,
prediction=y_pred_i,
truth=y_true_i,
proba=dict(zip(task.class_labels, y_pred_proba_i)),
)
)
# Format the parameters for OpenML
oml_knn_parameters = [
{"oml:name": k, "oml:value": v, "oml:component": knn_flow.flow_id}
for k, v in knn_parameters.items()
]
knn_run = openml.runs.OpenMLRun(
task_id=task.task_id,
flow_id=knn_flow.flow_id,
dataset_id=dataset.dataset_id,
parameter_settings=oml_knn_parameters,
data_content=predictions,
tags=["openml_tutorial_knn"],
description_text="Run generated by the tutorial.",
)
knn_run = knn_run.publish()
print(f"Run was uploaded to {knn_run.openml_url}")
print(f"The flow can be found at {knn_run.flow.openml_url}")
In [ ]:
Copied!
openml.config.stop_using_configuration_for_example()
openml.config.stop_using_configuration_for_example()