Getting Data from OpenML ... and then using your own training pipeline¶
- Just want the data and don't want to deal with anything else?
- Have some complicated idea you want to try? Don't want to be limited by this API? No problem!
- You can use your own training pipeline and still use data from OpenML ... but, you cannot upload your results back to OpenML this way as of now. ):
In [ ]:
Copied!
# import libraries
import openml
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from typing import Any
from tqdm import tqdm
import openml_pytorch as op
# import libraries
import openml
import torch
import numpy as np
from sklearn.model_selection import train_test_split
from typing import Any
from tqdm import tqdm
import openml_pytorch as op
Get data and create dataloaders¶
- !!!! This is the ONLY required step. Everything else is completely up to you.
- You might be wondering what the GenericDataset is. It is just a simple dataset class
import torch
class GenericDataset(torch.utils.data.Dataset):
"""
Generic dataset that takes X,y as input and returns them as tensors"""
def __init__(self, X, y):
self.X = torch.tensor(X, dtype=torch.float32) # Convert to tensors
self.y = torch.tensor(y, dtype=torch.long) # Ensure labels are LongTensor
def __len__(self):
return len(self.X)
def __getitem__(self, idx):
return self.X[idx], self.y[idx]
In [ ]:
Copied!
# Get dataset by ID
dataset = openml.datasets.get_dataset(20)
# Get the X, y data
X, y, _, _ = dataset.get_data(target=dataset.default_target_attribute)
X = X.to_numpy(dtype=np.float32) # Ensure X is a NumPy array of float32
y = y.to_numpy(dtype=np.int64) # Ensure y is a NumPy array of int64 (for classification)
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
# Dataloaders
ds_train = op.GenericDataset(X_train, y_train)
ds_test = op.GenericDataset(X_test, y_test)
dataloader_train = torch.utils.data.DataLoader(ds_train, batch_size=64, shuffle=True)
dataloader_test = torch.utils.data.DataLoader(ds_test, batch_size=64, shuffle=False)
# Get dataset by ID
dataset = openml.datasets.get_dataset(20)
# Get the X, y data
X, y, _, _ = dataset.get_data(target=dataset.default_target_attribute)
X = X.to_numpy(dtype=np.float32) # Ensure X is a NumPy array of float32
y = y.to_numpy(dtype=np.int64) # Ensure y is a NumPy array of int64 (for classification)
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
# Dataloaders
ds_train = op.GenericDataset(X_train, y_train)
ds_test = op.GenericDataset(X_test, y_test)
dataloader_train = torch.utils.data.DataLoader(ds_train, batch_size=64, shuffle=True)
dataloader_test = torch.utils.data.DataLoader(ds_test, batch_size=64, shuffle=False)
Choose your model¶
In [ ]:
Copied!
# Model Definition
class TabularClassificationModel(torch.nn.Module):
def __init__(self, input_size, output_size):
super(TabularClassificationModel, self).__init__()
self.fc1 = torch.nn.Linear(input_size, 128)
self.fc2 = torch.nn.Linear(128, 64)
self.fc3 = torch.nn.Linear(64, output_size)
self.relu = torch.nn.ReLU()
self.softmax = torch.nn.Softmax(dim=1)
def forward(self, x):
x = self.fc1(x)
x = self.relu(x)
x = self.fc2(x)
x = self.relu(x)
x = self.fc3(x)
x = self.softmax(x)
return x
# Model Definition
class TabularClassificationModel(torch.nn.Module):
def __init__(self, input_size, output_size):
super(TabularClassificationModel, self).__init__()
self.fc1 = torch.nn.Linear(input_size, 128)
self.fc2 = torch.nn.Linear(128, 64)
self.fc3 = torch.nn.Linear(64, output_size)
self.relu = torch.nn.ReLU()
self.softmax = torch.nn.Softmax(dim=1)
def forward(self, x):
x = self.fc1(x)
x = self.relu(x)
x = self.fc2(x)
x = self.relu(x)
x = self.fc3(x)
x = self.softmax(x)
return x
Define your own training pipeline¶
In [ ]:
Copied!
# Train the model
trainer = op.BasicTrainer(
model = TabularClassificationModel(X_train.shape[1], len(np.unique(y_train))),
loss_fn = torch.nn.CrossEntropyLoss(),
opt = torch.optim.Adam,
dataloader_train = dataloader_train,
dataloader_test = dataloader_test,
device= torch.device("mps")
)
trainer.fit(10)
# Train the model
trainer = op.BasicTrainer(
model = TabularClassificationModel(X_train.shape[1], len(np.unique(y_train))),
loss_fn = torch.nn.CrossEntropyLoss(),
opt = torch.optim.Adam,
dataloader_train = dataloader_train,
dataloader_test = dataloader_test,
device= torch.device("mps")
)
trainer.fit(10)
In [ ]:
Copied!