Skip to content

Custom Datasets

This module contains the custom datasets for OpenML datasets.

GenericDataset

Bases: Dataset

Generic dataset that takes X,y as input and returns them as tensors

Source code in openml_pytorch/custom_datasets/generic_dataset.py
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
class GenericDataset(torch.utils.data.Dataset):
    """
    Generic dataset that takes X,y as input and returns them as tensors"""

    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)  # Convert to tensors
        self.y = torch.tensor(y, dtype=torch.long)  # Ensure labels are LongTensor

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

OpenMLImageDataset

Bases: Dataset

Class representing an image dataset from OpenML for use in PyTorch.

Methods:

__init__(self, X, y, image_size, image_dir, transform_x=None, transform_y=None)
    Initializes the dataset with given data, image size, directory, and optional transformations.

__getitem__(self, idx)
    Retrieves an image and its corresponding label (if available) from the dataset at the specified index. Applies transformations if provided.

__len__(self)
    Returns the total number of images in the dataset.
Source code in openml_pytorch/custom_datasets/image_dataset.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
class OpenMLImageDataset(Dataset):
    """
    Class representing an image dataset from OpenML for use in PyTorch.

    Methods:

        __init__(self, X, y, image_size, image_dir, transform_x=None, transform_y=None)
            Initializes the dataset with given data, image size, directory, and optional transformations.

        __getitem__(self, idx)
            Retrieves an image and its corresponding label (if available) from the dataset at the specified index. Applies transformations if provided.

        __len__(self)
            Returns the total number of images in the dataset.
    """

    def __init__(self, X, y, image_size, image_dir, transform_x=None, transform_y=None):
        self.X = X
        self.y = y
        self.image_size = image_size
        self.image_dir = image_dir
        self.transform_x = transform_x
        self.transform_y = transform_y

    def __getitem__(self, idx):
        img_name = str(os.path.join(self.image_dir, self.X.iloc[idx, 0]))
        # hotfix for .DS_Store files
        if ".DS_Store" in img_name:
            return self.__getitem__((idx + 1) % len(self))

        # Open the image using PIL instead of read_image
        try:
            image = Image.open(img_name).convert("RGB")  # Ensure it's in RGB mode
        except Exception as e:
            print(f"Error opening image {img_name}: {e}")
            return self.__getitem__((idx + 1) % len(self))

        # Resize using PIL-based transform
        image = T.Resize((self.image_size, self.image_size))(image)
        # Convert to tensor after all PIL transformations
        image = T.ToTensor()(image)
        # Apply additional transformations if provided
        if self.transform_x is not None:
            image = self.transform_x(image)

        if self.y is not None:
            label = self.y.iloc[idx]
            if label is not None:
                if self.transform_y is not None:
                    label = self.transform_y(label)
                return image, label
        else:
            return image

    def __len__(self):
        return len(self.X)

OpenMLTabularDataset

Bases: Dataset

OpenMLTabularDataset

A custom dataset class to handle tabular data from OpenML (or any similar tabular dataset). It encodes categorical features and the target column using LabelEncoder from sklearn.

Methods:

Name Description
__init__

Initializes the dataset with the data and the target column. Encodes the categorical features and target if provided.

__getitem__

Retrieves the input data and target value at the specified index. Converts the data to tensors and returns them.

__len__

Returns the length of the dataset.

Source code in openml_pytorch/custom_datasets/tabular_dataset.py
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
class OpenMLTabularDataset(Dataset):
    """
    OpenMLTabularDataset

    A custom dataset class to handle tabular data from OpenML (or any similar tabular dataset).
    It encodes categorical features and the target column using LabelEncoder from sklearn.

    Methods:
        __init__(X, y) : Initializes the dataset with the data and the target column.
                         Encodes the categorical features and target if provided.

        __getitem__(idx): Retrieves the input data and target value at the specified index.
                          Converts the data to tensors and returns them.

        __len__(): Returns the length of the dataset.
    """

    def __init__(self, X, y):
        self.data = X
        # self.target_col_name = target_col
        for col in self.data.select_dtypes(include=["object", "category"]):
            # convert to float
            self.data[col] = self.data[col].astype("category").cat.codes
        self.label_mapping = None

        self.y = y

    def __getitem__(self, idx):
        # x is the input data, y is the target value from the target column
        x = self.data.iloc[idx, :]
        try:
            x = torch.tensor(x.values.astype("float32"))
        except Exception as e:
            print(f"Error converting data to tensor: {e}")
            return self.__getitem__((idx + 1) % len(self))
        if self.y is not None:
            y = self.y[idx]
            y = torch.tensor(y)
            return x, y
        else:
            return x

    def __len__(self):
        return len(self.data)