Via the website, you can filter by many dataset properties, such as size, type, format, and many more.
It also allows you to explore every dataset via interactive dashboards.
importopenml# List all datasets and their properties# It's possible to filter on status, tags, and meta-data attributesopenml.datasets.list_datasets(output_format="dataframe",status="active",tag="vision")
library(mlr3oml)library(mlr3)# Search for specific datasetsodatasets=list_oml_data(number_features=c(10,20),number_instances=c(45000,50000),number_classes=2)
importorg.openml.apiconnector.io.ApiConnector;// Create a client. Your API key can be found in your account.OpenmlConnectoropenml=newOpenmlConnector("api_key");// List all datasets and their propertiesDataSet[]datasets=openml.dataList();
importopenml# Get dataset by IDdataset=openml.datasets.get_dataset(61)# Get dataset by namedataset=openml.datasets.get_dataset('Fashion-MNIST')# Get the data itself. Returns a pandas dataframe by default.X,_,_,_=dataset.get_data()# Other data formats can be requested (e.g. numpy)# Target features, feature names and types are also returned X,y,is_categorical,feat_names=dataset.get_data(dataset_format="array",target=dataset.default_target_attribute)
usingOpenMLusingDataFrames# Get dataset by IDOpenML.describe_dataset(40996)# Get the data itself as a dataframe (or otherwise)table=OpenML.load(40996)df=DataFrame(table)
importorg.openml.apiconnector.io.ApiConnector;// Create a client. Your API key can be found in your account.OpenmlConnectoropenml=newOpenmlConnector("api_key");// Get dataset by IDDataSetDescriptiondata=openml.dataGet(40996);Stringfile_url=data.getUrl();
importopenmlfromsklearnimportensemble# Get dataset by IDdataset=openml.datasets.get_dataset(20)# Get the X, y dataX,y,_,_=dataset.get_data(target=dataset.default_target_attribute)# Create a model and train itclf=ensemble.RandomForestClassifier()clf.fit(X,y)
importtorch.nnimportopenml_pytorchimporttorchvisionfromtorchvision.transformsimportCompose,Resize,ToPILImage,ToTensor,Lambda# Image to tensor conversiontransform=Compose([ToPILImage(),# Convert tensor to PIL Image to ensure PIL Image operations can be applied.Lambda(convert_to_rgb),# Convert PIL Image to RGB if it's not already.Resize((64,64)),# Resize the image.ToTensor(),# Convert the PIL Image back to a tensor.])# Create a data loaderdata_module=OpenMLDataModule(type_of_data="image",file_dir="datasets",filename_col="image_path",target_mode="categorical",target_column="label",batch_size=64,transform=transform)# Create a trainer moduletrainer=OpenMLTrainerModule(data_module=data_module,verbose=True,epoch_count=1,callbacks=[],)openml_pytorch.config.trainer=trainer# Download an OpenML task and a Pytorch modeltask=openml.tasks.get_task(362128)model=torchvision.models.efficientnet_b0(num_classes=200)# Run the model on the OpenML taskrun=openml.runs.run_model_on_task(model,task,avoid_duplicate_runs=False)
importopenmlimportopenml_tensorflowfromtensorflow.keras.preprocessing.imageimportImageDataGeneratorimporttensorflowastffromtensorflow.kerasimportlayers,models# Configure OpenML based on datasets meta-datadatagen=ImageDataGenerator()openml_tensorflow.config.datagen=datagenopenml_tensorflow.config.dir=openml.config.get_cache_directory()+'/datasets/44312/PNU_Micro/images/'openml_tensorflow.config.x_col="FILE_NAME"openml_tensorflow.config.y_col='encoded_labels'openml_tensorflow.config.datagen=datagenopenml_tensorflow.config.batch_size=32openml_tensorflow.config.epoch=1openml_tensorflow.config.class_mode="categorical"# Set up cross-validationopenml_tensorflow.config.perform_validation=Trueopenml_tensorflow.config.validation_split=0.1openml_tensorflow.config.datagen_valid=ImageDataGenerator()IMG_SIZE=(128,128)IMG_SHAPE=IMG_SIZE+(3,)# Example tensorflow image classification model. model=models.Sequential()model.add(layers.Conv2D(128,(3,3),activation='relu',input_shape=IMG_SHAPE))model.add(layers.MaxPooling2D((2,2)))model.add(layers.Conv2D(64,(3,3),activation='relu'))model.add(layers.MaxPooling2D((2,2)))model.add(layers.Conv2D(64,(3,3),activation='relu'))model.add(layers.Flatten())model.add(layers.Dense(64,activation='relu'))model.add(layers.Dense(84,activation='relu'))model.add(layers.Dense(19,activation='softmax'))# Adjust output sizemodel.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['AUC'])# Download the OpenML task for the Meta_Album_PNU_Micro dataset.task=openml.tasks.get_task(362071)# Run the Keras model on the task (requires an API key).run=openml.runs.run_model_on_task(model,task,avoid_duplicate_runs=False)
library(mlr3oml)library(mlr3)# create an mlr3 Learner and Resampling and run a resample experimentsample(task=tsk_adult,learner=lrn("classif.rpart"),resampling=rsmp("cv",folds=10))
OpenML will automatically create a Croissant description when you create (or edit) an OpenML dataset.
Croissant also has data loaders that allow you to load the data and import it into AI tools.
importopenmlimportrequests# Get dataset by namedataset=openml.datasets.get_dataset('Fashion-MNIST')# Get the croissant URL# Currently this works via a predictive naming schemecroissant_url=dataset._parquet_url.replace(".pq","_croissant.json")# Download the croissant fileresponse=requests.get(croissant_url)croissant=response.json()