Source code for pytranskit.classification.utils

import numpy as np
from scipy.io import loadmat
import os
import h5py
from PIL import Image
from sklearn.model_selection import train_test_split

[docs]def new_index_matrix(max_index, n_samples_perclass, num_classes, repeat, y_train): seed = int('{}{}{}'.format(n_samples_perclass, num_classes, repeat)) np.random.seed(seed) index = np.zeros([num_classes, n_samples_perclass], dtype=np.int64) for classidx in range(num_classes): max_samples = (y_train == classidx).sum() index[classidx] = np.random.randint(0, max_samples, (n_samples_perclass)) return index
[docs]def take_samples(data, labels, index, num_classes): assert data.shape[0] == labels.shape[0] assert index.shape[0] == num_classes indexed_data = [] new_labels = [] for i in range(num_classes): class_data, class_labels = data[labels == i], labels[labels == i] indexed_data.append(class_data[index[i]]) new_labels.append(class_labels[index[i]]) return np.concatenate(indexed_data), np.concatenate(new_labels)
[docs]def load_data(dataset, num_classes, datadir='data'): cache_file = os.path.join(datadir, dataset, 'dataset.hdf5') if os.path.exists(cache_file): with h5py.File(cache_file, 'r') as f: x_train, y_train = f['x_train'][()], f['y_train'][()] x_test, y_test = f['x_test'][()], f['y_test'][()] print('loaded from cache file data: x_train {} x_test {}'.format(x_train.shape, x_test.shape)) return (x_train, y_train), (x_test, y_test) print('loading data from mat files') x_train, y_train, x_test, y_test = [], [], [], [] for split in ['training', 'testing']: for classidx in range(num_classes): datafile = os.path.join(datadir, dataset, '{}/dataORG_{}.mat'.format(split, classidx)) # loadmat(datafile)['xxO'] is of shape (H, W, N) data = loadmat(datafile)['xxO'].transpose([2, 0, 1]) # transpose to (N, H, W) label = np.zeros(data.shape[0], dtype=np.int64)+classidx #print('split {} class {} data.shape {}'.format(split, classidx, data.shape)) if split == 'training': x_train.append(data) y_train.append(label) else: x_test.append(data) y_test.append(label) x_train, y_train = np.concatenate(x_train), np.concatenate(y_train) x_test, y_test = np.concatenate(x_test), np.concatenate(y_test) print('x_train.shape {} x_test.shape {}'.format(x_train.shape, x_test.shape)) x_train = x_train / x_train.max(axis=(1, 2), keepdims=True) x_test = x_test / x_test.max(axis=(1, 2), keepdims=True) x_train = (x_train * 255.).astype(np.uint8) x_test = (x_test * 255.).astype(np.uint8) with h5py.File(cache_file, 'w') as f: f.create_dataset('x_train', data=x_train) f.create_dataset('y_train', data=y_train) f.create_dataset('x_test', data=x_test) f.create_dataset('y_test', data=y_test) print('saved to {}'.format(cache_file)) return (x_train, y_train), (x_test, y_test)
[docs]def load_data_1D(dataset, num_classes, datadir='data'): cache_file = os.path.join(datadir, dataset, 'dataset.hdf5') if os.path.exists(cache_file): with h5py.File(cache_file, 'r') as f: x_train, y_train = f['x_train'][()], f['y_train'][()] x_test, y_test = f['x_test'][()], f['y_test'][()] print('loaded from cache file data: x_train {} x_test {}'.format(x_train.shape, x_test.shape)) return (x_train, y_train), (x_test, y_test) print('loading data from mat files') x_train, y_train, x_test, y_test = [], [], [], [] for split in ['training', 'testing']: for classidx in range(num_classes): datafile = os.path.join(datadir, dataset, '{}/dataORG_{}.mat'.format(split, classidx)) # loadmat(datafile)['xxO'] is of shape (H, W, N) data = loadmat(datafile)['xxO'].T label = np.zeros(data.shape[0], dtype=np.int64)+classidx #print('split {} class {} data.shape {}'.format(split, classidx, data.shape)) if split == 'training': x_train.append(data) y_train.append(label) else: x_test.append(data) y_test.append(label) x_train, y_train = np.concatenate(x_train), np.concatenate(y_train) x_test, y_test = np.concatenate(x_test), np.concatenate(y_test) print('x_train.shape {} x_test.shape {}'.format(x_train.shape, x_test.shape)) with h5py.File(cache_file, 'w') as f: f.create_dataset('x_train', data=x_train) f.create_dataset('y_train', data=y_train) f.create_dataset('x_test', data=x_test) f.create_dataset('y_test', data=y_test) print('saved to {}'.format(cache_file)) return (x_train, y_train), (x_test, y_test)
[docs]def load_data_3D(dataset, num_classes, datadir='data'): cache_file = os.path.join(datadir, dataset, 'dataset.hdf5') if os.path.exists(cache_file): with h5py.File(cache_file, 'r') as f: x_train, y_train = f['x_train'][()], f['y_train'][()] x_test, y_test = f['x_test'][()], f['y_test'][()] print('loaded from cache file data: x_train {} x_test {}'.format(x_train.shape, x_test.shape)) return (x_train, y_train), (x_test, y_test) print('loading data from mat files') x_train, y_train, x_test, y_test = [], [], [], [] for split in ['training', 'testing']: for classidx in range(num_classes): datafile = os.path.join(datadir, dataset, '{}/dataORG_{}.mat'.format(split, classidx)) # loadmat(datafile)['xxO'] is of shape (H, W, N) data = loadmat(datafile)['xxO'].transpose([3, 0, 1, 2]) # transpose to (N, H, W, D) label = np.zeros(data.shape[0], dtype=np.int64)+classidx print('split {} class {} data.shape {}'.format(split, classidx, data.shape)) if split == 'training': x_train.append(data) y_train.append(label) else: x_test.append(data) y_test.append(label) x_train, y_train = np.concatenate(x_train), np.concatenate(y_train) x_test, y_test = np.concatenate(x_test), np.concatenate(y_test) print('x_train.shape {} x_test.shape {}'.format(x_train.shape, x_test.shape)) x_train = x_train / x_train.max(axis=(1, 2, 3), keepdims=True) x_test = x_test / x_test.max(axis=(1, 2, 3), keepdims=True) x_train = (x_train * 255.).astype(np.float32) x_test = (x_test * 255.).astype(np.float32) with h5py.File(cache_file, 'w') as f: f.create_dataset('x_train', data=x_train) f.create_dataset('y_train', data=y_train) f.create_dataset('x_test', data=x_test) f.create_dataset('y_test', data=y_test) print('saved to {}'.format(cache_file)) return (x_train, y_train), (x_test, y_test)
[docs]def take_train_samples(x_train, y_train, n_samples_perclass, num_classes, repeat): max_index = x_train.shape[0] // num_classes train_index = new_index_matrix(max_index, n_samples_perclass, num_classes, repeat, y_train) x_train_sub, y_train_sub = take_samples(x_train, y_train, train_index, num_classes) return x_train_sub, y_train_sub
[docs]def take_train_val_samples(x_train, y_train, n_samples_perclass, num_classes, repeat): max_index = x_train.shape[0]//num_classes train_index = new_index_matrix(max_index, n_samples_perclass, num_classes, repeat, y_train) val_samples = n_samples_perclass // 10 # Use 10% for validation if val_samples >= 1: val_index = train_index[:, :val_samples] x_val, y_val = take_samples(x_train, y_train, val_index, num_classes) assert x_val.shape[0] == y_val.shape[0] print('validation data shape {}'.format(x_val.shape), end=' ') else: x_val, y_val = None, None print('validation data {}'.format(x_val), end=' ') train_sub_index = train_index[:, val_samples:] x_train_sub, y_train_sub = take_samples(x_train, y_train, train_sub_index, num_classes) print('train data shape {}'.format(x_train_sub.shape)) if x_val is not None: assert x_val.shape[0] + x_train_sub.shape[0] == n_samples_perclass*num_classes else: assert x_train_sub.shape[0] == n_samples_perclass*num_classes return (x_train_sub, y_train_sub), (x_val, y_val)
[docs]def dataset_config(dataset): assert dataset in ['AffMNIST', 'LiverN', 'MNIST', 'OAM', 'OAM_t5', 'OAM_t10', 'SignMNIST', 'Synthetic', 'CIFAR10', 'MNIST_outDist', 'HEP2'] if dataset in ['MNIST']: rm_edge = True num_classes = 10 po_train_max = 12 # maximum train samples = 2^po_max img_size = 28 elif dataset in ['AffMNIST']: rm_edge = True num_classes = 10 img_size = 84 po_train_max = 12 # maximum train samples = 2^po_max elif dataset in ['OAM', 'OAM_t10', 'OAM_t5']: rm_edge = False num_classes = 32 img_size = 151 po_train_max = 9 # maximum train samples = 2^po_max elif dataset in ['SignMNIST']: rm_edge = False num_classes = 3 img_size = 128 po_train_max = 9 # maximum train samples = 2^po_max elif dataset in ['Synthetic']: rm_edge = True num_classes = 1000 img_size = 64 po_train_max = 4 # maximum train samples = 2^po_max elif dataset in ['LiverN']: rm_edge=False num_classes = 2 img_size = 130 po_train_max = 8 # maximum train samples = 2^po_max elif dataset in ['CIFAR10']: rm_edge = False num_classes = 10 img_size = 32 po_train_max = 12 elif dataset in ['MNIST_outDist']: rm_edge = True num_classes = 10 img_size = 84 po_train_max = 12 # maximum train samples = 2^po_max elif dataset in ['HEP2']: rm_edge = False num_classes = 2 img_size = 64 po_train_max = 10 # maximum train samples = 2^po_max return num_classes, img_size, po_train_max, rm_edge