Dataset size benchmarking

Below is a comparison of 6 machine learning models (LDA, SVC, kNN, ANN, CNN and LSTM) on the 5 datasets with a subject-independent approach (testing with unseen subjects), with a range of different dataset sizes (50% to 100% of the dataset) to study the influence of this parameter on the classification accuracy [1].

import datetime
import matplotlib.pyplot as plt
import os
import pandas as pd
import seaborn as sns
import torch

from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

from benchnirs.load import load_dataset
from benchnirs.process import process_epochs, extract_features
from benchnirs.learn import machine_learn, deep_learn


ALL_DATA_PATH = '../../data/dataset_'  # path to the datasets
DATASETS = {'herff_2014_nb': ['1-back', '2-back', '3-back'],
            'shin_2018_nb': ['0-back', '2-back', '3-back'],
            'shin_2018_wg': ['baseline', 'word generation'],
            'shin_2016_ma': ['baseline', 'mental arithmetic'],
            'bak_2019_me': ['right', 'left', 'foot']}
DATASET_SIZES = [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
CONFIDENCE = 0.05  # stat confidence at 95 %


start_time = datetime.datetime.now()
date = start_time.strftime('%Y_%m_%d_%H%M')
out_folder = f'./results/dataset_size_{date}'
os.makedirs(out_folder)
print(f'Main output folder: {out_folder}/')

print(f'Number of GPUs: {torch.cuda.device_count()}')

with open(f'{out_folder}/results.csv', 'w') as w:
    w.write('dataset;dataset_size;model;fold;accuracy;hyperparameters\n')


for dataset in DATASETS.keys():
    print(f'=====\n{dataset}\n=====')
    data_path = f'{ALL_DATA_PATH}{dataset[:-3]}/'
    out_path = f'{out_folder}/{dataset}_'

    # Load and preprocess data
    epochs = load_dataset(dataset, data_path, bandpass=[0.01, 0.5],
                          baseline=(-2, 0), roi_sides=True, tddr=True)
    classes = DATASETS[dataset]
    epochs_lab = epochs[classes]

    all_nirs, all_labels, all_groups = process_epochs(epochs_lab, tmax=9.9)
    dict_train_size = {'Dataset size': [], 'Chance': [], 'LDA': [], 'SVC': [],
                       'kNN': [], 'ANN': [], 'CNN': [], 'LSTM': []}
    for ts in DATASET_SIZES:
        print(f'-----\nDataset size {round(ts*100)} %\n-----')
        if ts < 1:
            split = train_test_split(
                all_nirs, all_labels, all_groups, shuffle=True, train_size=ts,
                stratify=all_labels, random_state=42)
            nirs, labels, groups = split[0], split[2], split[4]
        else:
            nirs, labels, groups = shuffle(
                all_nirs, all_labels, all_groups, random_state=42)
        nirs_features = extract_features(nirs, ['mean', 'std', 'slope'])

        # Run models
        lda, hps_lda, _ = machine_learn(
            'lda', nirs_features, labels, groups,
            output_folder=f'{out_path}{ts}_lda')
        svc, hps_svc, _ = machine_learn(
            'svc', nirs_features, labels, groups,
            output_folder=f'{out_path}{ts}_svc')
        knn, hps_knn, _ = machine_learn(
            'knn', nirs_features, labels, groups,
            output_folder=f'{out_path}{ts}_knn')
        ann, hps_ann, _ = deep_learn(
            'ann', nirs_features, labels, groups,
            output_folder=f'{out_path}{ts}_ann')
        cnn, hps_cnn, _ = deep_learn(
            'cnn', nirs, labels, groups, output_folder=f'{out_path}{ts}_cnn')
        lstm, hps_lstm, _ = deep_learn(
            'lstm', nirs, labels, groups, output_folder=f'{out_path}{ts}_lstm')
        dict_train_size['Chance'] += [1/len(classes) for _ in lda]
        dict_train_size['LDA'] += list(lda)
        dict_train_size['SVC'] += list(svc)
        dict_train_size['kNN'] += list(knn)
        dict_train_size['ANN'] += list(ann)
        dict_train_size['CNN'] += list(cnn)
        dict_train_size['LSTM'] += list(lstm)
        dict_train_size['Dataset size'] += [ts for _ in lda]

        # Write results
        results = {'LDA': [lda, hps_lda], 'SVC': [svc, hps_svc],
                   'kNN': [knn, hps_knn], 'ANN': [ann, hps_ann],
                   'CNN': [cnn, hps_cnn], 'LSTM': [lstm, hps_lstm]}
        with open(f'{out_folder}/results.csv', 'a') as w:
            for model in results.keys():
                for fold, accuracy in enumerate(results[model][0]):
                    w.write(f'{dataset};{ts};{model};{fold+1};{accuracy};'
                            f'"{results[model][1][fold]}"\n')

    df_train_size = pd.DataFrame(dict_train_size)
    df_train_size = df_train_size.melt(
        id_vars=['Dataset size'], value_vars=list(results.keys())+['Chance'],
        var_name='Model', value_name='Accuracy')
    lp = sns.lineplot(data=df_train_size, y='Accuracy', x='Dataset size',
                      hue='Model', palette='colorblind')
    if len(classes) == 2:
        lp.set(ylim=(0.45, 0.65))
    elif len(classes) == 3:
        lp.set(ylim=(0.25, 0.6))
    plt.legend(bbox_to_anchor=(1.01, 0.5), loc=6)
    plt.savefig(f'{out_path}summary.png', bbox_inches='tight')
    plt.close()


# Stats
print('Stats...')
with open(f'{out_folder}/stats.md', 'w') as w:
    df = pd.read_csv(f'{out_folder}/results.csv', delimiter=';')
    w.write('## Correlation of model accuracies to dataset set size\n\n')
    w.write('|Dataset|Model|Shapiro p-value|Test|p-value|rho|\n')
    w.write('|:---:|:---:|:---:|:---:|:---:|:---:|\n')
    for dataset in DATASETS.keys():
        for model in results.keys():
            w.write(f'|{dataset}|{model}|')
            sub_df = df[(df['dataset'] == dataset) & (df['model'] == model)]
            accuracies = sub_df['accuracy'].to_numpy()
            train_sizes = sub_df['dataset_size'].to_numpy()
            # Check normality of the distribution
            _, p_shap = stats.shapiro(accuracies)
            w.write(f'{p_shap}|')
            if p_shap > CONFIDENCE:
                # Pearson's correlation test
                rho, p_pears = stats.pearsonr(accuracies, train_sizes)
                w.write(f'Pearson|{p_pears}|{rho}|\n')
            else:
                # Spearman's correlation test
                rho, p_spear = stats.spearmanr(accuracies, train_sizes)
                w.write(f'Spearman|{p_spear}|{rho}|\n')


end_time = datetime.datetime.now()
elapsed_time = end_time - start_time
print(f'===\nElapsed time: {elapsed_time}')

References