[90]:
# Install tutorial-specific dependencies (e.g., matplotlib for plotting)
%pip install matplotlib seaborn kagglehub --quiet
Note: you may need to restart the kernel to use updated packages.
[1]:
import kagglehub
import numpy as np
import pandas as pd
import os
from pyhctsa.calculator import FeatureCalculator
from scipy.signal import filtfilt, butter
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix
import seaborn as sns

✨ Tutorial: Bonn EEG (end-to-end pyhctsa workflow)

This tutorial provides a hands-on walkthrough of the standard pyhctsa pipeline using data from the Bonn EEG dataset. We will move through the entire research cycle—starting with raw time-series data and ending with interpreted classification results.

What You Will Learn

The workflow is divided into four key phases:

  1. Data Preparation: Structuring your time-series data and associated metadata into the required pyhctsa format.

  2. Feature Extraction: Leveraging the massive feature library to compute thousands of properties for each time series.

  3. Data Processing & Cleaning: Handling missing values, normalizing features, and preparing the resulting feature matrix for analysis.

  4. Analysis & Visualization: Implementing classification algorithms to distinguish between data classes and using built-in visualization tools to interpret which features drive those differences.

⬇️ (1) Download the Bonn EEG Dataset

Download the Bonn EEG dataset directly from kaggle

  • EEG recordings from healthy individuals and patients with epilepsy

  • Five distinct classes: A, B, C, D and E

  • Each class has 100 single-channel EEG segments, with each segment comprising 23.6 seconds of EEG data, sampled at 173.61 Hz, yielding a length 4096 sample time series

A = Z, B = O, C = N, D = F, E = S

  • A/Z = eyes open healthy (eyesOpen)

  • B/O = eyes closed healthy (eyesClosed)

  • C/N = seizure free interval for epileptic patient (hippocampus)

  • D/F = seizure free interval epileptic patient, non-seizure epileptogenic zone (epileptogenic)

  • E/S = During seizure epileptic patient (seizure)

[2]:
path = kagglehub.dataset_download("quands/eeg-dataset") # download from kagglehub
print(os.listdir(os.path.join(path, 'Dataset'))) # display the subfolders
['N', 'Z', 'S', 'F', 'O']

Define the class mappings

[3]:
mappings = {'Z': 'eyesOpen', 'N': 'hippocampus', 'F': 'epileptogenic', 'S': 'seizure', 'O': 'eyesClosed'}

🗳 (2) Pre-process the EEG recordings

Pre-filter with a 40Hz low pass

[4]:
def unpack_subfolder(base_path : str, which_folder : str) -> pd.DataFrame:
    cutoff_freq = 40.0 # 40 Hz
    fs = 173.61 # sample rate in Hz
    order = 4 # 4th order butterworth filter
    nyq = 0.5 * fs
    normal_cutoff = cutoff_freq/nyq
    b, a = butter(order, normal_cutoff, btype='lowpass')
    if which_folder.lower() in ['n', 'z', 's', 'f', 'o']:
        label = mappings[which_folder.upper()]
        path = os.path.join(base_path, 'Dataset', which_folder)
        assert(os.path.exists(path))
        # check number of files
        subfiles = os.listdir(path)
        num_subfiles = len(subfiles)
        if num_subfiles != 100:
            print(f'More files than expected. Expected 100, got {num_subfiles}')
        all_series = []
        for subfile in subfiles:
            data = np.loadtxt(os.path.join(path, subfile))
            filtered_data = filtfilt(b, a, data)
            all_series.append(filtered_data)
        df = pd.DataFrame(all_series)
        df['label'] = label
    else:
        raise ValueError(f"Unknown folder: {which_folder}")
    return df

Concatenate all time-series instances into a single dataframe

[5]:
df_series = pd.concat([unpack_subfolder(path, c) for c in ['N', 'Z', 'S', 'F', 'O']])
[6]:
df_series.head(3)
[6]:
0 1 2 3 4 5 6 7 8 9 ... 4088 4089 4090 4091 4092 4093 4094 4095 4096 label
0 -76.002880 -8.160361 36.130147 37.625116 0.235783 -41.989949 -54.718865 -38.386317 -20.373990 -17.291749 ... 180.432287 164.583116 150.705251 157.809372 194.080194 235.464085 228.103199 127.476462 -51.043101 hippocampus
1 34.000406 31.416358 27.038342 21.346209 15.747189 10.304919 4.426019 -1.259340 -5.333748 -7.425735 ... -22.703815 -16.919187 -13.739674 -12.329390 -8.005106 1.240274 7.273179 -2.612481 -28.999855 hippocampus
2 -56.990550 -71.547664 -75.119898 -67.728569 -58.999405 -56.182465 -56.529083 -53.060212 -44.914053 -38.376826 ... -70.041801 -60.266899 -43.302333 -26.943180 -24.632308 -40.924329 -56.324522 -40.254580 10.991437 hippocampus

3 rows × 4098 columns

Plot first example from each class:

[7]:
fs = 173.61
fix = False
t = np.arange(df_series.drop(columns=['label']).shape[1])/fs
fig, ax = plt.subplots(5, 1, sharex=True, figsize=(12, 10))
df_series[df_series['label'] == 'eyesOpen'].drop(columns=['label']).iloc[0 if fix else np.random.randint(0, 99)].plot(x=t, ax=ax[0], title="eyesOpen", c='magenta')
df_series[df_series['label'] == 'eyesClosed'].drop(columns=['label']).iloc[0 if fix else np.random.randint(0, 99)].plot(x=t, ax=ax[1], title="eyesClosed", c='lightblue')
df_series[df_series['label'] == 'epileptogenic'].drop(columns=['label']).iloc[0 if fix else np.random.randint(0, 99)].plot(x=t, ax=ax[2], title="epileptogenic", c='lightgreen')
df_series[df_series['label'] == 'hippocampus'].drop(columns=['label']).iloc[0 if fix else np.random.randint(0, 99)].plot(x=t, ax=ax[3], title="hippocampus", c='orange')
df_series[df_series['label'] == 'seizure'].drop(columns=['label']).iloc[0 if fix else np.random.randint(0, 99)].plot(x=t, ax=ax[4], title="seizure", c='red')
plt.xlabel("time (s)")
ax[0].set_ylabel(rf"amplitude ($\mu v$)")
ax[1].set_ylabel(rf"amplitude ($\mu v$)")
ax[2].set_ylabel(rf"amplitude ($\mu v$)")
ax[3].set_ylabel(rf"amplitude ($\mu v$)")
ax[4].set_ylabel(rf"amplitude ($\mu v$)")
plt.tight_layout()
../_images/usage_tutorial_bon_eeg_20_0.png

⚙️ (3) Instantiate a calculator in pyhctsa

While you can run the calculator with the full hctsa feature set, for the sake of brevity, we will use a custom yaml file with a selected subset of 110 “fast” feature functions:

[8]:
calc = FeatureCalculator(config_path="./custom.yaml")
Loaded 110 master operations.
[9]:
bonn_eeg = df_series
[10]:
data_df = bonn_eeg.drop(columns=['label']).to_numpy() # drop the labels column for computing features on the time-series data

To ensure everything is working as expected, we will first run the calculator on the first instance/time-series of the data frame

[11]:
res = calc.extract(data_df[0, :])
Evaluating 110 partialed functions. Strap in!...
Feature extraction completed in 0.120 seconds.
[12]:
res
[12]:
hrv_classic.pnn5 hrv_classic.pnn10 hrv_classic.pnn20 hrv_classic.pnn30 hrv_classic.pnn40 hrv_classic.lfhf hrv_classic.vlf hrv_classic.lf hrv_classic.hf hrv_classic.tri ... ac_nl_0_2_6 ac_nl_0_3_6 ac_nl_0_4_6 ac_nl_0_5_6 ac_nl_1_1_2 ac_nl_1_1_3 ac_nl_1_2_2 ac_nl_1_3_3 ac_nl_2_2_3 ac_nl_2_3_3
instance
ts_1 0.988037 0.975098 0.943359 0.915039 0.887207 1.438589 16.373641 38.574764 26.814297 3.080451 ... 1.634811 1.597856 1.577479 1.561968 3.240279 2.838081 3.121141 2.618175 2.834442 2.684552

1 rows × 347 columns

Now that everything is working, let’s scale up to the full dataset:

[33]:
res = calc.extract(data_df)
Evaluating 110 partialed functions. Strap in!...
/Users/jmoo2880/Documents/pyhctsa-latest/pyhctsa/pyhctsa/operations/spectral.py:92: RuntimeWarning: divide by zero encountered in log
  log_s = np.log(s)
/opt/anaconda3/envs/pyhctsa-latest/lib/python3.12/site-packages/numpy/_core/_methods.py:190: RuntimeWarning: invalid value encountered in subtract
  x = asanyarray(arr - arrmean)
/Users/jmoo2880/Documents/pyhctsa-latest/pyhctsa/pyhctsa/operations/spectral.py:221: RuntimeWarning: divide by zero encountered in log
  h_shann = -s * np.log(s)
/Users/jmoo2880/Documents/pyhctsa-latest/pyhctsa/pyhctsa/operations/spectral.py:221: RuntimeWarning: invalid value encountered in multiply
  h_shann = -s * np.log(s)
/Users/jmoo2880/Documents/pyhctsa-latest/pyhctsa/pyhctsa/operations/spectral.py:228: RuntimeWarning: divide by zero encountered in log
  out['sfm'] = 10 * np.log10(np.exp(np.mean(np.log(s))) / np.mean(s))
/Users/jmoo2880/Documents/pyhctsa-latest/pyhctsa/pyhctsa/operations/spectral.py:228: RuntimeWarning: divide by zero encountered in log10
  out['sfm'] = 10 * np.log10(np.exp(np.mean(np.log(s))) / np.mean(s))
/Users/jmoo2880/Documents/pyhctsa-latest/pyhctsa/pyhctsa/operations/spectral.py:242: RuntimeWarning: divide by zero encountered in log
  out['logarea_2_1'] = np.sum(np.log(split[:, 0])) * dw
/Users/jmoo2880/Documents/pyhctsa-latest/pyhctsa/pyhctsa/operations/spectral.py:253: RuntimeWarning: divide by zero encountered in log
  out['logarea_3_1'] = np.sum(np.log(split[:, 0])) * dw
/Users/jmoo2880/Documents/pyhctsa-latest/pyhctsa/pyhctsa/operations/spectral.py:266: RuntimeWarning: divide by zero encountered in log
  out['logarea_4_1'] = np.sum(np.log(split[:, 0])) * dw
/Users/jmoo2880/Documents/pyhctsa-latest/pyhctsa/pyhctsa/operations/spectral.py:281: RuntimeWarning: divide by zero encountered in log
  out['logarea_5_1'] = np.sum(np.log(split[:, 0])) * dw
Feature extraction completed in 31.016 seconds.

[OPTIONAL] (3.5)

Time-series feature extraction is computationally intensive. To speed up processing, pyhctsa allows you to distribute the workload across multiple CPU cores on your local machine using the LocalDistributor:

[36]:
from pyhctsa.distribute import LocalDistributor
[ ]:
# create a LocalDistributor
dist = LocalDistributor(n_workers=6) # set to the number of physical cores available on your local machine...
[38]:
res = calc.extract(data_df, distributor=dist)
Evaluating 110 partialed functions. Strap in!...
/Users/jmoo2880/Documents/pyhctsa-latest/pyhctsa/pyhctsa/operations/spectral.py:92: RuntimeWarning: divide by zero encountered in log
  log_s = np.log(s)
/opt/anaconda3/envs/pyhctsa-latest/lib/python3.12/site-packages/numpy/_core/_methods.py:190: RuntimeWarning: invalid value encountered in subtract
  x = asanyarray(arr - arrmean)
/Users/jmoo2880/Documents/pyhctsa-latest/pyhctsa/pyhctsa/operations/spectral.py:221: RuntimeWarning: divide by zero encountered in log
  h_shann = -s * np.log(s)
/Users/jmoo2880/Documents/pyhctsa-latest/pyhctsa/pyhctsa/operations/spectral.py:221: RuntimeWarning: invalid value encountered in multiply
  h_shann = -s * np.log(s)
/Users/jmoo2880/Documents/pyhctsa-latest/pyhctsa/pyhctsa/operations/spectral.py:228: RuntimeWarning: divide by zero encountered in log
  out['sfm'] = 10 * np.log10(np.exp(np.mean(np.log(s))) / np.mean(s))
/Users/jmoo2880/Documents/pyhctsa-latest/pyhctsa/pyhctsa/operations/spectral.py:228: RuntimeWarning: divide by zero encountered in log10
  out['sfm'] = 10 * np.log10(np.exp(np.mean(np.log(s))) / np.mean(s))
/Users/jmoo2880/Documents/pyhctsa-latest/pyhctsa/pyhctsa/operations/spectral.py:242: RuntimeWarning: divide by zero encountered in log
  out['logarea_2_1'] = np.sum(np.log(split[:, 0])) * dw
/Users/jmoo2880/Documents/pyhctsa-latest/pyhctsa/pyhctsa/operations/spectral.py:253: RuntimeWarning: divide by zero encountered in log
  out['logarea_3_1'] = np.sum(np.log(split[:, 0])) * dw
/Users/jmoo2880/Documents/pyhctsa-latest/pyhctsa/pyhctsa/operations/spectral.py:266: RuntimeWarning: divide by zero encountered in log
  out['logarea_4_1'] = np.sum(np.log(split[:, 0])) * dw
/Users/jmoo2880/Documents/pyhctsa-latest/pyhctsa/pyhctsa/operations/spectral.py:281: RuntimeWarning: divide by zero encountered in log
  out['logarea_5_1'] = np.sum(np.log(split[:, 0])) * dw
Feature extraction completed in 6.369 seconds.

Here we can see that by utilising parallel computing, we can reduce our feature extraction time by a factor of ~4.

[OPTIONAL] Load results from the full feature set

If you wish to access the results from the full hctsa feature set, but do not want to run the full feature set locally due to time contraints, you can download and import the pre-computed results from https://doi.org/10.6084/m9.figshare.31290259

[8]:
LOAD_PRECALCULATED_RESULTS = True # set to true to load the results
if LOAD_PRECALCULATED_RESULTS:
    res = pd.read_pickle("./bonn_eeg_results.pkl") # point to the path where the downloaded results exist
[9]:
res.head(5)
[9]:
hrv_classic.pnn5 hrv_classic.pnn10 hrv_classic.pnn20 hrv_classic.pnn30 hrv_classic.pnn40 hrv_classic.lfhf hrv_classic.vlf hrv_classic.lf hrv_classic.hf hrv_classic.tri ... surrogate_test_RP_99_o3.o3_zscore surrogate_test_RP_99_o3.o3_f surrogate_test_RP_99_o3.o3_mediqr surrogate_test_RP_99_o3.o3_prank surrogate_test_RP_99_tc3.tc3_p surrogate_test_RP_99_tc3.tc3_zscore surrogate_test_RP_99_tc3.tc3_f surrogate_test_RP_99_tc3.tc3_mediqr surrogate_test_RP_99_tc3.tc3_prank multi_scale_entropy_1_10_2_0p15_rescale_tau.out
instance
ts_1 0.988037 0.975098 0.943359 0.915039 0.887207 1.438589 16.373641 38.574764 26.814297 3.080451 ... -2.217531 0.000000 1.393878 0.02 0.004159 2.865863 0.011524 2.146499 0.06 NaN
ts_2 0.979736 0.959473 0.916504 0.875244 0.831299 1.910163 16.994118 50.579037 26.478908 2.559026 ... 0.237313 0.345205 0.250219 0.78 0.010484 -2.559445 0.000000 1.908045 0.02 NaN
ts_3 0.979492 0.960938 0.929443 0.891602 0.852295 2.321853 26.072435 47.009691 20.246629 3.590710 ... 0.532657 0.331571 0.448411 0.64 0.523415 -0.638090 0.325449 0.515422 0.52 NaN
ts_4 0.976807 0.953125 0.906738 0.856689 0.813721 3.314088 47.505651 36.356375 10.970250 3.584427 ... 9.644766 0.000000 7.529028 0.04 0.240547 -1.173620 0.245583 0.819281 0.22 NaN
ts_5 0.979492 0.957764 0.914795 0.870605 0.833008 2.491926 17.374024 55.482169 22.264773 2.873072 ... 6.303317 0.000000 4.622768 0.04 0.400100 -0.841443 0.257640 0.789540 0.40 NaN

5 rows × 4573 columns

(4) Clean the results table

Not all features returned by pyhctsa will be well-behaved on every dataset. To perform downstream analyses such as classification, we must first filter out special-valued features and NaNs:

[10]:
res_clean = (
    res.replace([np.inf, -np.inf], np.nan)
      .dropna(axis=1)
)
[11]:
res_filtered = res_clean.dropna(axis=1)

Filter out non-numeric

[12]:
res_num = res_filtered.apply(pd.to_numeric, errors="coerce")
good_cols = res_num.notna().eq(res.notna()).all(axis=0)
res_num_strict = res_num.loc[:, good_cols]

Filter out zero variance (constant) columns

[13]:
zero_var_cols = res_num_strict.columns[res_num_strict.var(axis=0, skipna=True) == 0]
zero_var_cols
[13]:
Index(['add_noise_1_quantiles_10.pcrossmean', 'add_noise_1_even_10.pcrossmean',
       'add_noise_1_std1_10.pcrossmean', 'add_noise_1_gaussian.pdec',
       'add_noise_1_gaussian.pcrossmean', 'add_noise_1_kraskov1_4.pcrossmean',
       'translate_shape_circle_1p5_pts.max',
       'translate_shape_circle_1p5_pts.mode',
       'translate_shape_circle_2p5_pts.max',
       'translate_shape_circle_2p5_pts.mode',
       ...
       'ar_fit_1_8_sbc.popt_sbc', 'ar_fit_1_8_sbc.popt_fpe',
       'ar_fit_1_8_sbc.res_siglev', 'walker_prop_0p1.res_runstest',
       'walker_prop_0p5.res_runstest', 'walker_prop_0p9.res_runstest',
       'walker_prop_1p1.res_runstest', 'walker_biasprop_0p5_0p1.res_runstest',
       'walker_biasprop_0p1_0p5.res_runstest',
       'walker_momentum_5.res_runstest'],
      dtype='str', length=103)
[14]:
good_features_df = res_num_strict.drop(columns=zero_var_cols)
[15]:
good_features_df
[15]:
hrv_classic.pnn5 hrv_classic.pnn10 hrv_classic.pnn20 hrv_classic.pnn30 hrv_classic.pnn40 hrv_classic.lfhf hrv_classic.vlf hrv_classic.lf hrv_classic.hf hrv_classic.tri ... surrogate_test_RP_99_o3.o3_p surrogate_test_RP_99_o3.o3_zscore surrogate_test_RP_99_o3.o3_f surrogate_test_RP_99_o3.o3_mediqr surrogate_test_RP_99_o3.o3_prank surrogate_test_RP_99_tc3.tc3_p surrogate_test_RP_99_tc3.tc3_zscore surrogate_test_RP_99_tc3.tc3_f surrogate_test_RP_99_tc3.tc3_mediqr surrogate_test_RP_99_tc3.tc3_prank
instance
ts_1 0.988037 0.975098 0.943359 0.915039 0.887207 1.438589 16.373641 38.574764 26.814297 3.080451 ... 2.658684e-02 -2.217531 0.000000 1.393878 0.02 0.004159 2.865863 0.011524 2.146499 0.06
ts_2 0.979736 0.959473 0.916504 0.875244 0.831299 1.910163 16.994118 50.579037 26.478908 2.559026 ... 8.124138e-01 0.237313 0.345205 0.250219 0.78 0.010484 -2.559445 0.000000 1.908045 0.02
ts_3 0.979492 0.960938 0.929443 0.891602 0.852295 2.321853 26.072435 47.009691 20.246629 3.590710 ... 5.942707e-01 0.532657 0.331571 0.448411 0.64 0.523415 -0.638090 0.325449 0.515422 0.52
ts_4 0.976807 0.953125 0.906738 0.856689 0.813721 3.314088 47.505651 36.356375 10.970250 3.584427 ... 5.172918e-22 9.644766 0.000000 7.529028 0.04 0.240547 -1.173620 0.245583 0.819281 0.22
ts_5 0.979492 0.957764 0.914795 0.870605 0.833008 2.491926 17.374024 55.482169 22.264773 2.873072 ... 2.913416e-10 6.303317 0.000000 4.622768 0.04 0.400100 -0.841443 0.257640 0.789540 0.40
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
ts_496 0.984375 0.970215 0.944824 0.917725 0.889648 0.336630 13.870277 18.356541 54.530277 3.028086 ... 8.572270e-02 -1.718405 0.108641 1.331643 0.10 0.525572 -0.634780 0.314154 0.460967 0.58
ts_497 0.989990 0.981934 0.962158 0.946045 0.925537 0.197599 2.498134 6.366314 32.218319 4.028515 ... 8.857211e-24 -10.053599 0.000000 9.243120 0.02 0.384554 0.869537 0.297431 0.481377 0.48
ts_498 0.987549 0.976074 0.952881 0.928955 0.905029 0.297679 8.875599 16.243974 54.568739 3.149116 ... 5.731781e-11 -6.550570 0.000000 4.600939 0.02 0.039565 2.058260 0.065244 1.510814 0.12
ts_499 0.990723 0.982178 0.962891 0.941406 0.926270 0.487491 6.891783 12.990598 26.647876 3.158828 ... 5.604103e-05 -4.028889 0.000000 3.096691 0.02 0.241729 1.170676 0.277370 0.641031 0.24
ts_500 0.990723 0.978516 0.961914 0.946289 0.928955 0.773377 5.006532 16.483592 21.313790 3.330894 ... 9.240631e-02 -1.682839 0.093285 1.497840 0.16 0.753890 -0.313514 0.369732 0.328641 0.64

500 rows × 4374 columns

[32]:
from pyhctsa.utils import SigmoidScaler
[33]:
ss = SigmoidScaler(do_scale=True)
[34]:
ss_fit = ss.fit(good_features_df)
[35]:
good_features_df_N = ss.transform(good_features_df)

Normalise features to standardise their respective scales

[ ]:
# good_features_df_N = MinMaxScaler().fit_transform(good_features_df)

🖥 (5) Visualisation

To get a sense of how the pyhctsa features capture dimensions along which different types of dynamics can be distinguished, we can use linear dimensionality reduction methods such as PCA, or non-linear methods such as t-SNE to visualise how time-series instances cluster or separate:

[36]:
pc_fit = PCA(n_components=2).fit(good_features_df_N)
pc = pc_fit.transform(good_features_df_N)
pc_df = pd.DataFrame(pc, columns=['PC1', 'PC2'])
pc_df['label'] = df_series['label'].to_numpy()
[37]:
tsne_fit = TSNE(n_components=2, perplexity=30, random_state=42).fit_transform(good_features_df_N)
tsne_df = pd.DataFrame(tsne_fit, columns=['tSNE1', 'tSNE2'])
tsne_df['label'] = df_series['label'].to_numpy()
[38]:
fig, ax = plt.subplots(1, 2, figsize=(15, 5))
palette = {
    "epileptogenic": "#d62728",
    "eyesClosed":    "#1f77b4",
    "eyesOpen":      "#2ca02c",
    "hippocampus":   "#9467bd",
    "seizure":       "#ff7f0e",
}
#sns.kdeplot(data=pc_df, x='PC1', y='PC2', hue='label', alpha=0.4)
sns.scatterplot(data=pc_df, x='PC1', y='PC2', hue='label', palette=palette, ax=ax[0], s=10, linewidth=0)
ax[0].set_xlabel(f'PC-1 ({np.round(pc_fit.explained_variance_[0], 2)} % var)')
ax[0].set_ylabel(f'PC-1 ({np.round(pc_fit.explained_variance_[1], 2)} % var)')
ax[0].set_title("PCA")

sns.scatterplot(data=tsne_df, x='tSNE1', y='tSNE2', hue='label', palette=palette, ax=ax[1], s=10, linewidth=0)
ax[1].set_title('tSNE')
ax[1].set_xlabel('tSNE-1')
ax[1].set_ylabel('tSNE-2')
ax[1].legend().remove()
plt.show()
../_images/usage_tutorial_bon_eeg_61_0.png

Seizure vs Eyes Open

[39]:
subset_df = pc_df[(pc_df['label'] == 'eyesOpen') | (pc_df['label'] == 'seizure')]
[40]:
palette2 = {
    "eyesOpen":      "#d62728",
    "seizure":       "#1f77b4",
}
sns.scatterplot(data=subset_df, x='PC1', y='PC2', hue='label', palette=palette2, s=20, linewidth=0)
plt.show()
../_images/usage_tutorial_bon_eeg_64_0.png

📂 Classification Problem

Let’s now try to classify between the five groups using a simple linear SVM classifier. Here we will use 10-fold cross validation with a linear SVM and default parameters

[28]:
# create source dataset
df_source = good_features_df
df_source['label'] = df_series['label'].to_numpy()
X = df_source.drop(columns=['label']).to_numpy()
y = df_source['label'].to_numpy() # classes
[29]:
skf = StratifiedKFold(n_splits=10, random_state=0, shuffle=True)
[31]:
per_fold_accuracy = {}
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(f"Fold {i}:")
    # construct training set and test set
    X_train, y_train = X[train_index, :], y[train_index]
    X_test, y_test = X[test_index, :], y[test_index]
    # fit the scaler to the training set
    ss = SigmoidScaler(do_scale=False).fit(X_train)
    X_train_N = ss.transform(X_train)
    X_test_N = ss.transform(X_test)
    # fit the linear SVM
    sv = LinearSVC(max_iter=10_000).fit(X_train_N, y_train)
    # because classes are balance, acc is automatically the bal acc
    score = sv.score(X_test_N, y_test)
    print(f'Classifcation accuracy: {score}')
    per_fold_accuracy[f'fold {i+1}'] = score
# print overall mean accuracy across the 10 folds
mean_fold_acc = np.mean([per_fold_accuracy[v] for v in per_fold_accuracy])
print(f'Mean (across folds) accuracy (5-class) using 10-fold svm_linear classification: {mean_fold_acc*100}%')
Fold 0:
Classifcation accuracy: 0.88
Fold 1:
Classifcation accuracy: 0.92
Fold 2:
Classifcation accuracy: 0.96
Fold 3:
Classifcation accuracy: 0.94
Fold 4:
Classifcation accuracy: 0.98
Fold 5:
Classifcation accuracy: 0.88
Fold 6:
Classifcation accuracy: 0.96
Fold 7:
Classifcation accuracy: 0.96
Fold 8:
Classifcation accuracy: 0.88
Fold 9:
Classifcation accuracy: 0.94
Mean (across folds) accuracy (5-class) using 10-fold svm_linear classification: 93.0%
[ ]: