Source code for app.mimic3bp_converter

"""
File: app/mimic3bp_converter.py
Project: 22HLT01 QUMPHY
Contact: nando.hegemann@ptb.de
Gitlab: https://gitlab.com/qumphy
Description: Convert MIMIC-III Blood Pressure Database into PPG training chunks.
SPDX-License-Identifier: EUPL-1.2
"""

import os
import h5py
import numpy as np
from sklearn.model_selection import StratifiedKFold
from tqdm.auto import tqdm


[docs] def map_labels_to_classes(blood_pressure: np.ndarray) -> np.ndarray: """Map systolic/diastolic blood pressure to hypertension classes. The hypertension classes are given by the following table +----------------------+-------+-----------------+--------+------------------+ | Category | Value | Systolic (mmgG) | and/or | Diastolic (mmHg) | +======================+=======+=================+========+==================+ | normal | 0 | < 120 | and | < 80 | +----------------------+-------+-----------------+--------+------------------+ | elevated | 1 | 120 - 129 | and | < 80 | +----------------------+-------+-----------------+--------+------------------+ | Hypertension Stage 1 | 2 | 130 - 139 | or | 80 - 89 | +----------------------+-------+-----------------+--------+------------------+ | Hypertension Stage 2 | 3 | > 139 | or | > 89 | +----------------------+-------+-----------------+--------+------------------+ | Hypertensive Crisis | 4 | > 179 | and/or | > 119 | +----------------------+-------+-----------------+--------+------------------+ .. note:: Classes separation is done according to `Hypertension classes <https://www.heart.org/en/health-topics/high-blood-pressure/understanding-blood-pressure-readings>`_. Parameters ---------- blood_pressure : np.ndarray Array with systolic/diastolic blood pressure values. Returns ------- : Array containing respevtive hypertension classes. """ assert blood_pressure.ndim == 2 normal = np.bitwise_and(blood_pressure[:, 0] < 120, blood_pressure[:, 1] < 80) elevated = np.bitwise_and( np.bitwise_and(blood_pressure[:, 0] >= 120, blood_pressure[:, 0] <= 129), blood_pressure[:, 1] < 80, ) hypertension3 = np.bitwise_or( blood_pressure[:, 0] > 180, blood_pressure[:, 1] > 120 ) hypertension2 = np.bitwise_or( np.bitwise_and(blood_pressure[:, 0] >= 140, blood_pressure[:, 0] <= 179), np.bitwise_and(blood_pressure[:, 1] >= 90, blood_pressure[:, 1] <= 120), ) hypertension1 = np.bitwise_or( np.bitwise_and(blood_pressure[:, 0] >= 130, blood_pressure[:, 0] <= 139), np.bitwise_and(blood_pressure[:, 1] >= 80, blood_pressure[:, 1] <= 89), ) hypertension2 = (hypertension2.astype(np.int8) - hypertension3.astype(np.int8)) > 0 hypertension1 = ( hypertension1.astype(np.int8) - hypertension2.astype(np.int8) - hypertension3.astype(np.int8) ) > 0 elevated = ( elevated.astype(np.int8) - hypertension1.astype(np.int8) - hypertension2.astype(np.int8) - hypertension3.astype(np.int8) ) > 0 return ( 0 * normal + 1 * elevated + 2 * hypertension1 + 3 * hypertension2 + 4 * hypertension3 )
[docs] def convert(file_name: str, save_path: str, folds: int = 1) -> str: """Chunk data into multiple npy files. Parameters ---------- file_name : str Path to file. save_path : str Save directory. folds : int, optional Number of different files the file is split into. Returns ------- : Save location. """ os.makedirs(save_path, exist_ok=True) assert folds > 0 data = h5py.File(file_name) signal = np.array(data.get("/signal")) label = np.array(data.get("/label")) subject_idx = np.squeeze(np.array(data.get("/subject_idx"))) # normalize signal values to [0, 1] # signal = (signal - np.min(signal)) / (np.max(signal) - np.min(signal)) label_classes = map_labels_to_classes(label) majority_label = [ np.argmax(np.bincount(label_classes[subject_idx == sid])) for sid in np.unique(subject_idx) ] skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42) folds = [] unique_subject_idx = np.unique(subject_idx) for _, tst in skf.split(majority_label, majority_label): folds.append(unique_subject_idx[tst]) folds_samples = [ np.concatenate([np.where(subject_idx == idx)[0] for idx in fold]) for fold in folds ] for j, smpl in tqdm(enumerate(folds_samples), total=len(folds_samples)): if j == len(folds_samples) - 1: # last fold is test fold np.save(save_path + "test_signal.npy", signal[smpl]) np.save(save_path + "test_subject_id.npy", subject_idx[smpl]) np.save(save_path + "test_label.npy", label[smpl]) elif j == len(folds_samples) - 2: # penultimate fold is validation fold np.save(save_path + "validate_signal.npy", signal[smpl]) np.save(save_path + "validate_subject_id.npy", subject_idx[smpl]) np.save(save_path + "validate_label.npy", label[smpl]) else: # rest is training np.save(save_path + f"train_signal_{j + 1:02d}.npy", signal[smpl]) np.save(save_path + f"train_subject_id_{j + 1:02d}.npy", subject_idx[smpl]) np.save(save_path + f"train_label_{j + 1:02d}.npy", label[smpl]) return save_path
if __name__ == "__main__": print("split data (this may take some time - and RAM)") save_path = convert( "../data/mimic3bp/MIMIC-III_ppg_dataset.h5", "../data/mimic3bp/formatted/", folds=30, ) print(f"save files to: {save_path}")