Source code for app.mimic3bp_converter

"""
File: app/mimic3bp_converter.py
Project: 22HLT01 QUMPHY
Contact: nando.hegemann@ptb.de
Gitlab: https://gitlab.com/qumphy
Description: Convert MIMIC-III Blood Pressure Database into PPG training chunks.
SPDX-License-Identifier: EUPL-1.2
"""

import os
import h5py
import numpy as np
from sklearn.model_selection import StratifiedKFold
from tqdm.auto import tqdm



[docs]
def map_labels_to_classes(blood_pressure: np.ndarray) -> np.ndarray:
    """Map systolic/diastolic blood pressure to hypertension classes.

    The hypertension classes are given by the following table

    +----------------------+-------+-----------------+--------+------------------+
    | Category             | Value | Systolic (mmgG) | and/or | Diastolic (mmHg) |
    +======================+=======+=================+========+==================+
    | normal               | 0     | < 120           | and    | < 80             |
    +----------------------+-------+-----------------+--------+------------------+
    | elevated             | 1     | 120 - 129       | and    | < 80             |
    +----------------------+-------+-----------------+--------+------------------+
    | Hypertension Stage 1 | 2     | 130 - 139       | or     | 80 - 89          |
    +----------------------+-------+-----------------+--------+------------------+
    | Hypertension Stage 2 | 3     | > 139           | or     | > 89             |
    +----------------------+-------+-----------------+--------+------------------+
    | Hypertensive Crisis  | 4     | > 179           | and/or | > 119            |
    +----------------------+-------+-----------------+--------+------------------+

    .. note::
    Classes separation is done according to `Hypertension classes <https://www.heart.org/en/health-topics/high-blood-pressure/understanding-blood-pressure-readings>`_.


    Parameters
    ----------
    blood_pressure : np.ndarray
        Array with systolic/diastolic blood pressure values.

    Returns
    -------
    :
        Array containing respevtive hypertension classes.
    """
    assert blood_pressure.ndim == 2
    normal = np.bitwise_and(blood_pressure[:, 0] < 120, blood_pressure[:, 1] < 80)
    elevated = np.bitwise_and(
        np.bitwise_and(blood_pressure[:, 0] >= 120, blood_pressure[:, 0] <= 129),
        blood_pressure[:, 1] < 80,
    )
    hypertension3 = np.bitwise_or(
        blood_pressure[:, 0] > 180, blood_pressure[:, 1] > 120
    )
    hypertension2 = np.bitwise_or(
        np.bitwise_and(blood_pressure[:, 0] >= 140, blood_pressure[:, 0] <= 179),
        np.bitwise_and(blood_pressure[:, 1] >= 90, blood_pressure[:, 1] <= 120),
    )
    hypertension1 = np.bitwise_or(
        np.bitwise_and(blood_pressure[:, 0] >= 130, blood_pressure[:, 0] <= 139),
        np.bitwise_and(blood_pressure[:, 1] >= 80, blood_pressure[:, 1] <= 89),
    )

    hypertension2 = (hypertension2.astype(np.int8) - hypertension3.astype(np.int8)) > 0
    hypertension1 = (
        hypertension1.astype(np.int8)
        - hypertension2.astype(np.int8)
        - hypertension3.astype(np.int8)
    ) > 0
    elevated = (
        elevated.astype(np.int8)
        - hypertension1.astype(np.int8)
        - hypertension2.astype(np.int8)
        - hypertension3.astype(np.int8)
    ) > 0

    return (
        0 * normal
        + 1 * elevated
        + 2 * hypertension1
        + 3 * hypertension2
        + 4 * hypertension3
    )




[docs]
def convert(file_name: str, save_path: str, folds: int = 1) -> str:
    """Chunk data into multiple npy files.

    Parameters
    ----------
    file_name : str
        Path to file.
    save_path : str
        Save directory.
    folds : int, optional
        Number of different files the file is split into.

    Returns
    -------
    :
        Save location.
    """
    os.makedirs(save_path, exist_ok=True)
    assert folds > 0

    data = h5py.File(file_name)
    signal = np.array(data.get("/signal"))
    label = np.array(data.get("/label"))
    subject_idx = np.squeeze(np.array(data.get("/subject_idx")))

    # normalize signal values to [0, 1]
    # signal = (signal - np.min(signal)) / (np.max(signal) - np.min(signal))

    label_classes = map_labels_to_classes(label)
    majority_label = [
        np.argmax(np.bincount(label_classes[subject_idx == sid]))
        for sid in np.unique(subject_idx)
    ]
    skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)

    folds = []
    unique_subject_idx = np.unique(subject_idx)
    for _, tst in skf.split(majority_label, majority_label):
        folds.append(unique_subject_idx[tst])
    folds_samples = [
        np.concatenate([np.where(subject_idx == idx)[0] for idx in fold])
        for fold in folds
    ]
    for j, smpl in tqdm(enumerate(folds_samples), total=len(folds_samples)):
        if j == len(folds_samples) - 1:  # last fold is test fold
            np.save(save_path + "test_signal.npy", signal[smpl])
            np.save(save_path + "test_subject_id.npy", subject_idx[smpl])
            np.save(save_path + "test_label.npy", label[smpl])
        elif j == len(folds_samples) - 2:  # penultimate fold is validation fold
            np.save(save_path + "validate_signal.npy", signal[smpl])
            np.save(save_path + "validate_subject_id.npy", subject_idx[smpl])
            np.save(save_path + "validate_label.npy", label[smpl])
        else:  # rest is training
            np.save(save_path + f"train_signal_{j + 1:02d}.npy", signal[smpl])
            np.save(save_path + f"train_subject_id_{j + 1:02d}.npy", subject_idx[smpl])
            np.save(save_path + f"train_label_{j + 1:02d}.npy", label[smpl])
    return save_path



if __name__ == "__main__":
    print("split data (this may take some time - and RAM)")
    save_path = convert(
        "../data/mimic3bp/MIMIC-III_ppg_dataset.h5",
        "../data/mimic3bp/formatted/",
        folds=30,
    )
    print(f"save files to: {save_path}")