"""
File: app/mimic3bp_converter.py
Project: 22HLT01 QUMPHY
Contact: nando.hegemann@ptb.de
Gitlab: https://gitlab.com/qumphy
Description: Convert MIMIC-III Blood Pressure Database into PPG training chunks.
SPDX-License-Identifier: EUPL-1.2
"""
import os
import h5py
import numpy as np
from sklearn.model_selection import StratifiedKFold
from tqdm.auto import tqdm
[docs]
def map_labels_to_classes(blood_pressure: np.ndarray) -> np.ndarray:
"""Map systolic/diastolic blood pressure to hypertension classes.
The hypertension classes are given by the following table
+----------------------+-------+-----------------+--------+------------------+
| Category | Value | Systolic (mmgG) | and/or | Diastolic (mmHg) |
+======================+=======+=================+========+==================+
| normal | 0 | < 120 | and | < 80 |
+----------------------+-------+-----------------+--------+------------------+
| elevated | 1 | 120 - 129 | and | < 80 |
+----------------------+-------+-----------------+--------+------------------+
| Hypertension Stage 1 | 2 | 130 - 139 | or | 80 - 89 |
+----------------------+-------+-----------------+--------+------------------+
| Hypertension Stage 2 | 3 | > 139 | or | > 89 |
+----------------------+-------+-----------------+--------+------------------+
| Hypertensive Crisis | 4 | > 179 | and/or | > 119 |
+----------------------+-------+-----------------+--------+------------------+
.. note::
Classes separation is done according to `Hypertension classes <https://www.heart.org/en/health-topics/high-blood-pressure/understanding-blood-pressure-readings>`_.
Parameters
----------
blood_pressure : np.ndarray
Array with systolic/diastolic blood pressure values.
Returns
-------
:
Array containing respevtive hypertension classes.
"""
assert blood_pressure.ndim == 2
normal = np.bitwise_and(blood_pressure[:, 0] < 120, blood_pressure[:, 1] < 80)
elevated = np.bitwise_and(
np.bitwise_and(blood_pressure[:, 0] >= 120, blood_pressure[:, 0] <= 129),
blood_pressure[:, 1] < 80,
)
hypertension3 = np.bitwise_or(
blood_pressure[:, 0] > 180, blood_pressure[:, 1] > 120
)
hypertension2 = np.bitwise_or(
np.bitwise_and(blood_pressure[:, 0] >= 140, blood_pressure[:, 0] <= 179),
np.bitwise_and(blood_pressure[:, 1] >= 90, blood_pressure[:, 1] <= 120),
)
hypertension1 = np.bitwise_or(
np.bitwise_and(blood_pressure[:, 0] >= 130, blood_pressure[:, 0] <= 139),
np.bitwise_and(blood_pressure[:, 1] >= 80, blood_pressure[:, 1] <= 89),
)
hypertension2 = (hypertension2.astype(np.int8) - hypertension3.astype(np.int8)) > 0
hypertension1 = (
hypertension1.astype(np.int8)
- hypertension2.astype(np.int8)
- hypertension3.astype(np.int8)
) > 0
elevated = (
elevated.astype(np.int8)
- hypertension1.astype(np.int8)
- hypertension2.astype(np.int8)
- hypertension3.astype(np.int8)
) > 0
return (
0 * normal
+ 1 * elevated
+ 2 * hypertension1
+ 3 * hypertension2
+ 4 * hypertension3
)
[docs]
def convert(file_name: str, save_path: str, folds: int = 1) -> str:
"""Chunk data into multiple npy files.
Parameters
----------
file_name : str
Path to file.
save_path : str
Save directory.
folds : int, optional
Number of different files the file is split into.
Returns
-------
:
Save location.
"""
os.makedirs(save_path, exist_ok=True)
assert folds > 0
data = h5py.File(file_name)
signal = np.array(data.get("/signal"))
label = np.array(data.get("/label"))
subject_idx = np.squeeze(np.array(data.get("/subject_idx")))
# normalize signal values to [0, 1]
# signal = (signal - np.min(signal)) / (np.max(signal) - np.min(signal))
label_classes = map_labels_to_classes(label)
majority_label = [
np.argmax(np.bincount(label_classes[subject_idx == sid]))
for sid in np.unique(subject_idx)
]
skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)
folds = []
unique_subject_idx = np.unique(subject_idx)
for _, tst in skf.split(majority_label, majority_label):
folds.append(unique_subject_idx[tst])
folds_samples = [
np.concatenate([np.where(subject_idx == idx)[0] for idx in fold])
for fold in folds
]
for j, smpl in tqdm(enumerate(folds_samples), total=len(folds_samples)):
if j == len(folds_samples) - 1: # last fold is test fold
np.save(save_path + "test_signal.npy", signal[smpl])
np.save(save_path + "test_subject_id.npy", subject_idx[smpl])
np.save(save_path + "test_label.npy", label[smpl])
elif j == len(folds_samples) - 2: # penultimate fold is validation fold
np.save(save_path + "validate_signal.npy", signal[smpl])
np.save(save_path + "validate_subject_id.npy", subject_idx[smpl])
np.save(save_path + "validate_label.npy", label[smpl])
else: # rest is training
np.save(save_path + f"train_signal_{j + 1:02d}.npy", signal[smpl])
np.save(save_path + f"train_subject_id_{j + 1:02d}.npy", subject_idx[smpl])
np.save(save_path + f"train_label_{j + 1:02d}.npy", label[smpl])
return save_path
if __name__ == "__main__":
print("split data (this may take some time - and RAM)")
save_path = convert(
"../data/mimic3bp/MIMIC-III_ppg_dataset.h5",
"../data/mimic3bp/formatted/",
folds=30,
)
print(f"save files to: {save_path}")