Source code for rrgp.database

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import numpy as np
import logging
import requests, zipfile, io, os

logging.basicConfig(level=logging.INFO)

# URL and Root path of the original data folder
URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/00240/UCI%20HAR%20Dataset.zip"
DATASET_PATH = "UCI HAR Dataset"
TRAIN_DATA = DATASET_PATH + "/train/X_train.txt"
TRAIN_LABELS = DATASET_PATH + "/train/y_train.txt"
TEST_DATA = DATASET_PATH + "/test/X_test.txt"
TEST_LABELS = DATASET_PATH + "/test/y_test.txt"


[docs]def download_dataset(): """ Download raw dataset from url and unzip it """ if not os.path.isdir(DATASET_PATH) or len(os.listdir(DATASET_PATH)) == 0: logging.info(f"Dataset not locally available, downloading...") r = requests.get(URL) if not r.ok: logging.info(f"Error while downloading: {r.status_code}") return logging.info(f"Extracting raw data...") z = zipfile.ZipFile(io.BytesIO(r.content)) z.extractall() else: logging.info(f"Dataset already available, skipping download.")
[docs]def transform_to_text_labels(labels): """ Transform numerical labels to corresponding text Parameters ---------- labels : array an array of numerical labels Returns ------- labels : array same array with corresponding text labels """ labels = labels.astype(str) with open(DATASET_PATH + "/activity_labels.txt", "r") as f: for row in f: num_label, text_label = row.replace("\n", "").split(" ") labels = np.where(labels == str(num_label), text_label, labels) return labels
[docs]def get_dataset_split(data_path, labels_path): """ Get data and ground-truth of selected split Parameters ---------- data_path : str data file location labels_path : str labels file location Returns ------- data : array all the data of the split labels: array all the corresponding labels (ground-truth) """ # Load data with open(data_path, "r") as f: data = np.array([row.replace(" ", " ").strip().split(" ") for row in f]) # Load labels with open(labels_path, "r") as f: labels = np.array([row.replace("\n", "") for row in f], dtype=int) return data, labels
[docs]def load( standardized=False, printSize=False, train_data_path=None, train_labels_path=None, test_data_path=None, test_labels_path=None, ): """ Get the dataset and the corresponding labels split into a training and a testing set Parameters ---------- standardized : bool standardize the data before returning them or not Returns ------- train_data : array train_labels: array test_data : array test_labels: array """ logging.info(f"Starting dataset loading...") download_dataset() # Get training data if train_data_path and train_labels_path: train_data, train_labels = get_dataset_split(train_data_path, train_labels_path) logging.info(f"Custom Train Dataset loaded.") else: train_data, train_labels = get_dataset_split(TRAIN_DATA, TRAIN_LABELS) # Get testing data if test_data_path and test_labels_path: test_data, test_labels = get_dataset_split(test_data_path, test_labels_path) logging.info(f"Custom Test Dataset loaded.") else: test_data, test_labels = get_dataset_split(TEST_DATA, TEST_LABELS) logging.info(f"Dataset ready.") if printSize: logging.info(f"---Train samples: {train_data.shape[0]}") logging.info(f"---Test samples: {test_data.shape[0]}") # Standardization if required if standardized: from .preprocessor import standardize train_data, test_data = standardize(train_data, test_data) return train_data, train_labels, test_data, test_labels