#!/usr/bin/env python
# -*- coding: utf-8 -*-
import numpy as np
import logging
import requests, zipfile, io, os
logging.basicConfig(level=logging.INFO)
# URL and Root path of the original data folder
URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/00240/UCI%20HAR%20Dataset.zip"
DATASET_PATH = "UCI HAR Dataset"
TRAIN_DATA = DATASET_PATH + "/train/X_train.txt"
TRAIN_LABELS = DATASET_PATH + "/train/y_train.txt"
TEST_DATA = DATASET_PATH + "/test/X_test.txt"
TEST_LABELS = DATASET_PATH + "/test/y_test.txt"
[docs]def download_dataset():
"""
Download raw dataset from url and unzip it
"""
if not os.path.isdir(DATASET_PATH) or len(os.listdir(DATASET_PATH)) == 0:
logging.info(f"Dataset not locally available, downloading...")
r = requests.get(URL)
if not r.ok:
logging.info(f"Error while downloading: {r.status_code}")
return
logging.info(f"Extracting raw data...")
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall()
else:
logging.info(f"Dataset already available, skipping download.")
[docs]def transform_to_text_labels(labels):
"""
Transform numerical labels to corresponding text
Parameters
----------
labels : array
an array of numerical labels
Returns
-------
labels : array
same array with corresponding text labels
"""
labels = labels.astype(str)
with open(DATASET_PATH + "/activity_labels.txt", "r") as f:
for row in f:
num_label, text_label = row.replace("\n", "").split(" ")
labels = np.where(labels == str(num_label), text_label, labels)
return labels
[docs]def get_dataset_split(data_path, labels_path):
"""
Get data and ground-truth of selected split
Parameters
----------
data_path : str
data file location
labels_path : str
labels file location
Returns
-------
data : array
all the data of the split
labels: array
all the corresponding labels (ground-truth)
"""
# Load data
with open(data_path, "r") as f:
data = np.array([row.replace(" ", " ").strip().split(" ") for row in f])
# Load labels
with open(labels_path, "r") as f:
labels = np.array([row.replace("\n", "") for row in f], dtype=int)
return data, labels
[docs]def load(
standardized=False,
printSize=False,
train_data_path=None,
train_labels_path=None,
test_data_path=None,
test_labels_path=None,
):
"""
Get the dataset and the corresponding labels split
into a training and a testing set
Parameters
----------
standardized : bool
standardize the data before returning them or not
Returns
-------
train_data : array
train_labels: array
test_data : array
test_labels: array
"""
logging.info(f"Starting dataset loading...")
download_dataset()
# Get training data
if train_data_path and train_labels_path:
train_data, train_labels = get_dataset_split(train_data_path, train_labels_path)
logging.info(f"Custom Train Dataset loaded.")
else:
train_data, train_labels = get_dataset_split(TRAIN_DATA, TRAIN_LABELS)
# Get testing data
if test_data_path and test_labels_path:
test_data, test_labels = get_dataset_split(test_data_path, test_labels_path)
logging.info(f"Custom Test Dataset loaded.")
else:
test_data, test_labels = get_dataset_split(TEST_DATA, TEST_LABELS)
logging.info(f"Dataset ready.")
if printSize:
logging.info(f"---Train samples: {train_data.shape[0]}")
logging.info(f"---Test samples: {test_data.shape[0]}")
# Standardization if required
if standardized:
from .preprocessor import standardize
train_data, test_data = standardize(train_data, test_data)
return train_data, train_labels, test_data, test_labels