Permalink
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
cse5819-FinalProject/phm08.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
113 lines (96 sloc)
3.63 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
PHM08.py | |
Utilities for loading PHM08 for use with classifiers. | |
''' | |
import numpy as np | |
import math | |
from random import shuffle | |
# load_raw_data() | |
# | |
# Load the raw dataset from file | |
# Returns (training_data_raw, test_data_raw) | |
def load_raw_data(): | |
train_data_raw = np.loadtxt("PHM08/train.txt") | |
test_data_raw = np.loadtxt("PHM08/test.txt") | |
return (train_data_raw, test_data_raw) | |
# process_raw_data(x) | |
# | |
# In order to use PHM08 for classification, | |
# we do two things: | |
# | |
# 1) Strip unit number feature. | |
# This was for training as time series, | |
# However, we want each data point to be "independent." | |
# | |
# 2) Generate Labels | |
# It is assumed that the last datapoint for a given unit series | |
# is that units last run, that it has reached end-of-life. | |
def process_raw_data(data): | |
(m, _) = data.shape | |
y = [] | |
for i in range(0, m): | |
unit = data[i][0] | |
if i < m - 1 and unit != data[i + 1][0]: | |
y.append(1) | |
elif i == m - 1: | |
y.append(1) | |
else: | |
y.append(0) | |
return (data[:,1:], y) | |
# dataset() | |
# | |
# Return processed PHM08 data. | |
def dataset(): | |
(train_raw, test_raw) = load_raw_data() | |
(train_x, train_y) = process_raw_data(train_raw) | |
(test_x, test_y) = process_raw_data(test_raw) | |
return (train_x, train_y, test_x, test_y) | |
class DataSplits: | |
def __init__(self): | |
data = np.loadtxt("PHM08/train.txt") | |
self.series = [] | |
(m, _) = data.shape | |
x_series = [] | |
y_series = [] | |
for i in range(0, m): | |
unit = data[i][0] | |
x_series.append(data[i,1:]) | |
if (i < m - 1 and unit != data[i + 1][0]) or i == m - 1 : | |
y_series.append(1) | |
self.series.append((np.array(x_series), y_series)) | |
x_series = [] | |
y_series = [] | |
else: | |
y_series.append(0) | |
shuffle(self.series) | |
# There are 218 series, that we shuffled after loading. | |
# We save the last 18 for testing. | |
# Then we split series 0 through 200 into bunches of 20, | |
# and do ten fold cross validation with them | |
# | |
# So this function takes i in [1, 10), | |
# and returns the data_test, data_validate fold. | |
def get_train_validate_split(self, j): | |
x_train_series = [self.series[i][0] for i in range(0, 200) if j * 20 > i or i >= (j + 1) * 20 ] | |
y_train_series = [self.series[i][1] for i in range(0, 200) if j * 20 > i or i >= (j + 1) * 20 ] | |
x_validate_series = [self.series[i][0] for i in range(0, 200) if j * 20 <= i and i < (j + 1) * 20 ] | |
y_validate_series = [self.series[i][1] for i in range(0, 200) if j * 20 <= i and i < (j + 1) * 20] | |
x_train = np.concatenate(x_train_series, axis = 0) | |
y_train = np.concatenate(y_train_series, axis = 0) | |
x_validate = np.concatenate(x_validate_series, axis = 0) | |
y_validate = np.concatenate(y_validate_series, axis = 0) | |
return (x_train, y_train, x_validate, y_validate) | |
# Returns the last 18 series for testing | |
def get_test_data(self): | |
x_series = [self.series[i][0] for i in range(200, 218)] | |
y_series = [self.series[i][1] for i in range(200, 218)] | |
x_test = np.concatenate(x_series, axis = 0) | |
y_test = np.concatenate(y_series, axis = 0) | |
return (x_test, y_test) | |
# Returns all 200 series reserved for training. | |
def get_train_data(self): | |
x_series = [self.series[i][0] for i in range(0, 200)] | |
y_series = [self.series[i][1] for i in range(0, 200)] | |
x_test = np.concatenate(x_series, axis = 0) | |
y_test = np.concatenate(y_series, axis = 0) | |
return (x_test, y_test) | |