phm08.py

'''
PHM08.py

Utilities for loading PHM08 for use with classifiers.
'''
import numpy as np
import math
from random import shuffle

# load_raw_data()
#
# Load the raw dataset from file
# Returns (training_data_raw, test_data_raw)
def load_raw_data():
    train_data_raw = np.loadtxt("PHM08/train.txt")
    test_data_raw = np.loadtxt("PHM08/test.txt")
    return (train_data_raw, test_data_raw)

# process_raw_data(x)
#
# In order to use PHM08 for classification,
# we do two things:
#
# 1) Strip unit number feature.
#    This was for training as time series,
#    However, we want each data point to be "independent."
#
# 2) Generate Labels
#    It is assumed that the last datapoint for a given unit series
#    is that units last run, that it has reached end-of-life.
def process_raw_data(data):
    (m, _) = data.shape
    y = []
    for i in range(0, m):
        unit = data[i][0]
        if i < m - 1 and unit != data[i + 1][0]:
            y.append(1)
        elif i == m - 1:
            y.append(1)
        else:
            y.append(0)

    return (data[:,1:], y)

# dataset()
#
# Return processed PHM08 data.
def dataset():
    (train_raw, test_raw) = load_raw_data()
    (train_x, train_y) = process_raw_data(train_raw)
    (test_x, test_y) = process_raw_data(test_raw)
    return (train_x, train_y, test_x, test_y)


class DataSplits:
    def __init__(self):
        data = np.loadtxt("PHM08/train.txt")
        self.series = []
        (m, _) = data.shape
        x_series = []
        y_series = []
        for i in range(0, m):
            unit = data[i][0]
            x_series.append(data[i,1:])
            if (i < m - 1 and unit != data[i + 1][0]) or i == m - 1 :
                y_series.append(1)
                self.series.append((np.array(x_series), y_series))
                x_series = []
                y_series = []
            else:
                y_series.append(0)

        shuffle(self.series)

    # There are 218 series, that we shuffled after loading.
    # We save the last 18 for testing.
    # Then we split series 0 through 200 into bunches of 20,
    # and do ten fold cross validation with them
    #
    # So this function takes i in [1, 10),
    # and returns the data_test, data_validate fold.
    def get_train_validate_split(self, j):
        x_train_series = [self.series[i][0] for i in range(0, 200) if j * 20 > i or i >= (j + 1) * 20 ]
        y_train_series = [self.series[i][1] for i in range(0, 200) if j * 20 > i or i >= (j + 1) * 20 ]
        x_validate_series = [self.series[i][0] for i in range(0, 200) if j * 20 <= i and i < (j + 1) * 20 ]
        y_validate_series = [self.series[i][1] for i in range(0, 200) if j * 20 <= i and i < (j + 1) * 20]

        x_train = np.concatenate(x_train_series, axis = 0)
        y_train = np.concatenate(y_train_series, axis = 0)
        x_validate = np.concatenate(x_validate_series, axis = 0)
        y_validate = np.concatenate(y_validate_series, axis = 0)
        return (x_train, y_train, x_validate, y_validate)

    # Returns the last 18 series for testing
    def get_test_data(self):
        x_series = [self.series[i][0] for i in range(200, 218)]
        y_series = [self.series[i][1] for i in range(200, 218)]
        x_test = np.concatenate(x_series, axis = 0)
        y_test = np.concatenate(y_series, axis = 0)
        return (x_test, y_test)

    # Returns all 200 series reserved for training.
    def get_train_data(self):
        x_series = [self.series[i][0] for i in range(0, 200)]
        y_series = [self.series[i][1] for i in range(0, 200)]
        x_test = np.concatenate(x_series, axis = 0)
        y_test = np.concatenate(y_series, axis = 0)
        return (x_test, y_test)
	'''
	PHM08.py

	Utilities for loading PHM08 for use with classifiers.
	'''
	import numpy as np
	import math
	from random import shuffle

	# load_raw_data()
	#
	# Load the raw dataset from file
	# Returns (training_data_raw, test_data_raw)
	def load_raw_data():
	train_data_raw = np.loadtxt("PHM08/train.txt")
	test_data_raw = np.loadtxt("PHM08/test.txt")
	return (train_data_raw, test_data_raw)

	# process_raw_data(x)
	#
	# In order to use PHM08 for classification,
	# we do two things:
	#
	# 1) Strip unit number feature.
	# This was for training as time series,
	# However, we want each data point to be "independent."
	#
	# 2) Generate Labels
	# It is assumed that the last datapoint for a given unit series
	# is that units last run, that it has reached end-of-life.
	def process_raw_data(data):
	(m, _) = data.shape
	y = []
	for i in range(0, m):
	unit = data[i][0]
	if i < m - 1 and unit != data[i + 1][0]:
	y.append(1)
	elif i == m - 1:
	y.append(1)
	else:
	y.append(0)

	return (data[:,1:], y)

	# dataset()
	#
	# Return processed PHM08 data.
	def dataset():
	(train_raw, test_raw) = load_raw_data()
	(train_x, train_y) = process_raw_data(train_raw)
	(test_x, test_y) = process_raw_data(test_raw)
	return (train_x, train_y, test_x, test_y)


	class DataSplits:
	def __init__(self):
	data = np.loadtxt("PHM08/train.txt")
	self.series = []
	(m, _) = data.shape
	x_series = []
	y_series = []
	for i in range(0, m):
	unit = data[i][0]
	x_series.append(data[i,1:])
	if (i < m - 1 and unit != data[i + 1][0]) or i == m - 1 :
	y_series.append(1)
	self.series.append((np.array(x_series), y_series))
	x_series = []
	y_series = []
	else:
	y_series.append(0)

	shuffle(self.series)

	# There are 218 series, that we shuffled after loading.
	# We save the last 18 for testing.
	# Then we split series 0 through 200 into bunches of 20,
	# and do ten fold cross validation with them
	#
	# So this function takes i in [1, 10),
	# and returns the data_test, data_validate fold.
	def get_train_validate_split(self, j):
	x_train_series = [self.series[i][0] for i in range(0, 200) if j * 20 > i or i >= (j + 1) * 20 ]
	y_train_series = [self.series[i][1] for i in range(0, 200) if j * 20 > i or i >= (j + 1) * 20 ]
	x_validate_series = [self.series[i][0] for i in range(0, 200) if j * 20 <= i and i < (j + 1) * 20 ]
	y_validate_series = [self.series[i][1] for i in range(0, 200) if j * 20 <= i and i < (j + 1) * 20]

	x_train = np.concatenate(x_train_series, axis = 0)
	y_train = np.concatenate(y_train_series, axis = 0)
	x_validate = np.concatenate(x_validate_series, axis = 0)
	y_validate = np.concatenate(y_validate_series, axis = 0)
	return (x_train, y_train, x_validate, y_validate)

	# Returns the last 18 series for testing
	def get_test_data(self):
	x_series = [self.series[i][0] for i in range(200, 218)]
	y_series = [self.series[i][1] for i in range(200, 218)]
	x_test = np.concatenate(x_series, axis = 0)
	y_test = np.concatenate(y_series, axis = 0)
	return (x_test, y_test)

	# Returns all 200 series reserved for training.
	def get_train_data(self):
	x_series = [self.series[i][0] for i in range(0, 200)]
	y_series = [self.series[i][1] for i in range(0, 200)]
	x_test = np.concatenate(x_series, axis = 0)
	y_test = np.concatenate(y_series, axis = 0)
	return (x_test, y_test)