Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
data_fusion_p1/feature_extract.m
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
345 lines (318 sloc)
19.5 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
%% This file generates features for GPS - This file should work for both Android and iOS | |
% Used for feature extraction for the WH paper | |
%% Basic stuff | |
function [re rergw]=feature_extract(eps_p,minpts_p,time_di,os,nday) | |
epochtime = 1443672000000; % 2015/10/01 12:00 am, used to calculate hour of the day quickly | |
allneed = []; | |
if strcmp(os,'ios') | |
load('gps_feature_ios_up.mat'); % Supports both Android and iOS; for iOS load: gps_feature_ios.mat | |
interval = 1; | |
tthreshold=600; % set threshold to calculate time spent | |
wifi_tthreshold=600; | |
else | |
load('gps_feature_and_up.mat'); | |
interval = 1; | |
tthreshold=600; | |
wifi_tthreshold=600; | |
end | |
% Initializations | |
c = 0; | |
time_d = time_di | |
for eps = eps_p%0.0001%:0.0001:0.001 | |
for minpts = minpts_p %25 %2:1:30 % for iOS set to 60:60:420 | |
c = c+1 | |
%% | |
% Read data - for all users one by one | |
re=[]; | |
for z = 1:size(gpsData,2) % Last column has users id | |
z | |
for iii=[0 ] | |
userData = gpsData{z}; | |
userData(find(userData(:,11)==3),:)=[]; | |
if (length(userData) == 0) % if no data found, ignore and move one | |
continue; | |
end | |
% in case the time is in microsecond | |
if userData(1,4)>1000000000000 | |
userData(:,4) = userData(:,4)/1000; | |
end | |
if userData(1,1)>1000000000000 | |
userData(:,1) = userData(:,1)/1000; | |
end | |
% senate check | |
periodts = userData(1,1); | |
useris = userData(1,6); | |
userData = userData(find(userData(:,4)>userData(1,1)-nday*3600*24*1000),:); | |
if (length(userData) == 0) % if no data found, ignore and move one | |
continue; | |
end | |
ind = find(userData(:,4)<=userData(1,1)); | |
if length(ind)==0 | |
continue | |
end | |
userData = userData(ind,:); | |
if (length(userData) == 0) % if no data found, ignore and move one | |
continue; | |
end | |
if length(userData(find(userData(:,11)==-1),1)) >= 100 % Atlease 100 samples for a window | |
dc = 1; | |
for di = 2:length(userData) % Day count in a specific PHQ9 period | |
if userData(di,9) ~= userData(di-1,9) | |
dc=dc+1; | |
end | |
end | |
gpsFeatures(z,16) = length(userData)/(dc*24*60/interval); % Time coverage in terms of percentage | |
% to get depress label | |
tdepress = userData(1,10); | |
depress = 1; | |
if (tdepress == 1 | tdepress == 2)% | |
depress = -1; | |
end | |
gpsFeatures(z,15) = depress; % clinical ground truth | |
gpsFeatures(z,14) = dc; % day count | |
gpsFeatures(z,13) = userData(1,6); % uid | |
ind = find(userData(:,7)~=-1); | |
gpsFeatures(z,12) = userData(ind(1),7); % phq9 | |
gpsFeatures(z,1) = userData(1,1); % start time | |
userData = [userData(:,4) userData(:,2) userData(:,3) userData(:,5) userData(:,9) userData(:,11)]; % [timestamp, lat, long, moving/stat, weekday] | |
%% Time spent calculation (preprocessing step) | |
% Calculate how much time user spent on a perticular location | |
% Traverse through all location samples | |
timeSpent = []; % clear timeSpent | |
timeSpent(1,1) = tthreshold; | |
for i = 2:size(userData,1) | |
latDiff =abs(userData(i-1,2) - userData(i,2)); | |
longDiff = abs(userData(i-1,3) - userData(i,3)); | |
timeSpent(i,1) = userData(i,1) - userData(i-1,1) ; | |
if timeSpent(i,1) < 0 | |
i | |
disp('something is fishy, see code line 59'); | |
pause(5); % pause, in case timespent seems incorrect | |
end | |
if userData(i-1,6)==-2 | |
if timeSpent(i,1) > wifi_tthreshold % missing data case | |
timeSpent(i,1) = wifi_tthreshold; % android: 10 minutes ios: 1 minute | |
end | |
else | |
if timeSpent(i,1) > tthreshold % missing data case | |
timeSpent(i,1) = tthreshold; % android: 10 minutes ios: 1 minute | |
end | |
end | |
end % end for - this will give us time spent b/w consecutive long/lat | |
userData(:, end+1) = timeSpent; % add as a column % [timestamp, lat, long, moving/stat, weekday mins_between_traces] | |
r1=sum(userData(:,end))/(dc*24*60*60); | |
r2=sum(userData(find(userData(:,6)==-1),end))/(dc*86400); | |
% re=[re;[r1 r2]]; | |
gpsFeatures(z,21)=r1; | |
gpsFeatures(z,22)=r2; | |
if iii | |
userData = userData(find(userData(:,6) == -1),:); % keep only rows where timeSpent > 0 | |
end | |
%% Feature 1 = Variance | |
% First feature - Location variance - that measures the variability in the | |
% subject's GPS location - For stationary user only | |
% Get the user data where state is stationary i.e 0 | |
ind = find(userData(:,4) == 0 ); % 0 = Stationary points; 1 = Moving points | |
userDataStat = userData(ind,:); % [timestamp, lat, long, moving/stat, weekday mins_between_traces] | |
% Calculate statistical variance of longitude and latitude | |
varLat = var(userDataStat(:,2)); | |
varLong = var(userDataStat(:,3)); | |
locVar = log(varLat + varLong); % Nautral Log | |
gpsFeatures(z,2) = locVar; | |
%% Feature 2 = Entropy | |
% Entropy - to measure the variability of the time the subject spent at a | |
% location cluster | |
% Clustering used = dbScan | |
if time_d==0 | |
[class,type]= dbscan([userDataStat(:,2:3)], minpts, eps); % Apply dbscan without time dimension | |
else | |
timeInfo = mod(userDataStat(:,1)-userDataStat(1,1),86400000) * eps / 3600000; % convert to hours then time eps | |
%[class,type]= dbscan([userDataStat(:,2:3)], minpts, eps); % Apply dbscan | |
[class,type]= dbscan([userDataStat(:,2:3) timeInfo], minpts, eps); % Apply dbscan with time dimension | |
end | |
userDataStat(:,end+1) = class'; % Cluster label % [timestamp, lat, long, moving/stat, weekday mins_between_traces cluster#] | |
eind1 = size(userDataStat,2); % This index have cluster number (-1 = outlier) | |
userDataStat(:,end+1) = type'; % Boundry or outlier etc % [timestamp, lat, long, moving/stat, weekday mins_between_traces cluster# type] | |
eind2 = size(userDataStat,2); % This index have cluster type info | |
% Remove noise/outlier points i.e., type = -1 | |
indClus = find(userDataStat(:,eind1) ~= -1); % -1 Out - ASMA UNCOMMENTED | |
userDataStat = userDataStat(indClus,:); | |
ix = size(userDataStat,2); | |
% Calculate Entropy | |
ent = 0; | |
% gpsTime = datestr(userDataStat(:,1)/86400000 + datenum(1970,1,1)); % Get the time info | |
%t = datetime(gpsTime, 'Format', 'dd/M/yy HH:mm:ss'); | |
%t = datevec(t); % 4th column have hour | |
t = [zeros(length(userDataStat(:,1)),3) mod((userDataStat(:,1)*1000-epochtime)/3600000,24)]; | |
col = size(userDataStat,2); | |
userDataStat(:,end+1) = t(:,4); | |
eind3 = size(userDataStat,2); % This index have hour info | |
%% Feature 4 = Number of Unique Clusters | |
% Number of unique clusters | |
uniClus = unique(userDataStat(:,eind1)); % Cluster number % ASMA CHANGED | |
I = size(uniClus,1); % Number of unique clusters | |
gpsFeatures(z,10) = I; % outlier already removed from cluster | |
if iii==0 | |
baseone = ones(length(userDataStat(:,1)),1); | |
allneed=[allneed;baseone*useris userDataStat(:,1) baseone*periodts userDataStat(:,eind1)]; | |
end | |
if I > 1 % if atleast 2 clusters found | |
maxa = 0; | |
f = -2; % Variable that finally will have home cluster information | |
nightcluster{z}=[]; | |
for i = 1:I % total time spend in the cluster | |
% if uniClus(i) > -1 | |
tmesptind = find( userDataStat(:,eind1) == uniClus(i)); % find the cluster index | |
totTimeClus = sum(userDataStat(:, 6)); % total time spend in a PHQ9 period @ "all" meaningful clusters | |
tsumTime(1,i) = (sum(userDataStat(tmesptind, 6))); % total time spent in a perticular cluster i.e., ith cluster | |
sumTime(1,i) = (sum(userDataStat(tmesptind, 6)))/totTimeClus; % Percentage of time spent @ cluster i | |
ent = ent + ( sumTime(1,i) * (log(sumTime(1,i)))); % calculate entropy | |
ctimes = 1; | |
for j = 2:length(tmesptind) | |
if (userDataStat(tmesptind(j),1) - userDataStat(tmesptind(j-1),1))/1000/3600 >= 1 % times gap | |
ctimes = ctimes+1; % Number of time visited - for top cluster | |
end | |
end | |
nightcover=length(find(userDataStat(:,end) >= 0 & userDataStat(:,end) < 6))/(gpsFeatures(z,14)*36); | |
if uniClus(i) ~= -1 | |
idTime = find(userDataStat(tmesptind,end) >= 0 & userDataStat(tmesptind,end) < 6); % Time between 0 and 6 | |
idset = userDataStat(tmesptind(idTime), 5); % day of the week | |
if sum(userDataStat(tmesptind,6))/sum(userDataStat(:,6)) > 0.1 % 0.1 % 6th column have the time spent, Chaoqun, pls check | |
hometime = 1; | |
for l=2:length(idset) | |
if idset(l)~=idset(l-1) | |
hometime = hometime+1; | |
end | |
end | |
if gpsFeatures(z,16)>=0.5 & nightcover>0.6 | |
nightcluster{z}=[nightcluster{z} length(idTime)]; | |
end | |
if hometime > maxa | |
maxa = hometime; | |
f = uniClus(i); % By the end of iterations, f will have the cluster marked as home | |
end | |
end | |
else | |
gpsFeatures(z,16+iii*17) = tmesptind/length(userDataStat(:,1)); | |
end | |
end | |
%% Feature 2 = Entropy | |
ent = -ent; | |
ent = 0; | |
es = sort(sumTime(1,:),1); | |
for nes =1:length(es(1,:)) | |
ent = ent + ( es(1,nes) * (log(es(1,nes)))); | |
end | |
ent = -ent; | |
gpsFeatures(z,5) = ent; | |
%% Feature 3 = Normalized Entropy | |
gpsFeatures(z,6) = gpsFeatures(z,5)/(log(I)); % I is the total number of clusters | |
if f ==-2 | |
continue | |
end | |
%% Feature 5 = Home stay - cluster which have the time between 12am to 6am | |
if uniClus(1) == -1 % Never happens, just in case | |
gpsFeatures(z,7) = (sumTime(1,f+1));%/sum(sumTime); % WE DONT NEED TO SUM IT | |
else | |
gpsFeatures(z,7) = (sumTime(1,f));%/sum(sumTime); | |
end | |
else | |
gpsFeatures(z,5) = 0; | |
gpsFeatures(z,6) = 0; | |
gpsFeatures(z,7) = 1; | |
end % End if - we are done with clustering | |
if time_d==1 % thome should always be from without time dimension | |
userDataStat = userData(ind,:); | |
[class,type]= dbscan([userDataStat(:,2:3)], minpts, eps); % Apply dbscan for thome | |
userDataStat(:,eind1) = class'; % Cluster label % [timestamp, lat, long, moving/stat, weekday mins_between_traces cluster#] | |
userDataStat(:,eind2) = type'; % Boundry or outlier etc % [timestamp, lat, long, moving/stat, weekday mins_between_traces cluster# type] | |
% Remove noise/outlier points i.e., type = -1 | |
indClus = find(userDataStat(:,eind1) ~= -1); % -1 Out - ASMA UNCOMMENTED | |
userDataStat = userDataStat(indClus,:); | |
gpsTime = datestr(userDataStat(:,1)/86400000 + datenum(1970,1,1)); % Get the time info | |
% Number of unique clusters | |
uniClus = unique(userDataStat(:,eind1)); % Cluster number % ASMA CHANGED | |
I = size(uniClus,1); % Number of unique clusters | |
if I > 1 % if atleast 2 clusters found | |
maxa = 0; | |
f = -2; % Variable that finally will have home cluster information | |
sumTime=[]; | |
tsumTime=[]; | |
for i = 1:I % total time spend in the cluster | |
tmesptind = find( userDataStat(:,eind1) == uniClus(i)); % find the cluster index | |
totTimeClus = sum(userDataStat(:, 6)); % total time spend in a PHQ9 period @ "all" meaningful clusters | |
tsumTime(1,i) = (sum(userDataStat(tmesptind, 6))); % total time spent in a perticular cluster i.e., ith cluster | |
sumTime(1,i) = (sum(userDataStat(tmesptind, 6)))/totTimeClus; % Percentage of time spent @ cluster i | |
if uniClus(i) ~= -1 | |
idTime = find(userDataStat(tmesptind,end) >= 0 & userDataStat(tmesptind,end) < 6); % Time between 0 and 6 | |
idset = userDataStat(tmesptind(idTime), 5); % day of the week | |
if sum(userDataStat(tmesptind,6))/sum(userDataStat(:,6)) > 0.1 % 0.1 % 6th column have the time spent, Chaoqun, pls check | |
hometime = 1; | |
for l=2:length(idset) | |
if idset(l)~=idset(l-1) | |
hometime = hometime+1; | |
end | |
end | |
if hometime > maxa | |
maxa = hometime; | |
f = uniClus(i); % By the end of iterations, f will have the cluster marked as home | |
end | |
end | |
end | |
end | |
if uniClus(1) == -1 % Never happens, just in case | |
gpsFeatures(z,7) = (sumTime(1,f+1));%/sum(sumTime); % WE DONT NEED TO SUM IT | |
else | |
gpsFeatures(z,7) = (sumTime(1,f));%/sum(sumTime); | |
end | |
else | |
gpsFeatures(z,7) = 1; | |
end | |
end % End if - we are done with clustering | |
%% Feature 6 = Transition time | |
% Transition time = number of samples in transition/ total number of | |
% samples | |
movingSamples = sum(userData(find(userData(:,4) == 1),6)); | |
AllSamples = sum(userData(:,6)); | |
gpsFeatures(z,8) = movingSamples/ AllSamples; | |
%% Feature 7 = Total Distance | |
% Total distance traveled in km | |
sumdistkm = 0; | |
for i = 1:(size(userData,1)-1) | |
latLong1 = [userData(i,2) userData(i,3)]; | |
latLong2 = [userData(i+1,2) userData(i+1,3)]; | |
[d1km d2km]=lldistkm(latLong1,latLong2); | |
sumdistkm = sumdistkm + d1km; | |
end | |
gpsFeatures(z,9) = sumdistkm; %unit: km | |
%% Feaure 8 = Average Moving Speed | |
gpsFeatures(z,3) = sumdistkm/((userData(end,1)-userData(1,1))/1000); %average moving speed unit:km/s | |
clearvars -except eps_p allneed epochtime nday gpsonly minpts_p wifi_tthreshold nightcluster os interval time_d starttime eps minpts c tthreshold z gpsData gpsFeatures re %tsumTime centers sumTime userData timeSpent latDiff longDiff ind varLat varLong locVar LongLatStat idx C clususerDataStat clus1Ind clus2Ind clus3Ind timeClus1 timeClus2 timeClus3 i gpsTime homeTime t sumdistkm movingSamples col ent latLong1 latLong2 d1km d2km AllSamples | |
end | |
end | |
end | |
id = cellfun('length',nightcluster); | |
nightcluster(id==0)=[]; | |
nre=[]; | |
for z=1:length(nightcluster) | |
nre=[nre max(nightcluster{z})/sum(nightcluster{z})]; | |
end | |
save([os 'Processesed' num2str(time_d) 'ori' num2str(eps_p) '_' num2str(minpts_p) '.mat'],'gpsFeatures'); | |
gpsFeatures = gpsFeatures(~any(isnan(gpsFeatures),2),:); | |
gpsFeatures( ~any(gpsFeatures,2), : ) = []; | |
gpsFeatures( find(gpsFeatures(:,5)==0), : ) = []; | |
% remove intervals with total distance larger than 3000km | |
gpsFeatures( find(gpsFeatures(:,9)>3000), : ) = []; | |
gpsFeatures( find(gpsFeatures(:,9)<1), : ) = []; | |
% remove intervals with low sample coverage | |
%% Summing up and writing features for further analysis | |
dSet = gpsFeatures(:,[2,3,5,6,7,8,9,10,12,15,13,1]); | |
dSet = dSet(~any(isnan(dSet),2),:); | |
dSet( ~any(dSet,2), : ) = []; % Remove any row that have a Null value | |
save([os 'Processesed' num2str(time_d) '.mat'],'gpsFeatures'); | |
save(['dSet_' os num2str(time_d) '.mat'],'dSet'); | |
save(['svm/dSet_' os num2str(time_d) '.mat'],'dSet'); | |
save(['linear/dSet_' os num2str(time_d) '.mat'],'dSet'); | |
save(['nonlinear/dSet_' os num2str(time_d) '.mat'],'dSet'); | |
end | |
end | |