Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
%% This file generates features for GPS - This file should work for both Android and iOS
% Used for feature extraction for the WH paper
%% Basic stuff
function [re rergw]=feature_extract(eps_p,minpts_p,time_di,os,nday)
epochtime = 1443672000000; % 2015/10/01 12:00 am, used to calculate hour of the day quickly
allneed = [];
if strcmp(os,'ios')
load('gps_feature_ios_up.mat'); % Supports both Android and iOS; for iOS load: gps_feature_ios.mat
interval = 1;
tthreshold=600; % set threshold to calculate time spent
wifi_tthreshold=600;
else
load('gps_feature_and_up.mat');
interval = 1;
tthreshold=600;
wifi_tthreshold=600;
end
% Initializations
c = 0;
time_d = time_di
for eps = eps_p%0.0001%:0.0001:0.001
for minpts = minpts_p %25 %2:1:30 % for iOS set to 60:60:420
c = c+1
%%
% Read data - for all users one by one
re=[];
for z = 1:size(gpsData,2) % Last column has users id
z
for iii=[0 ]
userData = gpsData{z};
userData(find(userData(:,11)==3),:)=[];
if (length(userData) == 0) % if no data found, ignore and move one
continue;
end
% in case the time is in microsecond
if userData(1,4)>1000000000000
userData(:,4) = userData(:,4)/1000;
end
if userData(1,1)>1000000000000
userData(:,1) = userData(:,1)/1000;
end
% senate check
periodts = userData(1,1);
useris = userData(1,6);
userData = userData(find(userData(:,4)>userData(1,1)-nday*3600*24*1000),:);
if (length(userData) == 0) % if no data found, ignore and move one
continue;
end
ind = find(userData(:,4)<=userData(1,1));
if length(ind)==0
continue
end
userData = userData(ind,:);
if (length(userData) == 0) % if no data found, ignore and move one
continue;
end
if length(userData(find(userData(:,11)==-1),1)) >= 100 % Atlease 100 samples for a window
dc = 1;
for di = 2:length(userData) % Day count in a specific PHQ9 period
if userData(di,9) ~= userData(di-1,9)
dc=dc+1;
end
end
gpsFeatures(z,16) = length(userData)/(dc*24*60/interval); % Time coverage in terms of percentage
% to get depress label
tdepress = userData(1,10);
depress = 1;
if (tdepress == 1 | tdepress == 2)%
depress = -1;
end
gpsFeatures(z,15) = depress; % clinical ground truth
gpsFeatures(z,14) = dc; % day count
gpsFeatures(z,13) = userData(1,6); % uid
ind = find(userData(:,7)~=-1);
gpsFeatures(z,12) = userData(ind(1),7); % phq9
gpsFeatures(z,1) = userData(1,1); % start time
userData = [userData(:,4) userData(:,2) userData(:,3) userData(:,5) userData(:,9) userData(:,11)]; % [timestamp, lat, long, moving/stat, weekday]
%% Time spent calculation (preprocessing step)
% Calculate how much time user spent on a perticular location
% Traverse through all location samples
timeSpent = []; % clear timeSpent
timeSpent(1,1) = tthreshold;
for i = 2:size(userData,1)
latDiff =abs(userData(i-1,2) - userData(i,2));
longDiff = abs(userData(i-1,3) - userData(i,3));
timeSpent(i,1) = userData(i,1) - userData(i-1,1) ;
if timeSpent(i,1) < 0
i
disp('something is fishy, see code line 59');
pause(5); % pause, in case timespent seems incorrect
end
if userData(i-1,6)==-2
if timeSpent(i,1) > wifi_tthreshold % missing data case
timeSpent(i,1) = wifi_tthreshold; % android: 10 minutes ios: 1 minute
end
else
if timeSpent(i,1) > tthreshold % missing data case
timeSpent(i,1) = tthreshold; % android: 10 minutes ios: 1 minute
end
end
end % end for - this will give us time spent b/w consecutive long/lat
userData(:, end+1) = timeSpent; % add as a column % [timestamp, lat, long, moving/stat, weekday mins_between_traces]
r1=sum(userData(:,end))/(dc*24*60*60);
r2=sum(userData(find(userData(:,6)==-1),end))/(dc*86400);
% re=[re;[r1 r2]];
gpsFeatures(z,21)=r1;
gpsFeatures(z,22)=r2;
if iii
userData = userData(find(userData(:,6) == -1),:); % keep only rows where timeSpent > 0
end
%% Feature 1 = Variance
% First feature - Location variance - that measures the variability in the
% subject's GPS location - For stationary user only
% Get the user data where state is stationary i.e 0
ind = find(userData(:,4) == 0 ); % 0 = Stationary points; 1 = Moving points
userDataStat = userData(ind,:); % [timestamp, lat, long, moving/stat, weekday mins_between_traces]
% Calculate statistical variance of longitude and latitude
varLat = var(userDataStat(:,2));
varLong = var(userDataStat(:,3));
locVar = log(varLat + varLong); % Nautral Log
gpsFeatures(z,2) = locVar;
%% Feature 2 = Entropy
% Entropy - to measure the variability of the time the subject spent at a
% location cluster
% Clustering used = dbScan
if time_d==0
[class,type]= dbscan([userDataStat(:,2:3)], minpts, eps); % Apply dbscan without time dimension
else
timeInfo = mod(userDataStat(:,1)-userDataStat(1,1),86400000) * eps / 3600000; % convert to hours then time eps
%[class,type]= dbscan([userDataStat(:,2:3)], minpts, eps); % Apply dbscan
[class,type]= dbscan([userDataStat(:,2:3) timeInfo], minpts, eps); % Apply dbscan with time dimension
end
userDataStat(:,end+1) = class'; % Cluster label % [timestamp, lat, long, moving/stat, weekday mins_between_traces cluster#]
eind1 = size(userDataStat,2); % This index have cluster number (-1 = outlier)
userDataStat(:,end+1) = type'; % Boundry or outlier etc % [timestamp, lat, long, moving/stat, weekday mins_between_traces cluster# type]
eind2 = size(userDataStat,2); % This index have cluster type info
% Remove noise/outlier points i.e., type = -1
indClus = find(userDataStat(:,eind1) ~= -1); % -1 Out - ASMA UNCOMMENTED
userDataStat = userDataStat(indClus,:);
ix = size(userDataStat,2);
% Calculate Entropy
ent = 0;
% gpsTime = datestr(userDataStat(:,1)/86400000 + datenum(1970,1,1)); % Get the time info
%t = datetime(gpsTime, 'Format', 'dd/M/yy HH:mm:ss');
%t = datevec(t); % 4th column have hour
t = [zeros(length(userDataStat(:,1)),3) mod((userDataStat(:,1)*1000-epochtime)/3600000,24)];
col = size(userDataStat,2);
userDataStat(:,end+1) = t(:,4);
eind3 = size(userDataStat,2); % This index have hour info
%% Feature 4 = Number of Unique Clusters
% Number of unique clusters
uniClus = unique(userDataStat(:,eind1)); % Cluster number % ASMA CHANGED
I = size(uniClus,1); % Number of unique clusters
gpsFeatures(z,10) = I; % outlier already removed from cluster
if iii==0
baseone = ones(length(userDataStat(:,1)),1);
allneed=[allneed;baseone*useris userDataStat(:,1) baseone*periodts userDataStat(:,eind1)];
end
if I > 1 % if atleast 2 clusters found
maxa = 0;
f = -2; % Variable that finally will have home cluster information
nightcluster{z}=[];
for i = 1:I % total time spend in the cluster
% if uniClus(i) > -1
tmesptind = find( userDataStat(:,eind1) == uniClus(i)); % find the cluster index
totTimeClus = sum(userDataStat(:, 6)); % total time spend in a PHQ9 period @ "all" meaningful clusters
tsumTime(1,i) = (sum(userDataStat(tmesptind, 6))); % total time spent in a perticular cluster i.e., ith cluster
sumTime(1,i) = (sum(userDataStat(tmesptind, 6)))/totTimeClus; % Percentage of time spent @ cluster i
ent = ent + ( sumTime(1,i) * (log(sumTime(1,i)))); % calculate entropy
ctimes = 1;
for j = 2:length(tmesptind)
if (userDataStat(tmesptind(j),1) - userDataStat(tmesptind(j-1),1))/1000/3600 >= 1 % times gap
ctimes = ctimes+1; % Number of time visited - for top cluster
end
end
nightcover=length(find(userDataStat(:,end) >= 0 & userDataStat(:,end) < 6))/(gpsFeatures(z,14)*36);
if uniClus(i) ~= -1
idTime = find(userDataStat(tmesptind,end) >= 0 & userDataStat(tmesptind,end) < 6); % Time between 0 and 6
idset = userDataStat(tmesptind(idTime), 5); % day of the week
if sum(userDataStat(tmesptind,6))/sum(userDataStat(:,6)) > 0.1 % 0.1 % 6th column have the time spent, Chaoqun, pls check
hometime = 1;
for l=2:length(idset)
if idset(l)~=idset(l-1)
hometime = hometime+1;
end
end
if gpsFeatures(z,16)>=0.5 & nightcover>0.6
nightcluster{z}=[nightcluster{z} length(idTime)];
end
if hometime > maxa
maxa = hometime;
f = uniClus(i); % By the end of iterations, f will have the cluster marked as home
end
end
else
gpsFeatures(z,16+iii*17) = tmesptind/length(userDataStat(:,1));
end
end
%% Feature 2 = Entropy
ent = -ent;
ent = 0;
es = sort(sumTime(1,:),1);
for nes =1:length(es(1,:))
ent = ent + ( es(1,nes) * (log(es(1,nes))));
end
ent = -ent;
gpsFeatures(z,5) = ent;
%% Feature 3 = Normalized Entropy
gpsFeatures(z,6) = gpsFeatures(z,5)/(log(I)); % I is the total number of clusters
if f ==-2
continue
end
%% Feature 5 = Home stay - cluster which have the time between 12am to 6am
if uniClus(1) == -1 % Never happens, just in case
gpsFeatures(z,7) = (sumTime(1,f+1));%/sum(sumTime); % WE DONT NEED TO SUM IT
else
gpsFeatures(z,7) = (sumTime(1,f));%/sum(sumTime);
end
else
gpsFeatures(z,5) = 0;
gpsFeatures(z,6) = 0;
gpsFeatures(z,7) = 1;
end % End if - we are done with clustering
if time_d==1 % thome should always be from without time dimension
userDataStat = userData(ind,:);
[class,type]= dbscan([userDataStat(:,2:3)], minpts, eps); % Apply dbscan for thome
userDataStat(:,eind1) = class'; % Cluster label % [timestamp, lat, long, moving/stat, weekday mins_between_traces cluster#]
userDataStat(:,eind2) = type'; % Boundry or outlier etc % [timestamp, lat, long, moving/stat, weekday mins_between_traces cluster# type]
% Remove noise/outlier points i.e., type = -1
indClus = find(userDataStat(:,eind1) ~= -1); % -1 Out - ASMA UNCOMMENTED
userDataStat = userDataStat(indClus,:);
gpsTime = datestr(userDataStat(:,1)/86400000 + datenum(1970,1,1)); % Get the time info
% Number of unique clusters
uniClus = unique(userDataStat(:,eind1)); % Cluster number % ASMA CHANGED
I = size(uniClus,1); % Number of unique clusters
if I > 1 % if atleast 2 clusters found
maxa = 0;
f = -2; % Variable that finally will have home cluster information
sumTime=[];
tsumTime=[];
for i = 1:I % total time spend in the cluster
tmesptind = find( userDataStat(:,eind1) == uniClus(i)); % find the cluster index
totTimeClus = sum(userDataStat(:, 6)); % total time spend in a PHQ9 period @ "all" meaningful clusters
tsumTime(1,i) = (sum(userDataStat(tmesptind, 6))); % total time spent in a perticular cluster i.e., ith cluster
sumTime(1,i) = (sum(userDataStat(tmesptind, 6)))/totTimeClus; % Percentage of time spent @ cluster i
if uniClus(i) ~= -1
idTime = find(userDataStat(tmesptind,end) >= 0 & userDataStat(tmesptind,end) < 6); % Time between 0 and 6
idset = userDataStat(tmesptind(idTime), 5); % day of the week
if sum(userDataStat(tmesptind,6))/sum(userDataStat(:,6)) > 0.1 % 0.1 % 6th column have the time spent, Chaoqun, pls check
hometime = 1;
for l=2:length(idset)
if idset(l)~=idset(l-1)
hometime = hometime+1;
end
end
if hometime > maxa
maxa = hometime;
f = uniClus(i); % By the end of iterations, f will have the cluster marked as home
end
end
end
end
if uniClus(1) == -1 % Never happens, just in case
gpsFeatures(z,7) = (sumTime(1,f+1));%/sum(sumTime); % WE DONT NEED TO SUM IT
else
gpsFeatures(z,7) = (sumTime(1,f));%/sum(sumTime);
end
else
gpsFeatures(z,7) = 1;
end
end % End if - we are done with clustering
%% Feature 6 = Transition time
% Transition time = number of samples in transition/ total number of
% samples
movingSamples = sum(userData(find(userData(:,4) == 1),6));
AllSamples = sum(userData(:,6));
gpsFeatures(z,8) = movingSamples/ AllSamples;
%% Feature 7 = Total Distance
% Total distance traveled in km
sumdistkm = 0;
for i = 1:(size(userData,1)-1)
latLong1 = [userData(i,2) userData(i,3)];
latLong2 = [userData(i+1,2) userData(i+1,3)];
[d1km d2km]=lldistkm(latLong1,latLong2);
sumdistkm = sumdistkm + d1km;
end
gpsFeatures(z,9) = sumdistkm; %unit: km
%% Feaure 8 = Average Moving Speed
gpsFeatures(z,3) = sumdistkm/((userData(end,1)-userData(1,1))/1000); %average moving speed unit:km/s
clearvars -except eps_p allneed epochtime nday gpsonly minpts_p wifi_tthreshold nightcluster os interval time_d starttime eps minpts c tthreshold z gpsData gpsFeatures re %tsumTime centers sumTime userData timeSpent latDiff longDiff ind varLat varLong locVar LongLatStat idx C clususerDataStat clus1Ind clus2Ind clus3Ind timeClus1 timeClus2 timeClus3 i gpsTime homeTime t sumdistkm movingSamples col ent latLong1 latLong2 d1km d2km AllSamples
end
end
end
id = cellfun('length',nightcluster);
nightcluster(id==0)=[];
nre=[];
for z=1:length(nightcluster)
nre=[nre max(nightcluster{z})/sum(nightcluster{z})];
end
save([os 'Processesed' num2str(time_d) 'ori' num2str(eps_p) '_' num2str(minpts_p) '.mat'],'gpsFeatures');
gpsFeatures = gpsFeatures(~any(isnan(gpsFeatures),2),:);
gpsFeatures( ~any(gpsFeatures,2), : ) = [];
gpsFeatures( find(gpsFeatures(:,5)==0), : ) = [];
% remove intervals with total distance larger than 3000km
gpsFeatures( find(gpsFeatures(:,9)>3000), : ) = [];
gpsFeatures( find(gpsFeatures(:,9)<1), : ) = [];
% remove intervals with low sample coverage
%% Summing up and writing features for further analysis
dSet = gpsFeatures(:,[2,3,5,6,7,8,9,10,12,15,13,1]);
dSet = dSet(~any(isnan(dSet),2),:);
dSet( ~any(dSet,2), : ) = []; % Remove any row that have a Null value
save([os 'Processesed' num2str(time_d) '.mat'],'gpsFeatures');
save(['dSet_' os num2str(time_d) '.mat'],'dSet');
save(['svm/dSet_' os num2str(time_d) '.mat'],'dSet');
save(['linear/dSet_' os num2str(time_d) '.mat'],'dSet');
save(['nonlinear/dSet_' os num2str(time_d) '.mat'],'dSet');
end
end