Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
# Copy of notebook 008, in a simple script.
import pandas as pd
import numpy as np
from math import floor, ceil
from matplotlib import path
# Just some code from our previous files...
# List of filenames of the original data
original_od_fns = ["./DataForUConn/201407OD/201407.gz/part-m-{x}".format(x=str(x).rjust(5,"0")) for x in range(12)]
original_gps_fns = ["./DataForUConn/201407GPS/part-r-{x}".format(x=str(x).rjust(5,"0")) for x in range(2)]
# Time string to unix time converter
formattime = lambda timestr: int(time.mktime(time.strptime(timestr, "%Y-%m-%dT%H:%M:%S.000Z")))
# Headers
od_headers = ["id", "ptime", "dtime", "plon", "plat", "dlon", "dlat"]
gps_headers = ["id", "color", "lon", "lat", "time", "speed", "noMeaning"]
# Filenames of our pickles
gps_fns = ["./data/gps_part-r-{x}.pkl".format(x=str(x).rjust(3, "0")) for x in range(2)]
od_fns = ["./data/od_part-m-{x}.pkl".format(x=str(x).rjust(3, "0")) for x in range(12)]
# Region IDs corresponding to each row of a given dataframe.
# These were processed in scr_005 and scr_006
# e.g. od_pick-004.csv row 12345 is an integer RegionID, mapping the lat/lon of of row 12345 of dataframe od_part-m-004.pkl
gps_rid_fns = ["./data/gps_rid-{x}.csv".format(x=str(x).rjust(3,"0")) for x in range(2)]
od_pick_rid_fns = ["./data/od_pick-{x}.csv".format(x=str(x).rjust(3,"0")) for x in range(12)]
od_drop_rid_fns = ["./data/od_drop-{x}.csv".format(x=str(x).rjust(3,"0")) for x in range(12)]
# Start, end times
start_time = 1404360000
end_time = 1405828798
def time_to_index(tt, st = 1404360000, divisor=60):
return floor((tt-st)/divisor)
NN = ceil((end_time - start_time)/60)
# Read the processed CSVs
def _safe_to_int(cell):
# some cells are empty, so, return None instead of int
try:
return int(cell)
except ValueError:
return None
def read_rid_csv(fn):
with open(fn) as f:
return [[_safe_to_int(cell) for cell in line.split(",")] for line in f.read().strip().split("\n")]
def get_demand_from_files(od_fn, pick_rid_fn, number_of_timesteps, number_of_regions, verbose=True):
oddf = pd.read_pickle(od_fn)
pick_rids = read_rid_csv(pick_rid_fn)
# They should have the same amount of rows!
assert len(oddf) == len(pick_rids), "OD DF and PICK RIDs don't match! What gives?"
demand = np.zeros((number_of_timesteps, number_of_regions))
for row_idx in range(len(oddf)):
if verbose and row_idx % 50000 == 0:
print(f" Processing row {row_idx} of {len(oddf)}")
ts = time_to_index(oddf["ptime"][row_idx])
rid = pick_rids[row_idx][0] # Choose the first region ID in the list.
if rid is not None:
# skip if rid is None.
demand[ts][rid] += 1
return demand
if __name__ == "__main__":
demand = np.zeros((NN, 492))
for od_fn, pick_rid_fn in list(zip(od_fns, od_pick_rid_fns)):
print(f"Processing {od_fn}")
oddf = pd.read_pickle(od_fn)
pick_rids = read_rid_csv(pick_rid_fn)
assert len(oddf) == len(pick_rids), "OD DF and PICK RIDs don't match! What gives?"
demand += get_demand_from_files(od_fn, pick_rid_fn, NN, 492)
np.save("./data/demand", demand)
pd.DataFrame(demand).astype(int).to_csv("./data/demand.csv")