scr_009_write_demand.py

# Copy of notebook 008, in a simple script.


import pandas as pd
import numpy as np
from math import floor, ceil
from matplotlib import path

# Just some code from our previous files...

# List of filenames of the original data
original_od_fns = ["./DataForUConn/201407OD/201407.gz/part-m-{x}".format(x=str(x).rjust(5,"0")) for x in range(12)]
original_gps_fns = ["./DataForUConn/201407GPS/part-r-{x}".format(x=str(x).rjust(5,"0")) for x in range(2)]

# Time string to unix time converter
formattime = lambda timestr: int(time.mktime(time.strptime(timestr, "%Y-%m-%dT%H:%M:%S.000Z")))

# Headers
od_headers = ["id", "ptime", "dtime", "plon", "plat", "dlon", "dlat"]
gps_headers = ["id", "color", "lon", "lat", "time", "speed", "noMeaning"]

# Filenames of our pickles
gps_fns = ["./data/gps_part-r-{x}.pkl".format(x=str(x).rjust(3, "0")) for x in range(2)]
od_fns = ["./data/od_part-m-{x}.pkl".format(x=str(x).rjust(3, "0")) for x in range(12)]

# Region IDs corresponding to each row of a given dataframe.
# These were processed in scr_005 and scr_006
# e.g. od_pick-004.csv row 12345 is an integer RegionID, mapping the lat/lon of of row 12345 of dataframe od_part-m-004.pkl
gps_rid_fns = ["./data/gps_rid-{x}.csv".format(x=str(x).rjust(3,"0")) for x in range(2)]
od_pick_rid_fns = ["./data/od_pick-{x}.csv".format(x=str(x).rjust(3,"0")) for x in range(12)]
od_drop_rid_fns = ["./data/od_drop-{x}.csv".format(x=str(x).rjust(3,"0")) for x in range(12)]

# Start, end times
start_time = 1404360000
end_time = 1405828798

def time_to_index(tt, st = 1404360000, divisor=60):
    return floor((tt-st)/divisor)

NN = ceil((end_time - start_time)/60)

# Read the processed CSVs
def _safe_to_int(cell):
    # some cells are empty, so, return None instead of int
    try:
        return int(cell)
    except ValueError:
        return None

def read_rid_csv(fn):
    with open(fn) as f:
        return [[_safe_to_int(cell) for cell in line.split(",")] for line in f.read().strip().split("\n")]


def get_demand_from_files(od_fn, pick_rid_fn, number_of_timesteps, number_of_regions, verbose=True):
    oddf = pd.read_pickle(od_fn)
    pick_rids = read_rid_csv(pick_rid_fn)
    
    # They should have the same amount of rows!
    assert len(oddf) == len(pick_rids), "OD DF and PICK RIDs don't match! What gives?"
    
    demand = np.zeros((number_of_timesteps, number_of_regions))
    
    for row_idx in range(len(oddf)):
        if verbose and row_idx % 50000 == 0:
            print(f"  Processing row {row_idx} of {len(oddf)}")
        ts = time_to_index(oddf["ptime"][row_idx])
        rid = pick_rids[row_idx][0] # Choose the first region ID in the list.
        
        if rid is not None:
            # skip if rid is None.
            demand[ts][rid] += 1
    
    return demand


if __name__ == "__main__":
    demand = np.zeros((NN, 492))

    for od_fn, pick_rid_fn in list(zip(od_fns, od_pick_rid_fns)):
        print(f"Processing {od_fn}")
        oddf = pd.read_pickle(od_fn)
        pick_rids = read_rid_csv(pick_rid_fn)
        
        assert len(oddf) == len(pick_rids), "OD DF and PICK RIDs don't match! What gives?"
        
        demand += get_demand_from_files(od_fn, pick_rid_fn, NN, 492)
    
    np.save("./data/demand", demand)
    
    pd.DataFrame(demand).astype(int).to_csv("./data/demand.csv")