Permalink
Cannot retrieve contributors at this time
Name already in use
A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
traffic_dataproc/scr_009_write_demand.py
Go to fileThis commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
92 lines (64 sloc)
3.25 KB
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Copy of notebook 008, in a simple script. | |
import pandas as pd | |
import numpy as np | |
from math import floor, ceil | |
from matplotlib import path | |
# Just some code from our previous files... | |
# List of filenames of the original data | |
original_od_fns = ["./DataForUConn/201407OD/201407.gz/part-m-{x}".format(x=str(x).rjust(5,"0")) for x in range(12)] | |
original_gps_fns = ["./DataForUConn/201407GPS/part-r-{x}".format(x=str(x).rjust(5,"0")) for x in range(2)] | |
# Time string to unix time converter | |
formattime = lambda timestr: int(time.mktime(time.strptime(timestr, "%Y-%m-%dT%H:%M:%S.000Z"))) | |
# Headers | |
od_headers = ["id", "ptime", "dtime", "plon", "plat", "dlon", "dlat"] | |
gps_headers = ["id", "color", "lon", "lat", "time", "speed", "noMeaning"] | |
# Filenames of our pickles | |
gps_fns = ["./data/gps_part-r-{x}.pkl".format(x=str(x).rjust(3, "0")) for x in range(2)] | |
od_fns = ["./data/od_part-m-{x}.pkl".format(x=str(x).rjust(3, "0")) for x in range(12)] | |
# Region IDs corresponding to each row of a given dataframe. | |
# These were processed in scr_005 and scr_006 | |
# e.g. od_pick-004.csv row 12345 is an integer RegionID, mapping the lat/lon of of row 12345 of dataframe od_part-m-004.pkl | |
gps_rid_fns = ["./data/gps_rid-{x}.csv".format(x=str(x).rjust(3,"0")) for x in range(2)] | |
od_pick_rid_fns = ["./data/od_pick-{x}.csv".format(x=str(x).rjust(3,"0")) for x in range(12)] | |
od_drop_rid_fns = ["./data/od_drop-{x}.csv".format(x=str(x).rjust(3,"0")) for x in range(12)] | |
# Start, end times | |
start_time = 1404360000 | |
end_time = 1405828798 | |
def time_to_index(tt, st = 1404360000, divisor=60): | |
return floor((tt-st)/divisor) | |
NN = ceil((end_time - start_time)/60) | |
# Read the processed CSVs | |
def _safe_to_int(cell): | |
# some cells are empty, so, return None instead of int | |
try: | |
return int(cell) | |
except ValueError: | |
return None | |
def read_rid_csv(fn): | |
with open(fn) as f: | |
return [[_safe_to_int(cell) for cell in line.split(",")] for line in f.read().strip().split("\n")] | |
def get_demand_from_files(od_fn, pick_rid_fn, number_of_timesteps, number_of_regions, verbose=True): | |
oddf = pd.read_pickle(od_fn) | |
pick_rids = read_rid_csv(pick_rid_fn) | |
# They should have the same amount of rows! | |
assert len(oddf) == len(pick_rids), "OD DF and PICK RIDs don't match! What gives?" | |
demand = np.zeros((number_of_timesteps, number_of_regions)) | |
for row_idx in range(len(oddf)): | |
if verbose and row_idx % 50000 == 0: | |
print(f" Processing row {row_idx} of {len(oddf)}") | |
ts = time_to_index(oddf["ptime"][row_idx]) | |
rid = pick_rids[row_idx][0] # Choose the first region ID in the list. | |
if rid is not None: | |
# skip if rid is None. | |
demand[ts][rid] += 1 | |
return demand | |
if __name__ == "__main__": | |
demand = np.zeros((NN, 492)) | |
for od_fn, pick_rid_fn in list(zip(od_fns, od_pick_rid_fns)): | |
print(f"Processing {od_fn}") | |
oddf = pd.read_pickle(od_fn) | |
pick_rids = read_rid_csv(pick_rid_fn) | |
assert len(oddf) == len(pick_rids), "OD DF and PICK RIDs don't match! What gives?" | |
demand += get_demand_from_files(od_fn, pick_rid_fn, NN, 492) | |
np.save("./data/demand", demand) | |
pd.DataFrame(demand).astype(int).to_csv("./data/demand.csv") |