Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 007: Progress so far, and quantizing code!\n",
"\n",
"## Summary of what we have\n",
"\n",
"We sum up the progress we have so far and the data we've processed.\n",
"\n",
"---"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from math import floor, ceil\n",
"from matplotlib import path\n",
"\n",
"# Just some code from our previous files...\n",
"\n",
"# List of filenames of the original data\n",
"od_fns = [\"./DataForUConn/201407OD/201407.gz/part-m-{x}\".format(x=str(x).rjust(5,\"0\")) for x in range(12)]\n",
"gps_fns = [\"./DataForUConn/201407GPS/part-r-{x}\".format(x=str(x).rjust(5,\"0\")) for x in range(2)]\n",
"\n",
"# Time string to unix time converter\n",
"formattime = lambda timestr: int(time.mktime(time.strptime(timestr, \"%Y-%m-%dT%H:%M:%S.000Z\")))\n",
"\n",
"# Headers\n",
"od_headers = [\"id\", \"ptime\", \"dtime\", \"plon\", \"plat\", \"dlon\", \"dlat\"]\n",
"gps_headers = [\"id\", \"color\", \"lon\", \"lat\", \"time\", \"speed\", \"noMeaning\"]\n",
"\n",
"# Processing `area.txt` to polygons...\n",
"def process_line(line):\n",
" # Not sure what these values are used for tbh!\n",
" val1 = line[0]\n",
" val2 = line[1]\n",
" polyline = [float(v) for v in line[2:]]\n",
" polyline = np.array(list(zip(polyline[0::2], polyline[1::2])))\n",
" return val1, val2, polyline \n",
"\n",
"def point_to_rids(point, polypaths, radius=0.0):\n",
" RIDs = []\n",
" for rid, polypath in enumerate(polypaths):\n",
" if polypath.contains_point(point, radius=radius):\n",
" RIDs.append(rid)\n",
"\n",
"with open(\"./DataForUConn/DecideRegion/area.txt\") as f:\n",
" area = [process_line(line.split(\",\")) for line in f.read().strip().split(\"\\n\")]\n",
"\n",
"polypaths = [path.Path(line[2][1:]) for line in area]"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# Dataframes of the original data, with the \"time\" string converted to an integer\n",
"# e.g. usage; df = pd.read_pickle(gps_fns[0])\n",
"# these were processed in scr_002\n",
"gps_fns = [\"./data/gps_part-r-{x}.pkl\".format(x=str(x).rjust(3, \"0\")) for x in range(2)]\n",
"od_fns = [\"./data/od_part-m-{x}.pkl\".format(x=str(x).rjust(3, \"0\")) for x in range(12)]\n",
"\n",
"# Region IDs corresponding to each row of a given dataframe.\n",
"# These were processed in scr_005 and scr_006\n",
"# e.g. od_pick-004.csv row 12345 is an integer RegionID, mapping the lat/lon of of row 12345 of dataframe od_part-m-004.pkl\n",
"gps_rid_fns = [\"./data/gps_rid-{x}.csv\".format(x=str(x).rjust(3,\"0\")) for x in range(2)]\n",
"od_pick_rid_fns = [\"./data/od_pick-{x}.csv\".format(x=str(x).rjust(3,\"0\")) for x in range(12)]\n",
"od_drop_rid_fns = [\"./data/od_drop-{x}.csv\".format(x=str(x).rjust(3,\"0\")) for x in range(12)]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"./data/gps_part-r-001.pkl\n",
"./data/od_part-m-011.pkl\n",
"./data/gps_rid-001.csv\n",
"./data/od_pick-011.csv\n",
"./data/od_drop-011.csv\n"
]
}
],
"source": [
"# Take a look at their filenames!\n",
"print(gps_fns[-1])\n",
"print(od_fns[-1])\n",
"print(gps_rid_fns[-1])\n",
"print(od_pick_rid_fns[-1])\n",
"print(od_drop_rid_fns[-1])"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" id color lon lat time speed noMeaning\n",
"0 粤B0AA56 蓝的 113.961098 22.553101 1404360002 17 0\n",
"1 粤B0AA56 蓝的 113.962303 22.547001 1404360108 21 0\n",
"2 粤B0AA56 蓝的 113.962997 22.547001 1404360138 0 0\n",
"3 粤B0AA56 蓝的 113.963799 22.546400 1404360168 11 0\n",
"4 粤B0AA56 蓝的 113.963997 22.544399 1404360232 8 0\n",
"... ... ... ... ... ... ... ...\n",
"8835072 粤B7BW87 蓝的 114.120300 22.556200 1405140203 26 0\n",
"8835073 粤B7BW87 蓝的 114.124496 22.558800 1405140233 29 0\n",
"8835074 粤B7BW87 蓝的 114.127800 22.560600 1405140277 27 0\n",
"8835075 粤B7BW87 蓝的 114.133003 22.560699 1405140337 25 0\n",
"8835076 粤B7BW87 蓝的 114.143097 22.555500 1405140418 33 0\n",
"\n",
"[8835077 rows x 7 columns]\n",
" id ptime dtime plon plat dlon \\\n",
"0 粤B000H6 1404792268 1404793106 114.067535 22.567966 114.033546 \n",
"1 粤B000H6 1404795043 1404795607 114.116486 22.579849 114.117798 \n",
"2 粤B000H6 1404817520 1404817831 114.107765 22.568199 114.113548 \n",
"3 粤B000H6 1404818967 1404819999 114.107414 22.567949 114.095818 \n",
"4 粤B000H6 1404820170 1404820883 114.095482 22.540051 114.053436 \n",
"... ... ... ... ... ... ... \n",
"753812 粤BZ9Z49 1404440531 1404441428 114.026703 22.611700 114.032700 \n",
"753813 粤BZ9Z49 1404441608 1404442386 114.034203 22.625299 114.052299 \n",
"753814 粤BZ9Z49 1404443381 1404443614 114.132401 22.610600 114.121597 \n",
"753815 粤BZ9Z49 1404444120 1404444426 114.116898 22.601000 114.115601 \n",
"753816 粤BZ9Z49 1404444811 1404445215 114.121201 22.610001 114.111198 \n",
"\n",
" dlat \n",
"0 22.627783 \n",
"1 22.599850 \n",
"2 22.571234 \n",
"3 22.539984 \n",
"4 22.537033 \n",
"... ... \n",
"753812 22.629200 \n",
"753813 22.632299 \n",
"753814 22.607300 \n",
"753815 22.618299 \n",
"753816 22.614100 \n",
"\n",
"[753817 rows x 7 columns]\n"
]
}
],
"source": [
"#Let's look at the first row of each of these.\n",
"\n",
"print(pd.read_pickle(gps_fns[0]))\n",
"print(pd.read_pickle(od_fns[0]))"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[158], [311], [311], [311], [311], [311], [311], [311], [306], [306], [306], [306], [306], [306], [310], [240], [223], [223], [223], [223]]\n",
"[[158], [311], [311], [311], [311], [311], [311], [311], [306], [306], [306], [306], [306], [306], [310], [240], [223], [223], [223], [223]]\n",
"[[427], [198], [202], [283], [235], [283], [283], [286], [281], [208], [172], [191], [189], [279], [281], [186], [184], [191], [278], [175]]\n"
]
}
],
"source": [
"def _safe_to_int(cell):\n",
" # some cells are empty, so, return None instead of int\n",
" try:\n",
" return int(cell)\n",
" except ValueError:\n",
" return None\n",
"\n",
"def read_rid_csv(fn):\n",
" with open(fn) as f:\n",
" return [[_safe_to_int(cell) for cell in line.split(\",\")] for line in f.read().strip().split(\"\\n\")]\n",
"\n",
"gps_rids_0 = read_rid_csv(gps_rid_fns[0])\n",
"pick_rids_0 = read_rid_csv(od_pick_rid_fns[0])\n",
"drop_rids_0 = read_rid_csv(od_drop_rid_fns[0])\n",
"\n",
"print(gps_rids_0[:20])\n",
"print(pick_rids_0[:20])\n",
"print(drop_rids_0[:20])"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Some 'bad' GPS RIDs\n",
"674 [331, 339]\n",
"927 [251, 265]\n",
"931 [223, 247]\n",
"932 [223, 247]\n",
"1003 [None]\n",
"1007 [None]\n",
"1397 [None]\n",
"1398 [None]\n",
"1399 [None]\n",
"1968 [139, 240]\n",
"2216 [None]\n",
"2217 [None]\n",
"2218 [None]\n",
"2242 [None]\n",
"2245 [158, 159]\n",
"3482 [None]\n",
"3509 [None]\n",
"3518 [None]\n",
"3804 [235, 237]\n",
"3882 [331, 339]\n",
"\n",
"Some 'bad' Pick RIDs\n",
"11 [None]\n",
"341 [None]\n",
"805 [31, 241]\n",
"1275 [None]\n",
"1380 [None]\n",
"1438 [267, 268]\n",
"1981 [None]\n",
"2710 [None]\n",
"2733 [None]\n",
"2734 [None]\n",
"3957 [31, 241]\n",
"\n",
"Some 'bad' Drop RIDs\n",
"239 [None]\n",
"341 [None]\n",
"789 [None]\n",
"790 [181, 191]\n",
"1021 [None]\n",
"1186 [None]\n",
"1270 [None]\n",
"1274 [None]\n",
"1442 [263, 267]\n",
"1863 [255, 341]\n",
"1951 [331, 339]\n",
"1980 [None]\n",
"2520 [None]\n",
"2535 [409, 410]\n",
"2649 [187, 292]\n",
"2732 [None]\n",
"2733 [None]\n",
"3356 [31, 133]\n",
"3523 [139, 240]\n",
"3756 [None]\n"
]
}
],
"source": [
"# Let's view some bad cells:\n",
"print(\"Some 'bad' GPS RIDs\")\n",
"for idx, line in enumerate(gps_rids_0[0:4000]):\n",
" if line[0] == None or not len(line) == 1:\n",
" print(idx, line)\n",
"\n",
"print(\"\\nSome 'bad' Pick RIDs\")\n",
"for idx, line in enumerate(pick_rids_0[0:4000]):\n",
" if line[0] == None or not len(line) == 1:\n",
" print(idx, line)\n",
"\n",
"print(\"\\nSome 'bad' Drop RIDs\")\n",
"for idx, line in enumerate(drop_rids_0[0:4000]):\n",
" if line[0] == None or not len(line) == 1:\n",
" print(idx, line)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}