nb_007_progress-so-far.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 007: Progress so far, and quantizing code!\n",
    "\n",
    "## Summary of what we have\n",
    "\n",
    "We sum up the progress we have so far and the data we've processed.\n",
    "\n",
    "---"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from math import floor, ceil\n",
    "from matplotlib import path\n",
    "\n",
    "# Just some code from our previous files...\n",
    "\n",
    "# List of filenames of the original data\n",
    "od_fns = [\"./DataForUConn/201407OD/201407.gz/part-m-{x}\".format(x=str(x).rjust(5,\"0\")) for x in range(12)]\n",
    "gps_fns = [\"./DataForUConn/201407GPS/part-r-{x}\".format(x=str(x).rjust(5,\"0\")) for x in range(2)]\n",
    "\n",
    "# Time string to unix time converter\n",
    "formattime = lambda timestr: int(time.mktime(time.strptime(timestr, \"%Y-%m-%dT%H:%M:%S.000Z\")))\n",
    "\n",
    "# Headers\n",
    "od_headers = [\"id\", \"ptime\", \"dtime\", \"plon\", \"plat\", \"dlon\", \"dlat\"]\n",
    "gps_headers = [\"id\", \"color\", \"lon\", \"lat\", \"time\", \"speed\", \"noMeaning\"]\n",
    "\n",
    "# Processing `area.txt` to polygons...\n",
    "def process_line(line):\n",
    "    # Not sure what these values are used for tbh!\n",
    "    val1 = line[0]\n",
    "    val2 = line[1]\n",
    "    polyline = [float(v) for v in line[2:]]\n",
    "    polyline = np.array(list(zip(polyline[0::2], polyline[1::2])))\n",
    "    return val1, val2, polyline \n",
    "\n",
    "def point_to_rids(point, polypaths, radius=0.0):\n",
    "    RIDs = []\n",
    "    for rid, polypath in enumerate(polypaths):\n",
    "        if polypath.contains_point(point, radius=radius):\n",
    "            RIDs.append(rid)\n",
    "\n",
    "with open(\"./DataForUConn/DecideRegion/area.txt\") as f:\n",
    "    area = [process_line(line.split(\",\")) for line in f.read().strip().split(\"\\n\")]\n",
    "\n",
    "polypaths = [path.Path(line[2][1:]) for line in area]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Dataframes of the original data, with the \"time\" string converted to an integer\n",
    "# e.g. usage; df = pd.read_pickle(gps_fns[0])\n",
    "# these were processed in scr_002\n",
    "gps_fns = [\"./data/gps_part-r-{x}.pkl\".format(x=str(x).rjust(3, \"0\")) for x in range(2)]\n",
    "od_fns = [\"./data/od_part-m-{x}.pkl\".format(x=str(x).rjust(3, \"0\")) for x in range(12)]\n",
    "\n",
    "# Region IDs corresponding to each row of a given dataframe.\n",
    "# These were processed in scr_005 and scr_006\n",
    "# e.g. od_pick-004.csv row 12345 is an integer RegionID, mapping the lat/lon of of row 12345 of dataframe od_part-m-004.pkl\n",
    "gps_rid_fns = [\"./data/gps_rid-{x}.csv\".format(x=str(x).rjust(3,\"0\")) for x in range(2)]\n",
    "od_pick_rid_fns = [\"./data/od_pick-{x}.csv\".format(x=str(x).rjust(3,\"0\")) for x in range(12)]\n",
    "od_drop_rid_fns = [\"./data/od_drop-{x}.csv\".format(x=str(x).rjust(3,\"0\")) for x in range(12)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "./data/gps_part-r-001.pkl\n",
      "./data/od_part-m-011.pkl\n",
      "./data/gps_rid-001.csv\n",
      "./data/od_pick-011.csv\n",
      "./data/od_drop-011.csv\n"
     ]
    }
   ],
   "source": [
    "# Take a look at their filenames!\n",
    "print(gps_fns[-1])\n",
    "print(od_fns[-1])\n",
    "print(gps_rid_fns[-1])\n",
    "print(od_pick_rid_fns[-1])\n",
    "print(od_drop_rid_fns[-1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "              id color         lon        lat        time  speed noMeaning\n",
      "0        粤B0AA56    蓝的  113.961098  22.553101  1404360002     17         0\n",
      "1        粤B0AA56    蓝的  113.962303  22.547001  1404360108     21         0\n",
      "2        粤B0AA56    蓝的  113.962997  22.547001  1404360138      0         0\n",
      "3        粤B0AA56    蓝的  113.963799  22.546400  1404360168     11         0\n",
      "4        粤B0AA56    蓝的  113.963997  22.544399  1404360232      8         0\n",
      "...          ...   ...         ...        ...         ...    ...       ...\n",
      "8835072  粤B7BW87    蓝的  114.120300  22.556200  1405140203     26         0\n",
      "8835073  粤B7BW87    蓝的  114.124496  22.558800  1405140233     29         0\n",
      "8835074  粤B7BW87    蓝的  114.127800  22.560600  1405140277     27         0\n",
      "8835075  粤B7BW87    蓝的  114.133003  22.560699  1405140337     25         0\n",
      "8835076  粤B7BW87    蓝的  114.143097  22.555500  1405140418     33         0\n",
      "\n",
      "[8835077 rows x 7 columns]\n",
      "             id       ptime       dtime        plon       plat        dlon  \\\n",
      "0       粤B000H6  1404792268  1404793106  114.067535  22.567966  114.033546   \n",
      "1       粤B000H6  1404795043  1404795607  114.116486  22.579849  114.117798   \n",
      "2       粤B000H6  1404817520  1404817831  114.107765  22.568199  114.113548   \n",
      "3       粤B000H6  1404818967  1404819999  114.107414  22.567949  114.095818   \n",
      "4       粤B000H6  1404820170  1404820883  114.095482  22.540051  114.053436   \n",
      "...         ...         ...         ...         ...        ...         ...   \n",
      "753812  粤BZ9Z49  1404440531  1404441428  114.026703  22.611700  114.032700   \n",
      "753813  粤BZ9Z49  1404441608  1404442386  114.034203  22.625299  114.052299   \n",
      "753814  粤BZ9Z49  1404443381  1404443614  114.132401  22.610600  114.121597   \n",
      "753815  粤BZ9Z49  1404444120  1404444426  114.116898  22.601000  114.115601   \n",
      "753816  粤BZ9Z49  1404444811  1404445215  114.121201  22.610001  114.111198   \n",
      "\n",
      "             dlat  \n",
      "0       22.627783  \n",
      "1       22.599850  \n",
      "2       22.571234  \n",
      "3       22.539984  \n",
      "4       22.537033  \n",
      "...           ...  \n",
      "753812  22.629200  \n",
      "753813  22.632299  \n",
      "753814  22.607300  \n",
      "753815  22.618299  \n",
      "753816  22.614100  \n",
      "\n",
      "[753817 rows x 7 columns]\n"
     ]
    }
   ],
   "source": [
    "#Let's look at the first row of each of these.\n",
    "\n",
    "print(pd.read_pickle(gps_fns[0]))\n",
    "print(pd.read_pickle(od_fns[0]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[158], [311], [311], [311], [311], [311], [311], [311], [306], [306], [306], [306], [306], [306], [310], [240], [223], [223], [223], [223]]\n",
      "[[158], [311], [311], [311], [311], [311], [311], [311], [306], [306], [306], [306], [306], [306], [310], [240], [223], [223], [223], [223]]\n",
      "[[427], [198], [202], [283], [235], [283], [283], [286], [281], [208], [172], [191], [189], [279], [281], [186], [184], [191], [278], [175]]\n"
     ]
    }
   ],
   "source": [
    "def _safe_to_int(cell):\n",
    "    # some cells are empty, so, return None instead of int\n",
    "    try:\n",
    "        return int(cell)\n",
    "    except ValueError:\n",
    "        return None\n",
    "\n",
    "def read_rid_csv(fn):\n",
    "    with open(fn) as f:\n",
    "        return [[_safe_to_int(cell) for cell in line.split(\",\")] for line in f.read().strip().split(\"\\n\")]\n",
    "\n",
    "gps_rids_0 = read_rid_csv(gps_rid_fns[0])\n",
    "pick_rids_0 = read_rid_csv(od_pick_rid_fns[0])\n",
    "drop_rids_0 = read_rid_csv(od_drop_rid_fns[0])\n",
    "\n",
    "print(gps_rids_0[:20])\n",
    "print(pick_rids_0[:20])\n",
    "print(drop_rids_0[:20])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Some 'bad' GPS RIDs\n",
      "674 [331, 339]\n",
      "927 [251, 265]\n",
      "931 [223, 247]\n",
      "932 [223, 247]\n",
      "1003 [None]\n",
      "1007 [None]\n",
      "1397 [None]\n",
      "1398 [None]\n",
      "1399 [None]\n",
      "1968 [139, 240]\n",
      "2216 [None]\n",
      "2217 [None]\n",
      "2218 [None]\n",
      "2242 [None]\n",
      "2245 [158, 159]\n",
      "3482 [None]\n",
      "3509 [None]\n",
      "3518 [None]\n",
      "3804 [235, 237]\n",
      "3882 [331, 339]\n",
      "\n",
      "Some 'bad' Pick RIDs\n",
      "11 [None]\n",
      "341 [None]\n",
      "805 [31, 241]\n",
      "1275 [None]\n",
      "1380 [None]\n",
      "1438 [267, 268]\n",
      "1981 [None]\n",
      "2710 [None]\n",
      "2733 [None]\n",
      "2734 [None]\n",
      "3957 [31, 241]\n",
      "\n",
      "Some 'bad' Drop RIDs\n",
      "239 [None]\n",
      "341 [None]\n",
      "789 [None]\n",
      "790 [181, 191]\n",
      "1021 [None]\n",
      "1186 [None]\n",
      "1270 [None]\n",
      "1274 [None]\n",
      "1442 [263, 267]\n",
      "1863 [255, 341]\n",
      "1951 [331, 339]\n",
      "1980 [None]\n",
      "2520 [None]\n",
      "2535 [409, 410]\n",
      "2649 [187, 292]\n",
      "2732 [None]\n",
      "2733 [None]\n",
      "3356 [31, 133]\n",
      "3523 [139, 240]\n",
      "3756 [None]\n"
     ]
    }
   ],
   "source": [
    "# Let's view some bad cells:\n",
    "print(\"Some 'bad' GPS RIDs\")\n",
    "for idx, line in enumerate(gps_rids_0[0:4000]):\n",
    "    if line[0] == None or not len(line) == 1:\n",
    "        print(idx, line)\n",
    "\n",
    "print(\"\\nSome 'bad' Pick RIDs\")\n",
    "for idx, line in enumerate(pick_rids_0[0:4000]):\n",
    "    if line[0] == None or not len(line) == 1:\n",
    "        print(idx, line)\n",
    "\n",
    "print(\"\\nSome 'bad' Drop RIDs\")\n",
    "for idx, line in enumerate(drop_rids_0[0:4000]):\n",
    "    if line[0] == None or not len(line) == 1:\n",
    "        print(idx, line)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}