Skip to content
Permalink
master
Switch branches/tags

Name already in use

A tag already exists with the provided branch name. Many Git commands accept both tag and branch names, so creating this branch may cause unexpected behavior. Are you sure you want to create this branch?
Go to file
 
 
Cannot retrieve contributors at this time
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"# 1. List of all filenames\n",
"od_fns = [\"./DataForUConn/201407OD/201407.gz/part-m-{x}\".format(x=str(x).rjust(5,\"0\")) for x in range(12)]\n",
"gps_fns = [\"./DataForUConn/201407GPS/part-r-{x}\".format(x=str(x).rjust(5,\"0\")) for x in range(2)]\n",
"\n",
"od_headers = [\"id\", \"ptime\", \"dtime\", \"plon\", \"plat\", \"dlon\", \"dlat\"]\n",
"gps_headers = [\"id\", \"color\", \"lon\", \"lat\", \"time\", \"speed\", \"noMeaning\"]"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# 2. Let's explore each filename and write a processor\n",
"with open(od_fns[0]) as f:\n",
" odlines = [line.split(\",\") for line in f.read().strip().split(\"\\n\")[:100]]\n",
"\n",
"with open(gps_fns[0]) as f:\n",
" gpslines = [line.split(\",\") for line in f.read().strip().split(\"\\n\")[:100]]\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['粤B000H6', '2014-07-08T00:04:28.000Z', '2014-07-08T00:18:26.000Z', '114.067535', '22.567966', '114.033546', '22.627783']\n",
"['粤B0AA56', '蓝的', '113.961098', '22.553101', '2014-07-03T00:00:02.000Z', '17', '0']\n"
]
}
],
"source": [
"print(odlines[0])\n",
"print(gpslines[0])"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"oddf = pd.DataFrame(odlines, columns=od_headers)\n",
"gpsdf = pd.DataFrame(gpslines, columns=gps_headers)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>ptime</th>\n",
" <th>dtime</th>\n",
" <th>plon</th>\n",
" <th>plat</th>\n",
" <th>dlon</th>\n",
" <th>dlat</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>粤B000H6</td>\n",
" <td>2014-07-08T00:04:28.000Z</td>\n",
" <td>2014-07-08T00:18:26.000Z</td>\n",
" <td>114.067535</td>\n",
" <td>22.567966</td>\n",
" <td>114.033546</td>\n",
" <td>22.627783</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>粤B000H6</td>\n",
" <td>2014-07-08T00:50:43.000Z</td>\n",
" <td>2014-07-08T01:00:07.000Z</td>\n",
" <td>114.116486</td>\n",
" <td>22.579849</td>\n",
" <td>114.117798</td>\n",
" <td>22.59985</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>粤B000H6</td>\n",
" <td>2014-07-08T07:05:20.000Z</td>\n",
" <td>2014-07-08T07:10:31.000Z</td>\n",
" <td>114.107765</td>\n",
" <td>22.568199</td>\n",
" <td>114.113548</td>\n",
" <td>22.571234</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>粤B000H6</td>\n",
" <td>2014-07-08T07:29:27.000Z</td>\n",
" <td>2014-07-08T07:46:39.000Z</td>\n",
" <td>114.107414</td>\n",
" <td>22.567949</td>\n",
" <td>114.095818</td>\n",
" <td>22.539984</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>粤B000H6</td>\n",
" <td>2014-07-08T07:49:30.000Z</td>\n",
" <td>2014-07-08T08:01:23.000Z</td>\n",
" <td>114.095482</td>\n",
" <td>22.540051</td>\n",
" <td>114.053436</td>\n",
" <td>22.537033</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>95</th>\n",
" <td>粤B001ZD</td>\n",
" <td>2014-07-08T12:27:32.000Z</td>\n",
" <td>2014-07-08T12:38:18.000Z</td>\n",
" <td>114.257004</td>\n",
" <td>22.732</td>\n",
" <td>114.233498</td>\n",
" <td>22.728001</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96</th>\n",
" <td>粤B001ZD</td>\n",
" <td>2014-07-08T12:41:03.000Z</td>\n",
" <td>2014-07-08T12:56:08.000Z</td>\n",
" <td>114.233101</td>\n",
" <td>22.728399</td>\n",
" <td>114.267799</td>\n",
" <td>22.7202</td>\n",
" </tr>\n",
" <tr>\n",
" <th>97</th>\n",
" <td>粤B001ZD</td>\n",
" <td>2014-07-08T13:36:32.000Z</td>\n",
" <td>2014-07-08T13:45:00.000Z</td>\n",
" <td>114.264297</td>\n",
" <td>22.7269</td>\n",
" <td>114.277298</td>\n",
" <td>22.7383</td>\n",
" </tr>\n",
" <tr>\n",
" <th>98</th>\n",
" <td>粤B001ZD</td>\n",
" <td>2014-07-08T14:40:40.000Z</td>\n",
" <td>2014-07-08T14:54:02.000Z</td>\n",
" <td>114.344902</td>\n",
" <td>22.687401</td>\n",
" <td>114.338799</td>\n",
" <td>22.707701</td>\n",
" </tr>\n",
" <tr>\n",
" <th>99</th>\n",
" <td>粤B001ZD</td>\n",
" <td>2014-07-08T15:12:27.000Z</td>\n",
" <td>2014-07-08T15:15:54.000Z</td>\n",
" <td>114.327003</td>\n",
" <td>22.7012</td>\n",
" <td>114.302696</td>\n",
" <td>22.7176</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>100 rows × 7 columns</p>\n",
"</div>"
],
"text/plain": [
" id ptime dtime plon \\\n",
"0 粤B000H6 2014-07-08T00:04:28.000Z 2014-07-08T00:18:26.000Z 114.067535 \n",
"1 粤B000H6 2014-07-08T00:50:43.000Z 2014-07-08T01:00:07.000Z 114.116486 \n",
"2 粤B000H6 2014-07-08T07:05:20.000Z 2014-07-08T07:10:31.000Z 114.107765 \n",
"3 粤B000H6 2014-07-08T07:29:27.000Z 2014-07-08T07:46:39.000Z 114.107414 \n",
"4 粤B000H6 2014-07-08T07:49:30.000Z 2014-07-08T08:01:23.000Z 114.095482 \n",
".. ... ... ... ... \n",
"95 粤B001ZD 2014-07-08T12:27:32.000Z 2014-07-08T12:38:18.000Z 114.257004 \n",
"96 粤B001ZD 2014-07-08T12:41:03.000Z 2014-07-08T12:56:08.000Z 114.233101 \n",
"97 粤B001ZD 2014-07-08T13:36:32.000Z 2014-07-08T13:45:00.000Z 114.264297 \n",
"98 粤B001ZD 2014-07-08T14:40:40.000Z 2014-07-08T14:54:02.000Z 114.344902 \n",
"99 粤B001ZD 2014-07-08T15:12:27.000Z 2014-07-08T15:15:54.000Z 114.327003 \n",
"\n",
" plat dlon dlat \n",
"0 22.567966 114.033546 22.627783 \n",
"1 22.579849 114.117798 22.59985 \n",
"2 22.568199 114.113548 22.571234 \n",
"3 22.567949 114.095818 22.539984 \n",
"4 22.540051 114.053436 22.537033 \n",
".. ... ... ... \n",
"95 22.732 114.233498 22.728001 \n",
"96 22.728399 114.267799 22.7202 \n",
"97 22.7269 114.277298 22.7383 \n",
"98 22.687401 114.338799 22.707701 \n",
"99 22.7012 114.302696 22.7176 \n",
"\n",
"[100 rows x 7 columns]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"oddf"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>id</th>\n",
" <th>color</th>\n",
" <th>lon</th>\n",
" <th>lat</th>\n",
" <th>time</th>\n",
" <th>speed</th>\n",
" <th>noMeaning</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>粤B0AA56</td>\n",
" <td>蓝的</td>\n",
" <td>113.961098</td>\n",
" <td>22.553101</td>\n",
" <td>2014-07-03T00:00:02.000Z</td>\n",
" <td>17</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>粤B0AA56</td>\n",
" <td>蓝的</td>\n",
" <td>113.962303</td>\n",
" <td>22.547001</td>\n",
" <td>2014-07-03T00:01:48.000Z</td>\n",
" <td>21</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>粤B0AA56</td>\n",
" <td>蓝的</td>\n",
" <td>113.962997</td>\n",
" <td>22.547001</td>\n",
" <td>2014-07-03T00:02:18.000Z</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>粤B0AA56</td>\n",
" <td>蓝的</td>\n",
" <td>113.963799</td>\n",
" <td>22.5464</td>\n",
" <td>2014-07-03T00:02:48.000Z</td>\n",
" <td>11</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>粤B0AA56</td>\n",
" <td>蓝的</td>\n",
" <td>113.963997</td>\n",
" <td>22.544399</td>\n",
" <td>2014-07-03T00:03:52.000Z</td>\n",
" <td>8</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>95</th>\n",
" <td>粤B0AA56</td>\n",
" <td>蓝的</td>\n",
" <td>114.040703</td>\n",
" <td>22.518101</td>\n",
" <td>2014-07-03T01:16:24.000Z</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>96</th>\n",
" <td>粤B0AA56</td>\n",
" <td>蓝的</td>\n",
" <td>114.040703</td>\n",
" <td>22.518101</td>\n",
" <td>2014-07-03T01:16:54.000Z</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>97</th>\n",
" <td>粤B0AA56</td>\n",
" <td>蓝的</td>\n",
" <td>114.040703</td>\n",
" <td>22.518101</td>\n",
" <td>2014-07-03T01:17:24.000Z</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>98</th>\n",
" <td>粤B0AA56</td>\n",
" <td>蓝的</td>\n",
" <td>114.040703</td>\n",
" <td>22.518101</td>\n",
" <td>2014-07-03T01:17:54.000Z</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>99</th>\n",
" <td>粤B0AA56</td>\n",
" <td>蓝的</td>\n",
" <td>114.040703</td>\n",
" <td>22.518101</td>\n",
" <td>2014-07-03T01:19:54.000Z</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>100 rows × 7 columns</p>\n",
"</div>"
],
"text/plain": [
" id color lon lat time speed \\\n",
"0 粤B0AA56 蓝的 113.961098 22.553101 2014-07-03T00:00:02.000Z 17 \n",
"1 粤B0AA56 蓝的 113.962303 22.547001 2014-07-03T00:01:48.000Z 21 \n",
"2 粤B0AA56 蓝的 113.962997 22.547001 2014-07-03T00:02:18.000Z 0 \n",
"3 粤B0AA56 蓝的 113.963799 22.5464 2014-07-03T00:02:48.000Z 11 \n",
"4 粤B0AA56 蓝的 113.963997 22.544399 2014-07-03T00:03:52.000Z 8 \n",
".. ... ... ... ... ... ... \n",
"95 粤B0AA56 蓝的 114.040703 22.518101 2014-07-03T01:16:24.000Z 0 \n",
"96 粤B0AA56 蓝的 114.040703 22.518101 2014-07-03T01:16:54.000Z 0 \n",
"97 粤B0AA56 蓝的 114.040703 22.518101 2014-07-03T01:17:24.000Z 0 \n",
"98 粤B0AA56 蓝的 114.040703 22.518101 2014-07-03T01:17:54.000Z 0 \n",
"99 粤B0AA56 蓝的 114.040703 22.518101 2014-07-03T01:19:54.000Z 0 \n",
"\n",
" noMeaning \n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
".. ... \n",
"95 0 \n",
"96 0 \n",
"97 0 \n",
"98 0 \n",
"99 0 \n",
"\n",
"[100 rows x 7 columns]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gpsdf"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"# 3. We need to write a formatter for the time!\n",
"import time\n",
"egstr = \"2014-07-08T00:18:26.000Z\"\n",
"tstruct = time.strptime(egstr, \"%Y-%m-%dT%H:%M:%S.000Z\")"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"formattime = lambda timestr: int(time.mktime(time.strptime(timestr, \"%Y-%m-%dT%H:%M:%S.000Z\")))\n",
"gpsdf[\"time\"] = gpsdf[\"time\"].apply(formattime)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"oddf[\"ptime\"] = oddf[\"ptime\"].apply(formattime)\n",
"oddf[\"dtime\"] = oddf[\"dtime\"].apply(formattime)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"oddf.to_pickle()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 4. Now to bring it all together! See scr01_process-csvs-into-pickles.py"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}