-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprocess_data.py
More file actions
executable file
·95 lines (76 loc) · 3.71 KB
/
process_data.py
File metadata and controls
executable file
·95 lines (76 loc) · 3.71 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#!/usr/bin/env python3
import pandas as pd
import re
import sys
import argparse
import os
import functools
def time_to_parts(time):
time_parts = [int(x) for x in time.split(":")]
return time_parts[0] * 3600 + time_parts[1] * 60 + time_parts[2]
def compute_diff(t1, t2):
time1 = time_to_parts(t1)
time2 = time_to_parts(t2)
return time2 - time1
def process_name(st_name):
return re.sub(r" - [ABC]", "", st_name)
def compute_all_diferences(arrival_times, departure_times):
return [compute_diff(first, second)
for first, second in zip(arrival_times, departure_times)]
def create_csv(joined, output_path, suffix, stops):
grouped = joined.groupby("trip_id", sort=True)
data = []
for group in grouped:
g = group[1].sort_values(by="stop_sequence")
stations = ["->".join(x) for x in zip(g.stop_name.apply(process_name),
g.stop_name[1:].apply(process_name))]
times = compute_all_diferences(g.arrival_time, g.arrival_time[1:])
data += [{"section": x[0], "time": x[1]} for x in zip(stations, times)]
df = pd.DataFrame.from_records(data)
grouped = df.groupby("section").mean()
df2 = grouped.copy()
sections = grouped.index.map(lambda x: x.split("->"))
df2["departure_station"] = [x[0] for x in sections]
df2["arrival_station"] = [x[1] for x in sections]
df2.to_csv(os.path.join(output_path, "times{0}.csv".format(suffix)))
stops2 = stops.copy()
stops2["stop_name"] = stops2.stop_name.apply(lambda x: process_name(x)[0])
stops_grouped = stops2.groupby("stop_name").mean()
stops_grouped[["stop_lat", "stop_lon"]].apply(lambda x: x.round(6)).to_csv(os.path.join(output_path, "locations{0}.csv".format(suffix)))
def parse_arguments(argv):
parser = argparse.ArgumentParser(description='Process Prague data')
parser.add_argument('data_path', type=str,
help="Path to the folder containing GTFS files")
parser.add_argument("output_path", type=str,
help="Path to output folder.")
return parser.parse_args(argv)
def main(argv):
args = parse_arguments(argv)
stops_cols = ["stop_id", "stop_lat", "stop_lon", "stop_name"]
stops = pd.read_csv(os.path.join(args.data_path, "stops.txt"))[stops_cols]
stop_times = pd.read_csv(os.path.join(args.data_path,
"stop_times.txt"))
trips = pd.read_csv(os.path.join(args.data_path, "trips.txt"))
calendar = pd.read_csv(os.path.join(args.data_path, "calendar.txt"))
routes = pd.read_csv(os.path.join(args.data_path, "routes.txt"))
joined = functools.reduce(lambda x, y: x.merge(y[0], on=y[1]),
[(stop_times, "trip_id"),
(calendar, "service_id"), (stops, "stop_id"),
(routes, "route_id")], trips)
print("Data loaded")
night_start = time_to_parts("00:00:00")
night_end = time_to_parts("05:00:00")
joined["arrival_time_seconds"] = joined.arrival_time.apply(time_to_parts)
joined["departure_time_seconds"] = joined.departure_time.apply(time_to_parts)
night_time = joined[(joined.departure_time_seconds >= night_start) &
(joined.departure_time_seconds < night_end)]
day_time = joined[joined.departure_time_seconds >= night_end]
create_csv(night_time, args.output_path, "_night", stops)
print("Created night data.")
create_csv(day_time, args.output_path, "_day", stops)
print("Created day data.")
# joined = trips.merge(stop_times, on="trip_id").
# merge(calendar, on="service_id").merge(stops, on="stop_id").
# merge(routes, on="route_id")
if __name__ == "__main__":
main(sys.argv[1:])