-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathpreprocess_data.py
More file actions
executable file
·58 lines (53 loc) · 2.31 KB
/
preprocess_data.py
File metadata and controls
executable file
·58 lines (53 loc) · 2.31 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
from __future__ import print_function
from global_land_mask import globe
import os
import sys
sys.dont_write_bytecode=True
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import glob
import argparse
if not os.path.isdir('processed_data/'):
os.makedirs('processed_data/')
files = glob.glob('raw_data/*.csv')
files.sort()
print(files)
parser = argparse.ArgumentParser()
parser.add_argument("--zone",type=int,help="zone")
args=parser.parse_args()
for f, filename in enumerate(files):
if args.zone and not 'Zone%02d'%(args.zone) in filename:
continue
print("processing data from ", str(filename))
out_file = filename.split("Zone")[1]
df = pd.read_csv(filename,header=0,parse_dates=['BaseDateTime'],usecols=['MMSI','BaseDateTime','LAT','LON','SOG','Heading'])
vessels = df['MMSI'].unique()
df.sort_values(['BaseDateTime'],inplace=True)
out_frame = pd.DataFrame()
for v, vessel in enumerate(vessels):
print("mmsi: ", vessel," ",int(v+1),"/",len(vessels))
vessel_data = df.loc[df['MMSI']==vessel]
vessel_data['BaseDateTime'] = pd.to_datetime(vessel_data['BaseDateTime'],format = "%Y-%m-%dT%H:%M:%S")
vessel_data['BaseDateTime'] = vessel_data['BaseDateTime'].dt.ceil('min')
vessel_data = vessel_data.loc[~vessel_data['BaseDateTime'].duplicated(keep='first')]
vessel_data = vessel_data.set_index(['BaseDateTime']).resample('1min').interpolate(limit=5)
vessel_data.reset_index('BaseDateTime',inplace=True)
vessel_data = vessel_data.dropna(subset=['LAT','LON'])
try:
vessel_data.set_index(['BaseDateTime'],inplace=True)
vessel_data['Heading']=vessel_data['Heading'].astype('int32')
if not len(vessel_data['Heading'].unique())==1:
if (int(511) in vessel_data['Heading'].values):
vessel_data['Heading'].replace(to_replace=511, method='ffill',inplace=True)
vessel_data['Heading'].replace(to_replace=511, method='bfill',inplace=True)
out_frame = out_frame._append(vessel_data)
except ValueError:
print("invalid heading values found")
# out_frame = out_frame.append(vessel_data)
out_frame.index.name='BaseDateTime'
out_frame.sort_values(['BaseDateTime'],inplace=True)
print("Saving processed data to processed_data/" + out_file)
out_frame.to_csv('processed_data/'+out_file, index=True)
print("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx")