MachineBall/parse.py at master · DavidWAbrahams/MachineBall · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
from stats_tracker import StatsTracker
from game import Game

import argparse
from collections import defaultdict
from collections import OrderedDict
import glob
import numpy as np
import os
import pickle
import os

parser = argparse.ArgumentParser()

parser.add_argument('--parsed_data_prefix', action='store', default='.\\out', dest='parsed_data_prefix',
                    help='Output path for training sample pickles')
parser.add_argument('--data_path', action='store', default='.\\data\\', dest='data_path',
                    help='Input data dir to parse')
parser.add_argument('--roster_style', action='store', default='participants', dest='roster_style', choices=['starters', 'participants', 'full', 'last'],
                    help='How to populate the roster of each team.')
parser.add_argument('--f', action='store_true', default=False, dest='force',
                    help='Force overwrite of existing data.')
parser.add_argument('--max_pickle_len', action='store', default=50000, dest='max_pickle_len',
                    help='Max entries per pickle. May result in multiple pickles.', type=int)
parser.add_argument('--float_precision', action='store_true', default=False, dest='float_precision',
                    help='Max entries per pickle. May result in multiple pickles.')

args = parser.parse_args()

def season_ongoing(season_event_file_lines):
  # checks if there are any unparsed games left in the data files
  for team in season_event_file_lines:
    if Game.peakNextDate(team):
      return True
  return False

def data_from_roster_files():
  # Read the annual lineup for every team, found in .ROS data files
  year_dirs = [f.path for f in os.scandir(args.data_path) if f.is_dir()]
  year_dirs.sort()
  rosters = OrderedDict() # year: team: [player1, player2, ...]

  for year_dir in year_dirs:
    print('Processesing season {} rosters'.format(year_dir))
    season_event_file_lines = []

    # read all rosters from this season to RAM
    for filename in glob.glob(os.path.join(year_dir, '*.ROS*')):
      with open(filename, 'r') as f:
        roster_name = os.path.splitext(os.path.basename(filename))[0]
        year = roster_name[-4:]
        assert len(year) == 4
        assert year[:2] in ['19', '20']
        team = roster_name[0:-4]
        if year not in rosters:
          rosters[year] = {}
        if team not in rosters[year]:
          rosters[year][team] = OrderedDict()

        line_parts = [line.rstrip().split(',') for line in f]
        for player_parts in line_parts:
          player_id = player_parts[0]
          batting_hand = player_parts[3]
          throwing_hand = player_parts[4]
          rosters[year][team][player_id] = {'batting_hand': batting_hand, 'throwing_hand': throwing_hand}

  return rosters

def data_from_game_files():
  # Read all games from data files.
  year_dirs = [f.path for f in os.scandir(args.data_path) if f.is_dir()]
  year_dirs.sort()
  print('Years: {}'.format(year_dirs))

  games = []
  stats = StatsTracker()

  full_rosters = data_from_roster_files()
  last_game_rosters = defaultdict(dict)

  for year_dir in year_dirs:
    print('Processesing season {}'.format(year_dir))

    initial_num_games = len(games)
    season_event_file_lines = []

    # read all games from this season to RAM
    for filename in glob.glob(os.path.join(year_dir, '*.EV*')):
      with open(filename, 'r') as f:
        season_event_file_lines.append([line.rstrip() for line in f])

    # Parse the season's games in chronological order
    # TODO: this is pretty inefficient
    while season_ongoing(season_event_file_lines):
      next_game_dates = []
      for team_event_file_lines in season_event_file_lines:
        next_game_date = Game.peakNextDate(team_event_file_lines)
        next_game_dates.append(next_game_date)
      next_game_dates = np.array(next_game_dates, dtype=np.float)

      next_game_team_idx = np.nanargmin(next_game_dates)
      if next_game_dates[next_game_team_idx] == np.NaN:
        continue

      next_game_lines = season_event_file_lines[next_game_team_idx]
      # pass lines to game gobbler
      new_game = Game(float_precision=args.float_precision)
      new_game.gobble(next_game_lines, stats, roster_style=args.roster_style, full_rosters=full_rosters, last_game_rosters=last_game_rosters)
      games.append(new_game)
      #print('Finished parsing game {} with score {}'.format(new_game.id, new_game.score))
      # track players for each team for the 'last' roster strategy
      _, visitor_team, visitor_ids, home_team, home_ids = new_game.participant_ids()
      last_game_rosters[visitor_team] = visitor_ids
      last_game_rosters[home_team] = home_ids

    num_games = len(games)
    print('Parsed {} more games ({} total)'.format(num_games-initial_num_games, num_games))

  print('')
  print('***Done parsing game events***')
  print('Total games parsed: {}'.format(len(games)))

  samples = []
  labels = []
  game_ids = []
  for game in games:
    if game.is_good_sample():
      sample, visitor_label, home_label = game.to_sample(starters_only=args.roster_style=='starters')
      samples.append(sample)
      labels.append([visitor_label, home_label])
      game_ids.append(game.id)

  print('Purged {} out of {} games due to sparse player stats.'.format(len(games)-len(samples), len(games)))

  print('Example player stats:')
  print(samples[-1][-1])

  return samples, labels, game_ids

def main():

  if not args.force and (os.path.isfile(args.sample_path) or os.path.isfile(args.label_path)):
    print('ERROR: Parsed game data already exists. Please use --f if you are intentionally recreating it.')
  else:
    print('No saved training data found. Generating from raw game files.')
    samples, labels, game_ids = data_from_game_files()
    assert samples
    assert labels
    assert len(samples) == len(labels), '{} vs {}'.format(len(samples), len(labels))
    print('Generated {} training samples'.format(len(samples)))
    # save for later model training.
    for i in range(int(len(labels)/args.max_pickle_len) + 1):
      # Save data in chunks to avoid OOM errors during pickling.
      start = i*args.max_pickle_len
      end = (i+1)*args.max_pickle_len
      pickle.dump(labels[start:end], open(args.parsed_data_prefix + '_labels_{}.p'.format(i), 'wb'))
      pickle.dump(samples[start:end], open(args.parsed_data_prefix + '_samples_{}.p'.format(i), 'wb'))
      pickle.dump(game_ids[start:end], open(args.parsed_data_prefix + '_gameids_{}.p'.format(i), 'wb'))

if __name__ == "__main__":
    main()