From f47642702cdb3ee68f5579ccc08a8085c2819532 Mon Sep 17 00:00:00 2001 From: "Restyled.io" Date: Thu, 19 Mar 2020 20:18:10 +0000 Subject: [PATCH 1/4] Restyled by autopep8 --- consts.py | 76 +++---- network.py | 341 +++++++++++++++---------------- policy_training.py | 487 +++++++++++++++++++++++---------------------- util.py | 121 +++++------ 4 files changed, 518 insertions(+), 507 deletions(-) diff --git a/consts.py b/consts.py index b0eea0e..976de0a 100644 --- a/consts.py +++ b/consts.py @@ -22,37 +22,37 @@ FOURS = [] # Horizontal for row in range(HEIGHT): - for column in range(WIDTH - 3): - four = np.zeros([HEIGHT, WIDTH], bool) - for i in range(4): - four[row, column + i] = True - FOURS.append(four) + for column in range(WIDTH - 3): + four = np.zeros([HEIGHT, WIDTH], bool) + for i in range(4): + four[row, column + i] = True + FOURS.append(four) # Vertical for row in range(HEIGHT - 3): - for column in range(WIDTH): - four = np.zeros([HEIGHT, WIDTH], bool) - for i in range(4): - four[row + i, column] = True - FOURS.append(four) + for column in range(WIDTH): + four = np.zeros([HEIGHT, WIDTH], bool) + for i in range(4): + four[row + i, column] = True + FOURS.append(four) # Diagonal for row in range(HEIGHT - 3): - for column in range(WIDTH - 3): - four1 = np.zeros([HEIGHT, WIDTH], bool) - four2 = np.zeros([HEIGHT, WIDTH], bool) - for i in range(4): - four1[row + i, column + i] = True - four2[row + 3 - i, column + i] = True - FOURS.append(four1) - FOURS.append(four2) + for column in range(WIDTH - 3): + four1 = np.zeros([HEIGHT, WIDTH], bool) + four2 = np.zeros([HEIGHT, WIDTH], bool) + for i in range(4): + four1[row + i, column + i] = True + four2[row + 3 - i, column + i] = True + FOURS.append(four1) + FOURS.append(four2) FOURS = np.array(FOURS) DISK_FOURS = {} DISK_FOUR_COUNTS = np.zeros([HEIGHT, WIDTH], int) for row in range(HEIGHT): - for column in range(WIDTH): - disk_fours = [four for four in FOURS if four[row, column]] - DISK_FOURS[row, column] = disk_fours - DISK_FOUR_COUNTS[row, column] = len(disk_fours) + for column in range(WIDTH): + disk_fours = [four for four in FOURS if four[row, column]] + DISK_FOURS[row, column] = disk_fours + DISK_FOUR_COUNTS[row, column] = len(disk_fours) # Results RED_WIN = 1 @@ -68,21 +68,21 @@ NEW_POSITION_HASH = np.uint64(0) DISK_HASHES = np.zeros([COLOURS, HEIGHT, WIDTH], np.uint64) for colour in range(COLOURS): - for row in range(HEIGHT): - disks_in_column = row ^ (row + 1) - yellow_disks = 2**(row + 3) if colour == YELLOW else 0 - row_hash = disks_in_column | yellow_disks - for column in range(WIDTH): - row_column_hash = row_hash << (9 * column) - DISK_HASHES[colour, HEIGHT - row - 1, column] = row_column_hash + for row in range(HEIGHT): + disks_in_column = row ^ (row + 1) + yellow_disks = 2**(row + 3) if colour == YELLOW else 0 + row_hash = disks_in_column | yellow_disks + for column in range(WIDTH): + row_column_hash = row_hash << (9 * column) + DISK_HASHES[colour, HEIGHT - row - 1, column] = row_column_hash if __name__ == '__main__': - print(FOURS[0]) - print(DISK_FOURS[0, 0]) - print(DISK_FOUR_COUNTS) - print(TILED_COLUMNS.reshape([HEIGHT, WIDTH])) - print(TILED_ROWS.reshape([HEIGHT, WIDTH])) - print(ROW_EDGE_DISTANCE.reshape([HEIGHT, WIDTH])) - print(COLUMN_EDGE_DISTANCE.reshape([HEIGHT, WIDTH])) - print(ODDS.reshape([HEIGHT, WIDTH])) - print(np.array(map(bin, DISK_HASHES.flatten())).reshape(DISK_HASHES.shape)) + print(FOURS[0]) + print(DISK_FOURS[0, 0]) + print(DISK_FOUR_COUNTS) + print(TILED_COLUMNS.reshape([HEIGHT, WIDTH])) + print(TILED_ROWS.reshape([HEIGHT, WIDTH])) + print(ROW_EDGE_DISTANCE.reshape([HEIGHT, WIDTH])) + print(COLUMN_EDGE_DISTANCE.reshape([HEIGHT, WIDTH])) + print(ODDS.reshape([HEIGHT, WIDTH])) + print(np.array(map(bin, DISK_HASHES.flatten())).reshape(DISK_HASHES.shape)) diff --git a/network.py b/network.py index 4fa6514..bc653ae 100644 --- a/network.py +++ b/network.py @@ -5,178 +5,181 @@ class BaseNetwork(object): - def __init__(self, scope, use_symmetry): - self.scope = scope - - with tf.name_scope('inputs'): - self.turn = tf.placeholder(tf.float32, shape=[None], name='turn') - tiled_turn = tf.tile( - tf.reshape(util.turn_win(self.turn), [-1, 1, 1, 1]), - [1, 2, HEIGHT, WIDTH]) - - self.disks = tf.placeholder( - tf.float32, shape=[None, 2, HEIGHT, WIDTH], name='disks') - - self.empty = tf.placeholder( - tf.float32, shape=[None, HEIGHT, WIDTH], name='empty') - empty = tf.expand_dims(self.empty, axis=1) - - self.legal_moves = tf.placeholder( - tf.float32, shape=[None, HEIGHT, WIDTH], name='legal_moves') - legal_moves = tf.expand_dims(self.legal_moves, axis=1) - - self.threats = tf.placeholder( - tf.float32, shape=[None, 2, HEIGHT, WIDTH], name='threats') - - constant_features = np.array( - [TILED_ROWS, ODDS, ROW_EDGE_DISTANCE, COLUMN_EDGE_DISTANCE], - dtype=np.float32).reshape([1, 4, HEIGHT, WIDTH]) - batch_size = tf.shape(self.turn)[0] - tiled_constant_features = tf.tile(constant_features, - [batch_size, 1, 1, 1]) - - feature_planes = tf.concat( - [ - tiled_turn, self.disks, empty, legal_moves, self.threats, - tiled_constant_features - ], - axis=1) - - if use_symmetry: - # Interleave horizontally flipped position - feature_planes_shape = [-1] + feature_planes.shape.as_list()[1:] - flipped = tf.reverse(feature_planes, axis=[3]) - feature_planes = tf.reshape( - tf.stack([feature_planes, flipped], axis=1), feature_planes_shape) - - with tf.name_scope('conv_layers'): - if self.gpu_available(): - data_format = 'channels_first' - else: - feature_planes = tf.transpose(feature_planes, [0, 2, 3, 1]) - data_format = 'channels_last' - - conv1 = tf.layers.conv2d( - feature_planes, - filters=32, - kernel_size=[4, 5], - padding='same', - data_format=data_format, - use_bias=False, - name='conv1') - - conv2 = tf.layers.conv2d( - conv1, - filters=32, - kernel_size=[4, 5], - padding='same', - data_format=data_format, - activation=tf.nn.relu, - name='conv2') - - conv3 = tf.layers.conv2d( - conv2, - filters=32, - kernel_size=[4, 5], - padding='same', - data_format=data_format, - activation=tf.nn.relu, - name='conv3') - - final_conv = tf.layers.conv2d( - conv3, - filters=1, - kernel_size=[1, 1], - data_format=data_format, - name='final_conv') - disk_bias = tf.get_variable('disk_bias', shape=[TOTAL_DISKS]) - self.conv_output = tf.add( - tf.contrib.layers.flatten(final_conv), disk_bias, name='conv_output') - - self.conv_layers = [conv1, conv2, conv3, self.conv_output] - - def gpu_available(self): - devices = device_lib.list_local_devices() - return len([d for d in devices if d.device_type == 'GPU']) > 0 - - @property - def variables(self): - # Add '/' to stop network-1 containing network-10 variables - return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, - self.scope + '/') - - def assign(self, other): - return [ - tf.assign(other_var, self_var) - for self_var, other_var in zip(self.variables, other.variables) - ] + def __init__(self, scope, use_symmetry): + self.scope = scope + + with tf.name_scope('inputs'): + self.turn = tf.placeholder(tf.float32, shape=[None], name='turn') + tiled_turn = tf.tile( + tf.reshape(util.turn_win(self.turn), [-1, 1, 1, 1]), + [1, 2, HEIGHT, WIDTH]) + + self.disks = tf.placeholder( + tf.float32, shape=[None, 2, HEIGHT, WIDTH], name='disks') + + self.empty = tf.placeholder( + tf.float32, shape=[None, HEIGHT, WIDTH], name='empty') + empty = tf.expand_dims(self.empty, axis=1) + + self.legal_moves = tf.placeholder( + tf.float32, shape=[None, HEIGHT, WIDTH], name='legal_moves') + legal_moves = tf.expand_dims(self.legal_moves, axis=1) + + self.threats = tf.placeholder( + tf.float32, shape=[None, 2, HEIGHT, WIDTH], name='threats') + + constant_features = np.array( + [TILED_ROWS, ODDS, ROW_EDGE_DISTANCE, COLUMN_EDGE_DISTANCE], + dtype=np.float32).reshape([1, 4, HEIGHT, WIDTH]) + batch_size = tf.shape(self.turn)[0] + tiled_constant_features = tf.tile(constant_features, + [batch_size, 1, 1, 1]) + + feature_planes = tf.concat( + [ + tiled_turn, self.disks, empty, legal_moves, self.threats, + tiled_constant_features + ], + axis=1) + + if use_symmetry: + # Interleave horizontally flipped position + feature_planes_shape = [-1] + \ + feature_planes.shape.as_list()[1:] + flipped = tf.reverse(feature_planes, axis=[3]) + feature_planes = tf.reshape( + tf.stack([feature_planes, flipped], axis=1), feature_planes_shape) + + with tf.name_scope('conv_layers'): + if self.gpu_available(): + data_format = 'channels_first' + else: + feature_planes = tf.transpose(feature_planes, [0, 2, 3, 1]) + data_format = 'channels_last' + + conv1 = tf.layers.conv2d( + feature_planes, + filters=32, + kernel_size=[4, 5], + padding='same', + data_format=data_format, + use_bias=False, + name='conv1') + + conv2 = tf.layers.conv2d( + conv1, + filters=32, + kernel_size=[4, 5], + padding='same', + data_format=data_format, + activation=tf.nn.relu, + name='conv2') + + conv3 = tf.layers.conv2d( + conv2, + filters=32, + kernel_size=[4, 5], + padding='same', + data_format=data_format, + activation=tf.nn.relu, + name='conv3') + + final_conv = tf.layers.conv2d( + conv3, + filters=1, + kernel_size=[1, 1], + data_format=data_format, + name='final_conv') + disk_bias = tf.get_variable('disk_bias', shape=[TOTAL_DISKS]) + self.conv_output = tf.add( + tf.contrib.layers.flatten(final_conv), disk_bias, name='conv_output') + + self.conv_layers = [conv1, conv2, conv3, self.conv_output] + + def gpu_available(self): + devices = device_lib.list_local_devices() + return len([d for d in devices if d.device_type == 'GPU']) > 0 + + @property + def variables(self): + # Add '/' to stop network-1 containing network-10 variables + return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, + self.scope + '/') + + def assign(self, other): + return [ + tf.assign(other_var, self_var) + for self_var, other_var in zip(self.variables, other.variables) + ] class PolicyNetwork(BaseNetwork): - def __init__(self, scope, temperature=1.0, reuse=None, use_symmetry=False): - with tf.variable_scope(scope, reuse=reuse): - super(PolicyNetwork, self).__init__(scope, use_symmetry) - - with tf.name_scope('policy'): - self.temperature = tf.placeholder_with_default( - temperature, (), name='temperature') - - disk_logits = tf.divide( - self.conv_output, self.temperature, name='disk_logits') - - if use_symmetry: - # Calculate average of actual and horizontally flipped position - normal, flipped = tf.split( - tf.reshape(disk_logits, [-1, 2, HEIGHT, WIDTH]), - num_or_size_splits=2, - axis=1) - disk_logits = tf.reshape( - tf.reduce_mean( - tf.concat([normal, tf.reverse(flipped, axis=[3])], axis=1), - axis=1), [-1, TOTAL_DISKS]) - - # Make illegal moves impossible: - # - Legal moves have positive logits - # - Illegal moves have -ILLEGAL_PENALTY logits - legal_moves = tf.contrib.layers.flatten(self.legal_moves) - legal_disk_logits = (tf.nn.relu(disk_logits) * legal_moves + - (legal_moves - 1) * ILLEGAL_PENALTY) - - self.policy = tf.nn.softmax(legal_disk_logits, name='policy') - self.sample_move = tf.squeeze( - tf.multinomial(legal_disk_logits, 1) % WIDTH, - axis=1, - name='sample_move') - - self.entropy = tf.reduce_sum( - self.policy * -tf.log(self.policy + EPSILON), # Avoid Nans - axis=1, - name='entropy') - - self.policy_layers = self.conv_layers + [ - disk_logits, self.policy, self.entropy - ] + def __init__(self, scope, temperature=1.0, reuse=None, use_symmetry=False): + with tf.variable_scope(scope, reuse=reuse): + super(PolicyNetwork, self).__init__(scope, use_symmetry) + + with tf.name_scope('policy'): + self.temperature = tf.placeholder_with_default( + temperature, (), name='temperature') + + disk_logits = tf.divide( + self.conv_output, self.temperature, name='disk_logits') + + if use_symmetry: + # Calculate average of actual and horizontally flipped position + normal, flipped = tf.split( + tf.reshape(disk_logits, [-1, 2, HEIGHT, WIDTH]), + num_or_size_splits=2, + axis=1) + disk_logits = tf.reshape( + tf.reduce_mean( + tf.concat( + [normal, tf.reverse(flipped, axis=[3])], axis=1), + axis=1), [-1, TOTAL_DISKS]) + + # Make illegal moves impossible: + # - Legal moves have positive logits + # - Illegal moves have -ILLEGAL_PENALTY logits + legal_moves = tf.contrib.layers.flatten(self.legal_moves) + legal_disk_logits = (tf.nn.relu(disk_logits) * legal_moves + + (legal_moves - 1) * ILLEGAL_PENALTY) + + self.policy = tf.nn.softmax(legal_disk_logits, name='policy') + self.sample_move = tf.squeeze( + tf.multinomial(legal_disk_logits, 1) % WIDTH, + axis=1, + name='sample_move') + + self.entropy = tf.reduce_sum( + self.policy * -tf.log(self.policy + EPSILON), # Avoid Nans + axis=1, + name='entropy') + + self.policy_layers = self.conv_layers + [ + disk_logits, self.policy, self.entropy + ] class ValueNetwork(BaseNetwork): - def __init__(self, scope, use_symmetry=False): - with tf.variable_scope(scope): - super(ValueNetwork, self).__init__(scope, use_symmetry) - - with tf.name_scope('value'): - fully_connected = tf.layers.dense( - self.conv_output, - units=64, - activation=tf.nn.relu, - name='fully_connected') - - value = tf.layers.dense(fully_connected, 1, tf.tanh) - - if use_symmetry: - # Calculate average of actual and horizontally flipped position - self.value = tf.reduce_mean( - tf.reshape(value, [-1, 2]), axis=1, name='value') - else: - self.value = tf.squeeze(value, axis=1, name='value') - - self.value_layers = self.conv_layers + [fully_connected, self.value] + def __init__(self, scope, use_symmetry=False): + with tf.variable_scope(scope): + super(ValueNetwork, self).__init__(scope, use_symmetry) + + with tf.name_scope('value'): + fully_connected = tf.layers.dense( + self.conv_output, + units=64, + activation=tf.nn.relu, + name='fully_connected') + + value = tf.layers.dense(fully_connected, 1, tf.tanh) + + if use_symmetry: + # Calculate average of actual and horizontally flipped position + self.value = tf.reduce_mean( + tf.reshape(value, [-1, 2]), axis=1, name='value') + else: + self.value = tf.squeeze(value, axis=1, name='value') + + self.value_layers = self.conv_layers + \ + [fully_connected, self.value] diff --git a/policy_training.py b/policy_training.py index b8e74fc..f8f3a1c 100644 --- a/policy_training.py +++ b/policy_training.py @@ -17,258 +17,263 @@ class PolicyTraining(object): - def __init__(self, config): - self.config = config - self.run_dir = util.run_directory(config) - - self.session = tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions( - allow_growth=True))) - - self.policy_network = PolicyNetwork('policy') - self.policy_player = PolicyPlayer(self.policy_network, self.session) - util.restore_or_initialize_network(self.session, self.run_dir, - self.policy_network) - - # Train ops - self.create_train_op(self.policy_network) - self.writer = tf.summary.FileWriter(self.run_dir) - util.restore_or_initialize_scope(self.session, self.run_dir, - self.training_scope.name) - - self.opponents = Opponents( - [RandomPlayer(), - RandomThreatPlayer(), - MaxThreatPlayer()]) - self.opponents.restore_networks(self.session, self.run_dir) - - def create_train_op(self, policy_network): - with tf.variable_scope('policy_training') as self.training_scope: - self.move = tf.placeholder(tf.int32, shape=[None], name='move') - self.result = tf.placeholder(tf.float32, shape=[None], name='result') - - policy = tf.reshape(policy_network.policy, [-1, HEIGHT, WIDTH]) - move = tf.expand_dims(tf.one_hot(self.move, WIDTH), axis=1) - turn = util.turn_win(policy_network.turn) - move_probability = tf.reduce_sum(policy * move, axis=[1, 2]) - - result_loss = -tf.reduce_mean( - tf.log(move_probability) * turn * self.result) - entropy_regularisation = ( - -config.entropy * tf.reduce_mean(policy_network.entropy)) - loss = result_loss + entropy_regularisation - - optimizer = tf.train.AdamOptimizer(self.config.learning_rate) - self.global_step = tf.contrib.framework.get_or_create_global_step() - self.train_op = optimizer.minimize(loss, self.global_step) - - # Summary - tf.summary.scalar('loss', loss) - for var in policy_network.variables + policy_network.policy_layers: - tf.summary.histogram(var.name, var) - self.summary = tf.summary.merge_all() - - def train(self): - for _ in range(self.config.batches): - opponent = self.opponents.choose_opponent() - games = self.play_games(opponent) - step, summary = self.train_games(opponent, games) - self.process_results(opponent, games, step, summary) - - if self.opponents.all_beaten(): - name = self.opponents.next_network_name() - print('All opponents beaten. Creating %s' % name) - self.create_new_opponent(name) - - if step % 100 == 0: + def __init__(self, config): + self.config = config + self.run_dir = util.run_directory(config) + + self.session = tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions( + allow_growth=True))) + + self.policy_network = PolicyNetwork('policy') + self.policy_player = PolicyPlayer(self.policy_network, self.session) + util.restore_or_initialize_network(self.session, self.run_dir, + self.policy_network) + + # Train ops + self.create_train_op(self.policy_network) + self.writer = tf.summary.FileWriter(self.run_dir) + util.restore_or_initialize_scope(self.session, self.run_dir, + self.training_scope.name) + + self.opponents = Opponents( + [RandomPlayer(), + RandomThreatPlayer(), + MaxThreatPlayer()]) + self.opponents.restore_networks(self.session, self.run_dir) + + def create_train_op(self, policy_network): + with tf.variable_scope('policy_training') as self.training_scope: + self.move = tf.placeholder(tf.int32, shape=[None], name='move') + self.result = tf.placeholder( + tf.float32, shape=[None], name='result') + + policy = tf.reshape(policy_network.policy, [-1, HEIGHT, WIDTH]) + move = tf.expand_dims(tf.one_hot(self.move, WIDTH), axis=1) + turn = util.turn_win(policy_network.turn) + move_probability = tf.reduce_sum(policy * move, axis=[1, 2]) + + result_loss = -tf.reduce_mean( + tf.log(move_probability) * turn * self.result) + entropy_regularisation = ( + -config.entropy * tf.reduce_mean(policy_network.entropy)) + loss = result_loss + entropy_regularisation + + optimizer = tf.train.AdamOptimizer(self.config.learning_rate) + self.global_step = tf.contrib.framework.get_or_create_global_step() + self.train_op = optimizer.minimize(loss, self.global_step) + + # Summary + tf.summary.scalar('loss', loss) + for var in policy_network.variables + policy_network.policy_layers: + tf.summary.histogram(var.name, var) + self.summary = tf.summary.merge_all() + + def train(self): + for _ in range(self.config.batches): + opponent = self.opponents.choose_opponent() + games = self.play_games(opponent) + step, summary = self.train_games(opponent, games) + self.process_results(opponent, games, step, summary) + + if self.opponents.all_beaten(): + name = self.opponents.next_network_name() + print('All opponents beaten. Creating %s' % name) + self.create_new_opponent(name) + + if step % 100 == 0: + self.save() + self.save() - self.save() - - def save(self): - util.save_network(self.session, self.run_dir, self.policy_network) - util.save_scope(self.session, self.run_dir, self.training_scope.name) - self.opponents.save_opponent_stats(self.run_dir) - - def play_games(self, opponent): - # Create games - games = incomplete_games = [Game() for _ in range(self.config.batch_size)] - - # Let opponent play first in half of the games - self.play_move(games[0:len(games) // 2], opponent) - player = self.policy_player - - while incomplete_games: - self.play_move(incomplete_games, player) - player = self.policy_player if player != self.policy_player else opponent - incomplete_games = [ - game for game in incomplete_games if not game.position.gameover() - ] - - return games - - def play_move(self, games, player): - positions = [game.position for game in games] - moves = player.play(positions) - - for game, move in zip(games, moves): - game.move(move, player == self.policy_player) - - def train_games(self, opponent, games): - turn, disks, empty, legal_moves, threats, moves, results = ([], [], [], [], - [], [], []) - for game in games: - for position, move in game.policy_player_moves: - turn.append(position.turn) - disks.append(position.disks) - empty.append(position.empty) - legal_moves.append(position.legal_moves) - threats.append(position.threats) - moves.append(move) - results.append(game.result) - - _, step, summary = self.session.run( - [self.train_op, self.global_step, self.summary], { - self.policy_network.turn: turn, - self.policy_network.disks: disks, - self.policy_network.empty: empty, - self.policy_network.legal_moves: legal_moves, - self.policy_network.threats: threats, - self.move: moves, - self.result: results - }) - - return step, summary - - def process_results(self, opponent, games, step, summary): - win_rate = np.mean([game.policy_player_score for game in games]) - average_moves = sum(len(game.moves) for game in games) / self.config.batch_size - - opponent_summary = tf.Summary() - opponent_summary.value.add( - tag=self.training_scope.name + '/' + opponent.name + '/win_rate', - simple_value=win_rate) - opponent_summary.value.add( - tag=self.training_scope.name + '/' + opponent.name + '/moves', - simple_value=average_moves) - - self.writer.add_summary(summary, step) - self.writer.add_summary(opponent_summary, step) - - self.opponents.update_win_rate(opponent, win_rate) - - print('Step %d. Opponent %s, win rate %.2f <%.2f>, %.2f moves' % - (step, opponent.name, win_rate, self.opponents.win_rates[opponent], - average_moves)) - - def create_new_opponent(self, name): - # Create clone of policy_player - clone = PolicyNetwork(name) - self.session.run(self.policy_network.assign(clone)) - util.save_network(self.session, self.run_dir, clone) - new_opponent = PolicyPlayer(clone, self.session) - - self.opponents.decrease_win_rates() - self.opponents.add_opponent(new_opponent) + def save(self): + util.save_network(self.session, self.run_dir, self.policy_network) + util.save_scope(self.session, self.run_dir, self.training_scope.name) + self.opponents.save_opponent_stats(self.run_dir) + + def play_games(self, opponent): + # Create games + games = incomplete_games = [Game() + for _ in range(self.config.batch_size)] + + # Let opponent play first in half of the games + self.play_move(games[0:len(games) // 2], opponent) + player = self.policy_player + + while incomplete_games: + self.play_move(incomplete_games, player) + player = self.policy_player if player != self.policy_player else opponent + incomplete_games = [ + game for game in incomplete_games if not game.position.gameover() + ] + + return games + + def play_move(self, games, player): + positions = [game.position for game in games] + moves = player.play(positions) + + for game, move in zip(games, moves): + game.move(move, player == self.policy_player) + + def train_games(self, opponent, games): + turn, disks, empty, legal_moves, threats, moves, results = ([], [], [], [], + [], [], []) + for game in games: + for position, move in game.policy_player_moves: + turn.append(position.turn) + disks.append(position.disks) + empty.append(position.empty) + legal_moves.append(position.legal_moves) + threats.append(position.threats) + moves.append(move) + results.append(game.result) + + _, step, summary = self.session.run( + [self.train_op, self.global_step, self.summary], { + self.policy_network.turn: turn, + self.policy_network.disks: disks, + self.policy_network.empty: empty, + self.policy_network.legal_moves: legal_moves, + self.policy_network.threats: threats, + self.move: moves, + self.result: results + }) + + return step, summary + + def process_results(self, opponent, games, step, summary): + win_rate = np.mean([game.policy_player_score for game in games]) + average_moves = sum(len(game.moves) + for game in games) / self.config.batch_size + + opponent_summary = tf.Summary() + opponent_summary.value.add( + tag=self.training_scope.name + '/' + opponent.name + '/win_rate', + simple_value=win_rate) + opponent_summary.value.add( + tag=self.training_scope.name + '/' + opponent.name + '/moves', + simple_value=average_moves) + + self.writer.add_summary(summary, step) + self.writer.add_summary(opponent_summary, step) + + self.opponents.update_win_rate(opponent, win_rate) + + print('Step %d. Opponent %s, win rate %.2f <%.2f>, %.2f moves' % + (step, opponent.name, win_rate, self.opponents.win_rates[opponent], + average_moves)) + + def create_new_opponent(self, name): + # Create clone of policy_player + clone = PolicyNetwork(name) + self.session.run(self.policy_network.assign(clone)) + util.save_network(self.session, self.run_dir, clone) + new_opponent = PolicyPlayer(clone, self.session) + + self.opponents.decrease_win_rates() + self.opponents.add_opponent(new_opponent) class Opponents(object): - def __init__(self, opponents): - self.win_rates = {} - for opponent in opponents: - self.add_opponent(opponent) - - def add_opponent(self, opponent): - self.win_rates[opponent] = EPSILON - - def decrease_win_rates(self): - # Decrease win rate so tough players must be beaten again - self.win_rates = { - opponent: max(2 * win_rate - 1, EPSILON) - for opponent, win_rate in self.win_rates.items() - } - - def update_win_rate(self, opponent, win_rate): - # Win rate is a moving average - self.win_rates[opponent] = self.win_rates[opponent] * 0.9 + win_rate * 0.1 - - def all_beaten(self): - result = True - for win_rate in self.win_rates.values(): - result = result and win_rate > 0.7 - return result - - def choose_opponent(self): - # More difficult opponents are chosen more often - win_rates = np.maximum(list(self.win_rates.values()), 0.1) - probs = (1 / win_rates**2) - 1 - normalised_probs = probs / probs.sum() - return np.random.choice(list(self.win_rates.keys()), p=normalised_probs) - - def next_network_name(self): - network_opponents = len([ - opponent for opponent in self.win_rates.keys() - if type(opponent) == PolicyPlayer - ]) - return 'network-%d' % (network_opponents + 1) - - def save_opponent_stats(self, run_dir): - with open(os.path.join(run_dir, 'opponents'), 'w') as f: - f.write('\n'.join([ - opponent.name + ' ' + str(win_rate) - for opponent, win_rate in sorted( - self.win_rates.items(), key=lambda x: x[1]) - ])) - - def restore_networks(self, session, run_dir): - opponents_file = os.path.join(run_dir, 'opponents') - if os.path.exists(opponents_file): - with open(opponents_file) as f: - for line in f.readlines(): - opponent_name, win_rate_string = line.strip().split() - win_rate = float(win_rate_string) - if opponent_name[:8] == 'network-': - print('Restoring %s' % opponent_name) - network = PolicyNetwork(opponent_name) - util.restore_network_or_fail(session, run_dir, network) - opponent = PolicyPlayer(network, session) - self.win_rates[opponent] = win_rate - else: - for opponent in self.win_rates.keys(): - if opponent_name == opponent.name: - self.win_rates[opponent] = win_rate + def __init__(self, opponents): + self.win_rates = {} + for opponent in opponents: + self.add_opponent(opponent) + + def add_opponent(self, opponent): + self.win_rates[opponent] = EPSILON + + def decrease_win_rates(self): + # Decrease win rate so tough players must be beaten again + self.win_rates = { + opponent: max(2 * win_rate - 1, EPSILON) + for opponent, win_rate in self.win_rates.items() + } + + def update_win_rate(self, opponent, win_rate): + # Win rate is a moving average + self.win_rates[opponent] = self.win_rates[opponent] * \ + 0.9 + win_rate * 0.1 + + def all_beaten(self): + result = True + for win_rate in self.win_rates.values(): + result = result and win_rate > 0.7 + return result + + def choose_opponent(self): + # More difficult opponents are chosen more often + win_rates = np.maximum(list(self.win_rates.values()), 0.1) + probs = (1 / win_rates**2) - 1 + normalised_probs = probs / probs.sum() + return np.random.choice(list(self.win_rates.keys()), p=normalised_probs) + + def next_network_name(self): + network_opponents = len([ + opponent for opponent in self.win_rates.keys() + if type(opponent) == PolicyPlayer + ]) + return 'network-%d' % (network_opponents + 1) + + def save_opponent_stats(self, run_dir): + with open(os.path.join(run_dir, 'opponents'), 'w') as f: + f.write('\n'.join([ + opponent.name + ' ' + str(win_rate) + for opponent, win_rate in sorted( + self.win_rates.items(), key=lambda x: x[1]) + ])) + + def restore_networks(self, session, run_dir): + opponents_file = os.path.join(run_dir, 'opponents') + if os.path.exists(opponents_file): + with open(opponents_file) as f: + for line in f.readlines(): + opponent_name, win_rate_string = line.strip().split() + win_rate = float(win_rate_string) + if opponent_name[:8] == 'network-': + print('Restoring %s' % opponent_name) + network = PolicyNetwork(opponent_name) + util.restore_network_or_fail(session, run_dir, network) + opponent = PolicyPlayer(network, session) + self.win_rates[opponent] = win_rate + else: + for opponent in self.win_rates.keys(): + if opponent_name == opponent.name: + self.win_rates[opponent] = win_rate class Game(object): - def __init__(self): - self.position = Position() - self.positions = [self.position] - self.moves = [] - self.policy_player_moves = [] - self.result = None - - # Make it equally likely to train on red as yellow - if np.random.rand() < 0.5: - self.move(np.random.choice(self.position.legal_columns())) - - # Setup a random position - while np.random.rand() < 0.75: - self.move(np.random.choice(self.position.legal_columns())) - - def move(self, move, policy_player_turn=False): - if policy_player_turn: - self.policy_player_moves.append((self.position, move)) - self.moves.append(move) - self.position = self.position.move(move) - self.positions.append(self.position) - if self.position.gameover(): - self.result = self.position.result - self.policy_player_score = float(policy_player_turn) if self.result else 0.5 + def __init__(self): + self.position = Position() + self.positions = [self.position] + self.moves = [] + self.policy_player_moves = [] + self.result = None + + # Make it equally likely to train on red as yellow + if np.random.rand() < 0.5: + self.move(np.random.choice(self.position.legal_columns())) + + # Setup a random position + while np.random.rand() < 0.75: + self.move(np.random.choice(self.position.legal_columns())) + + def move(self, move, policy_player_turn=False): + if policy_player_turn: + self.policy_player_moves.append((self.position, move)) + self.moves.append(move) + self.position = self.position.move(move) + self.positions.append(self.position) + if self.position.gameover(): + self.result = self.position.result + self.policy_player_score = float( + policy_player_turn) if self.result else 0.5 def main(_): - training = PolicyTraining(config) - training.train() + training = PolicyTraining(config) + training.train() if __name__ == '__main__': - tf.app.run() + tf.app.run() diff --git a/util.py b/util.py index 84b17f5..19a4743 100644 --- a/util.py +++ b/util.py @@ -6,86 +6,89 @@ def run_directory(config): - def find_previous_run(dir): - if os.path.isdir(dir): - runs = [child[4:] for child in os.listdir(dir) if child[:4] == 'run_'] - if runs: - return max(int(run) for run in runs) + def find_previous_run(dir): + if os.path.isdir(dir): + runs = [child[4:] + for child in os.listdir(dir) if child[:4] == 'run_'] + if runs: + return max(int(run) for run in runs) - return 0 + return 0 - if config.run_dir == 'latest': - parent_dir = 'runs/' - previous_run = find_previous_run(parent_dir) - run_dir = parent_dir + ('run_%d' % previous_run) - elif config.run_dir: - run_dir = config.run_dir - else: - parent_dir = 'runs/' - previous_run = find_previous_run(parent_dir) - run_dir = parent_dir + ('run_%d' % (previous_run + 1)) + if config.run_dir == 'latest': + parent_dir = 'runs/' + previous_run = find_previous_run(parent_dir) + run_dir = parent_dir + ('run_%d' % previous_run) + elif config.run_dir: + run_dir = config.run_dir + else: + parent_dir = 'runs/' + previous_run = find_previous_run(parent_dir) + run_dir = parent_dir + ('run_%d' % (previous_run + 1)) - if run_dir[-1] != '/': - run_dir += '/' + if run_dir[-1] != '/': + run_dir += '/' - if not os.path.isdir(run_dir): - os.makedirs(run_dir) + if not os.path.isdir(run_dir): + os.makedirs(run_dir) - print('Checkpoint and summary directory is %s' % run_dir) + print('Checkpoint and summary directory is %s' % run_dir) - return run_dir + return run_dir def turn_win(turn): - return turn * -2 + 1 # RED = +1, YELLOW = -1 + return turn * -2 + 1 # RED = +1, YELLOW = -1 def restore_or_initialize_scope(session, run_dir, scope): - variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope) - latest_checkpoint = tf.train.latest_checkpoint(run_dir, - scope + '_checkpoint') - if latest_checkpoint: - tf.train.Saver(variables).restore(session, latest_checkpoint) - print('Restored %s scope from %s' % (scope, latest_checkpoint)) - else: - session.run(tf.variables_initializer(variables)) - print('Initialized %s scope' % scope) + variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope) + latest_checkpoint = tf.train.latest_checkpoint(run_dir, + scope + '_checkpoint') + if latest_checkpoint: + tf.train.Saver(variables).restore(session, latest_checkpoint) + print('Restored %s scope from %s' % (scope, latest_checkpoint)) + else: + session.run(tf.variables_initializer(variables)) + print('Initialized %s scope' % scope) def save_scope(session, run_dir, scope): - os.makedirs(run_dir, exist_ok=True) - variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope) - tf.train.Saver(variables).save( - session, - os.path.join(run_dir, scope + '.ckpt'), - latest_filename=scope + '_checkpoint') + os.makedirs(run_dir, exist_ok=True) + variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope) + tf.train.Saver(variables).save( + session, + os.path.join(run_dir, scope + '.ckpt'), + latest_filename=scope + '_checkpoint') def restore_or_initialize_network(session, run_dir, network): - latest_checkpoint = tf.train.latest_checkpoint(run_dir, - network.scope + '_checkpoint') - if latest_checkpoint: - tf.train.Saver(network.variables).restore(session, latest_checkpoint) - print('Restored %s network from %s' % (network.scope, latest_checkpoint)) - else: - session.run(tf.variables_initializer(network.variables)) - print('Initialized %s network' % network.scope) + latest_checkpoint = tf.train.latest_checkpoint(run_dir, + network.scope + '_checkpoint') + if latest_checkpoint: + tf.train.Saver(network.variables).restore(session, latest_checkpoint) + print('Restored %s network from %s' % + (network.scope, latest_checkpoint)) + else: + session.run(tf.variables_initializer(network.variables)) + print('Initialized %s network' % network.scope) def restore_network_or_fail(session, run_dir, network): - latest_checkpoint = tf.train.latest_checkpoint(run_dir, - network.scope + '_checkpoint') - if latest_checkpoint: - tf.train.Saver(network.variables).restore(session, latest_checkpoint) - print('Restored %s network from %s' % (network.scope, latest_checkpoint)) - else: - raise Exception('Network checkpoint %s not found in %s' % - (network.scope, run_dir)) + latest_checkpoint = tf.train.latest_checkpoint(run_dir, + network.scope + '_checkpoint') + if latest_checkpoint: + tf.train.Saver(network.variables).restore(session, latest_checkpoint) + print('Restored %s network from %s' % + (network.scope, latest_checkpoint)) + else: + raise Exception('Network checkpoint %s not found in %s' % + (network.scope, run_dir)) def save_network(session, run_dir, network): - os.makedirs(run_dir, exist_ok=True) - tf.train.Saver(network.variables).save( - session, - os.path.join(run_dir, network.scope + '.ckpt'), - latest_filename=network.scope + '_checkpoint') + os.makedirs(run_dir, exist_ok=True) + tf.train.Saver(network.variables).save( + session, + os.path.join(run_dir, network.scope + '.ckpt'), + latest_filename=network.scope + '_checkpoint') From e0b5c11f6c297830b1978b275873518789d2f7d1 Mon Sep 17 00:00:00 2001 From: "Restyled.io" Date: Thu, 19 Mar 2020 20:18:13 +0000 Subject: [PATCH 2/4] Restyled by black --- consts.py | 7 +-- network.py | 126 +++++++++++++++++++++++--------------- policy_training.py | 147 ++++++++++++++++++++++++++------------------- util.py | 59 +++++++++--------- 4 files changed, 194 insertions(+), 145 deletions(-) diff --git a/consts.py b/consts.py index 976de0a..4b6b9cd 100644 --- a/consts.py +++ b/consts.py @@ -15,8 +15,7 @@ TILED_COLUMNS = np.arange(TOTAL_DISKS) % WIDTH ROW_EDGE_DISTANCE = np.min([TILED_ROWS, np.flip(TILED_ROWS, axis=0)], axis=0) -COLUMN_EDGE_DISTANCE = np.min( - [TILED_COLUMNS, np.flip(TILED_COLUMNS, axis=0)], axis=0) +COLUMN_EDGE_DISTANCE = np.min([TILED_COLUMNS, np.flip(TILED_COLUMNS, axis=0)], axis=0) ODDS = TILED_ROWS % 2 FOURS = [] @@ -70,13 +69,13 @@ for colour in range(COLOURS): for row in range(HEIGHT): disks_in_column = row ^ (row + 1) - yellow_disks = 2**(row + 3) if colour == YELLOW else 0 + yellow_disks = 2 ** (row + 3) if colour == YELLOW else 0 row_hash = disks_in_column | yellow_disks for column in range(WIDTH): row_column_hash = row_hash << (9 * column) DISK_HASHES[colour, HEIGHT - row - 1, column] = row_column_hash -if __name__ == '__main__': +if __name__ == "__main__": print(FOURS[0]) print(DISK_FOURS[0, 0]) print(DISK_FOUR_COUNTS) diff --git a/network.py b/network.py index bc653ae..02bec19 100644 --- a/network.py +++ b/network.py @@ -8,103 +8,117 @@ class BaseNetwork(object): def __init__(self, scope, use_symmetry): self.scope = scope - with tf.name_scope('inputs'): - self.turn = tf.placeholder(tf.float32, shape=[None], name='turn') + with tf.name_scope("inputs"): + self.turn = tf.placeholder(tf.float32, shape=[None], name="turn") tiled_turn = tf.tile( tf.reshape(util.turn_win(self.turn), [-1, 1, 1, 1]), - [1, 2, HEIGHT, WIDTH]) + [1, 2, HEIGHT, WIDTH], + ) self.disks = tf.placeholder( - tf.float32, shape=[None, 2, HEIGHT, WIDTH], name='disks') + tf.float32, shape=[None, 2, HEIGHT, WIDTH], name="disks" + ) self.empty = tf.placeholder( - tf.float32, shape=[None, HEIGHT, WIDTH], name='empty') + tf.float32, shape=[None, HEIGHT, WIDTH], name="empty" + ) empty = tf.expand_dims(self.empty, axis=1) self.legal_moves = tf.placeholder( - tf.float32, shape=[None, HEIGHT, WIDTH], name='legal_moves') + tf.float32, shape=[None, HEIGHT, WIDTH], name="legal_moves" + ) legal_moves = tf.expand_dims(self.legal_moves, axis=1) self.threats = tf.placeholder( - tf.float32, shape=[None, 2, HEIGHT, WIDTH], name='threats') + tf.float32, shape=[None, 2, HEIGHT, WIDTH], name="threats" + ) constant_features = np.array( [TILED_ROWS, ODDS, ROW_EDGE_DISTANCE, COLUMN_EDGE_DISTANCE], - dtype=np.float32).reshape([1, 4, HEIGHT, WIDTH]) + dtype=np.float32, + ).reshape([1, 4, HEIGHT, WIDTH]) batch_size = tf.shape(self.turn)[0] - tiled_constant_features = tf.tile(constant_features, - [batch_size, 1, 1, 1]) + tiled_constant_features = tf.tile(constant_features, [batch_size, 1, 1, 1]) feature_planes = tf.concat( [ - tiled_turn, self.disks, empty, legal_moves, self.threats, - tiled_constant_features + tiled_turn, + self.disks, + empty, + legal_moves, + self.threats, + tiled_constant_features, ], - axis=1) + axis=1, + ) if use_symmetry: # Interleave horizontally flipped position - feature_planes_shape = [-1] + \ - feature_planes.shape.as_list()[1:] + feature_planes_shape = [-1] + feature_planes.shape.as_list()[1:] flipped = tf.reverse(feature_planes, axis=[3]) feature_planes = tf.reshape( - tf.stack([feature_planes, flipped], axis=1), feature_planes_shape) + tf.stack([feature_planes, flipped], axis=1), feature_planes_shape + ) - with tf.name_scope('conv_layers'): + with tf.name_scope("conv_layers"): if self.gpu_available(): - data_format = 'channels_first' + data_format = "channels_first" else: feature_planes = tf.transpose(feature_planes, [0, 2, 3, 1]) - data_format = 'channels_last' + data_format = "channels_last" conv1 = tf.layers.conv2d( feature_planes, filters=32, kernel_size=[4, 5], - padding='same', + padding="same", data_format=data_format, use_bias=False, - name='conv1') + name="conv1", + ) conv2 = tf.layers.conv2d( conv1, filters=32, kernel_size=[4, 5], - padding='same', + padding="same", data_format=data_format, activation=tf.nn.relu, - name='conv2') + name="conv2", + ) conv3 = tf.layers.conv2d( conv2, filters=32, kernel_size=[4, 5], - padding='same', + padding="same", data_format=data_format, activation=tf.nn.relu, - name='conv3') + name="conv3", + ) final_conv = tf.layers.conv2d( conv3, filters=1, kernel_size=[1, 1], data_format=data_format, - name='final_conv') - disk_bias = tf.get_variable('disk_bias', shape=[TOTAL_DISKS]) + name="final_conv", + ) + disk_bias = tf.get_variable("disk_bias", shape=[TOTAL_DISKS]) self.conv_output = tf.add( - tf.contrib.layers.flatten(final_conv), disk_bias, name='conv_output') + tf.contrib.layers.flatten(final_conv), disk_bias, name="conv_output" + ) self.conv_layers = [conv1, conv2, conv3, self.conv_output] def gpu_available(self): devices = device_lib.list_local_devices() - return len([d for d in devices if d.device_type == 'GPU']) > 0 + return len([d for d in devices if d.device_type == "GPU"]) > 0 @property def variables(self): # Add '/' to stop network-1 containing network-10 variables - return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, - self.scope + '/') + return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope + "/") def assign(self, other): return [ @@ -118,45 +132,56 @@ def __init__(self, scope, temperature=1.0, reuse=None, use_symmetry=False): with tf.variable_scope(scope, reuse=reuse): super(PolicyNetwork, self).__init__(scope, use_symmetry) - with tf.name_scope('policy'): + with tf.name_scope("policy"): self.temperature = tf.placeholder_with_default( - temperature, (), name='temperature') + temperature, (), name="temperature" + ) disk_logits = tf.divide( - self.conv_output, self.temperature, name='disk_logits') + self.conv_output, self.temperature, name="disk_logits" + ) if use_symmetry: # Calculate average of actual and horizontally flipped position normal, flipped = tf.split( tf.reshape(disk_logits, [-1, 2, HEIGHT, WIDTH]), num_or_size_splits=2, - axis=1) + axis=1, + ) disk_logits = tf.reshape( tf.reduce_mean( - tf.concat( - [normal, tf.reverse(flipped, axis=[3])], axis=1), - axis=1), [-1, TOTAL_DISKS]) + tf.concat([normal, tf.reverse(flipped, axis=[3])], axis=1), + axis=1, + ), + [-1, TOTAL_DISKS], + ) # Make illegal moves impossible: # - Legal moves have positive logits # - Illegal moves have -ILLEGAL_PENALTY logits legal_moves = tf.contrib.layers.flatten(self.legal_moves) - legal_disk_logits = (tf.nn.relu(disk_logits) * legal_moves + - (legal_moves - 1) * ILLEGAL_PENALTY) + legal_disk_logits = ( + tf.nn.relu(disk_logits) * legal_moves + + (legal_moves - 1) * ILLEGAL_PENALTY + ) - self.policy = tf.nn.softmax(legal_disk_logits, name='policy') + self.policy = tf.nn.softmax(legal_disk_logits, name="policy") self.sample_move = tf.squeeze( tf.multinomial(legal_disk_logits, 1) % WIDTH, axis=1, - name='sample_move') + name="sample_move", + ) self.entropy = tf.reduce_sum( self.policy * -tf.log(self.policy + EPSILON), # Avoid Nans axis=1, - name='entropy') + name="entropy", + ) self.policy_layers = self.conv_layers + [ - disk_logits, self.policy, self.entropy + disk_logits, + self.policy, + self.entropy, ] @@ -165,21 +190,22 @@ def __init__(self, scope, use_symmetry=False): with tf.variable_scope(scope): super(ValueNetwork, self).__init__(scope, use_symmetry) - with tf.name_scope('value'): + with tf.name_scope("value"): fully_connected = tf.layers.dense( self.conv_output, units=64, activation=tf.nn.relu, - name='fully_connected') + name="fully_connected", + ) value = tf.layers.dense(fully_connected, 1, tf.tanh) if use_symmetry: # Calculate average of actual and horizontally flipped position self.value = tf.reduce_mean( - tf.reshape(value, [-1, 2]), axis=1, name='value') + tf.reshape(value, [-1, 2]), axis=1, name="value" + ) else: - self.value = tf.squeeze(value, axis=1, name='value') + self.value = tf.squeeze(value, axis=1, name="value") - self.value_layers = self.conv_layers + \ - [fully_connected, self.value] + self.value_layers = self.conv_layers + [fully_connected, self.value] diff --git a/policy_training.py b/policy_training.py index f8f3a1c..d32bfa1 100644 --- a/policy_training.py +++ b/policy_training.py @@ -8,11 +8,11 @@ import util flags = tf.app.flags -flags.DEFINE_string('run_dir', None, 'Run directory') -flags.DEFINE_integer('batch_size', 256, 'Batch size') -flags.DEFINE_integer('batches', 10000, 'Number of batches') -flags.DEFINE_float('entropy', 0.03, 'Entropy regularisation rate') -flags.DEFINE_float('learning_rate', 0.001, 'Adam learning rate') +flags.DEFINE_string("run_dir", None, "Run directory") +flags.DEFINE_integer("batch_size", 256, "Batch size") +flags.DEFINE_integer("batches", 10000, "Number of batches") +flags.DEFINE_float("entropy", 0.03, "Entropy regularisation rate") +flags.DEFINE_float("learning_rate", 0.001, "Adam learning rate") config = flags.FLAGS @@ -21,41 +21,42 @@ def __init__(self, config): self.config = config self.run_dir = util.run_directory(config) - self.session = tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions( - allow_growth=True))) + self.session = tf.Session( + config=tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True)) + ) - self.policy_network = PolicyNetwork('policy') + self.policy_network = PolicyNetwork("policy") self.policy_player = PolicyPlayer(self.policy_network, self.session) - util.restore_or_initialize_network(self.session, self.run_dir, - self.policy_network) + util.restore_or_initialize_network( + self.session, self.run_dir, self.policy_network + ) # Train ops self.create_train_op(self.policy_network) self.writer = tf.summary.FileWriter(self.run_dir) - util.restore_or_initialize_scope(self.session, self.run_dir, - self.training_scope.name) + util.restore_or_initialize_scope( + self.session, self.run_dir, self.training_scope.name + ) self.opponents = Opponents( - [RandomPlayer(), - RandomThreatPlayer(), - MaxThreatPlayer()]) + [RandomPlayer(), RandomThreatPlayer(), MaxThreatPlayer()] + ) self.opponents.restore_networks(self.session, self.run_dir) def create_train_op(self, policy_network): - with tf.variable_scope('policy_training') as self.training_scope: - self.move = tf.placeholder(tf.int32, shape=[None], name='move') - self.result = tf.placeholder( - tf.float32, shape=[None], name='result') + with tf.variable_scope("policy_training") as self.training_scope: + self.move = tf.placeholder(tf.int32, shape=[None], name="move") + self.result = tf.placeholder(tf.float32, shape=[None], name="result") policy = tf.reshape(policy_network.policy, [-1, HEIGHT, WIDTH]) move = tf.expand_dims(tf.one_hot(self.move, WIDTH), axis=1) turn = util.turn_win(policy_network.turn) move_probability = tf.reduce_sum(policy * move, axis=[1, 2]) - result_loss = -tf.reduce_mean( - tf.log(move_probability) * turn * self.result) - entropy_regularisation = ( - -config.entropy * tf.reduce_mean(policy_network.entropy)) + result_loss = -tf.reduce_mean(tf.log(move_probability) * turn * self.result) + entropy_regularisation = -config.entropy * tf.reduce_mean( + policy_network.entropy + ) loss = result_loss + entropy_regularisation optimizer = tf.train.AdamOptimizer(self.config.learning_rate) @@ -63,7 +64,7 @@ def create_train_op(self, policy_network): self.train_op = optimizer.minimize(loss, self.global_step) # Summary - tf.summary.scalar('loss', loss) + tf.summary.scalar("loss", loss) for var in policy_network.variables + policy_network.policy_layers: tf.summary.histogram(var.name, var) self.summary = tf.summary.merge_all() @@ -77,7 +78,7 @@ def train(self): if self.opponents.all_beaten(): name = self.opponents.next_network_name() - print('All opponents beaten. Creating %s' % name) + print("All opponents beaten. Creating %s" % name) self.create_new_opponent(name) if step % 100 == 0: @@ -92,11 +93,10 @@ def save(self): def play_games(self, opponent): # Create games - games = incomplete_games = [Game() - for _ in range(self.config.batch_size)] + games = incomplete_games = [Game() for _ in range(self.config.batch_size)] # Let opponent play first in half of the games - self.play_move(games[0:len(games) // 2], opponent) + self.play_move(games[0 : len(games) // 2], opponent) player = self.policy_player while incomplete_games: @@ -116,8 +116,15 @@ def play_move(self, games, player): game.move(move, player == self.policy_player) def train_games(self, opponent, games): - turn, disks, empty, legal_moves, threats, moves, results = ([], [], [], [], - [], [], []) + turn, disks, empty, legal_moves, threats, moves, results = ( + [], + [], + [], + [], + [], + [], + [], + ) for game in games: for position, move in game.policy_player_moves: turn.append(position.turn) @@ -129,39 +136,49 @@ def train_games(self, opponent, games): results.append(game.result) _, step, summary = self.session.run( - [self.train_op, self.global_step, self.summary], { + [self.train_op, self.global_step, self.summary], + { self.policy_network.turn: turn, self.policy_network.disks: disks, self.policy_network.empty: empty, self.policy_network.legal_moves: legal_moves, self.policy_network.threats: threats, self.move: moves, - self.result: results - }) + self.result: results, + }, + ) return step, summary def process_results(self, opponent, games, step, summary): win_rate = np.mean([game.policy_player_score for game in games]) - average_moves = sum(len(game.moves) - for game in games) / self.config.batch_size + average_moves = sum(len(game.moves) for game in games) / self.config.batch_size opponent_summary = tf.Summary() opponent_summary.value.add( - tag=self.training_scope.name + '/' + opponent.name + '/win_rate', - simple_value=win_rate) + tag=self.training_scope.name + "/" + opponent.name + "/win_rate", + simple_value=win_rate, + ) opponent_summary.value.add( - tag=self.training_scope.name + '/' + opponent.name + '/moves', - simple_value=average_moves) + tag=self.training_scope.name + "/" + opponent.name + "/moves", + simple_value=average_moves, + ) self.writer.add_summary(summary, step) self.writer.add_summary(opponent_summary, step) self.opponents.update_win_rate(opponent, win_rate) - print('Step %d. Opponent %s, win rate %.2f <%.2f>, %.2f moves' % - (step, opponent.name, win_rate, self.opponents.win_rates[opponent], - average_moves)) + print( + "Step %d. Opponent %s, win rate %.2f <%.2f>, %.2f moves" + % ( + step, + opponent.name, + win_rate, + self.opponents.win_rates[opponent], + average_moves, + ) + ) def create_new_opponent(self, name): # Create clone of policy_player @@ -192,8 +209,7 @@ def decrease_win_rates(self): def update_win_rate(self, opponent, win_rate): # Win rate is a moving average - self.win_rates[opponent] = self.win_rates[opponent] * \ - 0.9 + win_rate * 0.1 + self.win_rates[opponent] = self.win_rates[opponent] * 0.9 + win_rate * 0.1 def all_beaten(self): result = True @@ -204,34 +220,42 @@ def all_beaten(self): def choose_opponent(self): # More difficult opponents are chosen more often win_rates = np.maximum(list(self.win_rates.values()), 0.1) - probs = (1 / win_rates**2) - 1 + probs = (1 / win_rates ** 2) - 1 normalised_probs = probs / probs.sum() return np.random.choice(list(self.win_rates.keys()), p=normalised_probs) def next_network_name(self): - network_opponents = len([ - opponent for opponent in self.win_rates.keys() - if type(opponent) == PolicyPlayer - ]) - return 'network-%d' % (network_opponents + 1) + network_opponents = len( + [ + opponent + for opponent in self.win_rates.keys() + if type(opponent) == PolicyPlayer + ] + ) + return "network-%d" % (network_opponents + 1) def save_opponent_stats(self, run_dir): - with open(os.path.join(run_dir, 'opponents'), 'w') as f: - f.write('\n'.join([ - opponent.name + ' ' + str(win_rate) - for opponent, win_rate in sorted( - self.win_rates.items(), key=lambda x: x[1]) - ])) + with open(os.path.join(run_dir, "opponents"), "w") as f: + f.write( + "\n".join( + [ + opponent.name + " " + str(win_rate) + for opponent, win_rate in sorted( + self.win_rates.items(), key=lambda x: x[1] + ) + ] + ) + ) def restore_networks(self, session, run_dir): - opponents_file = os.path.join(run_dir, 'opponents') + opponents_file = os.path.join(run_dir, "opponents") if os.path.exists(opponents_file): with open(opponents_file) as f: for line in f.readlines(): opponent_name, win_rate_string = line.strip().split() win_rate = float(win_rate_string) - if opponent_name[:8] == 'network-': - print('Restoring %s' % opponent_name) + if opponent_name[:8] == "network-": + print("Restoring %s" % opponent_name) network = PolicyNetwork(opponent_name) util.restore_network_or_fail(session, run_dir, network) opponent = PolicyPlayer(network, session) @@ -266,8 +290,7 @@ def move(self, move, policy_player_turn=False): self.positions.append(self.position) if self.position.gameover(): self.result = self.position.result - self.policy_player_score = float( - policy_player_turn) if self.result else 0.5 + self.policy_player_score = float(policy_player_turn) if self.result else 0.5 def main(_): @@ -275,5 +298,5 @@ def main(_): training.train() -if __name__ == '__main__': +if __name__ == "__main__": tf.app.run() diff --git a/util.py b/util.py index 19a4743..2055268 100644 --- a/util.py +++ b/util.py @@ -8,31 +8,30 @@ def run_directory(config): def find_previous_run(dir): if os.path.isdir(dir): - runs = [child[4:] - for child in os.listdir(dir) if child[:4] == 'run_'] + runs = [child[4:] for child in os.listdir(dir) if child[:4] == "run_"] if runs: return max(int(run) for run in runs) return 0 - if config.run_dir == 'latest': - parent_dir = 'runs/' + if config.run_dir == "latest": + parent_dir = "runs/" previous_run = find_previous_run(parent_dir) - run_dir = parent_dir + ('run_%d' % previous_run) + run_dir = parent_dir + ("run_%d" % previous_run) elif config.run_dir: run_dir = config.run_dir else: - parent_dir = 'runs/' + parent_dir = "runs/" previous_run = find_previous_run(parent_dir) - run_dir = parent_dir + ('run_%d' % (previous_run + 1)) + run_dir = parent_dir + ("run_%d" % (previous_run + 1)) - if run_dir[-1] != '/': - run_dir += '/' + if run_dir[-1] != "/": + run_dir += "/" if not os.path.isdir(run_dir): os.makedirs(run_dir) - print('Checkpoint and summary directory is %s' % run_dir) + print("Checkpoint and summary directory is %s" % run_dir) return run_dir @@ -43,14 +42,13 @@ def turn_win(turn): def restore_or_initialize_scope(session, run_dir, scope): variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope) - latest_checkpoint = tf.train.latest_checkpoint(run_dir, - scope + '_checkpoint') + latest_checkpoint = tf.train.latest_checkpoint(run_dir, scope + "_checkpoint") if latest_checkpoint: tf.train.Saver(variables).restore(session, latest_checkpoint) - print('Restored %s scope from %s' % (scope, latest_checkpoint)) + print("Restored %s scope from %s" % (scope, latest_checkpoint)) else: session.run(tf.variables_initializer(variables)) - print('Initialized %s scope' % scope) + print("Initialized %s scope" % scope) def save_scope(session, run_dir, scope): @@ -58,37 +56,40 @@ def save_scope(session, run_dir, scope): variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope) tf.train.Saver(variables).save( session, - os.path.join(run_dir, scope + '.ckpt'), - latest_filename=scope + '_checkpoint') + os.path.join(run_dir, scope + ".ckpt"), + latest_filename=scope + "_checkpoint", + ) def restore_or_initialize_network(session, run_dir, network): - latest_checkpoint = tf.train.latest_checkpoint(run_dir, - network.scope + '_checkpoint') + latest_checkpoint = tf.train.latest_checkpoint( + run_dir, network.scope + "_checkpoint" + ) if latest_checkpoint: tf.train.Saver(network.variables).restore(session, latest_checkpoint) - print('Restored %s network from %s' % - (network.scope, latest_checkpoint)) + print("Restored %s network from %s" % (network.scope, latest_checkpoint)) else: session.run(tf.variables_initializer(network.variables)) - print('Initialized %s network' % network.scope) + print("Initialized %s network" % network.scope) def restore_network_or_fail(session, run_dir, network): - latest_checkpoint = tf.train.latest_checkpoint(run_dir, - network.scope + '_checkpoint') + latest_checkpoint = tf.train.latest_checkpoint( + run_dir, network.scope + "_checkpoint" + ) if latest_checkpoint: tf.train.Saver(network.variables).restore(session, latest_checkpoint) - print('Restored %s network from %s' % - (network.scope, latest_checkpoint)) + print("Restored %s network from %s" % (network.scope, latest_checkpoint)) else: - raise Exception('Network checkpoint %s not found in %s' % - (network.scope, run_dir)) + raise Exception( + "Network checkpoint %s not found in %s" % (network.scope, run_dir) + ) def save_network(session, run_dir, network): os.makedirs(run_dir, exist_ok=True) tf.train.Saver(network.variables).save( session, - os.path.join(run_dir, network.scope + '.ckpt'), - latest_filename=network.scope + '_checkpoint') + os.path.join(run_dir, network.scope + ".ckpt"), + latest_filename=network.scope + "_checkpoint", + ) From e7ada8e216f3ff1f4ea1ecc42601c0f9fb112c1e Mon Sep 17 00:00:00 2001 From: "Restyled.io" Date: Thu, 19 Mar 2020 20:18:14 +0000 Subject: [PATCH 3/4] Restyled by reorder-python-imports --- network.py | 3 ++- policy_training.py | 10 ++++++---- util.py | 5 +++-- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/network.py b/network.py index 02bec19..c027ad6 100644 --- a/network.py +++ b/network.py @@ -1,7 +1,8 @@ -from consts import * import tensorflow as tf from tensorflow.python.client import device_lib + import util +from consts import * class BaseNetwork(object): diff --git a/policy_training.py b/policy_training.py index d32bfa1..0e7967b 100644 --- a/policy_training.py +++ b/policy_training.py @@ -1,11 +1,13 @@ +import os + +import numpy as np +import tensorflow as tf + +import util from consts import * from network import PolicyNetwork -import numpy as np -import os from players import * from position import Position -import tensorflow as tf -import util flags = tf.app.flags flags.DEFINE_string("run_dir", None, "Run directory") diff --git a/util.py b/util.py index 2055268..5799272 100644 --- a/util.py +++ b/util.py @@ -1,8 +1,9 @@ -from datetime import datetime import os -import tensorflow as tf import threading import time +from datetime import datetime + +import tensorflow as tf def run_directory(config): From 1c0d91f3ca50651982aa4be9186120523ac16692 Mon Sep 17 00:00:00 2001 From: "Restyled.io" Date: Thu, 19 Mar 2020 20:18:17 +0000 Subject: [PATCH 4/4] Restyled by yapf --- consts.py | 5 ++- network.py | 72 ++++++++++++++++++---------------- policy_training.py | 96 ++++++++++++++++++++++------------------------ util.py | 24 ++++++------ 4 files changed, 100 insertions(+), 97 deletions(-) diff --git a/consts.py b/consts.py index 4b6b9cd..54a5522 100644 --- a/consts.py +++ b/consts.py @@ -15,7 +15,8 @@ TILED_COLUMNS = np.arange(TOTAL_DISKS) % WIDTH ROW_EDGE_DISTANCE = np.min([TILED_ROWS, np.flip(TILED_ROWS, axis=0)], axis=0) -COLUMN_EDGE_DISTANCE = np.min([TILED_COLUMNS, np.flip(TILED_COLUMNS, axis=0)], axis=0) +COLUMN_EDGE_DISTANCE = np.min( + [TILED_COLUMNS, np.flip(TILED_COLUMNS, axis=0)], axis=0) ODDS = TILED_ROWS % 2 FOURS = [] @@ -69,7 +70,7 @@ for colour in range(COLOURS): for row in range(HEIGHT): disks_in_column = row ^ (row + 1) - yellow_disks = 2 ** (row + 3) if colour == YELLOW else 0 + yellow_disks = 2**(row + 3) if colour == YELLOW else 0 row_hash = disks_in_column | yellow_disks for column in range(WIDTH): row_column_hash = row_hash << (9 * column) diff --git a/network.py b/network.py index c027ad6..7f2e1e4 100644 --- a/network.py +++ b/network.py @@ -16,30 +16,31 @@ def __init__(self, scope, use_symmetry): [1, 2, HEIGHT, WIDTH], ) - self.disks = tf.placeholder( - tf.float32, shape=[None, 2, HEIGHT, WIDTH], name="disks" - ) + self.disks = tf.placeholder(tf.float32, + shape=[None, 2, HEIGHT, WIDTH], + name="disks") - self.empty = tf.placeholder( - tf.float32, shape=[None, HEIGHT, WIDTH], name="empty" - ) + self.empty = tf.placeholder(tf.float32, + shape=[None, HEIGHT, WIDTH], + name="empty") empty = tf.expand_dims(self.empty, axis=1) - self.legal_moves = tf.placeholder( - tf.float32, shape=[None, HEIGHT, WIDTH], name="legal_moves" - ) + self.legal_moves = tf.placeholder(tf.float32, + shape=[None, HEIGHT, WIDTH], + name="legal_moves") legal_moves = tf.expand_dims(self.legal_moves, axis=1) - self.threats = tf.placeholder( - tf.float32, shape=[None, 2, HEIGHT, WIDTH], name="threats" - ) + self.threats = tf.placeholder(tf.float32, + shape=[None, 2, HEIGHT, WIDTH], + name="threats") constant_features = np.array( [TILED_ROWS, ODDS, ROW_EDGE_DISTANCE, COLUMN_EDGE_DISTANCE], dtype=np.float32, ).reshape([1, 4, HEIGHT, WIDTH]) batch_size = tf.shape(self.turn)[0] - tiled_constant_features = tf.tile(constant_features, [batch_size, 1, 1, 1]) + tiled_constant_features = tf.tile(constant_features, + [batch_size, 1, 1, 1]) feature_planes = tf.concat( [ @@ -55,11 +56,12 @@ def __init__(self, scope, use_symmetry): if use_symmetry: # Interleave horizontally flipped position - feature_planes_shape = [-1] + feature_planes.shape.as_list()[1:] + feature_planes_shape = [-1 + ] + feature_planes.shape.as_list()[1:] flipped = tf.reverse(feature_planes, axis=[3]) feature_planes = tf.reshape( - tf.stack([feature_planes, flipped], axis=1), feature_planes_shape - ) + tf.stack([feature_planes, flipped], axis=1), + feature_planes_shape) with tf.name_scope("conv_layers"): if self.gpu_available(): @@ -106,9 +108,9 @@ def __init__(self, scope, use_symmetry): name="final_conv", ) disk_bias = tf.get_variable("disk_bias", shape=[TOTAL_DISKS]) - self.conv_output = tf.add( - tf.contrib.layers.flatten(final_conv), disk_bias, name="conv_output" - ) + self.conv_output = tf.add(tf.contrib.layers.flatten(final_conv), + disk_bias, + name="conv_output") self.conv_layers = [conv1, conv2, conv3, self.conv_output] @@ -119,7 +121,8 @@ def gpu_available(self): @property def variables(self): # Add '/' to stop network-1 containing network-10 variables - return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope + "/") + return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, + self.scope + "/") def assign(self, other): return [ @@ -135,12 +138,11 @@ def __init__(self, scope, temperature=1.0, reuse=None, use_symmetry=False): with tf.name_scope("policy"): self.temperature = tf.placeholder_with_default( - temperature, (), name="temperature" - ) + temperature, (), name="temperature") - disk_logits = tf.divide( - self.conv_output, self.temperature, name="disk_logits" - ) + disk_logits = tf.divide(self.conv_output, + self.temperature, + name="disk_logits") if use_symmetry: # Calculate average of actual and horizontally flipped position @@ -151,7 +153,9 @@ def __init__(self, scope, temperature=1.0, reuse=None, use_symmetry=False): ) disk_logits = tf.reshape( tf.reduce_mean( - tf.concat([normal, tf.reverse(flipped, axis=[3])], axis=1), + tf.concat( + [normal, tf.reverse(flipped, axis=[3])], + axis=1), axis=1, ), [-1, TOTAL_DISKS], @@ -161,10 +165,8 @@ def __init__(self, scope, temperature=1.0, reuse=None, use_symmetry=False): # - Legal moves have positive logits # - Illegal moves have -ILLEGAL_PENALTY logits legal_moves = tf.contrib.layers.flatten(self.legal_moves) - legal_disk_logits = ( - tf.nn.relu(disk_logits) * legal_moves - + (legal_moves - 1) * ILLEGAL_PENALTY - ) + legal_disk_logits = (tf.nn.relu(disk_logits) * legal_moves + + (legal_moves - 1) * ILLEGAL_PENALTY) self.policy = tf.nn.softmax(legal_disk_logits, name="policy") self.sample_move = tf.squeeze( @@ -203,10 +205,12 @@ def __init__(self, scope, use_symmetry=False): if use_symmetry: # Calculate average of actual and horizontally flipped position - self.value = tf.reduce_mean( - tf.reshape(value, [-1, 2]), axis=1, name="value" - ) + self.value = tf.reduce_mean(tf.reshape(value, [-1, 2]), + axis=1, + name="value") else: self.value = tf.squeeze(value, axis=1, name="value") - self.value_layers = self.conv_layers + [fully_connected, self.value] + self.value_layers = self.conv_layers + [ + fully_connected, self.value + ] diff --git a/policy_training.py b/policy_training.py index 0e7967b..6f5b795 100644 --- a/policy_training.py +++ b/policy_training.py @@ -23,42 +23,42 @@ def __init__(self, config): self.config = config self.run_dir = util.run_directory(config) - self.session = tf.Session( - config=tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True)) - ) + self.session = tf.Session(config=tf.ConfigProto( + gpu_options=tf.GPUOptions(allow_growth=True))) self.policy_network = PolicyNetwork("policy") self.policy_player = PolicyPlayer(self.policy_network, self.session) - util.restore_or_initialize_network( - self.session, self.run_dir, self.policy_network - ) + util.restore_or_initialize_network(self.session, self.run_dir, + self.policy_network) # Train ops self.create_train_op(self.policy_network) self.writer = tf.summary.FileWriter(self.run_dir) - util.restore_or_initialize_scope( - self.session, self.run_dir, self.training_scope.name - ) + util.restore_or_initialize_scope(self.session, self.run_dir, + self.training_scope.name) self.opponents = Opponents( - [RandomPlayer(), RandomThreatPlayer(), MaxThreatPlayer()] - ) + [RandomPlayer(), + RandomThreatPlayer(), + MaxThreatPlayer()]) self.opponents.restore_networks(self.session, self.run_dir) def create_train_op(self, policy_network): with tf.variable_scope("policy_training") as self.training_scope: self.move = tf.placeholder(tf.int32, shape=[None], name="move") - self.result = tf.placeholder(tf.float32, shape=[None], name="result") + self.result = tf.placeholder(tf.float32, + shape=[None], + name="result") policy = tf.reshape(policy_network.policy, [-1, HEIGHT, WIDTH]) move = tf.expand_dims(tf.one_hot(self.move, WIDTH), axis=1) turn = util.turn_win(policy_network.turn) move_probability = tf.reduce_sum(policy * move, axis=[1, 2]) - result_loss = -tf.reduce_mean(tf.log(move_probability) * turn * self.result) + result_loss = -tf.reduce_mean( + tf.log(move_probability) * turn * self.result) entropy_regularisation = -config.entropy * tf.reduce_mean( - policy_network.entropy - ) + policy_network.entropy) loss = result_loss + entropy_regularisation optimizer = tf.train.AdamOptimizer(self.config.learning_rate) @@ -95,17 +95,20 @@ def save(self): def play_games(self, opponent): # Create games - games = incomplete_games = [Game() for _ in range(self.config.batch_size)] + games = incomplete_games = [ + Game() for _ in range(self.config.batch_size) + ] # Let opponent play first in half of the games - self.play_move(games[0 : len(games) // 2], opponent) + self.play_move(games[0:len(games) // 2], opponent) player = self.policy_player while incomplete_games: self.play_move(incomplete_games, player) player = self.policy_player if player != self.policy_player else opponent incomplete_games = [ - game for game in incomplete_games if not game.position.gameover() + game for game in incomplete_games + if not game.position.gameover() ] return games @@ -154,7 +157,8 @@ def train_games(self, opponent, games): def process_results(self, opponent, games, step, summary): win_rate = np.mean([game.policy_player_score for game in games]) - average_moves = sum(len(game.moves) for game in games) / self.config.batch_size + average_moves = sum(len(game.moves) + for game in games) / self.config.batch_size opponent_summary = tf.Summary() opponent_summary.value.add( @@ -171,16 +175,13 @@ def process_results(self, opponent, games, step, summary): self.opponents.update_win_rate(opponent, win_rate) - print( - "Step %d. Opponent %s, win rate %.2f <%.2f>, %.2f moves" - % ( - step, - opponent.name, - win_rate, - self.opponents.win_rates[opponent], - average_moves, - ) - ) + print("Step %d. Opponent %s, win rate %.2f <%.2f>, %.2f moves" % ( + step, + opponent.name, + win_rate, + self.opponents.win_rates[opponent], + average_moves, + )) def create_new_opponent(self, name): # Create clone of policy_player @@ -211,7 +212,8 @@ def decrease_win_rates(self): def update_win_rate(self, opponent, win_rate): # Win rate is a moving average - self.win_rates[opponent] = self.win_rates[opponent] * 0.9 + win_rate * 0.1 + self.win_rates[ + opponent] = self.win_rates[opponent] * 0.9 + win_rate * 0.1 def all_beaten(self): result = True @@ -222,32 +224,25 @@ def all_beaten(self): def choose_opponent(self): # More difficult opponents are chosen more often win_rates = np.maximum(list(self.win_rates.values()), 0.1) - probs = (1 / win_rates ** 2) - 1 + probs = (1 / win_rates**2) - 1 normalised_probs = probs / probs.sum() - return np.random.choice(list(self.win_rates.keys()), p=normalised_probs) + return np.random.choice(list(self.win_rates.keys()), + p=normalised_probs) def next_network_name(self): - network_opponents = len( - [ - opponent - for opponent in self.win_rates.keys() - if type(opponent) == PolicyPlayer - ] - ) + network_opponents = len([ + opponent for opponent in self.win_rates.keys() + if type(opponent) == PolicyPlayer + ]) return "network-%d" % (network_opponents + 1) def save_opponent_stats(self, run_dir): with open(os.path.join(run_dir, "opponents"), "w") as f: - f.write( - "\n".join( - [ - opponent.name + " " + str(win_rate) - for opponent, win_rate in sorted( - self.win_rates.items(), key=lambda x: x[1] - ) - ] - ) - ) + f.write("\n".join([ + opponent.name + " " + str(win_rate) + for opponent, win_rate in sorted(self.win_rates.items(), + key=lambda x: x[1]) + ])) def restore_networks(self, session, run_dir): opponents_file = os.path.join(run_dir, "opponents") @@ -292,7 +287,8 @@ def move(self, move, policy_player_turn=False): self.positions.append(self.position) if self.position.gameover(): self.result = self.position.result - self.policy_player_score = float(policy_player_turn) if self.result else 0.5 + self.policy_player_score = float( + policy_player_turn) if self.result else 0.5 def main(_): diff --git a/util.py b/util.py index 5799272..da4487d 100644 --- a/util.py +++ b/util.py @@ -9,7 +9,9 @@ def run_directory(config): def find_previous_run(dir): if os.path.isdir(dir): - runs = [child[4:] for child in os.listdir(dir) if child[:4] == "run_"] + runs = [ + child[4:] for child in os.listdir(dir) if child[:4] == "run_" + ] if runs: return max(int(run) for run in runs) @@ -43,7 +45,8 @@ def turn_win(turn): def restore_or_initialize_scope(session, run_dir, scope): variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope) - latest_checkpoint = tf.train.latest_checkpoint(run_dir, scope + "_checkpoint") + latest_checkpoint = tf.train.latest_checkpoint(run_dir, + scope + "_checkpoint") if latest_checkpoint: tf.train.Saver(variables).restore(session, latest_checkpoint) print("Restored %s scope from %s" % (scope, latest_checkpoint)) @@ -64,11 +67,11 @@ def save_scope(session, run_dir, scope): def restore_or_initialize_network(session, run_dir, network): latest_checkpoint = tf.train.latest_checkpoint( - run_dir, network.scope + "_checkpoint" - ) + run_dir, network.scope + "_checkpoint") if latest_checkpoint: tf.train.Saver(network.variables).restore(session, latest_checkpoint) - print("Restored %s network from %s" % (network.scope, latest_checkpoint)) + print("Restored %s network from %s" % + (network.scope, latest_checkpoint)) else: session.run(tf.variables_initializer(network.variables)) print("Initialized %s network" % network.scope) @@ -76,15 +79,14 @@ def restore_or_initialize_network(session, run_dir, network): def restore_network_or_fail(session, run_dir, network): latest_checkpoint = tf.train.latest_checkpoint( - run_dir, network.scope + "_checkpoint" - ) + run_dir, network.scope + "_checkpoint") if latest_checkpoint: tf.train.Saver(network.variables).restore(session, latest_checkpoint) - print("Restored %s network from %s" % (network.scope, latest_checkpoint)) + print("Restored %s network from %s" % + (network.scope, latest_checkpoint)) else: - raise Exception( - "Network checkpoint %s not found in %s" % (network.scope, run_dir) - ) + raise Exception("Network checkpoint %s not found in %s" % + (network.scope, run_dir)) def save_network(session, run_dir, network):