From d8c38cf461df5d4383371d3e515d8d4e217c46a2 Mon Sep 17 00:00:00 2001 From: Corentin Date: Mon, 23 Apr 2018 09:44:32 +0200 Subject: [PATCH 1/4] separate p probabilities --- abnet3/sampler.py | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/abnet3/sampler.py b/abnet3/sampler.py index 672a368..7c4ce71 100644 --- a/abnet3/sampler.py +++ b/abnet3/sampler.py @@ -298,14 +298,11 @@ def type_samp_func(x): return np.log(1+x) W_types[tokens_type[tok]] += 1.0 except Exception as e: W_types[tokens_type[tok]] = 1.0 - p_types = {"Stype": {}, "Dtype": {}} + + p_types = dict() for type_idx in range(nb_types): - p_types["Stype"][type_idx] = type_samp_func(W_types[type_idx]) - for type_jdx in range(type_idx+1, nb_types): - p_types["Dtype"][(type_idx, type_jdx)] = \ - type_samp_func(W_types[type_idx]) * \ - type_samp_func(W_types[type_jdx]) + p_types[type_idx] = type_samp_func(W_types[type_idx]) return p_types def sample_spk_p(self, std_descr, spk_sampling_mode='log'): @@ -436,8 +433,7 @@ def type_speaker_sampling_p(self, std_descr=None, p_spk_types = self.sample_spk_p(std_descr, spk_sampling_mode=spk_sampling_mode) - for config in p_types.keys(): - p_types[config] = normalize_distribution(p_types[config]) + p_types = normalize_distribution(p_types) for config in p_spk_types.keys(): p_spk_types[config] = normalize_distribution(p_spk_types[config]) @@ -451,23 +447,25 @@ def type_speaker_sampling_p(self, std_descr=None, i += 1 if config == 'Stype_Sspk': for el in p_spk_types[config].keys(): - p_spk_types[config][el] = p_types['Stype'][el[1]] * \ + p_spk_types[config][el] = \ + p_types[el[1]] * \ p_spk_types[config][el] if config == 'Stype_Dspk': for el in p_spk_types[config].keys(): - p_spk_types[config][el] = p_types['Stype'][el[2]] * \ + p_spk_types[config][el] = \ + p_types[el[2]] * \ p_spk_types[config][el] if config == 'Dtype_Sspk': for el in p_spk_types[config].keys(): - p_spk_types[config][el] = p_types['Dtype'][ - (el[1], - el[2])] * \ + p_spk_types[config][el] = \ + p_types[el[1]] * \ + p_types[el[2]] * \ p_spk_types[config][el] if config == 'Dtype_Dspk': for el in p_spk_types[config].keys(): - p_spk_types[config][el] = p_types['Dtype'][ - (el[2], - el[3])] * \ + p_spk_types[config][el] = \ + p_types[el[2]] * \ + p_types[el[3]] * \ p_spk_types[config][el] for config in p_spk_types.keys(): From decc07f6649cb95037164bfa830ce662d2f3e3be Mon Sep 17 00:00:00 2001 From: Corentin Date: Mon, 23 Apr 2018 10:48:30 +0200 Subject: [PATCH 2/4] faster sampling --- abnet3/sampler.py | 134 +++++++++++++++++++++++----------------------- abnet3/utils.py | 21 ++++++++ 2 files changed, 88 insertions(+), 67 deletions(-) diff --git a/abnet3/sampler.py b/abnet3/sampler.py index 7c4ce71..572f909 100644 --- a/abnet3/sampler.py +++ b/abnet3/sampler.py @@ -10,7 +10,7 @@ """ from abnet3.utils import normalize_distribution, cumulative_distribution -from abnet3.utils import print_token, sample_searchidx +from abnet3.utils import print_token, sample_searchidx, samplepairs_searchidx from abnet3.utils import read_spkid_file, read_spk_list, progress import numpy as np @@ -316,15 +316,17 @@ def sample_spk_p(self, std_descr, spk_sampling_mode='log'): """ nb_tok = len(std_descr['tokens']) tokens_type = std_descr['tokens_type'] + types = std_descr['types'] p_spk_types = {'Stype_Sspk': {}, 'Stype_Dspk': {}, 'Dtype_Sspk': {}, 'Dtype_Dspk': {}} - speakers = std_descr['tokens_speaker'] + speakers_for_token = std_descr['tokens_speaker'] + speakers = std_descr['speakers'] W_spk_types = {} for tok in range(nb_tok): try: - W_spk_types[(speakers[tok], tokens_type[tok])] += 1.0 + W_spk_types[(speakers_for_token[tok], tokens_type[tok])] += 1.0 except Exception as e: - W_spk_types[(speakers[tok], tokens_type[tok])] = 1.0 + W_spk_types[(speakers_for_token[tok], tokens_type[tok])] = 1.0 if spk_sampling_mode == '1': def spk_samp_func(x): @@ -348,32 +350,29 @@ def spk_samp_func(x): return np.log(1+x) for (spk, type_idx) in W_spk_types.keys(): print_progress(i) i += 1 - for (spk2, type_jdx) in W_spk_types.keys(): - if spk == spk2: - if type_idx == type_jdx: - if (W_spk_types[(spk, type_idx)] - 1) == 0: - p_spk_types['Stype_Sspk'][(spk, type_idx)] = 0.0 - else: - p_spk_types['Stype_Sspk'][(spk, type_idx)] = \ - spk_samp_func(W_spk_types[(spk, type_idx)]) - else: - min_idx = min(type_idx, type_jdx) - max_idx = max(type_idx, type_jdx) + # Dtype, Dspk + p_spk_types['Dtype_Dspk'][(spk, type_idx)] = \ + spk_samp_func(W_spk_types[(spk, type_idx)]) + # Stype, Sspk + if (W_spk_types[(spk, type_idx)] - 1) == 0: + p_spk_types['Stype_Sspk'][(spk, type_idx)] = 0.0 + else: + p_spk_types['Stype_Sspk'][(spk, type_idx)] = \ + spk_samp_func(W_spk_types[(spk, type_idx)]) + # Dtype, Sspk + for type_jdx in range(len(types)): + min_idx = min(type_idx, type_jdx) + max_idx = max(type_idx, type_jdx) + if (spk, type_jdx) in W_spk_types: p_spk_types['Dtype_Sspk'][(spk, min_idx, max_idx)] = \ spk_samp_func(W_spk_types[(spk, type_idx)]) * \ spk_samp_func(W_spk_types[(spk, type_jdx)]) - else: - if type_idx == type_jdx: - p_spk_types['Stype_Dspk'][(spk, spk2, type_idx)] = \ - spk_samp_func(W_spk_types[(spk, type_idx)]) * \ - spk_samp_func(W_spk_types[(spk2, type_idx)]) - else: - min_idx = min(type_idx, type_jdx) - max_idx = max(type_idx, type_jdx) - p_spk_types['Dtype_Dspk'][(spk, spk2, - min_idx, max_idx)] = \ - spk_samp_func(W_spk_types[(spk, type_idx)]) * \ - spk_samp_func(W_spk_types[(spk2, type_jdx)]) + # Stype, Dspk + for spk2 in speakers: + if (spk2, type_idx) in W_spk_types: + p_spk_types['Stype_Dspk'][(spk, spk2, type_idx)] = \ + spk_samp_func(W_spk_types[(spk, type_idx)]) * \ + spk_samp_func(W_spk_types[(spk2, type_idx)]) return p_spk_types def generate_token_dict(self, std_descr): @@ -425,9 +424,7 @@ def type_speaker_sampling_p(self, std_descr=None, """ assert type_sampling_mode in ['1', 'f', 'f2', 'log', 'fcube'] assert spk_sampling_mode in ['1', 'f', 'f2', 'log', 'fcube'] - # W_types = std_descr['types'] - # speakers = [e for e in std_descr['speakers']] - # W_speakers = [std_descr['speakers'][e] for e in speakers] + p_types = self.type_sample_p(std_descr, type_sampling_mode=type_sampling_mode) p_spk_types = self.sample_spk_p(std_descr, @@ -464,8 +461,7 @@ def type_speaker_sampling_p(self, std_descr=None, if config == 'Dtype_Dspk': for el in p_spk_types[config].keys(): p_spk_types[config][el] = \ - p_types[el[2]] * \ - p_types[el[3]] * \ + p_types[el[1]] * \ p_spk_types[config][el] for config in p_spk_types.keys(): @@ -549,44 +545,48 @@ def sample_batch(self, 'Dtype_Dspk': num_Dtype_Dspk } for config in p_spk_types.keys(): - keys = np.array(list(p_spk_types[config].keys())) - sample_idx = sample_searchidx(cdf[config], sampled_ratio[config]) - sample = keys[sample_idx] - if config == 'Stype_Sspk': - for key in sample: - spk, type_idx = key - tokens = token_dict[int(type_idx), spk] - tok1, tok2 = np.random.choice(tokens, size=2, - replace=False) - sampled_tokens[config].append( - (tok1, tok2)) - if config == 'Stype_Dspk': - for key in sample: - spk1, spk2, type_idx = key - type_idx = int(type_idx) - tok1 = np.random.choice(token_dict[type_idx, spk1]) - tok2 = np.random.choice(token_dict[type_idx, spk2]) - sampled_tokens[config].append((tok1, tok2)) - if config == 'Dtype_Sspk': - for key in sample: - spk, type_idx, type_jdx = key - type_idx = int(type_idx) - type_jdx = int(type_jdx) - tok1 = np.random.choice(token_dict[type_idx, spk]) - tok2 = np.random.choice(token_dict[type_jdx, spk]) - sampled_tokens[config].append((tok1, tok2)) if config == 'Dtype_Dspk': + """ + Dtype_Dspk is particular + We sample two items and check they are different + """ + keys = np.array(list(p_spk_types[config].keys())) + sample_idx = samplepairs_searchidx(cdf[config], sampled_ratio[config]) + sample = keys[sample_idx] for key in sample: - spk1, spk2, type_idx, type_jdx = key - type_idx = int(type_idx) - type_jdx = int(type_jdx) - try: - tok1 = np.random.choice(token_dict[type_idx, spk1]) - tok2 = np.random.choice(token_dict[type_jdx, spk2]) - except Exception: - tok1 = np.random.choice(token_dict[type_idx, spk2]) - tok2 = np.random.choice(token_dict[type_jdx, spk1]) + (spk1, type1), (spk2, type2) = key + type1 = int(type1) + type2 = int(type2) + tok1 = np.random.choice(token_dict[type1, spk1]) + tok2 = np.random.choice(token_dict[type2, spk2]) sampled_tokens[config].append((tok1, tok2)) + else: + keys = np.array(list(p_spk_types[config].keys())) + sample_idx = sample_searchidx(cdf[config], sampled_ratio[config]) + sample = keys[sample_idx] + if config == 'Stype_Sspk': + for key in sample: + spk, type_idx = key + tokens = token_dict[int(type_idx), spk] + tok1, tok2 = np.random.choice(tokens, size=2, + replace=False) + sampled_tokens[config].append( + (tok1, tok2)) + if config == 'Stype_Dspk': + for key in sample: + spk1, spk2, type_idx = key + type_idx = int(type_idx) + tok1 = np.random.choice(token_dict[type_idx, spk1]) + tok2 = np.random.choice(token_dict[type_idx, spk2]) + sampled_tokens[config].append((tok1, tok2)) + if config == 'Dtype_Sspk': + for key in sample: + spk, type_idx, type_jdx = key + type_idx = int(type_idx) + type_jdx = int(type_jdx) + tok1 = np.random.choice(token_dict[type_idx, spk]) + tok2 = np.random.choice(token_dict[type_jdx, spk]) + sampled_tokens[config].append((tok1, tok2)) return sampled_tokens def write_tokens(self, descr=None, proba=None, cdf=None, diff --git a/abnet3/utils.py b/abnet3/utils.py index a4b8c39..9a6d505 100644 --- a/abnet3/utils.py +++ b/abnet3/utils.py @@ -97,6 +97,27 @@ def sample_searchidx(cdf, num_samples): return idx +def samplepairs_searchidx(cdf, num_samples): + """ + Sample indexes based on cdf distribution + This function samples pairs of *different* elements (ie without + replacement) + """ + iterations = 0 # limit 5 iterations to avoid infinite loops + uniform_samples = np.random.random_sample((int(num_samples), 2)) + idx = cdf.searchsorted(uniform_samples, side='right') + while True: + iterations += 1 + if iterations > 5: + print("Warning : more than 5 iterations to sample different pairs") + indices_same_sample = np.where(idx[:, 0] == idx[:, 1]) + num_samples_same = len(indices_same_sample[0]) + if num_samples_same == 0: + break + new_samples = np.random.random_sample((int(num_samples_same), 2)) + idx[indices_same_sample] = cdf.searchsorted(new_samples, side='right') + return idx + def print_token(tok): """Pretty print token for batches From 949049d8c6f8f3984ba5c9d439f8e4e1aaa2a0ab Mon Sep 17 00:00:00 2001 From: Corentin Date: Mon, 23 Apr 2018 12:05:43 +0200 Subject: [PATCH 3/4] correct sampling with dtype dspk --- abnet3/sampler.py | 2 +- abnet3/utils.py | 18 ++++++++++++------ 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/abnet3/sampler.py b/abnet3/sampler.py index 572f909..b1f9ed6 100644 --- a/abnet3/sampler.py +++ b/abnet3/sampler.py @@ -551,7 +551,7 @@ def sample_batch(self, We sample two items and check they are different """ keys = np.array(list(p_spk_types[config].keys())) - sample_idx = samplepairs_searchidx(cdf[config], sampled_ratio[config]) + sample_idx = samplepairs_searchidx(cdf[config], sampled_ratio[config], keys=keys) sample = keys[sample_idx] for key in sample: (spk1, type1), (spk2, type2) = key diff --git a/abnet3/utils.py b/abnet3/utils.py index 9a6d505..83d1145 100644 --- a/abnet3/utils.py +++ b/abnet3/utils.py @@ -97,25 +97,31 @@ def sample_searchidx(cdf, num_samples): return idx -def samplepairs_searchidx(cdf, num_samples): +def samplepairs_searchidx(cdf, num_samples, keys): """ Sample indexes based on cdf distribution This function samples pairs of *different* elements (ie without replacement) + It samples randomly pairs of elements, and reruns the elements that have + one thing in common (this is used to sample pairs of (spk, type) + where both spk and type are different """ iterations = 0 # limit 5 iterations to avoid infinite loops uniform_samples = np.random.random_sample((int(num_samples), 2)) idx = cdf.searchsorted(uniform_samples, side='right') while True: iterations += 1 - if iterations > 5: - print("Warning : more than 5 iterations to sample different pairs") - indices_same_sample = np.where(idx[:, 0] == idx[:, 1]) - num_samples_same = len(indices_same_sample[0]) + if iterations > 30: + print("Warning : more than 30 iterations to sample different pairs") + pair_keys = keys[idx] + index_same_spk = np.where(pair_keys[:, 0, 0] == pair_keys[:, 1, 0])[0] + index_same_type = np.where(pair_keys[:, 0, 1] == pair_keys[:, 1, 1])[0] + indices_to_change = np.concatenate((index_same_spk, index_same_type)) + num_samples_same = len(indices_to_change) if num_samples_same == 0: break new_samples = np.random.random_sample((int(num_samples_same), 2)) - idx[indices_same_sample] = cdf.searchsorted(new_samples, side='right') + idx[indices_to_change] = cdf.searchsorted(new_samples, side='right') return idx def print_token(tok): From 10bee06ff980cf07b3e59e385433c3a3081f0c3b Mon Sep 17 00:00:00 2001 From: Corentin Date: Mon, 23 Apr 2018 18:47:04 +0200 Subject: [PATCH 4/4] correct wrong sampling --- abnet3/sampler.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/abnet3/sampler.py b/abnet3/sampler.py index b1f9ed6..1a804d8 100644 --- a/abnet3/sampler.py +++ b/abnet3/sampler.py @@ -360,7 +360,7 @@ def spk_samp_func(x): return np.log(1+x) p_spk_types['Stype_Sspk'][(spk, type_idx)] = \ spk_samp_func(W_spk_types[(spk, type_idx)]) # Dtype, Sspk - for type_jdx in range(len(types)): + for type_jdx in range(type_idx + 1, len(types)): min_idx = min(type_idx, type_jdx) max_idx = max(type_idx, type_jdx) if (spk, type_jdx) in W_spk_types: @@ -369,7 +369,7 @@ def spk_samp_func(x): return np.log(1+x) spk_samp_func(W_spk_types[(spk, type_jdx)]) # Stype, Dspk for spk2 in speakers: - if (spk2, type_idx) in W_spk_types: + if spk != spk2 and (spk2, type_idx) in W_spk_types: p_spk_types['Stype_Dspk'][(spk, spk2, type_idx)] = \ spk_samp_func(W_spk_types[(spk, type_idx)]) * \ spk_samp_func(W_spk_types[(spk2, type_idx)]) @@ -557,6 +557,8 @@ def sample_batch(self, (spk1, type1), (spk2, type2) = key type1 = int(type1) type2 = int(type2) + assert spk1 != spk2 + assert type1 != type2 tok1 = np.random.choice(token_dict[type1, spk1]) tok2 = np.random.choice(token_dict[type2, spk2]) sampled_tokens[config].append((tok1, tok2)) @@ -575,6 +577,7 @@ def sample_batch(self, if config == 'Stype_Dspk': for key in sample: spk1, spk2, type_idx = key + assert spk1 != spk2 type_idx = int(type_idx) tok1 = np.random.choice(token_dict[type_idx, spk1]) tok2 = np.random.choice(token_dict[type_idx, spk2]) @@ -582,6 +585,7 @@ def sample_batch(self, if config == 'Dtype_Sspk': for key in sample: spk, type_idx, type_jdx = key + assert type_idx != type_jdx type_idx = int(type_idx) type_jdx = int(type_jdx) tok1 = np.random.choice(token_dict[type_idx, spk])