Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions src/mixtures/mixturemodel.jl
Original file line number Diff line number Diff line change
Expand Up @@ -477,6 +477,34 @@ rand(rng::AbstractRNG, s::MixtureSampler{Univariate}) =
rand(rng::AbstractRNG, d::MixtureModel{Univariate}) =
rand(rng, component(d, rand(rng, d.prior)))

function rand(rng::AbstractRNG, d::MixtureModel{Univariate}, n::Int)
Comment thread
mmikhasenko marked this conversation as resolved.
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the wrong dispatch, isn't it? If one wants to draw multiple samples from a distribution d, automatically Distributions dispatches to drawing samples with sampler(d). In the case of mixture models this is MixtureSampler. So this should actually be defined for MixtureSampler{Univariate}, not MixtureModel{Univariate} AFAICT?

However, generally for univariate distributions one also shouldn't define rand(rng, dist, n) but only the in-place method _rand!(rng, sampler(dist), out) (#1905 will fix possibly incorrectly allocated output arrays) if multiple samples can be generated more efficiently:

return rand!(rng, sampler(s), out)
function rand!(rng::AbstractRNG, s::Sampleable{Univariate}, A::AbstractArray{<:Real})
return _rand!(rng, sampler(s), A)
end
function _rand!(rng::AbstractRNG, sampler::Sampleable{Univariate}, A::AbstractArray{<:Real})
for i in eachindex(A)
A[i] = rand(rng, sampler)
end
return A
end

So AFAICT we should only define

Suggested change
function rand(rng::AbstractRNG, d::MixtureModel{Univariate}, n::Int)
function _rand!(rng::AbstractRNG, d::MixtureSampler{Univariate}, x::AbstractArray{<:Real})

Alternatively, if we never want to use MixtureSampler for sampling MixtureModel{Univariate}, we should define sampler(d::MixtureModel{Univariate}) = d (or limit the definition below to sampler(d::MixtureModel{Multivariate}) = MixtureSampler(d)) and here define

Suggested change
function rand(rng::AbstractRNG, d::MixtureModel{Univariate}, n::Int)
function _rand!(rng::AbstractRNG, d::MixtureModel{Univariate}, x::AbstractArray{<:Real})

counts = rand(rng, Multinomial(n, probs(d.prior)))

# Find the component with the maximum count to minimize resizing
max_count_idx = argmax(counts)
max_count = counts[max_count_idx]

# Sample from the component with maximum count first and use it directly
x = rand(rng, component(d, max_count_idx), max_count)

# Resize to the full size and continue with other components
resize!(x, n)
offset = max_count

for i in eachindex(counts)
if i != max_count_idx
ni = counts[i]
if ni > 0
c = component(d, i)
last_offset = offset + ni - 1
rand!(rng, c, @view(x[(begin+offset):(begin+last_offset)]))
offset = last_offset + 1
end
end
end
Comment on lines +483 to +504
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For the in-place method, it seems this could be simplified to

Suggested change
# Find the component with the maximum count to minimize resizing
max_count_idx = argmax(counts)
max_count = counts[max_count_idx]
# Sample from the component with maximum count first and use it directly
x = rand(rng, component(d, max_count_idx), max_count)
# Resize to the full size and continue with other components
resize!(x, n)
offset = max_count
for i in eachindex(counts)
if i != max_count_idx
ni = counts[i]
if ni > 0
c = component(d, i)
last_offset = offset + ni - 1
rand!(rng, c, @view(x[(begin+offset):(begin+last_offset)]))
offset = last_offset + 1
end
end
end
offset = 0
for (c, ni) in zip(components(d), counts)
last_offset = offset + ni - 1
rand!(rng, c, @view(x[(begin+offset):(begin+last_offset)]))
offset = last_offset + 1
end

return shuffle!(rng, x)
end

# multivariate mixture sampler for a vector
_rand!(rng::AbstractRNG, s::MixtureSampler{Multivariate}, x::AbstractVector{<:Real}) =
rand!(rng, s.csamplers[rand(rng, s.psampler)], x)
Expand Down
56 changes: 56 additions & 0 deletions src/truncate.jl
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,62 @@ function rand(rng::AbstractRNG, d::Truncated)
end
end

function rand(rng::AbstractRNG, d::Truncated, n::Int)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
function rand(rng::AbstractRNG, d::Truncated, n::Int)
function _rand!(rng::AbstractRNG, d::Truncated, x::AbstractArray{<:Real})

And if there's any precomputations that could be factored out (doesn't seem to be the case?), then we should think about defining a dedicated sampler.

d0 = d.untruncated
tp = d.tp
lower = d.lower
upper = d.upper

# Use the same three regimes as the scalar version
if tp > 0.25
# Regime 1: Rejection sampling with batch optimization
# Get the correct type and memory by sampling from the untruncated distribution
samples = rand(rng, d0, n)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
samples = rand(rng, d0, n)
rand!(rng, d0, x)

n_collected = 0
max_batch = 0
batch_buffer = Vector{eltype(samples)}()
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A separate batch buffer seems unnecessary, in particular the resizing might be inefficient? Instead of copying from a separate vector we could just use the output vector and move samples around there and keep track of the last accepted index.

while n_collected < n
n_remaining = n - n_collected
n_expected = n_remaining / tp
δn_expected = sqrt(n_remaining * tp * (1 - tp))
n_batch_f = n_expected + 3δn_expected
n_batch = ceil(Int, n_batch_f)
if n_batch > max_batch
resize!(batch_buffer, n_batch)
max_batch = n_batch
end
rand!(rng, d0, batch_buffer)
for i in 1:n_batch
s = batch_buffer[i]
if _in_closed_interval(s, lower, upper)
n_collected += 1
samples[n_collected] = s
n_collected == n && break
end
end
end
return samples
elseif tp > sqrt(eps(typeof(float(tp))))
# Regime 2: Quantile-based sampling
# Sample one value first to determine the correct type
sample_type = typeof(quantile(d0, d.lcdf + rand(rng) * d.tp))
samples = Vector{sample_type}(undef, n)
Comment on lines +273 to +275
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# Sample one value first to determine the correct type
sample_type = typeof(quantile(d0, d.lcdf + rand(rng) * d.tp))
samples = Vector{sample_type}(undef, n)

for i in 1:n
samples[i] = quantile(d0, d.lcdf + rand(rng) * d.tp)
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should probably at least use a Random.Sampler for rand(rng), maybe it's even faster to call rand(rng, n) (despite the allocation.

end
return samples
else
# Regime 3: Log-space computation
# Sample one value first to determine the correct type
sample_type = typeof(invlogcdf(d0, logaddexp(d.loglcdf, d.logtp - randexp(rng))))
samples = Vector{sample_type}(undef, n)
Comment on lines +282 to +284
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# Sample one value first to determine the correct type
sample_type = typeof(invlogcdf(d0, logaddexp(d.loglcdf, d.logtp - randexp(rng))))
samples = Vector{sample_type}(undef, n)

for i in 1:n
samples[i] = invlogcdf(d0, logaddexp(d.loglcdf, d.logtp - randexp(rng)))
end
return samples
end
end

## show

function show(io::IO, d::Truncated)
Expand Down
Loading