UnsupervisedTechinques/Density_based_method.R at master · Lcesari96/UnsupervisedTechinques · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
library(pdfCluster)
library(cluster)
library(fpc)
library(prabclus)
library(smacof)
library(tidyverse)
library(cluster)
library(dbscan)
library(factoextra) # for silhouette plots
library(mclust)
library(robustHD) # MAD std
# install.packages("C:/Users/loren/Desktop/EMMIXskew_1.0.3.tar.gz",repos=NULL,type="source")
library(EMMIXskew)
library(poLCA)
library(nomclust)
library(fda) # Functional data analysis
library(funFEM)
library(robustbase)
library(mixtools)
library(tclust)
set.seed(123456)
help("EmSkew")

# DENSITY BASED METHOD -> MIXTURE MODELS + FUNCTIONAL DATA (mixture on functional)
# NB CARE FOR THE BIC VERSIONS (Small is good and viceversa)

# Mixture Models

# Gaussian Mixture models  GMM
data(oliveoil)
olive <- oliveoil[,3:10]
solive <- scale(olive)

# Mclust fit automatically the best GMM according to the BIC among all the possible
# combination (i.e, "VVV" ...) of cov matrix model and also model that "works" in
# terms of degenerating likelihood.
help("Mclust")
molive <- Mclust(olive,G=1:15)
# G specify the number of mixture components to test.
# With modelNames = c("EII", "VII", "EEI", "EVI", "VEI", "VVI") we can choose a
# single configuration
# If you want one or more specific covariance matrix models,you could specify
# modelNames, e.g.,
moliveVVV <- Mclust(olive,G=1:10,modelNames="VVV")
molivex <- Mclust(olive,G=1:10,modelNames=c("EEE","VVV"))
# Information available in output object, e.g.
molive$classification
# Clustering vector
molive$parameters
# Estmated parameters
molive$z
# Matrix of posterior probabilities p_ik that point i was generated by mixture
# component k

summary(molive) # Gives us the best model result with his correspondent clustering
# table

summary(molive$BIC) # Gives the best 3 models in terms of BIC

adjustedRandIndex(molive$classification,oliveoil$region) # Fairly good
# molive$classification is vector containing the clustering labels
levels(oliveoil$region)# NB 9 regions

# Diagnostic plots
# plot(molive)
# Multiple plots:
# 1: BIC # meaningful

# not so meaningful
# 2: classification -> plots the p_ik (posterior probabilities p_ik that point i was
# generated by mixture component k)
# 3: uncertainty -> plots 1 - p_ik
# 4: density

molive3 <- Mclust(olive,G=3)
adjustedRandIndex(molive3$classification,oliveoil$macro.area)
levels(oliveoil$macro.area)
# plot(molive3)

# PCA visualizzation for Mclust
# PCA can be clearer for not too low dimensional data
sprolive <- princomp(solive, F, T)
# x: a numeric matrix or data frame
# cor: a logical value. If TRUE, the data will be centered and scaled before the
# analysis
# scores: a logical value. If TRUE, the coordinates on each principal component
# are calculated
plot(sprolive$scores,col=molive$classification,
     pch=clusym[molive$classification])

# EmSkew: including Mixtures of skew and heavy-tailed distributions

stolive <- list()
bicvals <- ariarea <- ariregion <- numeric(0)

# NB In this loop are contained also two ADJ
# NB here we are trying to fit a mixture of multivariate - skew - t
for (i in 1:12){
  print(i)
  tryattempts <- 3
  trycounter <- 1
  tst <- try(stolive[[i]] <- EmSkew(olive,g=i,distr="mst",ncov=3))
  while((is.null(tst) | class(tst)=="try-error") & trycounter<tryattempts+1){
    print("Error, try again")
    tst <- try(stolive[[i]] <- EmSkew(olive,g=i,distr="mst",ncov=3))
    trycounter <- trycounter+1
  }
  trycounter <- 1
  while((is.null(tst) | class(tst)=="try-error") & trycounter<tryattempts+1){
    print("Error, try again")
    tst <- try(stolive[[i]] <- EmSkew(olive,g=i,distr="mst",ncov=4))
    trycounter <- trycounter+1
  }
  trycounter <- 1
  while((is.null(tst) | class(tst)=="try-error") & trycounter<tryattempts+1){
    print("Error, try again")
    tst <- try(stolive[[i]] <- EmSkew(olive,g=i,distr="mst",ncov=2))
    trycounter <- trycounter+1
  }
  bicvals[i] <- stolive[[i]]$bic
  ariarea[i] <- adjustedRandIndex(stolive[[i]]$clust,oliveoil$macro.area)
  ariregion[i] <- adjustedRandIndex(stolive[[i]]$clust,oliveoil$region)
}

plot(1:12,bicvals,typ="l",xlab="Number of clusters",ylab="BIC")
# NB EMMIX uses "small is good" version of BIC
# EmSkew.contours(olive, stolive[[7]]) # Diagnostic plot offered by EMMIXskew

# PCA:
plot(sprolive$scores,col=stolive[[7]]$clus,pch=clusym[stolive[[7]]$clus])
plot(sprolive$scores,col=stolive[[3]]$clus,pch=clusym[stolive[[3]]$clus])

# FLEXMIX: MIXTURE MODEL FOR CATEGORICAL DATA.
# Flexible EM-algorithm for which users can write methods for specific models.
# NB can also fit mixed type continuous/categorical data.

set.seed(887766)
setwd("C:/Users/loren/Desktop/Henning_Exam/Data")
veronica <- read.table("veronica.dat")
veronicabernm <- flexmixedruns(veronica,continuous=0,discrete=583,n.cluster=1:10)
# This uses by default 20 random initialisations.
# This will also fit mixtures with mixed type continuous and categorical variables.
# Default assumption: continuous variables come first, therefore
# continuous=0,discrete=583 - all variables are categorical.
# With xvarsorted=FALSE, can specify freely what kind of variable is where.

plot(1:10,veronicabernm$bicvals,typ="l",xlab="Number of clusters",ylab="BIC")
# ("Small is good" version of BIC)

jveronica <- dist(veronica,method="binary")
mdsveronica <- mds(jveronica, ndim = 2)
plot(mdsveronica$conf,col=veronicabernm$flexout[[6]]@cluster,
     pch=clusym[veronicabernm$flexout[[6]]@cluster])

# Can access parameter estimators
str(veronicabernm$flexout[[6]],max.level=2)
# Formal class 'flexmix' [package "flexmix"] with 18 slots
# ..@ posterior :List of 2
# ..@ weights : NULL
# ..@ iter : int 4
# ..@ cluster : int [1:207] 4 4 4 3 2 5 6 1 4 3 ...
# ..@ logLik : num -13361
# ..@ df : num 3467
# ..@ control :Formal class 'FLXcontrol' [package "flexmix"] with 6 slots
# ..@ group : Factor w/ 0 levels:
# ..@ size : Named int [1:6] 29 13 22 48 64 31
# .. ..- attr(*, "names")= chr [1:6] "1" "2" "3" "4" ...
# ..@ converged : logi TRUE
# ..@ k0 : int 6
# ..@ model :List of 1
# ..@ prior : num [1:6] 0.1401 0.0628 0.1063 0.2319 0.3092 ...
# ..@ components :List of 6
# ..@ concomitant:Formal class 'FLXP' [package "flexmix"] with 7 slots
# ..@ formula :Class 'formula' language x ~ 1
# .. .. ..- attr(*, ".Environment")=<environment: 0x555f82bae2c8>
# ..@ call : language flexmix(formula = x ~ 1, k = k,
# cluster = initial.cluster, model = lcmixed(continuous = continuous,
# discrete| __truncated__
# ..@ k : int 6

# E.g., this is for component 5, variables 1-4,
# just as illustration (the "[[1]]"
# is required to find the parameters, but otherwise
# doesn't mean anything)
veronicabernm$flexout[[6]]@components[[5]][[1]]@parameters$pp
# [[1]]
# [1] 1 0
#
# [[2]]
# [1] 0.03125 0.96875
#
# [[3]]
# [1] 1 0
#
# [[4]]
# [1] 1 0
# etc.
# i.e,
# Posterior fitted probabilities of classifications obtained by finding the parameters
# mu(hat) (i.e., the epsilon(l) probality of assume category l for each variable)
# that maximize those for each obs/ for each mixture(cluster) (EM-algorithm)

# Checking the local independence assumption
veronicam <- as.matrix(veronica)
vveronica <- dist(t(veronicam),method="binary")
varclust <- hclust(vveronica,method="average")
# heatmap, rows ordered by clusters,
# columns by earlier variable clustering
heatmap(veronicam[order(veronicabernm$flexout[[6]]@cluster),],
        Rowv=NA,Colv=as.dendrogram(varclust),
        RowSideColors=palette()[veronicabernm$flexout[[6]]@cluster]
        [order(veronicabernm$flexout[[6]]@cluster)],
        col=c(0,1),scale="none")


# FUNCTIONAL DATA

covid21 <- read.table("covid2021.dat")
covid21v <- as.matrix(covid21[,5:559])
# obs: country
# var: new cases over one week per 1000 inhabitants
# Functional data: we don't have any more variables, obs now are expressed as a
# function of the var that represent a continuous range of values.
# Thus now our observation are discrete points(countries)(y.axis) evaluated at each
# correspondent var values(often time values) vs the continuous range ofvalues that
# represent/derived from our var(x.axis) .
# REC FORMAL DEF OF FUNCTIONAL DATA: Functional data are data where observations
# are interpreted as functions Xi(t)(i country, t new cases over one week per
# 1000 inhabitants) i = 1, . . . ,n (thus for each i we have p different obs(our
# discrete points) -> a total of n*p discrete obs) over a continuous range of
# values t that belongs to T = [tmin, tmax ](T continuous range of values is defined
# from our var by taking its tmin and tmax).
# The representation of data will be:
# y.axis -> observation labels
# x.axis -> continuous range of values derived from our variables
# points -> our observations

# Raw data plot(555 n of var(observed "time" points for each country) ->
# our observation for 1 variable)
plot(1:555,covid21v[1,],type="l",ylim=c(0,25),ylab="New cases over one
week per 1000 inhabitants",xlab="Day (1 April 2020-7 October 2021)",
     main="Covid weekly new cases for 179 countries")
# 179 -> observation labels
for(i in 2:179)
  points(1:555,covid21v[i,],type="l")

# Single country
i <- 79 # Italy (can print covid21[,1] to find numbers for countries)
i <- 69 # Haiti
plot(1:555,covid21v[i,],cex=0.5,ylab="New cases over one week per
1000 inhabitants",xlab="Day (1 April 2020-7 October 2021)",main=covid21[i,1])
# p different obs(our discrete points) for each observation labels -> thus a total
# of n*p effective observations

# Constructing B-spline basis
help("pca.fd")
bbasis10 <- create.bspline.basis(rangeval = c(1,555),nbasis=10)
# Functional data objects are constructed by specifying a set of basis functions
# and a set of coefficients defining a linear combination of these basis functions.

# NB rangeval = c(1,555) numeric vector of length 2 defining the interval over
# which the functional data object can be evaluated
# nbasis = 10 integer variable specifying the number of basis functions p

# Splines approximating data as linear combinations of B-spline basis
fdcovid10 <- Data2fd(1:555,y=t(as.matrix(covid21v)),basisobj=bbasis10)
# Data2fd fits those linear combinations
# NB t(as.matrix(covid21v)) TRANSPOSE

bbasis <- create.bspline.basis(c(1,555),nbasis=100) # same with p=100
fdcovid <- Data2fd(1:555,y=t(as.matrix(covid21v)),basisobj=bbasis)

# Plot basis
plot(bbasis10)
plot(bbasis)

# Smooth splines for data with smooth mean function:
plot(fdcovid)
mcovid <- mean.fd(fdcovid)
lines(mcovid,col=2,lwd=5)

# Show smooth fit of individual countries
plotfit.fd(t(covid21v),1:555,fdcovid10,index=79,cex.pch=0.5)
plotfit.fd(t(covid21v),1:555,fdcovid10,index=69,cex.pch=0.5)
plotfit.fd(t(covid21v),1:555,fdcovid,index=79,cex.pch=0.5)
plotfit.fd(t(covid21v),1:555,fdcovid,index=69,cex.pch=0.5)

# We try to approximate the discrete data points  as linear combinations of
# B-spline basis.
# We need to find the right number p of basis that well fit the behaviour of each
# "observation label" without capturing also the "noise" --> a right TRADE-OFF.T

# Functional Principal Component Analysis for visualizzation
# Dimension reduction technique

covidpca <- pca.fd(fdcovid, nharm = 5)
plot(covidpca$harmonics) # PCs phi_k
covidpca$varprop # Percentage of variance
# [1] 0.40871917 0.18555649 0.11717077 0.05581147 0.04606967
cumsum(covidpca$varprop) # Cumulative percentage of variance
# [1] 0.4087192 0.5942757 0.7114464 0.7672579 0.8133276
ncontinent <- as.numeric(as.factor(covid21$continent))
levels(as.factor(covid21$continent))
# NB we use the variable continent for cluster (as a clustering label vector)
#[1] "Africa" "Asia" "Australia"
# "Central America"
#[5] "Europe" "North America" "South America"
pairs(covidpca$scores,col=ncontinent,pch=clusym[ncontinent])
# PCA scores

# Create functional data object of PCA approximations
# NB functional PCA can also approximate original curves by projection on J < p
# PCs
covidpcaapprox <- covidpca$harmonics
i <- 1
pcacoefi <- covidpca$harmonics$coefs %*% covidpca$scores[i,]+mcovid$coefs
# REC mcovid <- mean.fd(fdcovid)
# covidpca$harmonics$coefs --> deltas
# mcovid$coefs --> gamma(ij) coeff of the basis
# %*% --> is matrix multiplication. For matrix multiplication, you need an m x n
# matrix times an n x p matrix.

covidpcaapprox$coefs <- pcacoefi
for (i in 2:179){
  pcacoefi <- covidpca$harmonics$coefs %*% covidpca$scores[i,]+mcovid$coefs
  covidpcaapprox$coefs <- cbind(covidpcaapprox$coefs, pcacoefi)
}
dimnames(covidpcaapprox$coefs)[[2]] <- covid21[,1]
plotfit.fd(t(covid21v),1:555,covidpcaapprox,index=79,cex.pch=0.5)
plotfit.fd(t(covid21v),1:555,covidpcaapprox,index=69,cex.pch=0.5)
plotfit.fd(t(covid21v),1:555,covidpcaapprox,index=164,cex.pch=0.5)

# A mixture-based method for functional clustering "FunFEM"

# Method "FunFEM" is based on finding a lower D-dimensional subspace (D < K)
# on which clusters can be optimally separated.
# It try to cluster our previous B spline basis data that then can be modelled
# as generated by Gaussian mixture, thus with all the covariance based model
# options (constrainghts).
# The optimal lower D-dimensional subspace discriminating the clusters is found
# by a "new step" in the EM- algorithm used for estimation: the Fisher-step.
# It is based on Fisher's LDA.
# The remaining two step of the EM algorithm follows the same principle and works
# given the new lower D-dimensional subspace.

# The following (using default model "AkjBk") does not work very well:
# REC fdcovid <- Data2fd(1:555,y=t(as.matrix(covid21v)),basisobj=bbasis)
covidcluster <- funFEM(fdcovid,K=2:10)
# Shows errors, not all models can be fitted (degenerating likelihood).
# We still get a final result, but only K=2 was fitted without error.
# Try out all models and find the best.

# As in "classical" mixture models we compare all the possible solutions and we
# choose the best in terms of BIC
set.seed(1234567)
femmodels <- c("DkBk", "DkB", "DBk",
               "DB", "AkjBk", "AkjB", "AkB", "AkBk", "AjBk", "AjB", "ABk",
               "AB")
nmodels <- length(femmodels)
femresults <- list() # Save output for all models in femmodels
bestk <- bestbic <- numeric(0)
# bestk: vector of best K for each model.
# bestbic: Best BIC value for each model.
K=2:10 # Numbers of clusters K to try out.
fembic <- matrix(NA,nrow=nmodels,ncol=max(K))
# fembic will hold all BIC values for models (rows) and K (columns);
# NA for those that cannot be fitted.
for (i in 1:nmodels){ # This takes a long time!!
  print(femmodels[i])
  femresults[[i]] <- funFEM(fdcovid,model=femmodels[i],K=K)
  fembic[i,K] <- femresults[[i]]$allCriterions$bic
  bestk[i] <- which(fembic[i,]==max(fembic[i,K],na.rm=TRUE))
  bestbic[i] <- max(fembic[i,K],na.rm=TRUE)
}

besti <- which(bestbic==max(bestbic,na.rm=TRUE)) # Best model
femmodels[besti] #"ABk" Sigma_k spherical and equal, beta_k can vary with k.
bestk[besti] # K=8 optimal for model 11 "ABk"

# Can reproduce results faster (up to random initialization)(DO THIS):
femresult11 <- funFEM(fdcovid,model="ABk",K=8)

# or save the result (DOESN 'T WORK)
#femresult11 <- femresults[[11]]

# Plot BIC values for all models and K:
i <- 1
plot(1:max(K),fembic[i,],col=i,pch=i,
     ylim=c(min(fembic,na.rm=TRUE),max(fembic,na.rm=TRUE)),type="n")
for(i in 1:nmodels)
  text(1:max(K),fembic[i,],femmodels[i],col=i)

# Clustering is in cls component of output.
table(femresult11$cls,covid21$continent)
adjustedRandIndex(femresult11$cls,covid21$continent) # 0.2038502

# Plot clusters against latitude and longitude
plot(covid21[,4:3],col=femresult11$cls,pch=clusym[femresult11$cls])

# This prints out the countries in the clusters.
for(i in 1:femresult11$K){
  print(i)
  print(covid21[femresult11$cls==i,1])
}

# Clusters on principal components
# REC covidpca <- pca.fd(fdcovid, nharm = 5)
# REC fdcovid <- Data2fd(1:555,y=t(as.matrix(covid21v)),basisobj=bbasis)
pairs(covidpca$scores,col=femresult11$cls,pch=19)
# Visualisation of discriminative subspace U
par(ask=FALSE)
fdproj <- t(fdcovid$coefs) %*% femresult11$U
pairs(fdproj,col=femresults11$cls,pch=19)
plot(fdproj,col=femresult11$cls,pch=19,xlab="DC 1",ylab="DC 2")
# femresults11$P has all the probabilities p_ik for the objects
# to belong to the clusters. Here all these are larger than 0.992
# or smaller than 0.008, so all points are very confidently classified.
# Although this is probably correctly computed, I think that
# this is overconfident, due to large degrees of freedom for
# finding the optimally discriminating subspace.

# Plot the curves and clusters
par(ask=FALSE)
plot(1:555,covid21v[1,],type="l",ylim=c(0,25),
     ylab="New cases over one week per 1000 inhabitants",
     xlab="Day (1 April 2020-7 October 2021)",lwd=1.5,col=femresult11$cls[1])
for(i in 2:179)
  points(1:555,covid21v[i,],type="l",col=femresult11$cls[i],lwd=1.5)

# Plot the cluster mean curves
clmeans <- fdcovid; clmeans$coefs <- t(femresult11$prms$my)
plot(clmeans,lwd=3) # col doesn't seem to work here, neither lwd
legend(100,10,legend=1:8,col=c(1:6,1:2),lty=c(1:5,1:3))

# Plot individual clusters and mean curves
par(ask=TRUE)
for (k in 1:femresult11$K){
  plot(1:555,covid21v[1,],type="l",ylim=c(0,25),
       ylab="New cases over one week per 1000 inhabitants",
       xlab="Day (1 April 2020-7 October 2021)",col=as.integer(femresult11$cls[1]==k))
  for(i in 2:179)
    points(1:555,covid21v[i,],type="l",col=as.integer(femresult11$cls[i]==k))
  meank <- colMeans(covid21v[femresult11$cls==k,])
  points(1:555,meank,type="l",lwd=5,col=2)
}
par(ask=FALSE)