forked from joedekeizer/Simulations-for-GC-RCT
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path1_sim_data.R
More file actions
123 lines (103 loc) · 5 KB
/
1_sim_data.R
File metadata and controls
123 lines (103 loc) · 5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
################################################################################
###### (1) Simulations to create the data sets
################################################################################
### This code will create a folder in the specified work directory named /complex_nXX_ateYY where XX is the sample size (N.effectif) and YY the effect size (Size.Effect)
### In the folder 10000 simulated data sets will be created and saved as .Rdata, according to the relations described in the sim.base function
################################################################################
library(dplyr)
library(doParallel)
### Initialization of the parameters
SizeEffect <- "log(3)" #The size effect (character)
N.effectif <- 200 #The sample size (numeric)
N.stop <- 10000 #The number of simulated data sets (numeric)
path <- "/home/Simulations/" #Work directory
setwd(path)
### Create the folder for the data if it does not already exist
Size.Effect <- gsub("[[:punct:]]", "", SizeEffect) #Removing any special character for folder name
SizeEffect <- eval(parse(text = SizeEffect)) #Setting to numeric
pathbases <- paste0(path, "/complex_n", N.effectif, "_ate", Size.Effect, "/")
ifelse(!dir.exists(file.path(pathbases)), dir.create(file.path(pathbases)), FALSE)
### Parallelisation of the code (PSOCK if Windows and FORK if Linux environment)
nb.cluster <- parallel::detectCores() - 1 #All cores minus one
if(.Platform[[1]] == "windows") {cl <- makeCluster(nb.cluster, type="PSOCK") } else{ cl <- makeCluster(nb.cluster, type="FORK") }
registerDoParallel(cl)
if(.Platform[[1]] == "windows") {clusterEvalQ(cl, {library(dplyr);library(MASS);library(splines);library(caret);library(SuperLearner);library(glmnet);library(cvAUC)})}
################################################################################
### Function to simulate the data
sim.base <- function(N, beta1.t, iter)
{
### Coefficient from Supplementary Table A1
beta0.x <- -0.4
beta1.x <- log(2)
beta0.o <- -2
beta1.o <- log(2)
### Simulated variables from the complex scenario Figure 1
.x1 <- rnorm(N, 0, 1) #1
.x2 <- rnorm(N, beta0.x + beta1.x * .x1, 1) #2
.x3 <- rnorm(N, beta0.x - beta1.x * .x1 - beta1.x * .x2, 1) #3
.x5 <- rnorm(N, 0, 1) #4
.x6 <- 1 * (rnorm(N, 0, 1) > 0.66) # prevalence ~25% #5
.x7 <- 1 * (rnorm(N, beta0.x - beta1.x * .x5, 1) > (-0.40)) # prevalence ~50% #6
.x8 <- rnorm(N, beta0.x - beta1.x * .x6, 1) #7
.x9 <- 1 * (rnorm(N, beta0.x + beta1.x * .x7, 1) > (-0.80)) # prevalence ~75% (77%) #8
.x10 <- rnorm(N, beta0.x + beta1.x * .x8, 1) #9
.x11 <- rnorm(N, 0, 1) #10
.x12 <- 1 * (rnorm(N, beta0.x + beta1.x * .x9, 1) > (0.84)) # prevalence ~25% #11
.x14 <- rnorm(N, beta0.x - beta1.x * .x12 - beta1.x * .x11, 1) #12
.x15 <- rnorm(N, beta0.x - beta1.x * .x12, 1) #13
.x18 <- rnorm(N, 0, 1) #14
.x19 <- 1 * (rnorm(N, 0, 1) > qnorm(0.75)) # prevalence ~25% #15
.x20 <- 1 * (rnorm(N, 0, 1) > (0.66)) # prevalence ~25% #16
.x21 <- rnorm(N, 0, 1) #17
data.obs <- data.frame(x1 = .x1,
x2 = .x2,
x3 = .x3,
x5 = .x5,
x6 = .x6,
x7 = .x7,
x8 = .x8,
x9 = .x9,
x10 = .x10,
x11 = .x11,
x12 = .x12,
x14 = .x14,
x15 = .x15,
x18 = .x18,
x19 = .x19,
x20 = .x20,
x21 = .x21)
data.obs$ttt = rbinom(N, size = 1, prob = 0.5)
### Causal relations between the variables and the outcome from the complex scenario Figure 1
bx <- beta0.o +
beta1.o * (data.obs$x2 > -0.44) -
beta1.o * data.obs$x3 + (beta1.o / 2) * (data.obs$x3^2) +
beta1.o * data.obs$x6 +
beta1.o * data.obs$x7 +
beta1.o * data.obs$x10 +
beta1.o * 0.5 * (data.obs$x11^2) -
beta1.o * data.obs$x14 -
beta1.o * (data.obs$x15 > -0.55) +
beta1.o * data.obs$x18 +
beta1.o * data.obs$x19 +
beta1.o * 0.5 * data.obs$ttt*data.obs$x18 +
beta1.t * data.obs$ttt
pr.o.obs <- plogis(bx)
data.obs$outcome <- rbinom(N, 1, prob = pr.o.obs)
### Save each data set as data.obs in a .Rdata file
save(data.obs, file = paste0(pathbases,"sim_n", N.effectif, "_ate", Size.Effect,"_",formatC(iter, width = 5, format = "d", flag = "0"), ".Rdata"))
return(data.obs)
}
################################################################################
### Running the function with parallel processing (not necessary but recommended)
if(.Platform[[1]] == "windows") {
### Windows
foreach(i = 1:N.stop, .inorder = FALSE, .verbose = T,
.export=ls(envir=globalenv()),
.packages = c("dplyr")
) %dopar% {sim.base(N = N.effectif, beta1.t = SizeEffect, iter = i)}
} else {
### Linux
foreach(i = 1:N.stop, .inorder = FALSE, .verbose = T
) %dopar% {sim.base(N = N.effectif, beta1.t = SizeEffect, iter = i)}
}
stopCluster(cl)