-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathCalcD_P_value function.R
More file actions
190 lines (176 loc) · 8.2 KB
/
CalcD_P_value function.R
File metadata and controls
190 lines (176 loc) · 8.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
################################################################################
### Modification of the origional CalcD function to save the p_value as well ###
############## Modified from CalcD in the evobiR package #######################
################################################################################
# load dependent packages
library(seqinr)
# write the modified function so that it saves the P-value as well as the d value
CalcD_P_value <- function(alignment = "alignment.fasta",
sig.test="N", # options are "N", "B", "J"
ambig="D", #options are D R I
block.size = 1000, # size of blocks to drop in jacknife
replicate=1000,
align.format='fasta'){
# this function is used regardless of approach
d.calc <- function(alignment){
abba <- 0 # set up my variables
baba <- 0 # set up my variables
for(i in 1:ncol(alignment)){ # run through all sites
if(length(unique(alignment[, i])) == 2){ # unique(c(p1,p2,p3,o))==2 aka biallelic
if(alignment[1, i] != alignment[2, i]){ # p1 != p2 aka different resolutions in p1 and p2
if(alignment[4, i] != alignment[3, i]){ # o != p3 durand says "less likely pattern due to seq. errors
if(alignment[3, i] == alignment[1, i]) {baba <- baba + 1} # add to the count of baba sites
if(alignment[2, i] == alignment[3, i]) {abba <- abba + 1} # add to the count of abba sites
}
}
}
}
d <- (abba - baba) / (abba + baba) #what its all about
results <- list()
results[[1]] <- d
results[[2]] <- abba
results[[3]] <- baba
return(results)
}
#### Test of empirical data
alignment <- read.alignment(alignment, format = align.format, forceToLower=T) # read in the alignment
alignment.matrix <- matrix(, length(alignment$nam), nchar(alignment$seq[[1]])) # make a matrix for the alignment
for(i in 1:length(alignment$nam)){
alignment.matrix[i, ] <- unlist(strsplit(alignment$seq[[i]], "")) # fill in the matrix
}
#### This section is being added to deal with reccurent
#### Requests to deal with ambiguity in sequence data
# R A or G
# Y C or T
# S G or C
# W A or T
# K G or T
# M A or C
## First we deal with the situation where the user
## wishes to simply drop ambig sites
if(ambig == "D"){
target <- c("a","c","g","t")
keep <- vector()
for(i in 1:ncol(alignment.matrix)){
keep[i] <- all(alignment.matrix[,i] %in% target)
}
alignment.matrix <- alignment.matrix[,keep]
}
## Next we deal with the situation where users want to
## randomly resolve ambigous sites
if(ambig == "R"){
# I still want to limit sites so we first drop
# those sites that look like 3 or 4 possibilities
target <- c("a", "c", "g", "t", "r",
"y", "s", "w", "k", "m")
keep <- vector()
for(i in 1:ncol(alignment.matrix)){
keep[i] <- all(alignment.matrix[,i] %in% target)
}
alignment.matrix <- alignment.matrix[,keep]
# this function will be applied to each site in our data
# it resolves ambiguities randomly
resolver <- function(x){
if(x=="r") z <- sample(c("a", "g"), 1)
if(x=="y") z <- sample(c("c", "t"), 1)
if(x=="s") z <- sample(c("g", "c"), 1)
if(x=="w") z <- sample(c("a", "t"), 1)
if(x=="k") z <- sample(c("g", "t"), 1)
if(x=="m") z <- sample(c("a", "c"), 1)
if(x %in% c("a", "c", "g", "t")) z <- x
return(z)
}
alignment.matrix <- apply(alignment.matrix, c(1,2), resolver)
}
results <- d.calc(alignment.matrix)
d <- results[[1]]
if(is.nan(d)) d <- 0
abba <- results[[2]]
baba <- results[[3]]
## THIS SECTION WILL CALCULATE THE P-VAL BASED ON BOOTSTRAPPING
## SITES ARE SAMPLED WITH REPLACEMENT TO MAKE A NEW DATASET OF
## OF EQUAL SIZE TO THE ORIGINAL DATASET THIS ALLOWS US TO CALCULATE
## THE STANDARD DEVIATION AND THUS A Z SCORE.
if(sig.test=="B"){
sim.d<-vector()
foo <- ncol(alignment.matrix)
sim.matrix<-matrix(,4,foo)
cat("\nperforming bootstrap")
for(k in 1:replicate){
if(k/(replicate/100) == round(k/(replicate/100))) cat(".")
sim.matrix[1:4,1:foo] <- alignment.matrix[1:4, sample(1:foo, replace=T)]
results <- d.calc(sim.matrix)
sim.d[k] <- results[[1]]
}
sim.d[is.nan(sim.d)] <- 0
z <- abs(d/sd(sim.d))
new.pval <- 2 * (1 - pnorm(z))
## NOW WE MAKE THE OUTPUTS
cat("\nSites in alignment =", ncol(alignment.matrix))
cat("\nNumber of sites with ABBA pattern =", abba)
cat("\nNumber of sites with BABA pattern =", baba)
cat("\n\nD raw statistic / Z-score = ", d, " / ", z)
cat("\n\nResults from ", replicate, "bootstraps")
cat("\nSD D statistic =", sd(sim.d))
cat("\nP-value (that D=0) = ",new.pval,"\n\n") #after Eaton and Ree 2013
}
## THIS SECTION WILL CALCULATE THE P-VAL BASED ON JACKKNIFING
## THE DATA IS REANALYZED WHILE DROPPING A PORTION OF THE DATA
## THE SIZE OF THE DROPPED PORTION IS DETERMINED BY THE BLOCK SIZE
## ARGUMENT THIS PROCEDURE ALLOWS US TO CALCULATE
## THE STANDARD DEVIATION AND THUS A Z SCORE. THIS APPROACH IS PARTICULARLY
## IMPORTANT WHEN WE WILL BE USING DATA WHERE THE SNPs MAY BE IN LINKAGE WITH
## ONE ANOTHER
if(sig.test=="J"){
#first lets test whether we are limited in the number of reps by block size
max.rep <- ncol(alignment.matrix) - block.size
if(block.size >= (ncol(alignment.matrix)/2)){
stop(call. = F, paste("\nWith a block size of", block.size,
"and an alignment of", ncol(alignment.matrix),
"sites \nsome sites would never be included in the \nanalysis",
"\n\nThe maximum block size is 1/2 the alignment length"))
}
if(max.rep < replicate){
stop(call. = F, paste("\nWith a block size of", block.size,
"and an alignment of", ncol(alignment.matrix),
"sites", replicate, "replicates\nare not possible"))
}
if(max.rep >= replicate){
drop.pos <- seq.int(from=1, to=(max.rep-1), length.out=replicate)
replicate2 <- replicate
}
sim.d<-vector()
foo <- ncol(alignment.matrix)
sim.matrix<-matrix(,4,foo-block.size)
cat("\nperforming jackknife")
for(k in 1:replicate2){
if(k/2 == round(k/2)) cat(".")
sim.matrix[1:4,1:(foo-block.size-1)] <-alignment.matrix[1:4, -drop.pos[k]:-(drop.pos[k]+block.size)]
results <- d.calc(sim.matrix)
sim.d[k] <- results[[1]]
}
sim.d[is.nan(sim.d)] <- 0
z <- abs(d/sd(sim.d))
new.pval <- 2 * (1 - pnorm(z))
## NOW WE MAKE THE OUTPUTS
cat("\nSites in alignment =", ncol(alignment.matrix))
cat("\nNumber of sites with ABBA pattern =", abba)
cat("\nNumber of sites with BABA pattern =", baba)
cat("\nD raw statistic", d)
cat("\nZ-score = ", z)
cat("\n\nResults from", replicate2, "jackknifes with block size of", block.size)
cat("\nSD D statistic =", sd(sim.d))
cat("\nP-value (that D=0) = ",new.pval,"\n\n") #after Eaton and Ree 2013
}
if(sig.test=="N"){
cat("\nSites in alignment =", ncol(alignment.matrix))
cat("\nNumber of sites with ABBA pattern =", abba)
cat("\nNumber of sites with BABA pattern =", baba)
cat("\n\nD raw statistic = ", d,"\n\n")
}
return(list(D=d, p.value=new.pval))
}
# test the new function
#TEST <- CalcD_P_value(alignment = "C:/Users/mbzlld/Google Drive/Post Doc/RAD data analysis/ABBA BABA analysis/populations_PGDSpider_OBSE_SCAD_OBSM_BEPA/batch_4_ordered.fasta",
# sig.test = "J", ambig = "R", block.size = 100, replicate = 100)
#TEST