-
Notifications
You must be signed in to change notification settings - Fork 7
Expand file tree
/
Copy pathplottingReference.R
More file actions
1455 lines (1143 loc) · 52.9 KB
/
plottingReference.R
File metadata and controls
1455 lines (1143 loc) · 52.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# tocID <- "plottingReference.R"
#
# Purpose: Reference to graphical output in R.
#
#
# Version: 2.1.1
#
# Date: 2017-09 - 2022-10
# Author: Boris Steipe (boris.steipe@utoronto.ca)
#
# V 2.1.1 Maintenance, typos
# V 2.1 First minor updates 2021
# V 2.0 Comprehensive reference with basic and advanced options based on
# an integrated yeast gene expression dataset. Full rewrite of most
# sections.
# V 1.1 Stylistic improvements, code polish, and additional examples
# V 1.0 Digest from "plotting reference" files
#
# ToDo: Demo 6.3.2 in parallel plots
#
#
#TOC> ==========================================================================
#TOC>
#TOC> Section Title Line
#TOC> --------------------------------------------------------------------
#TOC> 01 INITIALIZE 71
#TOC> 02 THIS REFERENCE ... 76
#TOC> 02.1 Dataset Documentation 83
#TOC> 03 PROPORTIONS AND DISTRIBUTIONS 195
#TOC> 03.1 barplot() 200
#TOC> 03.2 pie() 225
#TOC> 03.3 boxplot() 257
#TOC> 03.4 hist() 315
#TOC> 03.4.1 overlaying histograms 362
#TOC> 04 THE plot() FUNCTION 405
#TOC> 04.1 line plots 413
#TOC> 05 ENCODING INFORMATION: SYMBOL, SIZE, COLOuR 512
#TOC> 05.1 pch ("plotting character" symbols) 520
#TOC> 05.1.1 Using pch to emphasize different categories 611
#TOC> 05.1.2 Line types 700
#TOC> 05.2 cex ("character expansion" size) 724
#TOC> 06 COLOUR 784
#TOC> 06.1 Colours by number 792
#TOC> 06.2 Colours by name 810
#TOC> 06.3 Colours as hex-triplets 836
#TOC> 06.3.1 Inbuilt palettes 891
#TOC> 06.3.2 Custom palettes 971
#TOC> 06.3.3 Transparency: The Alpha Channel 1021
#TOC> 06.4 abline(), lines() and segments() 1066
#TOC> 07 AXES 1100
#TOC> 08 LEGENDS 1137
#TOC> 08.1 basic legends 1140
#TOC> 08.2 Color bars 1144
#TOC> 09 LAYOUT 1272
#TOC> 10 TEXT 1303
#TOC> 11 DRAWING ON PLOTS 1331
#TOC> 12 IMAGES 1397
#TOC> 13 CONTOUR LINES 1402
#TOC> 14 3D PLOTS 1407
#TOC> 15 GRAPHS AND NETWORKS 1412
#TOC> 16 OTHER GRPAHICS PACKAGES 1417
#TOC> 17 INTERACTIVE PLOTS 1441
#TOC> 17.1 locator() 1445
#TOC> 17.2 plotly:: 1448
#TOC>
#TOC> ==========================================================================
# = 01 INITIALIZE =========================================================
SC <- readRDS("data/SC.rds") # <<<- execute this line first
# = 02 THIS REFERENCE ... =================================================
# This script covers basic plotting and graphical output options of R. The
# functions are demonstrated with an integrated data set of yeast gene
# expression data and annotations.
# == 02.1 Dataset Documentation ============================================
# You do not need to study the dataset in detail but it will be useful to refer
# to this documentation from time to time to understand what data is being
# plotted and what one can therefore learn from the results.
# The integrated dataset SC is a list that contains an analysis of yeast gene
# expression profiles originally published by Pramila et al. (2002; PMID:
# 12464633) and accessible as GSE3653 on GEO
# (https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE3635). This is a
# high-resolution (5 min. interval) data set spanning two cycles of a
# synchronized yeast culture. It includes 6228 expression profiles with 25
# time-points assigned to yeast systematic names. It is complemented with gene
# annotations taken from SGD, and augmented with a set of GO slims. GO slims
# were rerieved by navigating to https://www.yeastgenome.org/ and using the menu
# to choose Function >> Gene Ontology >> GOslim Mapping File. This downloads
# go_slim_mapping.tab which was further processed to annotate each gene with its
# corresponding GO terms. To evaluate cyclical regulation, a curve-fitting
# protocol was followed to fit a cyclical epxpression model with parameters
# amplitude, phase, frequency, exponential damping, and baseline shift. After
# fitting the profiles, correlations with the fitted model were calculated and
# the first peak of the model was determined as a marker of when the expression
# was ON in the cell-cycle. Finally, the expression peak markers were assigned
# to seven phases of the cell cycle, and GO term enrichments were computed. A
# A subset was selected with the following parameters:
# selCCl <-
# nlsParams$A > 0.075 & # reasonably high Amplitude
# nlsParams$cor > 0.6 & # good correlation with the parametrized model
# nlsParams$f > 0.75 & # period between 0.75 ...
# nlsParams$f < 1.333 & # ... and 1.333 hours
# nlsParams$k < 0.03 & # limiting the exponential damping
# nlsParams$k > -0.001
# sum(selCCl) # 1,297 genes in the data set show some level
# # of periodic expression-variation in the cell
# # cycle.
#
# Thus the dataset comprises numeric and categorical data
# Here are the data details:
# SC$xpr : expression profiles. Numeric matrix with 1,297 rows of genes
# ====== and 25 colums of time points in 5 minute
# intervals
# Example:
# --------
# SC$xpr[169, 1:5]
# > t.0 t.5 t.10 t.15 t.20
# > -0.055703429 -0.116124955 -0.135965733 -0.068665035 -0.001819363
#
#
# SC$mdl : parametrized model. Data frame with 1,297 rows of genes
# ======
# SC$mdl$A : model parameter: amplitude (log ratio)
# SC$mdl$phi : model parameter: phase (degrees)
# SC$mdl$f : model parameter: 1/frequency (hours)
# SC$mdl$k : model parameter: damping
# SC$mdl$B : model parameter: baseline shift
# SC$mdl$cor : correlation of observation and model
# SC$mdl$peaks : timepoint of first expression peak of model (minutes)
#
# # Example:
# --------
# SC$mdl[169, ]
# > A phi f k B cor peaks
# > 0.1007908 -81.04413 1.012092 0.00656052 -0.01857916 0.8437175 37.52217
#
#
# SC$ann : annotations. Data frame with 1,297 rows of genes
# ======
# SC$ann$SGD : Saccharomyces Gene Database identifier
# SC$ann$sysName : Yeast gene systematic name
# SC$ann$stdName : The standard name under which the gene is known
# SC$ann$alias : common alias name(s)
# SC$ann$description : free-text description of the gene
# SC$ann$GO : GO IDs of the SGD GO slim subset annotated to the gene
#
# Example:
# --------
# SC$ann[169, ]
# > SGD sysName stdName alias
# > S000002214 YDL056W MBP1 transcription factor MBP1
# > description
# > YDL056W Transcription factor; involved in regulation [...]
# > GO
# > YDL056W GO:0005634 GO:0003677 GO:0001071 GO:0000278
#
# SC$phases
# ---------
# Data frame with a definition of yeast cell cycle phase names and
# time-points derived from inspection of the SC$mdl$peaks distribution.
# names start ends
# 1 Sense 0 6
# 2 Prep 6 15
# 3 Replicate 15 30
# 4 Assemble 30 37
# 5 Segregate 37 43
# 6 Duplicate 43 55
# 7 Stabilize 55 65
#
# SC$GO
# -----
# Data frame with GO term information and GO term enrichment data
# SC$GO$ontology : {C|F|P} identifying the "cellular component", "molecular
# function" or "biological process" ontology
# SC$GO$GOid : Gene Ontyology ID of the term
# SC$GO$label : Short label of the term
# SC$GO$tAll : Count of times the term is annotated in any gene
# SC$GO$tCC : ... times term is annotated to a gene in this dataset
# SC$GO$t... : ... annotated to one ofg the cell-cycle phases
# SC$GO$xs... : ... enrichment factor in each cell cycle phase
#
# = 03 PROPORTIONS AND DISTRIBUTIONS ======================================
# Distributions of numeric variable characterize the values. Proportions compare values. A typical use case is to characterize a set of measurements, or compare several sets of measurements.
# == 03.1 barplot() ========================================================
# Draws a bar with height proportional to a given number.
barplot(mean(SC$xpr[ , "t.20"])) # barplot of the mean expression at t.20
# Barplots show only a single number. They tell us nothing about the underlying
# distribution. They are commonly used to compare several distributions:
barplot(colMeans(SC$xpr[ , c("t.0", "t.20", "t.40", "t.60")]))
# barplots take the usual plotting parameters
myPal <- colorRampPalette(c("#00FF0055", "#0066FF55", # a "palette" function
"#00FFFF55", "#CCCCAA55",
"#FF00FF55", "#FF006655",
"#FFFFFF55"), alpha = TRUE) # transparent colors
barplot(colMeans(SC$xpr[ , 1:13]), # mean expression changes, first hour
main = "Mean expression changes",
xlab = "time points",
cex.names = 0.6, # scale column names
ylab = "Expression (log ratio)",
cex.axis = 0.8, # scale of y-axis labels
col = myPal(13)) # get 13 color values from myPal()
# == 03.2 pie() ============================================================
# Generally not preferred, but simple to do
table(SC$GO$ontology)
# Example: how many genes are anotated to the GO terms in the Cellular Component
# category?
CCids <- SC$GO$GOid[SC$GO$ontology == "C"] # get GO ids for "C" ontology
x <- unlist(strsplit(SC$ann$GO, " ")) # split GO term strings on blank-spaces
# and dump them all into one single vector
x <- x[x %in% CCids] # subset the ones in CCIDs (about 40%)
pie(table(x), cex = 0.5)
# This can be better visualized in a barplot
myT <- sort(table(x), decreasing = TRUE)
b <- barplot(myT, # assign the plot - we need the x-coordinates
main = "Genes annotated to GO terms in the Cellular Component Ontology",
cex.main = 0.8,
names.arg = "", # blank the names - otherwise they are
ylab = "counts", # taken from the names() of the object
cex.axis = 0.8,
col = colorRampPalette(c("#CCAAFF", "#FFDDFF", "#FFFFFF"))(length(myT)),
ylim = c(0, 1.5 * max(myT))) # make space at the top
# Add the GOids as text()
text(b, myT, # b holds the x coordinates, myT is y
labels = names(myT), # take the names from the table - names
srt = 90, # rotate to vertical
adj = c(-0.1, 0.5), # align center, top
cex = 0.5)
# == 03.3 boxplot() ========================================================
# A boxplot() is almost always preferred to a barplot, since it includes an
# estimate of the distribution:
boxplot(SC$xpr[ , "t.20"]) # boxplot of expression values at t.20
# What are these elements?
# Assigning the plot to a variable makes the numbers available so we can
# use them for annotation:
oPar <- par(mar = c(0.2,3,0.2,0.2)) # reduce the margins
( b <- boxplot(SC$xpr[ , "t.20"]))
xT <- 1.3
cT <- "#DD0000"
# outliers
myOut <- mean(b$out[b$out > b$stats[5]]) # outliers above the whiskers
myOut[2] <- mean(b$out[b$out < b$stats[1]]) # outliers below the whiskers
text(1.04, myOut, adj=c(0,0.5), cex=0.7, col=cT, "outliers")
# whiskers
text(1.12, b$stats[1,1], adj=c(0,0.5), cex=0.7, col=cT, "min(x_i > Q1-1.5*IQR)")
text(1.12, b$stats[5,1], adj=c(0,0.5), cex=0.7, col=cT, "max(x_i < Q3+1.5*IQR)")
# quartiles
# "min(x_i > Q1-1.5*IQR)" is "the smallest x in the data that still lies
# within 1.5 times the inter-quartile range below Q1
text(1.23, b$stats[2,1], adj=c(0,0.5), cex=0.7, col=cT, "Q1 (lower quartile)")
text(1.23, b$stats[4,1], adj=c(0,0.5), cex=0.7, col=cT, "Q3 (upper quartile)")
# median
text(1.23, b$stats[3,1], adj=c(0,0.5), cex=0.9, col=cT, "median")
# the mean is not shown
text(0.79, mean(SC$xpr[,"t.20"]), adj=c(1,0.5), cex=0.7, col="#999999", "mean")
par(oPar) # reset the margins
# If we have datasets with more columns, each column gets its own boxplot
boxplot(SC$xpr[ , c("t.0", "t.20", "t.40", "t.60")])
# boxplots take the usual plotting parameters
myPal <- colorRampPalette(c("#00FF0055", "#0066FF55", # a "palette" function
"#00FFFF55", "#CCCCAA55",
"#FF00FF55", "#FF006655",
"#FFFFFF55"), alpha = TRUE) # transparent colors
boxplot(SC$xpr[ , 1:13], # expression changes, first hour
main = "Expression levels",
xlab = "time points",
cex.axis = 0.6, # scale axis names
ylab = "Expression (log ratio)",
col = myPal(13)) # get 13 color values from myPal()
# This plot shows that there is a noticable fluctuation of expression levels
# over the course of the first 60 minutes of the experiment. This may be due to
# these genes being selected as having cyclically varying expression profiles.
# == 03.4 hist() ===========================================================
# Histograms are superbly informative and one of the workhorses of our analyses.
# A histogram answers the question how many observations do we have in a
# particular range of a (continuously varying) variable.
# Here we plot a histogram of the first expression peaks of genes, computed as
# the first peak of the fitted model for every gene in our set. We might be
# interested in whether the expression of different genes peaks continuously
# over the cycle, or whetehr there are time intervals where waves of expression
# of different genes can be observed in response to cycle-control signals.
hist(SC$mdl$peaks)
# We can explicitly set breakpoints for the histogram:
# Here we set breaks at 4 minute intervals
( myBreaks <- seq(0, 80, by = 4) ) # Note: 21 values bound 20 intervals
hist(SC$mdl$peaks, breaks = myBreaks)
# Assigning the output of hist() makes the values
# used in constructing the histogram accessible:
( H <- hist(SC$mdl$peaks) )
# for example, we can use this to plot the actual numbers:
text(H$mids, # midpoints of the bars
H$counts, # counts -> probabilities
labels = H$counts, # add labels to each bar: count numbers
adj = c(0.5, -0.5), # text is centered on x and raised in y
cex = 0.6,
col = "#8888FF")
# histograms take the usual plotting parameters
myPal <- colorRampPalette(c("#00FF0055", "#0066FF55", # a "palette" function
"#00FFFF55", "#CCCCAA55",
"#FF00FF55", "#FF006655",
"#FFFFFF55"), alpha = TRUE) # transparent colors
myBreaks <- seq(0, 80, length.out = 81) # one-minute intervals
hist(SC$mdl$peaks, # gene expression first peaks
breaks = myBreaks,
main = "Timing of first peaks of during the cell-cycle",
sub = "The bimodal distribution distinguishes replication from division",
cex.sub = 0.8,
xlab = "t (min.)",
ylab = "counts",
col = myPal(81)) # get 81 color values from myPal()
# === 03.4.1 overlaying histograms
# Histograms can be plotted one-over another to compare them
# Example: when do genes with different GO term annotations first peak?
# GO:0005730 "nucleolus"
# GO:0032200 "telomere organization"
# GO:0006260 "DNA replication"
sel1 <- grep("GO:0005730", SC$ann$GO) # 103 genes
sel2 <- grep("GO:0032200", SC$ann$GO) # 32 genes
sel3 <- grep("GO:0006260", SC$ann$GO) # 67 genes
# define breaks. This is important to make the histograms comparable
myBreaks <- seq(0, 80, by = 4)
# plot the first histogram
hist(SC$mdl$peaks[sel1], breaks = myBreaks,
main = "Peak timing for different GO terms",
xlab = "t (min)",
ylab = "Counts",
ylim = c(0, 50),
col = "#FF007955")
# first overlay
hist(SC$mdl$peaks[sel2], breaks = myBreaks,
col = "#00B2FF55",
add = TRUE)
# second overlay
hist(SC$mdl$peaks[sel3], breaks = myBreaks,
col = "#7FDFC955",
add = TRUE)
# Legend
legend("topright",
fill = c("#FF007955", "#00B2FF55", "#7FDFC955"),
legend = c("GO:0005730\n(nucleolus)\n", # "\n" is a line-break
"GO:0032200\n(telomere organization)\n",
"GO:0006260\n(DNA replication)\n"),
cex = 0.6,
bty = "n") # no box around the legend
# = 04 THE plot() FUNCTION ================================================
# plot() is the workhorse of data visualization in R. But plot() is also a
# "generic" - i.e. different "classes" can define their own plot-methods and
# plot will "dispatch" the right method for a class. This means: even though you
# just type plot() to plot, you may actually be running different functions on
# different types of data.
# == 04.1 line plots =======================================================
# Lines are useful to visually associate ralated data points, like points in a
# sequence of events.
# plot an expression profile: value over time
plot(SC$xpr[169, ])
# note the x is implied and plotted as "index", the expression values are used
# for the y-axis by default.
# plot types
plot(SC$xpr[169, ], type = "p") # points, the default
plot(SC$xpr[169, ], type = "l") # lines
plot(SC$xpr[169, ], type = "b") # both points and lines
plot(SC$xpr[169, ], type = "c") # empty points and lines
plot(SC$xpr[169, ], type = "o") # overplotted
plot(SC$xpr[169, ], type = "s") # steps
plot(SC$xpr[169, ], type = "h") # like histogram
plot(SC$xpr[169, ], type = "n") # none - useful to create an empty frame
# to add other elements later
text(SC$xpr[169, ], colnames(SC$xpr), cex = 0.5) # ... such as labels
# title and axis labels
iRow <- 169
t <- seq(0, 120, by = 5) # set actual x-values
plot(t, SC$xpr[iRow, ],
type = "b",
main = "Expression profile",
sub = "Data: GSE3653 (Pramila et al. 2002)",
xlab = "time(min)",
ylab = "Expression (log ratio)")
# adjusting font sizes
plot(t, SC$xpr[iRow, ],
type = "b",
main = "Expression profile",
sub = "Data: GSE3653 (Pramila et al. 2002)",
xlab = "time(min)",
ylab = "Expression (log ratio)",
cex.main = 1.1, # title
cex.sub = 0.6, # subtitle
cex.axis = 0.7, # axis values
cex.lab = 0.8) # axis labels
# x and y axis ranges can be defined with a two-element vector
plot(t, SC$xpr[iRow, ],
type = "b",
xlim = c(0, 120), # tight against the values
ylim = c(-max(abs(SC$xpr[iRow, ])),
max(abs(SC$xpr[iRow, ]))), # symmetric around 0
main = "", sub = "", xlab = "", ylab = "")
abline(h = 0, col = "#FF000044")
# pull in annotations with sprintf()
plot(t, SC$xpr[iRow, ],
type = "b",
main = sprintf("Expression profile: %s (%s)",
SC$ann$stdName[iRow],
SC$ann$sysName[iRow]),
sub = "", xlab = "", ylab = "")
# When exploring data, it is useful to have a standard view of data as a
# function:
xprPlot <- function(iRow) {
t <- seq(0, 120, by=5)
yMax <- max(abs(SC$xpr[iRow, ])) * 1.1
plot(t, SC$xpr[iRow, ],
ylim = c(-yMax, yMax),
type = "b",
main = sprintf("Expression profile: %s (%s)",
SC$ann$stdName[iRow],
SC$ann$sysName[iRow]),
xlab = "time(min)",
ylab = "Expression (log ratio)",
cex.main = 1.1,
cex.axis = 0.7,
cex.lab = 0.8)
abline(h = 0, col = "#FF000044")
}
xprPlot(169)
xprPlot(1124)
# better to visualize by plotting one OVER the other.
# This is done with the function points()
xprPlot(1124)
points(t, SC$xpr[169, ], type = "b", cex = 0.7, col = "#EE5500CC")
# In general, when we plot(), we produce a "scatterplot" of X and y values.
# scatterplot: one against the other
plot(SC$xpr[169, ], SC$xpr[1124, ]) # these two profiles are anticorrelated
# = 05 ENCODING INFORMATION: SYMBOL, SIZE, COLOuR =========================
# We have many options to encode information in scatter plots (and others), in
# order to be able to visualize and discover patterns and relationships. The
# most important ones are the type of symbol to use, its size, and its color.
# == 05.1 pch ("plotting character" symbols) ===============================
# There are many types of symbols available to plot points and they can all be
# varied by type, size, and color to present additional information. Which one
# to use is defined in the argument "pch". The default is pch = 1: the open
# circle. Characters 1:20 are regular symbols:
# reduce margins
oPar <- par(mar = c(1, 1, 1, 1))
# Empty plot frame ...
plot(c(0,11), c(0,11), type = "n", axes = FALSE, xlab = "", ylab = "")
# coordinates for first 25 symbols
x1 <- c(1:10, 1:10)
y1 <- c(rep(10, 10),rep(9, 10))
myPch <- 1:20
points(x1, y1, pch = myPch, cex = 1.2)
# pch 21:25 can have different border and fill colours. Borders are defined with
# "col", fills are defined with "fill":
x2 <- 1:5
y2 <- rep(8,5)
myCols <- c("#4B0055", "#00588B", "#009B95", "#53CC67", "#FDE333")
points(x2, y2, pch=21:25, col="#708090", bg = myCols, cex = 1.8)
# ten more symbols are defined as characters
x3 <- 1:10
y3 <- rep(6,10)
myPch <- c(".", "o", "O", "0","a","A", "*", "+","-","|")
points(x3, y3, pch = myPch) # note: "myPCh" is a character vector while
# the other pch were identified by integers
# The ASCII codes for characters 32 to 126 can also be used as plotting symbols
myPch <- 32:126
x4 <- (((myPch - 32) %% 19) + 2) / 2
y4 <- 5 - ((myPch - 32) %/% 19)
points(x4, y4, pch=32:126, col="#0055AA")
# I don't think I have ever actually used characters as plotting characters in
# this way; it is much more straightforward to just use the text() function in
# that case, rather than having to look up which ASCII code is which character,
# and remembering which ones are available as numbers, and which ones are
# identified by the character itself. But the first twenty five I use a lot, and
# I also end up looking up their codes a lot. So here is a function to plot them
# for reference. You can paste it into your .Rprofile to keep it at hand:
pchRef <- function() {
# a reference plot for the first twenty five plotting characters
oPar <- par(mar = c(1, 1, 1, 1), mfrow = c(1, 1))
plot(c(0,6), c(0,6), type = "n", axes = FALSE, xlab = "", ylab = "")
x <- ((0:24) %% 5) + 1
y <- rep(5:1, each = 5)
idx <- 1:20
points(x[idx], y[idx], pch = idx, cex = 2)
idx <- 21:25
points(x[idx], y[idx], pch = idx, cex = 2, col = "#708090",
bg=c("#CC0033AA", "#991566AA", "#662A99AA", "#323FCCAA", "#0055FFAA"))
text(x - 0.3, y, labels = sprintf("%d:", 1:25), col = "#7766AA", cex = 0.8)
par(oPar)
}
# reset margins
par(oPar)
# In addition to the default fonts, a large set of Hershey vector fonts is
# available which gives access to many more plotting and labeling options via
# text()
demo(Hershey)
# Plotting other symbols:
# In the most general way, Unicode characters can be plotted as text.
# The code is passed in hexadecimal, long integer, with a negative sign.
# Here is a quarter note (Unicode: 266a) using plot()
plot(0.5,0.5, pch=-0x266aL, cex=5, xlab="", ylab="")
# However, rendering may vary across platforms since it depends on unicode
# support. If your character can't be rendered, you'll only get an empty box.
# === 05.1.1 Using pch to emphasize different categories
# In general, when we plot(), we produce a "scatterplot" of X and y values.
# scatterplot: one against the other
tRep <- c("t.15", "t.20", "t.25", "t.30") # Replication-phase time points
tDup <- c("t.40", "t.45", "t.50", "t.55") # Duplication-phase time points
plot(rowMeans(SC$xpr[ , tRep]), rowMeans(SC$xpr[ , tDup]))
# select top ten replication enriched GO terms
sel <- order(SC$GO$xsReplicate, decreasing = TRUE)[1:10]
SC$GO$label[sel] # what are these?
GOrep <- SC$GO$GOid[sel] # GO terms that are enriched in the Replication phase
sel <- order(SC$GO$xsDuplicate, decreasing = TRUE)[1:10]
SC$GO$label[sel] # what are these?
GOdup <- SC$GO$GOid[sel] # GO terms that are enriched in the Duplication phase
# define sets of genes that are annotated to one or the other GO id set
repGenes <- logical(nrow(SC$ann)) # prepare an empty vector
for (id in GOrep) { # whenever we find one of the
repGenes <- repGenes | grepl(id, SC$ann$GO) # characteristic GO IDs annotated
} # to a a gene, we put TRUE in its
sum(repGenes) # spot of the repGenes vector.
xRep <- rowMeans(SC$xpr[ repGenes, tRep]) # repGenes in the replication phase
yRep <- rowMeans(SC$xpr[ repGenes, tDup]) # repGenes in the duplication phase
# same thing for the genes that are enriched in the duplication phase
dupGenes <- logical(nrow(SC$ann))
for (id in GOdup) {
dupGenes <- dupGenes | grepl(id, SC$ann$GO)
}
sum(dupGenes)
xDup <- rowMeans(SC$xpr[ dupGenes, tRep]) # dupGenes in the replication phase
yDup <- rowMeans(SC$xpr[ dupGenes, tDup]) # dupGenes in the duplication phase
# Ready to plot:
# base plot without the ref... and dup... genes
basePlot <- function(...) {
sel <- ! (repGenes | dupGenes)
m <- sprintf("Expression changes between phases (%d genes)",
nrow(SC$xpr))
plot(rowMeans(SC$xpr[ sel, tRep]),
rowMeans(SC$xpr[ sel, tDup]),
main = m,
xlab = "Expression in Replication phase",
ylab = "Expression in Duplication phase",
cex.main = 0.9,
cex.lab = 0.8,
...)
abline(h = 0, col = "#0088FF44") # horizontal zero
abline(v = 0, col = "#0088FF44") # vertical zero
}
basePlot()
# plotting the rep.. and dup... sets with different pch
points(xRep, yRep, pch=24, col="#000000", bg="#FFFFFF") # white up triangle
points(xDup, yDup, pch=25, col="#000000", bg="#FFFFFF") # white down triangle
# These are clearly different distributions, but the plot is overall too busy.
# Using color improves the plot:
basePlot(pch=21, col="#AAAAAA", bg="#DDDDDD")
points(xRep, yRep, pch=24, col="#000000", bg="#66DDFF")
points(xDup, yDup, pch=25, col="#000000", bg="#FF77FF")
# Even better may be to use transparent colors, to address the overlap:
basePlot(pch=19, col="#00000022")
points(xRep, yRep, pch=19, col="#33BBFF66")
points(xDup, yDup, pch=19, col="#FF44AA44")
# this needs a legend:
legend ("bottom",
legend = c(
sprintf("%d Genes with Replication-\nenriched GO terms\n ", sum(repGenes)),
sprintf("%d Genes with Duplication-\nenriched GO terms\n ", sum(dupGenes)),
"Other Genes\n "
),
horiz = TRUE,
pch = 19,
col = c("#33BBFF66", "#FF44AA44", "#00000022"),
cex = 0.6,
pt.cex = 1.0,
inset = 0.02,
box.col = "#DDDDDD",
bg = "#FFFFFF")
# === 05.1.2 Line types
# Basically all plots take arguments "lty" to define the line type, and "lwd"
# to define the line width.
# empty plot ...
plot(c(0,10), c(0,10), type = "n", axes = FALSE, xlab = "", ylab = "")
# Line type
for (i in 1:8) {
y <- 10.5-(i/2)
segments(1,y,5,y, lty=i)
text(6, y, paste("lty = ", i), col="grey60", adj=0, cex=0.75)
}
# Line width
for (i in 1:10) {
y <- 5.5-(i/2)
segments(1,y,5,y, lwd=(0.3*i)^2)
text(6, y, paste("lwd = ", (0.3*i)^2), col="grey60", adj=0, cex=0.75)
}
# == 05.2 cex ("character expansion" size) =================================
# The size of characters can be controlled with the cex parameter. cex takes a
# vector of numbers, mapped to the vecors of plotted elements. The usual R
# conventions of vector recycling apply, so if cex s only a single number, it is
# applied to all plotted elements. Here is an example plotting expression
# amplitudes in a comparison of pre-replication vs. replication phase
# expression.
tPre <- c("t.0", "t.5", "t.10") # Pre replication-phase time points
tMid <- c("t.25", "t.30", "t.35") # Midway between replication and duplication
x <- rowMeans(SC$xpr[ , tPre]) # row means average out fluctuations
y <- rowMeans(SC$xpr[ , tMid])
plot(x, y)
myAmps <- SC$mdl$A # fetch the correlations of the fitted
# low correlations are not well modeled
# with a periodic function
hist(myAmps, breaks = 40) # examine the distribution
hist(log(myAmps), breaks = 40) # sometimes log() is be better suited
# to map to symbol size
# useful cex sizes are approximately between 0.5 and 5
# Example:
N <- 10
cexMin <- 0.5
cexMax <- 5.0
myCex <- seq(cexMax, cexMin, length.out=N) # Vector of cex values of length N
# We plot these points from largest to smallest, so we don't overplot
x1 <- runif(N)
y1 <- runif(N)
plot(x1, y1, xlim = 0:1, ylim = 0:1, xlab = "", ylab = "", pch = 16,
cex = myCex, col= colorRampPalette(c("#F4F0F8", "#3355DD"))(N))
# Note that the smallest cex at 0.5 is _really_ small.
points(x1[N], y1[N], cex = 3.0, col = "#FF000077") # Here it is!
# To rescale a vector into the desired interval (lo, up) we write a little
# function. First we normalize the vector into the interval (0, 1), then we
# multiply it by up-lo, and add lo.
#
reScale <- function(x, lo = 0, up = 1) {
return((((x-min(x)) / (max(x)-min(x))) * (up-lo)) + lo )
}
# with this, our coding amplitudes as plot character size becomes quite easy:
plot(x, y, pch=16,
cex = reScale(myAmps, lo=0.5, up=5), # <<<---- That's all we need
col="#4499BB22")
abline(a = 0, b = 1, col = "#DD333344")
abline(h = 0, col = "#33333344")
abline(v = 0, col = "#33333344")
# ToDo: add titles, scale legend, and interpretation
# = 06 COLOUR =============================================================
# Colour is immensely important for data visualization, and colours in R can be
# specified by number, by name, as hex-triplets, as rgb or hsv values, and
# through several color-space specifications. The can be used individually, as
# vectors that are defined manually, and through colour palettes. And the can be
# solid and transparent.
# == 06.1 Colours by number ================================================
# The col=... parameter for plots is 1 by default and you can
# set it to the range 0:8.
# 0: white
# 1: black (the default)
# 2: red
# 3: green
# 4: blue
# 5: cyan
# 6: magenta
# 7: yellow
# 8: grey
barplot(rep(1,9), col=0:8, axes=FALSE, names.arg=c(0:8))
# As you can see, the default primary colours are garish and offend even the
# most rudimentary sense of aesthetics. Using these colors won't earn you
# respect. Fortunately we have many more sophisticated ways to define colours.
# == 06.2 Colours by name ==================================================
# You may have noticed that "red", "green", and "blue" work for the col=...
# parameter, but you probably would not have imagined that "peachpuff",
# "firebrick" and "goldenrod" are valid as well. In fact, there are 657 named
# colours in R. Access them all by typing:
colors()
myCols <- c(
"firebrick2",
"tomato",
"goldenrod1",
"peachpuff",
"papayawhip",
"seashell",
"whitesmoke"
)
pie(c(1, 1, 2, 3, 5, 8, 13),
col = myCols,
labels = myCols)
# Read more about named colours (and related topics) in Melanie Frazier's pdf:
# https://www.nceas.ucsb.edu/sites/default/files/2020-04/colorPaletteCheatsheet.pdf
# I almost never use named colours anymore but use hex-triplets instead...
# == 06.3 Colours as hex-triplets ==========================================
# Hex triplets in R work exactly as in HTML: a triplet of
# RGB values in two-digit hexadecimal representation. The
# first two digits specify the red value, the second two
# are for green, then blue. R accepts a fourth pair of
# digits to optionally specify the transparency - the "Alpha" channel. The
# semantics of the hex-triplet is thus "#RRGGBB" or "#RRGGBBAA".
# Read more e.g. at http://en.wikipedia.org/wiki/Web_colors
# The function col2rgb() converts colour names to rgb values ...
col2rgb("violetred")
# ... and rgb() converts rgb values to hex-code:
rgb(1, 0.5, 0.23)
# to get hexcodes from names takes a bit more: divide and transpose -
rgb(t(col2rgb("violetred")/255))
# Here is a utility function to convert all manners of color-vectors to
# hex-triplets. You can copy it into your .Rprofile if you want to keep it at hand.
col2hex <- function(colV, dput = FALSE, pal = FALSE, N = NULL) {
# colV a vector or scalar of color names, hex-codes or integers
# dput logical. If TRUE, character output is suitable for pasting into code.
# pal logical. If TRUE, the colors will be used in a palette. N must
# be defined in that case
# value: a vector of hex-codes of the same length, or a dput() string
h <- rgb(t(col2rgb(colV) / 255))
if (pal == TRUE) {
if (is.null(N)) { stop("N must be defined if pal is TRUE.") }
h <- colorRampPalette(h)(N)
}
if (dput == TRUE) {
dput(h)
return(invisible(h))
}
return(h)
}
# Examples:
myCols <- c( "firebrick2", "tomato", "goldenrod1", "peachpuff",
"papayawhip", "seashell", "whitesmoke")
col2hex(myCols)
col2hex(myCols, dput = TRUE)
# barplot with the original colors
barplot(rep(1, length(myCols)), col=myCols)
# expanding the palette with col2hex()
N <- 40
barplot(rep(1, N), col = col2hex(myCols,pal=TRUE,N=N) )
# === 06.3.1 Inbuilt palettes
# In R, a palette is a function (!) that takes a number as its argument and
# returns that number of colors, those colors can then be used to color points
# in a plot or other elements. In principle, such colors can serve three
# distinct purposes:
# Qualitative colors have a different color for different types of entities.
# Here the color serves simply to distinguish the types, it represents
# "categorical information". For example, we can assign different colors to each
# individual amino acid.
# Sequential colors distinguish order in a sequence: the colors distinguish the
# first from the last element in the order. For example we often color the
# N-terminus of a protein blue(ish) and the C-terminus red(dish).
# Diverging colors encode some property and are scaled to represent particular
# values of the property. For example we might represent temperature with a
# black-body radiation color palette. Note that "sequential" can be seen as
# "diverging on order".
# R has are a number of inbuilt palettes; Here is a convenience function to view
# palettes. Input can be either a palette, the name of a hcl.colors palette, or
# a vector of colors:
plotPal <- function(pal, N = 40) {
if (is.character(pal)) {
if (length(pal) == 1) { # name of a hcl palette
myCols <- hcl.colors(N, pal)
m <- pal
} else { # already a color vector
myCols <- colorRampPalette(pal)(N)
m <- sprintf("Custom palette (spread to %d values)", N)
}
} else { # palette function
myCols <- pal(N)
m <- as.character(substitute(pal))
}
barplot(rep(1,N), main = m, axes = FALSE, col = myCols)
}
plotPal(rainbow) # rainbow() is the quintessential qualitative palette
plotPal(cm.colors) # cm.colors() is sequential: cyan-white-magenta
plotPal(terrain.colors) # a diverging palette: map elevation colors
# A lot of thought has gone into the construction of palettes in the
# colorspace:: package, now available as hcl.colors in R (hcl: hue, chroma,
# luminance), which is a better way to specify perceptually equidistant colors.
# It's worthwhile to read through the help-file for more information -
?hcl.colors
# ... and to sample the 110 available palettes:
hcl.pals()
# Examples:
plotPal("Viridis")
plotPal("Pastel 1")
plotPal("Spectral")
plotPal("OrRd")
plotPal("Cold")
plotPal("Mint")
plotPal("Berlin")
# If you are curious, you can execute the code below, then select the line that
# draws the plot and press ctrl+enter repeatedly to step through the entire
# set of palettes.
# i <- 0
# plotPal(hcl.pals()[( i<-i+1 )])
# Worthy of special mention are the color-Brewer palettes, in particular for
# their accessibility considerations. Do consider that not all of us view
# colours in the same way!
if (! requireNamespace("RColorBrewer", quietly = TRUE)) {
install.packages("RColorBrewer")
}
?RColorBrewer
RColorBrewer::display.brewer.all(colorblindFriendly = TRUE)
plotPal(RColorBrewer::brewer.pal(11, "PuOr"), N = 11)
# === 06.3.2 Custom palettes
#
# Bespoke palettes are easily created with colorRampPalette(). The function
# returns a palette, i.e. a function (not a vector of colors). You assign the
# function to a variable (your palette), and you call it with an argument of how
# many color values you want it to return.
myPal <- colorRampPalette(c("#000000", "#FFFFFF", "#FF0000"))
myPal(10)
plotPal(myPal)
# The parameter "bias" controls the spacing of colors at the end of the palette:
plotPal(colorRampPalette(c("#000000", "#FFFFFF", "#FF0000"), bias = 0.5))
plotPal(colorRampPalette(c("#000000", "#FFFFFF", "#FF0000"), bias = 1.0))
plotPal(colorRampPalette(c("#000000", "#FFFFFF", "#FF0000"), bias = 2.0))
# But a similar effect can be obtained by duplicating one or more colours to
# spread the range ...
plotPal(colorRampPalette(c("#000000", "#FFFFFF", "#FF0000", "#FF0000")))
# ... and this can also be used to spread the middle
plotPal(colorRampPalette(c("#000000", "#FFFFFF", "#FFFFFF", "#FF0000")))
# nicer pastels
myPastels <- colorRampPalette(c("#FFFFFF",
"#FF9AA2",
"#FFB7B2",
"#FFDAC1",
"#E2F0CB",