Combining factor levels in R 3.2.1 - r

In previous versions of R I could combine factor levels that didn't have a "significant" threshold of volume using the following little function:
whittle = function(data, cutoff_val){
#convert to a data frame
tab = as.data.frame.table(table(data))
#returns vector of indices where value is below cutoff_val
idx = which(tab$Freq < cutoff_val)
levels(data)[idx] = "Other"
return(data)
}
This takes in a factor vector, looks for levels that don't appear "often enough" and combines all of those levels into one "Other" factor level. An example of this is as follows:
> sort(table(data$State))
05 27 35 40 54 84 9 AP AU BE BI DI G GP GU GZ HN HR JA JM KE KU L LD LI MH NA
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
OU P PL RM SR TB TP TW U VD VI VS WS X ZH 47 BL BS DL M MB NB RP TU 11 DU KA
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 3 3 3
BW ND NS WY AK SD 13 QC 01 BC MT AB HE ID J NO LN NM ON NE VT UT IA MS AO AR ME
4 4 4 4 5 5 6 6 7 7 7 8 8 8 9 10 11 17 23 26 26 30 31 31 38 40 44
OR KS HI NV WI OK KY IN WV AL CO WA MN NH MO SC LA TN AZ IL NC MI GA OH ** CT DE
45 47 48 57 57 64 106 108 112 113 120 125 131 131 135 138 198 200 233 492 511 579 645 646 840 873 1432
RI DC TX MA FL VA MD CA NJ PA NY
1782 2513 6992 7027 10527 11016 11836 12221 15485 16359 34045
Now when I use whittle it returns me the following message:
> delete = whittle(data$State, 1000)
Warning message:
In `levels<-`(`*tmp*`, value = c("Other", "Other", "Other", "Other", :
duplicated levels in factors are deprecated
How can I modify my function so that it has the same effect but doesn't use these "deprecated" factor levels? Converting to a character, tabling, and then converting to the character "Other"?

I've always found it easiest (less typing and less headache) to convert to character and back for these sorts of operations. Keeping with your as.data.frame.table and using replace to do the replacement of the low-frequency levels:
whittle <- function(data, cutoff_val) {
tab = as.data.frame.table(table(data))
factor(replace(as.character(data), data %in% tab$data[tab$Freq < cutoff_val], "Other"))
}
Testing on some sample data:
state <- factor(c("MD", "MD", "MD", "VA", "TX"))
whittle(state, 2)
# [1] MD MD MD Other Other
# Levels: MD Other

I think this verison should work. The levels<- function allows you to collapse by assigning a list (see ?levels).
whittle <- function(data, cutoff_val){
tab <- table(data)
shouldmerge <- tab < cutoff_val
tokeep <- names(tab)[!shouldmerge]
tomerge <- names(tab)[shouldmerge]
nv <- c(as.list(setNames(tokeep,tokeep)), list("Other"=tomerge))
levels(data)<-nv
return(data)
}
And we test it with
set.seed(15)
x<-factor(c(sample(letters[1:10], 100, replace=T), sample(letters[11:13], 10, replace=T)))
table(x)
# x
# a b c d e f g h i j k l m
# 5 11 8 8 7 5 13 14 14 15 2 3 5
y <- whittle(x, 9)
table(y)
# y
# b g h i j Other
# 11 13 14 14 15 43

It's worth adding to this answer that the new forcats package contains the fct_lump() function which is dedicated to this.
Using #MrFlick's data:
x <- factor(c(sample(letters[1:10], 100, replace=T),
sample(letters[11:13], 10, replace=T)))
library(forcats)
library(magrittr) ## for %>% ; could also load dplyr
fct_lump(x, n=5) %>% table
# b g h i j Other
#11 13 14 14 15 43
The n argument specifies the number of most common values to preserve.

Here's another way of doing it by replacing all the items below the threshold with the first and then renaming that level to Other.
whittle <- function(x, thresh) {
belowThresh <- names(which(table(x) < thresh))
x[x %in% belowThresh] <- belowThresh[1]
levels(x)[levels(x) == belowThresh[1]] <- "Other"
factor(x)
}

Related

Conditional filling NA rows with comparing non-NA labeled rows

I want to fill NA rows based on checking the differences between the closest non-NA labeled rows.
For instance
data <- data.frame(sd_value=c(34,33,34,37,36,45),
value=c(383,428,437,455,508,509),
label=c(c("bad",rep(NA,4),"unable")))
> data
sd_value value label
1 34 383 bad
2 33 428 <NA>
3 34 437 <NA>
4 37 455 <NA>
5 36 508 <NA>
6 45 509 unable
I want to evaluate how to change NA rows with checking the difference between sd_value and value those close to bad and unablerows.
if we want to get differences between the rows we can do;
library(dplyr)
data%>%
mutate(diff_val=c(0,diff(value)), diff_sd_val=c(0,diff(sd_value)))
sd_value value label diff_val diff_sd_val
1 34 383 bad 0 0
2 33 428 <NA> 45 -1
3 34 437 <NA> 9 1
4 37 455 <NA> 18 3
5 36 508 <NA> 53 -1
6 45 509 unable 1 9
The condition how I want to label the NA rows is
if the diff_val<50 and diff_sd_val<9 label them with the last non-NA label else use the first non-NA label after the last NA row.
So that the expected output would be
sd_value value label diff_val diff_sd_val
1 34 383 bad 0 0
2 33 428 bad 45 -1
3 34 437 bad 9 1
4 37 455 bad 18 3
5 36 508 unable 53 -1
6 45 509 unable 1 9
The possible solution I cooked up so far:
custom_labelling <- function(x,y,label){
diff_sd_val<-c(NA,diff(x))
diff_val<-c(NA,diff(y))
label <- NA
for (i in 1:length(label)){
if(is.na(label[i])&diff_sd_val<9&diff_val<50){
label[i] <- label
}
else {
label <- label[i]
}
}
return(label)
}
which gives
data%>%
mutate(diff_val=c(0,diff(value)), diff_sd_val=c(0,diff(sd_value)))%>%
mutate(custom_label=custom_labelling(sd_value,value,label))
Error in mutate_impl(.data, dots) :
Evaluation error: missing value where TRUE/FALSE needed.
In addition: Warning message:
In if (is.na(label[i]) & diff_sd_val < 9 & diff_val < 50) { :
the condition has length > 1 and only the first element will be used
One option is to find NA and non-NA index and based on the condition select the closest label to it.
library(dplyr)
#Create a new dataframe with diff_val and diff_sd_val
data1 <- data%>% mutate(diff_val=c(0,diff(value)), diff_sd_val=c(0,diff(sd_value)))
#Get the NA indices
NA_inds <- which(is.na(data1$label))
#Get the non-NA indices
non_NA_inds <- setdiff(1:nrow(data1), NA_inds)
#For every NA index
for (i in NA_inds) {
#Check the condition
if(data1$diff_sd_val[i] < 9 & data1$diff_val[i] < 50)
#Get the last non-NA label
data1$label[i] <- data1$label[non_NA_inds[which.max(i > non_NA_inds)]]
else
#Get the first non-NA label after last NA value
data1$label[i] <- data1$label[non_NA_inds[i < non_NA_inds]]
}
data1
# sd_value value label diff_val diff_sd_val
#1 34 383 bad 0 0
#2 33 428 bad 45 -1
#3 34 437 bad 9 1
#4 37 455 bad 18 3
#5 36 508 unable 53 -1
#6 45 509 unable 1 9
You can remove diff_val and diff_sd_val columns later if not needed.
We can also create a function
custom_label <- function(label, diff_val, diff_sd_val) {
NA_inds <- which(is.na(label))
non_NA_inds <- setdiff(1:length(label), NA_inds)
new_label = label
for (i in NA_inds) {
if(diff_sd_val[i] < 9 & diff_val[i] < 50)
new_label[i] <- label[non_NA_inds[which.max(i > non_NA_inds)]]
else
new_label[i] <- label[non_NA_inds[i < non_NA_inds]]
}
return(new_label)
}
and then apply it
data%>%
mutate(diff_val = c(0, diff(value)),
diff_sd_val = c(0, diff(sd_value)),
new_label = custom_label(label, diff_val, diff_sd_val))
# sd_value value label diff_val diff_sd_val new_label
#1 34 383 bad 0 0 bad
#2 33 428 <NA> 45 -1 bad
#3 34 437 <NA> 9 1 bad
#4 37 455 <NA> 18 3 bad
#5 36 508 <NA> 53 -1 unable
#6 45 509 unable 1 9 unable
If we want to apply it by group we can add a group_by statement and it should work.
data%>%
group_by(group) %>%
mutate(diff_val = c(0, diff(value)),
diff_sd_val = c(0, diff(sd_value)),
new_label = custom_label(label, diff_val, diff_sd_val))

Change column value depending on other columns

I have a dataframe:
chrom position ref var normal_reads1 normal_reads2 normal_var_freq normal_gt tumor_reads1 tumor_reads2 tumor_var_freq tumor_gt somatic_status variant_p_value somatic_p_value
1 2L 13048 A T 32 23 41.82 W 17 6 26.09 W Germline 7.507123e-11 0.9437542
2 2L 16467 G A 0 43 100.00 A 0 24 100.00 A <NA> 6.674261e-40 1.0000000
3 2L 20682 T A 32 14 30.43 W 14 6 30.00 W Germline 1.746726e-07 0.6223244
4 2L 25727 T G 52 22 29.73 K 16 4 20.00 K Germline 2.000049e-09 0.8758070
5 2L 25729 A T 49 23 31.94 W 16 4 20.00 W Germline 7.938282e-10 0.9092970
6 2L 25741 T C 45 28 38.36 Y 15 6 28.57 Y Germline 1.497796e-12 0.8604958
I'm trying to change to value of the somatic_status col to "ROH" if both normal_var_freq and tumor_var_freq are > 90
Here's what I've tried:
snps <- within(snps, somatic_status[normal_var_freq > 90 & tumor_var_freq > 90] <- 'ROH')
but I get the error:
Warning message:
In `[<-.factor`(`*tmp*`, normal_var_freq > 90 & tumor_var_freq > :
invalid factor level, NA generated
Can someone point me in the right direction?
We can the factor to character class before assigning the values to 'ROH' based on the logical vector ('i1')
i1 <- with(snps, normal_var_freq > 90 & tumor_var_freq > 90)
snps$somatic_status <- as.character(snps$somatic_status)
snps$somatic_status[i1] <- "ROH"
or add a new level to the column if we don't want to convert the factor column to character before changing some of the elements to a new value
levels(snps$somatic_status) <- c(levels(snps$somatic_status), "ROH")
snps$somatic_status[i1] <- "ROH"
Regarding the usage of within, it is a useful function for creating new variables or updates old variables, but the assigning a subset of values to new value is not recommended

Automate regression by rows

I have a data.frame
set.seed(100)
exp <- data.frame(exp = c(rep(LETTERS[1:2], each = 10)), re = c(rep(seq(1, 10, 1), 2)), age1 = seq(10, 29, 1), age2 = seq(30, 49, 1),
h = c(runif(20, 10, 40)), h2 = c(40 + runif(20, 4, 9)))
I'd like to make a lm for each row in a data set (h and h2 ~ age1 and age2)
I do it by loop
exp$modelh <- 0
for (i in 1:length(exp$exp)){
age = c(exp$age1[i], exp$age2[i])
h = c(exp$h[i], exp$h2[i])
model = lm(age ~ h)
exp$modelh[i] = coef(model)[1] + 100 * coef(model)[2]
}
and it works well but takes some time with very large files. Will be grateful for the faster solution f.ex. dplyr
Using dplyr, we can try with rowwise() and do. Inside the do, we concatenate (c) the 'age1', 'age2' to create 'age', likewise, we can create 'h', apply lm, extract the coef to create the column 'modelh'.
library(dplyr)
exp %>%
rowwise() %>%
do({
age <- c(.$age1, .$age2)
h <- c(.$h, .$h2)
model <- lm(age ~ h)
data.frame(., modelh = coef(model)[1] + 100*coef(model)[2])
} )
gives the output
# exp re age1 age2 h h2 modelh
#1 A 1 10 30 19.23298 46.67906 68.85506
#2 A 2 11 31 17.73018 47.55402 66.17050
#3 A 3 12 32 26.56967 46.69174 84.98486
#4 A 4 13 33 11.69149 47.74486 61.98766
#5 A 5 14 34 24.05648 46.10051 82.90167
#6 A 6 15 35 24.51312 44.85710 89.21053
#7 A 7 16 36 34.37208 47.85151 113.37492
#8 A 8 17 37 21.10962 48.40977 74.79483
#9 A 9 18 38 26.39676 46.74548 90.34187
#10 A 10 19 39 15.10786 45.38862 75.07002
#11 B 1 20 40 28.74989 46.44153 100.54666
#12 B 2 21 41 36.46497 48.64253 125.34773
#13 B 3 22 42 18.41062 45.74346 81.70062
#14 B 4 23 43 21.95464 48.77079 81.20773
#15 B 5 24 44 32.87653 47.47637 115.95097
#16 B 6 25 45 30.07065 48.44727 101.10688
#17 B 7 26 46 16.13836 44.90204 84.31080
#18 B 8 27 47 20.72575 47.14695 87.00805
#19 B 9 28 48 20.78425 48.94782 84.25406
#20 B 10 29 49 30.70872 44.65144 128.39415
We could do this with the devel version of data.table i.e. v1.9.5. Instructions to install the devel version are here.
We convert the 'data.frame' to 'data.table' (setDT), create a column 'rn' with the option keep.rownames=TRUE. We melt the dataset by specifying the patterns in the measure to convert from 'wide' to 'long' format. Grouped by 'rn', we do the lm and get the coef. This can be assigned as a new column in the original dataset ('exp') while removing the unwanted 'rn' column by assigning (:=) it to NULL.
library(data.table)#v1.9.5+
modelh <- melt(setDT(exp, keep.rownames=TRUE), measure=patterns('^age', '^h'),
value.name=c('age', 'h'))[, {model <- lm(age ~h)
coef(model)[1] + 100 * coef(model)[2]},rn]$V1
exp[, modelh:= modelh][, rn := NULL]
exp
# exp re age1 age2 h h2 modelh
# 1: A 1 10 30 19.23298 46.67906 68.85506
# 2: A 2 11 31 17.73018 47.55402 66.17050
# 3: A 3 12 32 26.56967 46.69174 84.98486
# 4: A 4 13 33 11.69149 47.74486 61.98766
# 5: A 5 14 34 24.05648 46.10051 82.90167
# 6: A 6 15 35 24.51312 44.85710 89.21053
# 7: A 7 16 36 34.37208 47.85151 113.37492
# 8: A 8 17 37 21.10962 48.40977 74.79483
# 9: A 9 18 38 26.39676 46.74548 90.34187
#10: A 10 19 39 15.10786 45.38862 75.07002
#11: B 1 20 40 28.74989 46.44153 100.54666
#12: B 2 21 41 36.46497 48.64253 125.34773
#13: B 3 22 42 18.41062 45.74346 81.70062
#14: B 4 23 43 21.95464 48.77079 81.20773
#15: B 5 24 44 32.87653 47.47637 115.95097
#16: B 6 25 45 30.07065 48.44727 101.10688
#17: B 7 26 46 16.13836 44.90204 84.31080
#18: B 8 27 47 20.72575 47.14695 87.00805
#19: B 9 28 48 20.78425 48.94782 84.25406
#20: B 10 29 49 30.70872 44.65144 128.39415
Great (double) answer from #akrun.
Just a suggestion for your future analysis as you mentioned "it's an example of a bigger problem". Obviously, if you are really interested in building models rowwise then you'll create more and more columns as your age and h observations increase. If you get N observations you'll have to use 2xN columns for those 2 variables only.
I'd suggest to use a long data format in order to increase your rows instead of your columns.
Something like:
exp[1,] # how your first row (model building info) looks like
# exp re age1 age2 h h2
# 1 A 1 10 30 19.23298 46.67906
reshape(exp[1,], # how your model building info is transformed
varying = list(c("age1","age2"),
c("h","h2")),
v.names = c("age_value","h_value"),
direction = "long")
# exp re time age_value h_value id
# 1.1 A 1 1 10 19.23298 1
# 1.2 A 1 2 30 46.67906 1
Apologies if the "bigger problem" refers to something else and this answer is irrelevant.
With base R, the function sprintf can help us create formulas. And lapply carries out the calculation.
strings <- sprintf("c(%f,%f) ~ c(%f,%f)", exp$age1, exp$age2, exp$h, exp$h2)
lst <- lapply(strings, function(x) {model <- lm(as.formula(x));coef(model)[1] + 100 * coef(model)[2]})
exp$modelh <- unlist(lst)
exp
# exp re age1 age2 h h2 modelh
# 1 A 1 10 30 19.23298 46.67906 68.85506
# 2 A 2 11 31 17.73018 47.55402 66.17050
# 3 A 3 12 32 26.56967 46.69174 84.98486
# 4 A 4 13 33 11.69149 47.74486 61.98766
# 5 A 5 14 34 24.05648 46.10051 82.90167
# 6 A 6 15 35 24.51312 44.85710 89.21053
# 7 A 7 16 36 34.37208 47.85151 113.37493
# 8 A 8 17 37 21.10962 48.40977 74.79483
# 9 A 9 18 38 26.39676 46.74548 90.34187
# 10 A 10 19 39 15.10786 45.38862 75.07002
# 11 B 1 20 40 28.74989 46.44153 100.54666
# 12 B 2 21 41 36.46497 48.64253 125.34773
# 13 B 3 22 42 18.41062 45.74346 81.70062
# 14 B 4 23 43 21.95464 48.77079 81.20773
# 15 B 5 24 44 32.87653 47.47637 115.95097
# 16 B 6 25 45 30.07065 48.44727 101.10688
# 17 B 7 26 46 16.13836 44.90204 84.31080
# 18 B 8 27 47 20.72575 47.14695 87.00805
# 19 B 9 28 48 20.78425 48.94782 84.25406
# 20 B 10 29 49 30.70872 44.65144 128.39416
In the lapply function the expression as.formula(x) is what converts the formulas created in the first line into a format usable by the lm function.
Benchmark
library(dplyr)
library(microbenchmark)
set.seed(100)
big.exp <- data.frame(age1=sample(30, 1e4, T),
age2=sample(30:50, 1e4, T),
h=runif(1e4, 10, 40),
h2= 40 + runif(1e4,4,9))
microbenchmark(
plafort = {strings <- sprintf("c(%f,%f) ~ c(%f,%f)", big.exp$age1, big.exp$age2, big.exp$h, big.exp$h2)
lst <- lapply(strings, function(x) {model <- lm(as.formula(x));coef(model)[1] + 100 * coef(model)[2]})
big.exp$modelh <- unlist(lst)},
akdplyr = {big.exp %>%
rowwise() %>%
do({
age <- c(.$age1, .$age2)
h <- c(.$h, .$h2)
model <- lm(age ~ h)
data.frame(., modelh = coef(model)[1] + 100*coef(model)[2])
} )}
,times=5)
t: seconds
expr min lq mean median uq max neval cld
plafort 13.00605 13.41113 13.92165 13.56927 14.53814 15.08366 5 a
akdplyr 26.95064 27.64240 29.40892 27.86258 31.02955 33.55940 5 b
(Note: I downloaded the newest 1.9.5 devel version of data.table today, but continued to receive errors when trying to test it.
The results also differ fractionally (1.93 x 10^-8). Rounding likely accounts for the difference.)
all.equal(pl, ak)
[1] "Attributes: < Component “class”: Lengths (1, 3) differ (string compare on first 1) >"
[2] "Attributes: < Component “class”: 1 string mismatch >"
[3] "Component “modelh”: Mean relative difference: 1.933893e-08"
Conclusion
The lapply approach seems to perform well compared to dplyr with respect to speed, but it's 5 digit rounding may be an issue. Improvements may be possible. Perhaps using apply after converting to matrix to increase speed and efficiency.

How to sum a function over a specific range in R?

Here are three columns:
indx vehID LocalY
1 2 35.381
2 2 39.381
3 2 43.381
4 2 47.38
5 2 51.381
6 2 55.381
7 2 59.381
8 2 63.379
9 2 67.383
10 2 71.398
11 2 75.401
12 2 79.349
13 2 83.233
14 2 87.043
15 2 90.829
16 2 94.683
17 2 98.611
18 2 102.56
19 2 106.385
20 2 110.079
21 2 113.628
22 2 117.118
23 2 120.6
24 2 124.096
25 2 127.597
26 2 131.099
27 2 134.595
28 2 138.081
29 2 141.578
30 2 145.131
31 2 148.784
32 2 152.559
33 2 156.449
34 2 160.379
35 2 164.277
36 2 168.15
37 2 172.044
38 2 176
39 2 179.959
40 2 183.862
41 2 187.716
42 2 191.561
43 2 195.455
44 2 199.414
45 2 203.417
46 2 207.43
47 2 211.431
48 2 215.428
49 2 219.427
50 2 223.462
51 2 227.422
52 2 231.231
53 2 235.001
54 2 238.909
55 2 242.958
56 2 247.137
57 2 251.247
58 2 255.292
59 2 259.31
60 2 263.372
61 2 267.54
62 2 271.842
63 2 276.256
64 2 280.724
65 2 285.172
I want to create a new column called 'Smoothed Y' by applying the following formula:
D=15, delta (triangular symbol) = 5, i = indx, x_alpha(tk) = LocalY, x_alpha(ti) = smoothed value
I have tried using following code for first calculating Z: (Kernel below means the exp function)
t <- 0.5
dt <- 0.1
delta <- t/dt
d <- 3*delta
indx <- a$indx
for (i in indx) {
initial <- i-d
end <- i+d
k <- c(initial:end)
for (n in k) {
kernel <- exp(-abs(i-n)/delta)
z <- sum(kernel)
}
}
a$z <- z
print (a)
NOTE: 'a' is the imported data frame containing the three columns above.
Although the values of computed function are fine but it doesn't sum up the values in variable z. How can I do summation over the range i-d to i+d for every indx value i?
You can use the convolve function. One thing you need to decide is what to do for indices closer to either end of the array than width of the convolution kernel. One option is to simply use the partial kernel, rescaled so the weights still sum to 1.
smooth<-function(x,D,delta){
z<-exp(-abs(-D:D)/delta)
r<-convolve(x,z,type="open")/convolve(rep(1,length(x)),z,type="open")
r<-head(tail(r,-D),-D)
r
}
With your array as y, the result is this:
> yy<-smooth(y,15,5)
> yy
[1] 50.70804 52.10837 54.04788 56.33651 58.87682 61.61121 64.50214
[8] 67.52265 70.65186 73.87197 77.16683 80.52193 83.92574 87.36969
[15] 90.84850 94.35809 98.15750 101.93317 105.67833 109.38989 113.06889
[22] 116.72139 120.35510 123.97707 127.59293 131.20786 134.82720 138.45720
[29] 142.10507 145.77820 149.48224 153.21934 156.98794 160.78322 164.60057
[36] 168.43699 172.29076 176.15989 180.04104 183.93127 187.83046 191.74004
[43] 195.66223 199.59781 203.54565 207.50342 211.46888 215.44064 219.41764
[50] 223.39908 227.05822 230.66813 234.22890 237.74176 241.20236 244.60039
[57] 247.91917 251.14346 254.25876 257.24891 260.09121 262.74910 265.16057
[64] 267.21598 268.70276
Of course, the problem with this is that the kernel ends up non-centered at the edges. This is a well-known problem, and there are ways to deal with it but it complicates the problem. Plotting the data will show you the effects of this non-centering:
plot(y)
lines(yy)

plot plate layout heatmap in r

I am trying to plot a plate layout heatmap in R. The plate layout is simply 8 (row) x 12 (column) circles (wells). Rows are labeled by alphabets and columns by numbers. Each well need to be filled with some color intensity depends upon a qualitative or quantitative variable. The plate layout look like this:
Here is small dataset:
set.seed (123)
platelay <- data.frame (rown = rep (letters[1:8], 12), coln = rep (1:12, each = 8),
colorvar = rnorm (96, 0.3, 0.2))
rown coln colorvar
1 a 1 0.187904871
2 b 1 0.253964502
3 c 1 0.611741663
4 d 1 0.314101678
5 e 1 0.325857547
6 f 1 0.643012997
7 g 1 0.392183241
8 h 1 0.046987753
9 a 2 0.162629430
10 b 2 0.210867606
11 c 2 0.544816359
12 d 2 0.371962765
13 e 2 0.380154290
14 f 2 0.322136543
15 g 2 0.188831773
16 h 2 0.657382627
17 a 3 0.399570096
18 b 3 -0.093323431
19 c 3 0.440271180
20 d 3 0.205441718
21 e 3 0.086435259
22 f 3 0.256405017
23 g 3 0.094799110
24 h 3 0.154221754
25 a 4 0.174992146
26 b 4 -0.037338662
27 c 4 0.467557409
28 d 4 0.330674624
29 e 4 0.072372613
30 f 4 0.550762984
31 g 4 0.385292844
32 h 4 0.240985703
33 a 5 0.479025132
34 b 5 0.475626698
35 c 5 0.464316216
36 d 5 0.437728051
37 e 5 0.410783531
38 f 5 0.287617658
39 g 5 0.238807467
40 h 5 0.223905800
41 a 6 0.161058604
42 b 6 0.258416544
43 c 6 0.046920730
44 d 6 0.733791193
45 e 6 0.541592400
46 f 6 0.075378283
47 g 6 0.219423033
48 h 6 0.206668929
49 a 7 0.455993024
50 b 7 0.283326187
51 c 7 0.350663703
52 d 7 0.294290649
53 e 7 0.291425909
54 f 7 0.573720457
55 g 7 0.254845803
56 h 7 0.603294121
57 a 8 -0.009750561
58 b 8 0.416922750
59 c 8 0.324770849
60 d 8 0.343188314
61 e 8 0.375927897
62 f 8 0.199535309
63 g 8 0.233358523
64 h 8 0.096284923
65 a 9 0.085641755
66 b 9 0.360705728
67 c 9 0.389641956
68 d 9 0.310600845
69 e 9 0.484453494
70 f 9 0.710016937
71 g 9 0.201793767
72 h 9 -0.161833775
73 a 10 0.501147705
74 b 10 0.158159847
75 c 10 0.162398277
76 d 10 0.505114274
77 e 10 0.243045399
78 f 10 0.055856458
79 g 10 0.336260696
80 h 10 0.272221728
81 a 11 0.301152837
82 b 11 0.377056080
83 c 11 0.225867994
84 d 11 0.428875310
85 e 11 0.255902688
86 f 11 0.366356393
87 g 11 0.519367803
88 h 11 0.387036298
89 a 12 0.234813683
90 b 12 0.529761524
91 c 12 0.498700771
92 d 12 0.409679392
93 e 12 0.347746347
94 f 12 0.174418785
95 g 12 0.572130490
96 h 12 0.179948083
Is there is package that can readily do it ? Is it possible write a function in base or ggplot2 or other package that can achieve this target.
Changing the colour of points of sufficient size, with ggplot2. Note I've implemeted #TylerRinkler's suggestion, but within the call to ggplot. I've also removed the axis labels
ggplot(platelay, aes(y = factor(rown, rev(levels(rown))),x = factor(coln))) +
geom_point(aes(colour = colorvar), size =18) +theme_bw() +
labs(x=NULL, y = NULL)
And a base graphics approach, which will let you have the x axis above the plot
# plot with grey colour dictated by rank, no axes or labels
with(platelay, plot( x=as.numeric(coln), y= rev(as.numeric(rown)), pch= 19, cex = 2,
col = grey(rank(platelay[['colorvar']] ) / nrow(platelay)), axes = F, xlab= '', ylab = ''))
# add circular outline
with(platelay, points( x=as.numeric(coln), y= rev(as.numeric(rown)), pch= 21, cex = 2))
# add the axes
axis(3, at =1:12, labels = 1:12)
axis(2, at = 1:8, labels = LETTERS[8:1])
# the background grid
grid()
# and a box around the outside
box()
And for giggles and Christmas cheer, here is a version using base R plotting functions.
Though there is very possibly a better solution.
dev.new(width=6,height=4)
rown <- unique(platelay$rown)
coln <- unique(platelay$coln)
plot(NA,ylim=c(0.5,length(rown)+0.5),xlim=c(0.5,length(coln)+0.5),ann=FALSE,axes=FALSE)
box()
axis(2,at=seq_along(rown),labels=rev(rown),las=2)
axis(3,at=seq_along(coln),labels=coln)
colgrp <- findInterval(platelay$colorvar,seq(min(platelay$colorvar),max(platelay$colorvar),length.out=10))
colfunc <- colorRampPalette(c("green", "blue"))
collist <- colfunc(length(unique(colgrp)))
symbols(platelay$coln,
factor(platelay$rown, rev(levels(platelay$rown))),
circles=rep(0.2,nrow(platelay)),
add=TRUE,
inches=FALSE,
bg=collist[colgrp])
And the resulting image:
here a solution using ggplot2 solution of #mnel and grid solution
here the code of given solution
d <- ggplot(platelay, aes(y=rown,x=factor(coln))) +
geom_point(aes(colour = colorvar), size =18) + theme_bw()
I use the data generated by ggplot
data <- ggplot_build(d)$data[[1]]
x <- data$x
y <- data$y
grid.newpage()
pushViewport(plotViewport(c(4, 4, 2, 2)),
dataViewport(x, y))
grid hase an ellipse geom
grid.ellipse(x, y,size=20, ar = 2,angle=0,gp =gpar(fill=data$colour))
grid.xaxis(at=c(labels=1:12,ticks=NA),gp=gpar(cex=2))
grid.yaxis(at = 1:8,label=rev(LETTERS[1:8]),gp=gpar(cex=2))
grid.roundrect(gp=gpar(fill=NA))
I add grid :
gpgrid <- gpar(col='grey',lty=2,col='white')
grid.segments(unit(1:12, "native") ,unit(0, "npc"), unit(1:12, "native"),unit(1, "npc"),gp=gpgrid)
grid.segments(unit(0, "npc"), unit(1:8, "native"), unit(1, "npc"),unit(1:8, "native"),gp=gpgrid)
upViewport()
This answer is an add on for #thelatemail answer which explains the platemap for (8,12) = 96 format.
To construct (32,48) = 1536 format, single digits of A-Z is insufficent. Hence one needs to expand letters such as AA, AB, AC, AD ... ZZ and it can be expanded to three or more digits by concatenating LETTERS to the levels variable as below.
levels = c(LETTERS, c(t(outer(LETTERS, LETTERS, paste, sep = "")))))
#thelatemail answer can be improved for letters in double digits for 1536 plate format as below
rown = rep (c(LETTERS, c(t(outer(LETTERS[1], LETTERS[1:6], paste, sep = "")))),
symbols(platelay$coln,
factor(platelay$rown,
levels = rev(c(LETTERS, c(t(outer(LETTERS[1], LETTERS[1:6], paste, sep = "")))))),
circles=rep(0.45,nrow(platelay)),
add=TRUE,
inches=FALSE,
bg=collist[colgrp])
The levels variable inside symbols function should have characters with alphabetically sorted single, then double, then triple ... and so on digits.
For example, if you have below incorrect order of levels inside the symbols function, then it will plot with incorrect color representation.
Incorrect order:
A, AA, AB, AC, AD, AE, AF, B, C,D, ...Z
Correct order:
A, B, C, D, E, .....Z, AA, AB, AC, AD, AE, AF

Resources