plot plate layout heatmap in r - r

I am trying to plot a plate layout heatmap in R. The plate layout is simply 8 (row) x 12 (column) circles (wells). Rows are labeled by alphabets and columns by numbers. Each well need to be filled with some color intensity depends upon a qualitative or quantitative variable. The plate layout look like this:
Here is small dataset:
set.seed (123)
platelay <- data.frame (rown = rep (letters[1:8], 12), coln = rep (1:12, each = 8),
colorvar = rnorm (96, 0.3, 0.2))
rown coln colorvar
1 a 1 0.187904871
2 b 1 0.253964502
3 c 1 0.611741663
4 d 1 0.314101678
5 e 1 0.325857547
6 f 1 0.643012997
7 g 1 0.392183241
8 h 1 0.046987753
9 a 2 0.162629430
10 b 2 0.210867606
11 c 2 0.544816359
12 d 2 0.371962765
13 e 2 0.380154290
14 f 2 0.322136543
15 g 2 0.188831773
16 h 2 0.657382627
17 a 3 0.399570096
18 b 3 -0.093323431
19 c 3 0.440271180
20 d 3 0.205441718
21 e 3 0.086435259
22 f 3 0.256405017
23 g 3 0.094799110
24 h 3 0.154221754
25 a 4 0.174992146
26 b 4 -0.037338662
27 c 4 0.467557409
28 d 4 0.330674624
29 e 4 0.072372613
30 f 4 0.550762984
31 g 4 0.385292844
32 h 4 0.240985703
33 a 5 0.479025132
34 b 5 0.475626698
35 c 5 0.464316216
36 d 5 0.437728051
37 e 5 0.410783531
38 f 5 0.287617658
39 g 5 0.238807467
40 h 5 0.223905800
41 a 6 0.161058604
42 b 6 0.258416544
43 c 6 0.046920730
44 d 6 0.733791193
45 e 6 0.541592400
46 f 6 0.075378283
47 g 6 0.219423033
48 h 6 0.206668929
49 a 7 0.455993024
50 b 7 0.283326187
51 c 7 0.350663703
52 d 7 0.294290649
53 e 7 0.291425909
54 f 7 0.573720457
55 g 7 0.254845803
56 h 7 0.603294121
57 a 8 -0.009750561
58 b 8 0.416922750
59 c 8 0.324770849
60 d 8 0.343188314
61 e 8 0.375927897
62 f 8 0.199535309
63 g 8 0.233358523
64 h 8 0.096284923
65 a 9 0.085641755
66 b 9 0.360705728
67 c 9 0.389641956
68 d 9 0.310600845
69 e 9 0.484453494
70 f 9 0.710016937
71 g 9 0.201793767
72 h 9 -0.161833775
73 a 10 0.501147705
74 b 10 0.158159847
75 c 10 0.162398277
76 d 10 0.505114274
77 e 10 0.243045399
78 f 10 0.055856458
79 g 10 0.336260696
80 h 10 0.272221728
81 a 11 0.301152837
82 b 11 0.377056080
83 c 11 0.225867994
84 d 11 0.428875310
85 e 11 0.255902688
86 f 11 0.366356393
87 g 11 0.519367803
88 h 11 0.387036298
89 a 12 0.234813683
90 b 12 0.529761524
91 c 12 0.498700771
92 d 12 0.409679392
93 e 12 0.347746347
94 f 12 0.174418785
95 g 12 0.572130490
96 h 12 0.179948083
Is there is package that can readily do it ? Is it possible write a function in base or ggplot2 or other package that can achieve this target.

Changing the colour of points of sufficient size, with ggplot2. Note I've implemeted #TylerRinkler's suggestion, but within the call to ggplot. I've also removed the axis labels
ggplot(platelay, aes(y = factor(rown, rev(levels(rown))),x = factor(coln))) +
geom_point(aes(colour = colorvar), size =18) +theme_bw() +
labs(x=NULL, y = NULL)
And a base graphics approach, which will let you have the x axis above the plot
# plot with grey colour dictated by rank, no axes or labels
with(platelay, plot( x=as.numeric(coln), y= rev(as.numeric(rown)), pch= 19, cex = 2,
col = grey(rank(platelay[['colorvar']] ) / nrow(platelay)), axes = F, xlab= '', ylab = ''))
# add circular outline
with(platelay, points( x=as.numeric(coln), y= rev(as.numeric(rown)), pch= 21, cex = 2))
# add the axes
axis(3, at =1:12, labels = 1:12)
axis(2, at = 1:8, labels = LETTERS[8:1])
# the background grid
grid()
# and a box around the outside
box()

And for giggles and Christmas cheer, here is a version using base R plotting functions.
Though there is very possibly a better solution.
dev.new(width=6,height=4)
rown <- unique(platelay$rown)
coln <- unique(platelay$coln)
plot(NA,ylim=c(0.5,length(rown)+0.5),xlim=c(0.5,length(coln)+0.5),ann=FALSE,axes=FALSE)
box()
axis(2,at=seq_along(rown),labels=rev(rown),las=2)
axis(3,at=seq_along(coln),labels=coln)
colgrp <- findInterval(platelay$colorvar,seq(min(platelay$colorvar),max(platelay$colorvar),length.out=10))
colfunc <- colorRampPalette(c("green", "blue"))
collist <- colfunc(length(unique(colgrp)))
symbols(platelay$coln,
factor(platelay$rown, rev(levels(platelay$rown))),
circles=rep(0.2,nrow(platelay)),
add=TRUE,
inches=FALSE,
bg=collist[colgrp])
And the resulting image:

here a solution using ggplot2 solution of #mnel and grid solution
here the code of given solution
d <- ggplot(platelay, aes(y=rown,x=factor(coln))) +
geom_point(aes(colour = colorvar), size =18) + theme_bw()
I use the data generated by ggplot
data <- ggplot_build(d)$data[[1]]
x <- data$x
y <- data$y
grid.newpage()
pushViewport(plotViewport(c(4, 4, 2, 2)),
dataViewport(x, y))
grid hase an ellipse geom
grid.ellipse(x, y,size=20, ar = 2,angle=0,gp =gpar(fill=data$colour))
grid.xaxis(at=c(labels=1:12,ticks=NA),gp=gpar(cex=2))
grid.yaxis(at = 1:8,label=rev(LETTERS[1:8]),gp=gpar(cex=2))
grid.roundrect(gp=gpar(fill=NA))
I add grid :
gpgrid <- gpar(col='grey',lty=2,col='white')
grid.segments(unit(1:12, "native") ,unit(0, "npc"), unit(1:12, "native"),unit(1, "npc"),gp=gpgrid)
grid.segments(unit(0, "npc"), unit(1:8, "native"), unit(1, "npc"),unit(1:8, "native"),gp=gpgrid)
upViewport()

This answer is an add on for #thelatemail answer which explains the platemap for (8,12) = 96 format.
To construct (32,48) = 1536 format, single digits of A-Z is insufficent. Hence one needs to expand letters such as AA, AB, AC, AD ... ZZ and it can be expanded to three or more digits by concatenating LETTERS to the levels variable as below.
levels = c(LETTERS, c(t(outer(LETTERS, LETTERS, paste, sep = "")))))
#thelatemail answer can be improved for letters in double digits for 1536 plate format as below
rown = rep (c(LETTERS, c(t(outer(LETTERS[1], LETTERS[1:6], paste, sep = "")))),
symbols(platelay$coln,
factor(platelay$rown,
levels = rev(c(LETTERS, c(t(outer(LETTERS[1], LETTERS[1:6], paste, sep = "")))))),
circles=rep(0.45,nrow(platelay)),
add=TRUE,
inches=FALSE,
bg=collist[colgrp])
The levels variable inside symbols function should have characters with alphabetically sorted single, then double, then triple ... and so on digits.
For example, if you have below incorrect order of levels inside the symbols function, then it will plot with incorrect color representation.
Incorrect order:
A, AA, AB, AC, AD, AE, AF, B, C,D, ...Z
Correct order:
A, B, C, D, E, .....Z, AA, AB, AC, AD, AE, AF

Related

Remove link between time series and add minor date tick on x_axis in ggplot

I was trying to plot a time series composed of weekly averanges. Here is the plot that I have obtained:
[weekly averages A]
[1]: https://i.stack.imgur.com/XMGMs.png
As you can see the time serie do not cover all the years completely, so, when I have got no data ggplot links two subsequent years. I think I have to group the data in some ways, but I do not understand how. Here is the code:
df4 <- data.frame(df$Date, df$A)
colnames(df4)<- c("date","A")
df4$date <- as.Date(df4$date,"%Y/%m/%d")
df4$week_day <- as.numeric(format(df4$date, format='%w'))
df4$endofweek <- df4$date + (6 - df4$week_day)
week_aveA <- df4 %>%
group_by(endofweek) %>%
summarise_all(list(mean=mean), na.rm=TRUE) %>%
na.omit()
g1 = ggplot() +
geom_step(data=week_aveA, aes(group = 1, x = (endofweek), y = (A_mean)), colour="gray25") +
scale_y_continuous(expand = c(0, 0), limits = c(0, 2500)) +
scale_x_date(breaks="year", labels=date_format("%Y")) +
labs(y = expression(A~ ~index),
x = NULL) +
theme(axis.text.x = element_text(size=10),
axis.title = element_text(size=10))
Here an extraction (the former three years) of the dataset:
endofweek date_mean A_mean week_day_mean
1 20/03/2010 17/03/2010 939,2533437 3
2 27/03/2010 24/03/2010 867,3620121 3
3 03/04/2010 31/03/2010 1426,791222 3
4 10/04/2010 07/04/2010 358,5698314 3
5 17/04/2010 13/04/2010 301,1815352 2
6 24/04/2010 21/04/2010 273,4922895 3,333333333
7 01/05/2010 28/04/2010 128,5989633 3
8 08/05/2010 05/05/2010 447,8858881 3
9 15/05/2010 12/05/2010 387,9828891 3
10 22/05/2010 19/05/2010 138,0770986 3
11 29/05/2010 26/05/2010 370,2147933 3
12 05/06/2010 02/06/2010 139,0451791 3
13 12/06/2010 09/06/2010 217,1286356 3
14 19/06/2010 16/06/2010 72,36972411 3
15 26/06/2010 23/06/2010 282,2911902 3
16 03/07/2010 30/06/2010 324,3215936 3
17 10/07/2010 07/07/2010 210,568691 3
18 17/07/2010 14/07/2010 91,76930829 3
19 24/07/2010 21/07/2010 36,4211218 3,666666667
20 31/07/2010 28/07/2010 37,53981103 3
21 07/08/2010 04/08/2010 91,33282642 3
22 14/08/2010 11/08/2010 28,38587352 3
23 21/08/2010 18/08/2010 58,72836406 3
24 28/08/2010 24/08/2010 102,1050612 2,5
25 04/09/2010 02/09/2010 13,45357513 4,5
26 11/09/2010 08/09/2010 51,24017212 3
27 18/09/2010 15/09/2010 159,7395663 3
28 25/09/2010 21/09/2010 62,71136678 2
29 02/04/2011 31/03/2011 1484,661164 4
30 09/04/2011 06/04/2011 656,1827964 3
31 16/04/2011 13/04/2011 315,3097313 3
32 23/04/2011 20/04/2011 293,2904042 3
33 30/04/2011 26/04/2011 255,7517519 2,4
34 07/05/2011 04/05/2011 360,7035289 3
35 14/05/2011 11/05/2011 342,0902797 3
36 21/05/2011 18/05/2011 386,1380421 3
37 28/05/2011 24/05/2011 418,9624807 2,833333333
38 04/06/2011 01/06/2011 112,7568 3
39 11/06/2011 08/06/2011 85,17855619 3,2
40 18/06/2011 15/06/2011 351,8714638 3
41 25/06/2011 22/06/2011 139,7936898 3
42 02/07/2011 29/06/2011 68,57716191 3,6
43 09/07/2011 06/07/2011 62,31823822 3
44 16/07/2011 13/07/2011 80,7328917 3
45 23/07/2011 20/07/2011 114,9475331 3
46 30/07/2011 27/07/2011 90,13118758 3
47 06/08/2011 03/08/2011 43,29372258 3
48 13/08/2011 10/08/2011 49,39935204 3
49 20/08/2011 16/08/2011 133,746822 2
50 03/09/2011 31/08/2011 76,03928942 3
51 10/09/2011 05/09/2011 27,99834637 1
52 24/03/2012 23/03/2012 366,2625797 5,5
53 31/03/2012 28/03/2012 878,8535513 3
54 07/04/2012 04/04/2012 1029,909052 3
55 14/04/2012 11/04/2012 892,9163416 3
56 21/04/2012 18/04/2012 534,8278693 3
57 28/04/2012 25/04/2012 255,1177585 3
58 05/05/2012 02/05/2012 564,5280546 3
59 12/05/2012 09/05/2012 767,5018168 3
60 19/05/2012 16/05/2012 516,2680148 3
61 26/05/2012 23/05/2012 241,2113073 3
62 02/06/2012 30/05/2012 863,6123397 3
63 09/06/2012 06/06/2012 201,2019288 3
64 16/06/2012 13/06/2012 222,9955486 3
65 23/06/2012 20/06/2012 91,14166632 3
66 30/06/2012 27/06/2012 26,93145693 3
67 07/07/2012 04/07/2012 67,32183278 3
68 14/07/2012 11/07/2012 46,25297513 3
69 21/07/2012 18/07/2012 81,34359825 3,666666667
70 28/07/2012 25/07/2012 49,59130851 3
71 04/08/2012 01/08/2012 44,13438077 3
72 11/08/2012 08/08/2012 30,15773151 3
73 18/08/2012 15/08/2012 57,47256772 3
74 25/08/2012 22/08/2012 31,9109555 3
75 01/09/2012 29/08/2012 52,71058484 3
76 08/09/2012 04/09/2012 24,52495229 2
77 06/04/2013 01/04/2013 1344,388042 1,5
78 13/04/2013 10/04/2013 1304,838687 3
79 20/04/2013 17/04/2013 892,620141 3
80 27/04/2013 24/04/2013 400,1720434 3
81 04/05/2013 01/05/2013 424,8473083 3
82 11/05/2013 08/05/2013 269,2380208 3
83 18/05/2013 15/05/2013 238,9993749 3
84 25/05/2013 22/05/2013 128,4096151 3
85 01/06/2013 29/05/2013 158,5576121 3
86 08/06/2013 05/06/2013 175,2036942 3
87 15/06/2013 12/06/2013 79,20250839 3
88 22/06/2013 19/06/2013 126,9065428 3
89 29/06/2013 26/06/2013 133,7480108 3
90 06/07/2013 03/07/2013 218,0092943 3
91 13/07/2013 10/07/2013 54,08460936 3
92 20/07/2013 17/07/2013 91,54285041 3
93 27/07/2013 24/07/2013 44,64567928 3
94 03/08/2013 31/07/2013 229,5067999 3
95 10/08/2013 07/08/2013 49,70729373 3
96 17/08/2013 14/08/2013 53,38618335 3
97 24/08/2013 21/08/2013 217,2800997 3
98 31/08/2013 28/08/2013 49,43590136 3
99 07/09/2013 04/09/2013 64,88783029 3
100 14/09/2013 11/09/2013 11,04300773 3
So at the end I have one mainly question: how can I eliminated the connection between the years? ... and an aesthetic question: how can I add minor ticks on the x_axis? At least one every 6 months, just to make the plot easy to read.
Thanks in advance for any suggestion!
Edit
This is the code I tried with the suggestion, maybe I mistype some part of it.
library(tidyverse)
library(dplyr)
library(lubridate)
df4 <- data.frame(df$Date, df$A)
colnames(df4)<- c("date","A")
df4$date <- as.Date(df4$date,"%Y/%m/%d")
df4$week_day <- as.numeric(format(df4$date, format='%w'))
df4$endofweek <- df4$date + (6 - df4$week_day)
week_aveA <- df4 %>%
group_by(endofweek) %>%
summarise_all(list(mean=mean), na.rm=TRUE) %>%
na.omit()
week_aveA$endofweek <- as.Date(week_aveA$endofweek,"%d/%m/%Y")
week_aveA$A_mean <- as.numeric(gsub(",", ".", week_aveA$A_mean))
week_aveA$week_day_mean <- as.numeric(gsub(",", ".", week_aveA$week_day_mean))
week_aveA$year <- format(week_aveA$endofweek, "%Y")
library(ggplot2)
library(methods)
library(scales)
mylabel <- function(x) {
ifelse(grepl("-07-01$", x), "", format(x, "%Y"))
}
ggplot() +
geom_step(data=week_aveA, aes(x = endofweek, y = A_mean, group = year), colour="gray25") +
scale_y_continuous(expand = c(0, 0), limits = c(0, 2500)) +
scale_x_date(breaks="6 month", labels = mylabel) +
labs(y = expression(A~ ~index),
x = NULL) +
theme(axis.text.x = element_text(size=10),
axis.title = element_text(size=10))
You have to group by year:
Add a variable with the year to your dataset
Map the year variable on the group aesthetic
For the ticks. Increase the number of the breaks. If you want only ticks but not labels you can use a custom function to get rid of unwanted labels, e.g. my approach below set the breaks to "6 month" but replaces the mid-year labels with an empty string:
week_aveA$endofweek <- as.Date(week_aveA$endofweek,"%d/%m/%Y")
week_aveA$A_mean <- as.numeric(gsub(",", ".", week_aveA$A_mean))
week_aveA$week_day_mean <- as.numeric(gsub(",", ".", week_aveA$week_day_mean))
week_aveA$year <- format(week_aveA$endofweek, "%Y")
library(ggplot2)
mylabel <- function(x) {
ifelse(grepl("-07-01$", x), "", format(x, "%Y"))
}
ggplot() +
geom_step(data=week_aveA, aes(x = endofweek, y = A_mean, group = year), colour="gray25") +
scale_y_continuous(expand = c(0, 0), limits = c(0, 2500)) +
scale_x_date(breaks="6 month", labels = mylabel) +
labs(y = expression(A~ ~index),
x = NULL) +
theme(axis.text.x = element_text(size=10),
axis.title = element_text(size=10))

Issue with Density Plot using GGPLOT2 [closed]

Closed. This question needs details or clarity. It is not currently accepting answers.
Want to improve this question? Add details and clarify the problem by editing this post.
Closed 3 years ago.
Improve this question
I want to plot a density plot for 2 groups and below is my code.
library(ggplot2)
#Sample data
dat <- data.frame(Score = c(myfiles2Best$V2, myfilesL2Best$V2)
, Group = rep(c("T", "L")))
ggplot(dat, aes(x = Score)) +
geom_density(aes(color = Group)) + xlim(0,16)
Below is the image of the output.
and when I change the data frame by changing the location of the column as shown below this is how my plot looks like.
dat <- data.frame(Score = c(myfilesL2Best$V2, myfiles2Best$V2)
, Group = rep(c("L", "T")))
Individually, this is how they look like.
dat <- data.frame(Score = c(myfiles2Best$V2)
, Group = rep(c("T"))
ggplot(dat, aes(x = Score)) +
geom_density(aes(color = Group)) + xlim(0,16)
dat <- data.frame(Score = c(myfilesL2Best$V2)
, Group = rep(c("L"))
ggplot(dat, aes(x = Score)) +
geom_density(aes(color = Group)) + xlim(0,16)
This is totally wrong, anything wrong with my setup
rownumber score group
1 8 T
2 8 L
3 7 T
4 7 L
5 9 T
6 8 L
7 8 T
8 7 L
9 8 T
10 8 L
11 8 T
12 9 L
13 8 T
14 8 L
15 8 T
16 8 L
17 9 T
18 7 L
19 9 T
20 7 L
21 8 T
22 10 L
23 8 T
24 8 L
25 9 T
26 8 L
27 8 T
28 8 L
29 9 T
30 8 L
31 7 T
32 10 L
33 8 T
34 10 L
35 8 T
36 7 L
37 8 T
38 7 L
39 11 T
40 9 L
41 8 T
42 9 L
43 8 T
44 10 L
45 8 T
46 9 L
47 8 T
48 8 L
49 8 T
50 7 L
51 9 T
52 8 L
53 8 T
54 9 L
55 8 T
56 7 L
57 7 T
58 9 L
59 10 T
60 8 L
ggplot2::geom_density uses the base R density function to compute density. (see ?geom_density.) This requires a parameter for smoothing, which by default uses a rule named "nrd0", which was picked for "historical and compatibility reasons." (see ?density.) You will get density plots with different appearances depending on this parameter.
From ?bandwidth:
bw.nrd0 implements a rule-of-thumb for choosing the bandwidth of a Gaussian kernel density estimator. It defaults to 0.9 times the minimum of the standard deviation and the interquartile range divided by 1.34 times the sample size to the negative one-fifth power (= Silverman's ‘rule of thumb’, Silverman (1986, page 48, eqn (3.31))) unless the quartiles coincide when a positive result will be guaranteed.
In your example, the two subgroups look like they have different standard deviations and IQRs, so it makes sense to me that they would look different depending on whether that smoothing parameter is calculated for them collectively (as in the case with the combined plot) or individually.
If you want your density plots to correspond between a grouped and individual basis, specify the bandwidth manually:
ggplot(df, aes(x = score)) +
geom_density(aes(color = group), bw = 0.3) +
xlim(0,16)
ggplot(subset(df, group == "L"), aes(x = score)) +
geom_density(aes(color = group), bw = 0.3) +
xlim(0,16)
ggplot(subset(df, group == "T"), aes(x = score)) +
geom_density(aes(color = group), bw = 0.3) +
xlim(0,16)

test for in-tile for Dirichlet tile, using R

So I can take points and use the R libraries deldir or spatstat::dirichlet to find the dirichlet tesselation of those points.
Now I have a point not in the set, and I want to know the indices of the points forming the dirichlet tile which my not-in-set-point is interior to. I can get there by knowing the tile label (or index).
Are there any libraries or methods to do this? I'm thinking spatstat, but not finding something there yet.
The function cut.ppp() can take a point pattern and find which tesselation
tile each point in the pattern belongs to. Below is the code for a simple
example of a point pattern that only contains a single point (0.5, 0.5).
library(spatstat)
dd <- dirichlet(cells)
plot.tess(dd, do.labels = TRUE)
xx <- ppp(.5, .5, window = Window(dd))
plot(xx, add = TRUE, col = "red", cex = 2, pch = 20)
yy <- cut(xx, dd)
yy
#> Marked planar point pattern: 1 point
#> Multitype, with levels =
#> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27
#> 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42
#> window: rectangle = [0, 1] x [0, 1] units
marks(yy)
#> [1] 18
#> 42 Levels: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 ... 42
Created on 2018-12-03 by the reprex package (v0.2.1)
If X is a point pattern and B is a tessellation, then
M <- marks(cut(X, B))
returns a factor (vector of categorical values) identifying which tile contains each of the points of X. Alternatively,
M <- tileindex(X$x, X$y, B)
or
f <- as.function(B)
M <- f(X)

r - How to plot alphabets?

I want a scatter plot which looks like letters of the alphabet. How can I do this with a program? I can just enter co-ordinates and make the plot look like an 'A' or 'S' or whatever. But can it be done in an easier manner?
The pch argument of plot will take arguments that can be used to represent these values. From ?points, values 32-127 are the ASCII character set.
With a little messing around, values 65:90 correspond to capital letters, and values 97:122 correspond to lower case letters.
To illustrate this, try
plot(1:10, 1:10, type="p", pch=97:107)
for example.
Here is a plot of all of the latin alphabet
# blank canvas
plot(1:30, 1:30, type="n")
# upper case
points(1:26, 1:26, pch=65:90)
# lower case
points(1:26, 4:29, pch=97:107)
You could even build a mapping between these values for easier reference.
myRefUpper <- setNames(65:90, LETTERS)
myRefUpper
A B C D E F G H I J K L M N O P Q R S T U V W X Y Z
65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
myRefLower <- setNames(97:107, letters)
myRefLower
a b c d e f g h i j k l m n o p q r s t u v w x y z
97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122
This way, you could refer to specific letters by name. For example, try
plot(1:10, 1:10, type="p", pch=c(myRefLower[c("a", "t", "q")], myRefUpper[LETTERS[10:16]]))
There is now an R package on GitHub that provide coordinates for the Hershey fonts that Ben Bolker mentioned: hershey.
For example, we can get coordinates for the start and end of each stroke (line) in the letter A, for the Roman Simplex font (a simple font using minimal straight lines to create letters):
library(hershey)
coord <- subset(hershey, font == 'rowmans' & char == 'A')
coord
#> x y left right width stroke idx glyph font ascii char
#> 93723 0 12 -9 9 18 0 1 34 rowmans 65 A
#> 93724 -8 -9 -9 9 18 0 2 34 rowmans 65 A
#> 93725 0 12 -9 9 18 1 3 34 rowmans 65 A
#> 93726 8 -9 -9 9 18 1 4 34 rowmans 65 A
#> 93727 -5 -2 -9 9 18 2 5 34 rowmans 65 A
#> 93728 5 -2 -9 9 18 2 6 34 rowmans 65 A
We can use the base approx function to interpolate between the start and end points for each stroke, then plot the result, using the graphical parameter pty to set a square aspect ratio:
op <- par(pty = "s")
plot(coord[, 1:2], type = "n")
for (i in unique(coord$stroke)){
points(approx(subset(coord, stroke == i)))
}
To reset the default graphical parameters:
par(op)
For ggplot2 you can do the interpolation first as below:
library(dplyr)
library(ggplot2)
coord2 <- coord %>%
group_by(stroke) %>%
do(as_tibble(approx(.)))
ggplot(coord2, aes(x, y, group = stroke)) +
geom_point() +
coord_equal() +
theme_minimal()
Created on 2022-01-05 by the reprex package (v2.0.1)
Edit
approx won't work for letters with strokes that have x values with different y values, e.g. vertical strokes or strokes that bend back on themselves. For this we can define our own linear interpolation function:
interp <- function(coord, eps = 0.5) {
y <- coord$y
x <- coord$x
n <- length(x)
x2 <- (x[-1] - x[-n])/eps
y2 <- (y[-1] - y[-n])/eps
p <- pmax(abs(x2), abs(y2))
id <- sequence(p)
list(x = c(x[1], rep(x[-n], p) + rep((x[-1] - x[-n])/p, p)*id),
y = c(y[1], rep(y[-n], p) + rep((y[-1] - y[-n])/p, p)*id))
}
library(hershey)
coord <- subset(hershey, font == 'rowmans' & char == 'C')
op <- par(pty = "s")
plot(coord[, 1:2], type = "n")
for (i in unique(coord$stroke)){
points(interp(subset(coord, stroke == i)))
}
Created on 2022-01-05 by the reprex package (v2.0.1)

Automate regression by rows

I have a data.frame
set.seed(100)
exp <- data.frame(exp = c(rep(LETTERS[1:2], each = 10)), re = c(rep(seq(1, 10, 1), 2)), age1 = seq(10, 29, 1), age2 = seq(30, 49, 1),
h = c(runif(20, 10, 40)), h2 = c(40 + runif(20, 4, 9)))
I'd like to make a lm for each row in a data set (h and h2 ~ age1 and age2)
I do it by loop
exp$modelh <- 0
for (i in 1:length(exp$exp)){
age = c(exp$age1[i], exp$age2[i])
h = c(exp$h[i], exp$h2[i])
model = lm(age ~ h)
exp$modelh[i] = coef(model)[1] + 100 * coef(model)[2]
}
and it works well but takes some time with very large files. Will be grateful for the faster solution f.ex. dplyr
Using dplyr, we can try with rowwise() and do. Inside the do, we concatenate (c) the 'age1', 'age2' to create 'age', likewise, we can create 'h', apply lm, extract the coef to create the column 'modelh'.
library(dplyr)
exp %>%
rowwise() %>%
do({
age <- c(.$age1, .$age2)
h <- c(.$h, .$h2)
model <- lm(age ~ h)
data.frame(., modelh = coef(model)[1] + 100*coef(model)[2])
} )
gives the output
# exp re age1 age2 h h2 modelh
#1 A 1 10 30 19.23298 46.67906 68.85506
#2 A 2 11 31 17.73018 47.55402 66.17050
#3 A 3 12 32 26.56967 46.69174 84.98486
#4 A 4 13 33 11.69149 47.74486 61.98766
#5 A 5 14 34 24.05648 46.10051 82.90167
#6 A 6 15 35 24.51312 44.85710 89.21053
#7 A 7 16 36 34.37208 47.85151 113.37492
#8 A 8 17 37 21.10962 48.40977 74.79483
#9 A 9 18 38 26.39676 46.74548 90.34187
#10 A 10 19 39 15.10786 45.38862 75.07002
#11 B 1 20 40 28.74989 46.44153 100.54666
#12 B 2 21 41 36.46497 48.64253 125.34773
#13 B 3 22 42 18.41062 45.74346 81.70062
#14 B 4 23 43 21.95464 48.77079 81.20773
#15 B 5 24 44 32.87653 47.47637 115.95097
#16 B 6 25 45 30.07065 48.44727 101.10688
#17 B 7 26 46 16.13836 44.90204 84.31080
#18 B 8 27 47 20.72575 47.14695 87.00805
#19 B 9 28 48 20.78425 48.94782 84.25406
#20 B 10 29 49 30.70872 44.65144 128.39415
We could do this with the devel version of data.table i.e. v1.9.5. Instructions to install the devel version are here.
We convert the 'data.frame' to 'data.table' (setDT), create a column 'rn' with the option keep.rownames=TRUE. We melt the dataset by specifying the patterns in the measure to convert from 'wide' to 'long' format. Grouped by 'rn', we do the lm and get the coef. This can be assigned as a new column in the original dataset ('exp') while removing the unwanted 'rn' column by assigning (:=) it to NULL.
library(data.table)#v1.9.5+
modelh <- melt(setDT(exp, keep.rownames=TRUE), measure=patterns('^age', '^h'),
value.name=c('age', 'h'))[, {model <- lm(age ~h)
coef(model)[1] + 100 * coef(model)[2]},rn]$V1
exp[, modelh:= modelh][, rn := NULL]
exp
# exp re age1 age2 h h2 modelh
# 1: A 1 10 30 19.23298 46.67906 68.85506
# 2: A 2 11 31 17.73018 47.55402 66.17050
# 3: A 3 12 32 26.56967 46.69174 84.98486
# 4: A 4 13 33 11.69149 47.74486 61.98766
# 5: A 5 14 34 24.05648 46.10051 82.90167
# 6: A 6 15 35 24.51312 44.85710 89.21053
# 7: A 7 16 36 34.37208 47.85151 113.37492
# 8: A 8 17 37 21.10962 48.40977 74.79483
# 9: A 9 18 38 26.39676 46.74548 90.34187
#10: A 10 19 39 15.10786 45.38862 75.07002
#11: B 1 20 40 28.74989 46.44153 100.54666
#12: B 2 21 41 36.46497 48.64253 125.34773
#13: B 3 22 42 18.41062 45.74346 81.70062
#14: B 4 23 43 21.95464 48.77079 81.20773
#15: B 5 24 44 32.87653 47.47637 115.95097
#16: B 6 25 45 30.07065 48.44727 101.10688
#17: B 7 26 46 16.13836 44.90204 84.31080
#18: B 8 27 47 20.72575 47.14695 87.00805
#19: B 9 28 48 20.78425 48.94782 84.25406
#20: B 10 29 49 30.70872 44.65144 128.39415
Great (double) answer from #akrun.
Just a suggestion for your future analysis as you mentioned "it's an example of a bigger problem". Obviously, if you are really interested in building models rowwise then you'll create more and more columns as your age and h observations increase. If you get N observations you'll have to use 2xN columns for those 2 variables only.
I'd suggest to use a long data format in order to increase your rows instead of your columns.
Something like:
exp[1,] # how your first row (model building info) looks like
# exp re age1 age2 h h2
# 1 A 1 10 30 19.23298 46.67906
reshape(exp[1,], # how your model building info is transformed
varying = list(c("age1","age2"),
c("h","h2")),
v.names = c("age_value","h_value"),
direction = "long")
# exp re time age_value h_value id
# 1.1 A 1 1 10 19.23298 1
# 1.2 A 1 2 30 46.67906 1
Apologies if the "bigger problem" refers to something else and this answer is irrelevant.
With base R, the function sprintf can help us create formulas. And lapply carries out the calculation.
strings <- sprintf("c(%f,%f) ~ c(%f,%f)", exp$age1, exp$age2, exp$h, exp$h2)
lst <- lapply(strings, function(x) {model <- lm(as.formula(x));coef(model)[1] + 100 * coef(model)[2]})
exp$modelh <- unlist(lst)
exp
# exp re age1 age2 h h2 modelh
# 1 A 1 10 30 19.23298 46.67906 68.85506
# 2 A 2 11 31 17.73018 47.55402 66.17050
# 3 A 3 12 32 26.56967 46.69174 84.98486
# 4 A 4 13 33 11.69149 47.74486 61.98766
# 5 A 5 14 34 24.05648 46.10051 82.90167
# 6 A 6 15 35 24.51312 44.85710 89.21053
# 7 A 7 16 36 34.37208 47.85151 113.37493
# 8 A 8 17 37 21.10962 48.40977 74.79483
# 9 A 9 18 38 26.39676 46.74548 90.34187
# 10 A 10 19 39 15.10786 45.38862 75.07002
# 11 B 1 20 40 28.74989 46.44153 100.54666
# 12 B 2 21 41 36.46497 48.64253 125.34773
# 13 B 3 22 42 18.41062 45.74346 81.70062
# 14 B 4 23 43 21.95464 48.77079 81.20773
# 15 B 5 24 44 32.87653 47.47637 115.95097
# 16 B 6 25 45 30.07065 48.44727 101.10688
# 17 B 7 26 46 16.13836 44.90204 84.31080
# 18 B 8 27 47 20.72575 47.14695 87.00805
# 19 B 9 28 48 20.78425 48.94782 84.25406
# 20 B 10 29 49 30.70872 44.65144 128.39416
In the lapply function the expression as.formula(x) is what converts the formulas created in the first line into a format usable by the lm function.
Benchmark
library(dplyr)
library(microbenchmark)
set.seed(100)
big.exp <- data.frame(age1=sample(30, 1e4, T),
age2=sample(30:50, 1e4, T),
h=runif(1e4, 10, 40),
h2= 40 + runif(1e4,4,9))
microbenchmark(
plafort = {strings <- sprintf("c(%f,%f) ~ c(%f,%f)", big.exp$age1, big.exp$age2, big.exp$h, big.exp$h2)
lst <- lapply(strings, function(x) {model <- lm(as.formula(x));coef(model)[1] + 100 * coef(model)[2]})
big.exp$modelh <- unlist(lst)},
akdplyr = {big.exp %>%
rowwise() %>%
do({
age <- c(.$age1, .$age2)
h <- c(.$h, .$h2)
model <- lm(age ~ h)
data.frame(., modelh = coef(model)[1] + 100*coef(model)[2])
} )}
,times=5)
t: seconds
expr min lq mean median uq max neval cld
plafort 13.00605 13.41113 13.92165 13.56927 14.53814 15.08366 5 a
akdplyr 26.95064 27.64240 29.40892 27.86258 31.02955 33.55940 5 b
(Note: I downloaded the newest 1.9.5 devel version of data.table today, but continued to receive errors when trying to test it.
The results also differ fractionally (1.93 x 10^-8). Rounding likely accounts for the difference.)
all.equal(pl, ak)
[1] "Attributes: < Component “class”: Lengths (1, 3) differ (string compare on first 1) >"
[2] "Attributes: < Component “class”: 1 string mismatch >"
[3] "Component “modelh”: Mean relative difference: 1.933893e-08"
Conclusion
The lapply approach seems to perform well compared to dplyr with respect to speed, but it's 5 digit rounding may be an issue. Improvements may be possible. Perhaps using apply after converting to matrix to increase speed and efficiency.

Resources