Repeatedly sample a data.frame with dplyr - r

I have this data.frame:
x <- rnorm(1000, 3, 2)
groups <- rep(c("GroupA", "GroupB"), each = 500)
df <- data.frame(x, groups)
Using dplyr, I can sample 100 rows of df then calculate the difference between the means of GroupA and GroupB:
df_difference_means <- df %>%
add_rownames %>%
filter(rowname %in% sample(1:1000, 100)) %>%
group_by(groups) %>%
summarise(mean.x = mean(x)) %>%
as.data.frame %>%
summarise(difference.mean.x = mean.x[2] - mean.x[1]) %>%
mutate(.replicate = 1) %>%
as.data.frame
difference.mean.x .replicate
1 -0.7258672 1
How, using dplyr, can I repeat this process 100 times and output the results in a data.frame. The resultant data.frame should look like df_difference_means_100:
difference.mean.x <- rnorm(100, -0.72, 2)
.replicate <- 1:100
df_difference_means_100 <- data.frame(difference.mean.x, .replicate)
df_difference_means_100
difference.mean.x .replicate
1 -1.74745341 1
2 -1.60671744 2
3 -0.73216685 3
4 2.53595482 4
5 -2.13187162 5
6 0.42921334 6
7 -1.23031115 7
8 2.66900128 8
9 -0.26267355 9
10 0.97573805 10
11 4.38242693 11
12 -2.09175166 12
13 1.17403184 13
14 0.77553541 14
15 -3.61322099 15
16 1.85055915 16
17 0.06395296 17
18 -1.42459781 18
19 2.90383461 19
20 -1.79359430 20
21 -0.43856161 21
22 1.81433832 22
23 3.15741676 23
24 -1.14643453 24
25 -2.14220126 25
26 -0.32972133 26
27 -0.27037302 27
28 2.20310891 28
29 3.05937838 29
30 0.11348566 30
31 0.09080867 31
32 -2.11559132 32
33 -0.50134470 33
34 0.31628255 34
35 0.96801232 35
36 3.42165046 36
37 2.47089399 37
38 -1.34196912 38
39 -1.11181326 39
40 -3.48664556 40
41 -2.49013457 41
42 3.67952537 42
43 -3.80781570 43
44 0.68793508 44
45 0.05869912 45
46 5.25205269 46
47 -3.00920009 47
48 -2.48109066 48
49 -0.22790952 49
50 1.41952375 50
51 0.79675613 51
52 1.13585093 52
53 0.63646903 53
54 0.56779986 54
55 -1.48099201 55
56 -0.24586261 56
57 3.16075196 57
58 -0.55765459 58
59 1.78498217 59
60 3.38490948 60
61 -0.09666898 61
62 -2.38897557 62
63 -0.50976285 63
64 4.25219676 64
65 -1.57526334 65
66 0.58006652 66
67 0.89549514 67
68 -0.17842015 68
69 -2.57422568 69
70 4.14008849 70
71 -3.48424762 71
72 -3.48788857 72
73 -4.22862573 73
74 1.98098272 74
75 0.73889898 75
76 -2.78759887 76
77 -0.75359051 77
78 -0.24062074 78
79 -0.39441863 79
80 -0.58710463 80
81 -2.95208480 81
82 -0.18225793 82
83 0.98356501 83
84 0.77963590 84
85 -1.21736133 85
86 1.36733389 86
87 -0.41273956 87
88 4.58347146 88
89 0.37946472 89
90 -5.02405002 90
91 -0.09883054 91
92 -1.99874326 92
93 -0.77896124 93
94 -0.05878099 94
95 0.82023492 95
96 2.29944232 96
97 -2.24368129 97
98 1.39608682 98
99 -0.61909894 99
100 0.74170204 100

Here's a possible approach using dplyr combined with replicate and lapply:
# define a custom function:
my_func <- function(df) {
df %>%
summarise(difference.mean.x = mean(x[groups == "GroupA"]) -
mean(x[groups == "GroupB"]))
}
# sample repeatedly (100 times) 100 rows of df and store in a list
# apply the custom function to each sample in the list,
# bind rows together and create an index column, all in a "pipe":
replicate(100, sample_n(df, 100), simplify = FALSE) %>%
lapply(., my_func) %>%
bind_rows %>%
mutate(replicate = 1:n())
#Source: local data frame [100 x 2]
#
# difference.mean.x replicate
#1 0.2531246 1
#2 -0.1595892 2
#3 0.1759745 3
#4 -0.1119139 4
#5 -0.1332090 5
#6 -0.8790818 6
#7 0.2170683 7
#8 -0.3484234 8
#9 0.2238635 9
#10 -0.4445486 10
#.. ... ...

Here's a way to put all the logic in a single dplyr pipeline, but it comes at the expense of copying df 100 times at the outset:
set.seed(111)
rep_df <- lapply(1:100, function(rep) {
df[['replicate']]=rep
df
})
rep_df <- do.call(rbind, rep_df)
rep_df %>%
group_by(replicate) %>%
sample_n(100) %>%
group_by(replicate, groups) %>%
summarize(mean_x = mean(x)) %>%
summarize(mean_x_group_diff = diff(mean_x)) -> rep_df
str(rep_df)
PS. You could also use a similar pipeline within an lapply call. This is more compact and doesn't copy df 100 times, but may be less readable:
set.seed(111)
output <- lapply(1:100, function(rep) {
sample_n(df, 100) %>%
group_by(groups) %>%
summarize(mean_x = mean(x)) %>%
summarize(mean_x_group_diff = diff(mean_x)) %>%
mutate(replicate=rep)
})
rep_df <- do.call(rbind, output)
str(rep_df)

You can create a dplyr-friendly formula using
row_rep <- function(df, n) {
df[rep(1:nrow(df), times = n),]
}
Sourced from https://gist.github.com/mdlincoln/528a53939538b07ade86

Related

Put the first row as the column names of my dataframe with dplyr in R

This is my dataframe:
x<-data.frame(A = c(letters[1:10]), M1 = c(11:20), M2 = c(31:40), M3 = c(41:50))
colnames(x)<-NULL
I want to tranpose (t(x)) and consider the first column of x as the colnames of the new dataframe t(x).
Also I need them (the colnames of t(x)) to be identified as words/letters (as character right?)
Is it possible to do this with dplyr package?
Any help?
The {janitor} package is good for this and is flexible enough to be able to select any row to push to column names:
library(tidyverse)
library(janitor)
x <- x %>% row_to_names(row_number = 1)
You can do this easily in base R. Just make the first column of x be the row names, then remove the first column and transpose.
row.names(x) = x[,1]
x = t(x[,-1])
x
a b c d e f g h i j
M1 11 12 13 14 15 16 17 18 19 20
M2 31 32 33 34 35 36 37 38 39 40
M3 41 42 43 44 45 46 47 48 49 50
Try this:
library(dplyr)
library(tidyr)
x <- data.frame(
A = c(letters[1:10]),
M1 = c(11:20),
M2 = c(31:40),
M3 = c(41:50))
x %>%
gather(key = key, value = value, 2:ncol(x)) %>%
spread(key = names(x)[1], value = "value")
key a b c d e f g h i j
1 M1 11 12 13 14 15 16 17 18 19 20
2 M2 31 32 33 34 35 36 37 38 39 40
3 M3 41 42 43 44 45 46 47 48 49 50
I think column_to_rownames from the tibble package would be your simplest solution. Use it before you transpose with t.
library(magrittr)
library(tibble)
x %>%
column_to_rownames("A") %>%
t
#> a b c d e f g h i j
#> M1 11 12 13 14 15 16 17 18 19 20
#> M2 31 32 33 34 35 36 37 38 39 40
#> M3 41 42 43 44 45 46 47 48 49 50
The "M1", "M2", "M3" above are row names. If you want to keep them inside (as a column), you can add rownames_to_column from the same package.
x %>%
column_to_rownames("A") %>%
t %>%
as.data.frame %>%
rownames_to_column("key")
#> key a b c d e f g h i j
#> 1 M1 11 12 13 14 15 16 17 18 19 20
#> 2 M2 31 32 33 34 35 36 37 38 39 40
#> 3 M3 41 42 43 44 45 46 47 48 49 50
Essentially,
column_to_rownames("A") converts column "A" in x to row names,
t transposes the data.frame (now a matrix),
as.data.frame reclassifies it back as a data.frame (which is necessary for the next function), and
rownames_to_column("key") converts the row names into a new column called "key".
Using rownames_to_column() from the tibble package
library(magrittr)
library(tibble)
x %>%
t() %>%
as.data.frame(stringsAsFactors = FALSE) %>%
rownames_to_column() %>%
`colnames<-`(.[1,]) %>%
.[-1,] %>%
`rownames<-`(NULL)
#> A a b c d e f g h i j
#> 1 M1 11 12 13 14 15 16 17 18 19 20
#> 2 M2 31 32 33 34 35 36 37 38 39 40
#> 3 M3 41 42 43 44 45 46 47 48 49 50
x %>%
`row.names<-`(.[, 1]) %>%
t() %>%
as.data.frame(stringsAsFactors = FALSE) %>%
.[-1,]
#> a b c d e f g h i j
#> M1 11 12 13 14 15 16 17 18 19 20
#> M2 31 32 33 34 35 36 37 38 39 40
#> M3 41 42 43 44 45 46 47 48 49 50
Created on 2018-10-06 by the reprex package (v0.2.1.9000)

Apply succinct function over subsets of columns of data frames in a list

I have a list (28 items) of dataframes (12 columns, 8 rows) named "n.l.df".
Statistics need to be applied row-wise on columns 1:3, 4:6, 7:9, 10:12, separately, within each dataframe. I am iterating through the list, calculating stats by doing the following:
library(tidyverse)
avgs <- n.l.df
avgs <- lapply(avgs, function(x) {
x[1,1] <-mean(as.numeric(x[1,1:3]))
x[2,1] <-mean(as.numeric(x[2,1:3]))
x[3,1] <-mean(as.numeric(x[3,1:3]))
x[4,1] <-mean(as.numeric(x[4,1:3]))
x[5,1] <-mean(as.numeric(x[5,1:3]))
x[6,1] <-mean(as.numeric(x[6,1:3]))
x[7,1] <-mean(as.numeric(x[7,1:3]))
x[8,1] <-mean(as.numeric(x[8,1:3]))
x[1,4] <-mean(as.numeric(x[1,4:6]))
x[2,4] <-mean(as.numeric(x[2,4:6]))
x[3,4] <-mean(as.numeric(x[3,4:6]))
x[4,4] <-mean(as.numeric(x[4,4:6]))
x[5,4] <-mean(as.numeric(x[5,4:6]))
x[6,4] <-mean(as.numeric(x[6,4:6]))
x[7,4] <-mean(as.numeric(x[7,4:6]))
x[8,4] <-mean(as.numeric(x[8,4:6]))
x[1,7] <-mean(as.numeric(x[1,7:9]))
x[2,7] <-mean(as.numeric(x[2,7:9]))
x[3,7] <-mean(as.numeric(x[3,7:9]))
x[4,7] <-mean(as.numeric(x[4,7:9]))
x[5,7] <-mean(as.numeric(x[5,7:9]))
x[6,7] <-mean(as.numeric(x[6,7:9]))
x[7,7] <-mean(as.numeric(x[7,7:9]))
x[8,7] <-mean(as.numeric(x[8,7:9]))
x[1,10] <-mean(as.numeric(x[1,10:12]))
x[2,10] <-mean(as.numeric(x[2,10:12]))
x[3,10] <-mean(as.numeric(x[3,10:12]))
x[4,10] <-mean(as.numeric(x[4,10:12]))
x[5,10] <-mean(as.numeric(x[5,10:12]))
x[6,10] <-mean(as.numeric(x[6,10:12]))
x[7,10] <-mean(as.numeric(x[7,10:12]))
x[8,10] <-mean(as.numeric(x[8,10:12]))
return(x)
})
This works nicely and I can strip out the unnecessary values in columns 2,3,5,6,8,9,11, and 12 when needed. I like that I do not have to gather the dataframes into long form and keeping it as a list of dataframes is preferable.
Clearly, this way is too repetitive and I think there must be a way to do a nested lapply/apply, but it is beyond my level. How can I simplify and shorten this code?
Thanks.
library(tidyverse)
# For reproducibility
set.seed(100)
# list of 28 random data frames
df_list <- rerun(28, data.frame(replicate(12,sample(1:100,8))))
# Use map to iterate over the list, using rowMeans and select to get means of select columns.
map(df_list, ~mutate(., rm_1_3 = rowMeans(select(., 1:3)),
rm_4_6 = rowMeans(select(., 4:6)),
rm_7_9 = rowMeans(select(., 7:9)),
rm_10_12 = rowMeans(select(., 10:12))))
[[1]]
X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 rm_1_3 rm_4_6 rm_7_9 rm_10_12
1 31 55 21 43 35 34 21 13 45 58 46 31 35.66667 37.33333 26.33333 45.00000
2 26 17 36 17 95 86 31 23 36 96 60 73 26.33333 66.00000 30.00000 76.33333
3 55 62 99 76 69 77 33 59 100 65 91 89 72.00000 74.00000 64.00000 81.66667
4 6 86 67 86 87 81 20 21 44 61 96 21 53.00000 84.66667 28.33333 59.33333
5 45 27 52 53 18 58 23 45 24 83 4 35 41.33333 43.00000 30.66667 40.66667
6 46 38 68 27 60 47 27 62 66 74 55 43 50.66667 44.66667 51.66667 57.33333
7 77 72 51 46 94 74 56 91 39 79 69 86 66.66667 71.33333 62.00000 78.00000
8 35 63 70 87 13 83 24 63 31 9 24 37 56.00000 61.00000 39.33333 23.33333
This will give you a list of 28 data frames, with 4 columns of statistics added to each. If you just want the means, then substitute transmute for mutate

Group data.frame by column and select number of rows based on numeric vector

Let's say I have got a data.frame like the following:
df = read.table(text = 'A B
11 98
11 87
11 999
11 22
12 34
12 34
12 44
12 98
17 77
17 67
17 87
17 66
33 6
33 45
33 12
33 10', header = TRUE)
I need to group df by col A and select only a given number of rows based on the following vector:
n_rows = c(2, 3, 4, 2)
So that the first group will have only 2 rows (no matter their order), the second group 3 rows, etc...
Here my expected output:
A B
11 98
11 87
12 34
12 34
12 44
17 77
17 67
17 87
17 66
33 6
33 45
I tried to do the trick with dplyr by doing the following:
df %>%
group_by(A) %>%
top_n(n = n_rows, wt =B)
but I got the following error:
Error: n must be a scalar integer
Any suggestion?
thanks
Another base R option,
do.call(rbind, Map(function(x, y) x[seq(y),], split(df, df$A), n_rows))
which gives,
A B
11.1 11 98
11.2 11 87
12.5 12 34
12.6 12 34
12.7 12 44
17.9 17 77
17.10 17 67
17.11 17 87
17.12 17 66
33.13 33 6
33.14 33 45
Here's a possibility, splitting first the data.frame then using map2:
library(dplyr)
library(purr)
df %>% split(.$A) %>%
map2_dfr(n_rows,head)
# A B
# 1 11 98
# 2 11 87
# 3 12 34
# 4 12 34
# 5 12 44
# 6 17 77
# 7 17 67
# 8 17 87
# 9 17 66
# 10 33 6
# 11 33 45
If order doesn't matter you don't need top_n, head works just fine (and faster), else just replace head with top_n.
EDIT:
Here is also a tidy solution, a few characters longer but maybe more satisfying as you don't separate things of the same "kind" but rather work completely inside of the data.frame (same output).
df %>% nest(B) %>%
mutate(data = map2(data,n_rows,head)) %>%
unnest
In base R, you can do something like:
df2 <- data.frame()
for (i in seq_along(unique(df$A))) {
df2 <- rbind(df2, df[df$A == unique(df$A)[i], ][1:n_rows[i], ])
}
> df2
A B
1 11 98
2 11 87
5 12 34
6 12 34
7 12 44
9 17 77
10 17 67
11 17 87
12 17 66
13 33 6
14 33 45
Here is an option with top_n
library(tidyverse)
df %>%
split(., .$A) %>%
map2_df(., n_rows, ~ top_n(., .y, wt = .$B))
If we are not looking for top_n, then another option is slice
df %>%
group_by(A) %>%
nest(B) %>%
mutate(newcol = map2(data, n_rows, ~ .x %>% slice(seq(.y)))) %>%
select(-data) %>%
unnest

Transpose and rearrange rows in a matrix

I have several files with the following structure:
data <- matrix(c(1:100000), nrow=1000, ncol=100)
The first 500 rows are X coordinates and the final 500 rows are Y coordinates of several object contours. Row # 1 (X) and row 501 (Y) correspond to coordinates of the same object. I need to:
transpose the whole matrix and arrange it so now row 1 is column 1 and row 501 is column 2 and have paired x, y coordinates in contiguous columns. Row 2 and row 502 should be in column 1 and column 2 below the data of previous object.
ideally, have an extra column with filename info.
thanks.
Simpler version:
Transpose the matrix, then create a vector with the column indices and subset with them:
mat <- matrix(1:100, nrow = 10)
mat2 <- t(mat)
cols <- unlist(lapply(1:(nrow(mat2)/2), function(i) c(i, i+nrow(mat2)/2)))
mat3 <- mat2[,cols]
Then just make it a dataframe as below.
You can subset pairs of rows separated by nrow/2, make them a 2-column matrix and then cbind them all:
df <- as.data.frame(do.call(cbind, lapply(1:(nrow(mat)/2), function(i) {
matrix(mat[c(i, nrow(mat)/2 + i),], ncol = 2, byrow = TRUE)
})))
df
# V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 fname
# 1 1 6 2 7 3 8 4 9 5 10 a
# 2 11 16 12 17 13 18 14 19 15 20 e
# 3 21 26 22 27 23 28 24 29 25 30 e
# 4 31 36 32 37 33 38 34 39 35 40 o
# 5 41 46 42 47 43 48 44 49 45 50 y
# 6 51 56 52 57 53 58 54 59 55 60 q
# 7 61 66 62 67 63 68 64 69 65 70 v
# 8 71 76 72 77 73 78 74 79 75 80 b
# 9 81 86 82 87 83 88 84 89 85 90 v
# 10 91 96 92 97 93 98 94 99 95 100 y
Then just add the new column as necessary, since it's now a dataframe:
df$fname <- sample(letters, nrow(df), TRUE)
What about
n <- 500
df <- data.frame(col1 = data[1:n, ],
col2 = data[(nrow(data) - 500):nrow(data), ],
fileinfo = "this is the name of the file...")
Try David's answer, but this way:
n <- 500
df <- data.frame(col1 = data[1:n, ],
col2 = data[(nrow(data) - (n-1)):nrow(data), ],
fileinfo = "this is the name of the file...")

Plot a list of variable length vectors in R

I have a list which has multiple vectors (total 80) of various lengths. On the x-axis I want the names of these vectors. On the y-axis I want to plot the values corresponding to each vector. How can I do it in R?
One way to do this is to reshape the data using reshape2::melt or some other method. Please try and make a reproducible example. I think this is the gist of what you are after:
set.seed(4)
mylist <- list(a = sample(1:50, 10, T),
b = sample(25:40, 15, T),
c = sample(51:75, 20, T))
mylist
# $a
# [1] 30 1 15 14 41 14 37 46 48 4
#
# $b
# [1] 37 29 26 40 31 32 40 34 40 37 36 40 33 32 35
#
# $c
# [1] 71 63 72 63 64 65 56 72 67 63 75 62 66 60 51 74 57 65 55 73
library(ggplot2)
library(reshape2)
df <- melt(mylist)
head(df)
# value L1
# 1 30 a
# 2 1 a
# 3 15 a
# 4 14 a
# 5 41 a
# 6 14 a
ggplot(df, aes(x = factor(L1), y = value)) + geom_point()

Resources