I have this data.frame:
x <- rnorm(1000, 3, 2)
groups <- rep(c("GroupA", "GroupB"), each = 500)
df <- data.frame(x, groups)
Using dplyr, I can sample 100 rows of df then calculate the difference between the means of GroupA and GroupB:
df_difference_means <- df %>%
add_rownames %>%
filter(rowname %in% sample(1:1000, 100)) %>%
group_by(groups) %>%
summarise(mean.x = mean(x)) %>%
as.data.frame %>%
summarise(difference.mean.x = mean.x[2] - mean.x[1]) %>%
mutate(.replicate = 1) %>%
as.data.frame
difference.mean.x .replicate
1 -0.7258672 1
How, using dplyr, can I repeat this process 100 times and output the results in a data.frame. The resultant data.frame should look like df_difference_means_100:
difference.mean.x <- rnorm(100, -0.72, 2)
.replicate <- 1:100
df_difference_means_100 <- data.frame(difference.mean.x, .replicate)
df_difference_means_100
difference.mean.x .replicate
1 -1.74745341 1
2 -1.60671744 2
3 -0.73216685 3
4 2.53595482 4
5 -2.13187162 5
6 0.42921334 6
7 -1.23031115 7
8 2.66900128 8
9 -0.26267355 9
10 0.97573805 10
11 4.38242693 11
12 -2.09175166 12
13 1.17403184 13
14 0.77553541 14
15 -3.61322099 15
16 1.85055915 16
17 0.06395296 17
18 -1.42459781 18
19 2.90383461 19
20 -1.79359430 20
21 -0.43856161 21
22 1.81433832 22
23 3.15741676 23
24 -1.14643453 24
25 -2.14220126 25
26 -0.32972133 26
27 -0.27037302 27
28 2.20310891 28
29 3.05937838 29
30 0.11348566 30
31 0.09080867 31
32 -2.11559132 32
33 -0.50134470 33
34 0.31628255 34
35 0.96801232 35
36 3.42165046 36
37 2.47089399 37
38 -1.34196912 38
39 -1.11181326 39
40 -3.48664556 40
41 -2.49013457 41
42 3.67952537 42
43 -3.80781570 43
44 0.68793508 44
45 0.05869912 45
46 5.25205269 46
47 -3.00920009 47
48 -2.48109066 48
49 -0.22790952 49
50 1.41952375 50
51 0.79675613 51
52 1.13585093 52
53 0.63646903 53
54 0.56779986 54
55 -1.48099201 55
56 -0.24586261 56
57 3.16075196 57
58 -0.55765459 58
59 1.78498217 59
60 3.38490948 60
61 -0.09666898 61
62 -2.38897557 62
63 -0.50976285 63
64 4.25219676 64
65 -1.57526334 65
66 0.58006652 66
67 0.89549514 67
68 -0.17842015 68
69 -2.57422568 69
70 4.14008849 70
71 -3.48424762 71
72 -3.48788857 72
73 -4.22862573 73
74 1.98098272 74
75 0.73889898 75
76 -2.78759887 76
77 -0.75359051 77
78 -0.24062074 78
79 -0.39441863 79
80 -0.58710463 80
81 -2.95208480 81
82 -0.18225793 82
83 0.98356501 83
84 0.77963590 84
85 -1.21736133 85
86 1.36733389 86
87 -0.41273956 87
88 4.58347146 88
89 0.37946472 89
90 -5.02405002 90
91 -0.09883054 91
92 -1.99874326 92
93 -0.77896124 93
94 -0.05878099 94
95 0.82023492 95
96 2.29944232 96
97 -2.24368129 97
98 1.39608682 98
99 -0.61909894 99
100 0.74170204 100
Here's a possible approach using dplyr combined with replicate and lapply:
# define a custom function:
my_func <- function(df) {
df %>%
summarise(difference.mean.x = mean(x[groups == "GroupA"]) -
mean(x[groups == "GroupB"]))
}
# sample repeatedly (100 times) 100 rows of df and store in a list
# apply the custom function to each sample in the list,
# bind rows together and create an index column, all in a "pipe":
replicate(100, sample_n(df, 100), simplify = FALSE) %>%
lapply(., my_func) %>%
bind_rows %>%
mutate(replicate = 1:n())
#Source: local data frame [100 x 2]
#
# difference.mean.x replicate
#1 0.2531246 1
#2 -0.1595892 2
#3 0.1759745 3
#4 -0.1119139 4
#5 -0.1332090 5
#6 -0.8790818 6
#7 0.2170683 7
#8 -0.3484234 8
#9 0.2238635 9
#10 -0.4445486 10
#.. ... ...
Here's a way to put all the logic in a single dplyr pipeline, but it comes at the expense of copying df 100 times at the outset:
set.seed(111)
rep_df <- lapply(1:100, function(rep) {
df[['replicate']]=rep
df
})
rep_df <- do.call(rbind, rep_df)
rep_df %>%
group_by(replicate) %>%
sample_n(100) %>%
group_by(replicate, groups) %>%
summarize(mean_x = mean(x)) %>%
summarize(mean_x_group_diff = diff(mean_x)) -> rep_df
str(rep_df)
PS. You could also use a similar pipeline within an lapply call. This is more compact and doesn't copy df 100 times, but may be less readable:
set.seed(111)
output <- lapply(1:100, function(rep) {
sample_n(df, 100) %>%
group_by(groups) %>%
summarize(mean_x = mean(x)) %>%
summarize(mean_x_group_diff = diff(mean_x)) %>%
mutate(replicate=rep)
})
rep_df <- do.call(rbind, output)
str(rep_df)
You can create a dplyr-friendly formula using
row_rep <- function(df, n) {
df[rep(1:nrow(df), times = n),]
}
Sourced from https://gist.github.com/mdlincoln/528a53939538b07ade86
Related
This is my dataframe:
x<-data.frame(A = c(letters[1:10]), M1 = c(11:20), M2 = c(31:40), M3 = c(41:50))
colnames(x)<-NULL
I want to tranpose (t(x)) and consider the first column of x as the colnames of the new dataframe t(x).
Also I need them (the colnames of t(x)) to be identified as words/letters (as character right?)
Is it possible to do this with dplyr package?
Any help?
The {janitor} package is good for this and is flexible enough to be able to select any row to push to column names:
library(tidyverse)
library(janitor)
x <- x %>% row_to_names(row_number = 1)
You can do this easily in base R. Just make the first column of x be the row names, then remove the first column and transpose.
row.names(x) = x[,1]
x = t(x[,-1])
x
a b c d e f g h i j
M1 11 12 13 14 15 16 17 18 19 20
M2 31 32 33 34 35 36 37 38 39 40
M3 41 42 43 44 45 46 47 48 49 50
Try this:
library(dplyr)
library(tidyr)
x <- data.frame(
A = c(letters[1:10]),
M1 = c(11:20),
M2 = c(31:40),
M3 = c(41:50))
x %>%
gather(key = key, value = value, 2:ncol(x)) %>%
spread(key = names(x)[1], value = "value")
key a b c d e f g h i j
1 M1 11 12 13 14 15 16 17 18 19 20
2 M2 31 32 33 34 35 36 37 38 39 40
3 M3 41 42 43 44 45 46 47 48 49 50
I think column_to_rownames from the tibble package would be your simplest solution. Use it before you transpose with t.
library(magrittr)
library(tibble)
x %>%
column_to_rownames("A") %>%
t
#> a b c d e f g h i j
#> M1 11 12 13 14 15 16 17 18 19 20
#> M2 31 32 33 34 35 36 37 38 39 40
#> M3 41 42 43 44 45 46 47 48 49 50
The "M1", "M2", "M3" above are row names. If you want to keep them inside (as a column), you can add rownames_to_column from the same package.
x %>%
column_to_rownames("A") %>%
t %>%
as.data.frame %>%
rownames_to_column("key")
#> key a b c d e f g h i j
#> 1 M1 11 12 13 14 15 16 17 18 19 20
#> 2 M2 31 32 33 34 35 36 37 38 39 40
#> 3 M3 41 42 43 44 45 46 47 48 49 50
Essentially,
column_to_rownames("A") converts column "A" in x to row names,
t transposes the data.frame (now a matrix),
as.data.frame reclassifies it back as a data.frame (which is necessary for the next function), and
rownames_to_column("key") converts the row names into a new column called "key".
Using rownames_to_column() from the tibble package
library(magrittr)
library(tibble)
x %>%
t() %>%
as.data.frame(stringsAsFactors = FALSE) %>%
rownames_to_column() %>%
`colnames<-`(.[1,]) %>%
.[-1,] %>%
`rownames<-`(NULL)
#> A a b c d e f g h i j
#> 1 M1 11 12 13 14 15 16 17 18 19 20
#> 2 M2 31 32 33 34 35 36 37 38 39 40
#> 3 M3 41 42 43 44 45 46 47 48 49 50
x %>%
`row.names<-`(.[, 1]) %>%
t() %>%
as.data.frame(stringsAsFactors = FALSE) %>%
.[-1,]
#> a b c d e f g h i j
#> M1 11 12 13 14 15 16 17 18 19 20
#> M2 31 32 33 34 35 36 37 38 39 40
#> M3 41 42 43 44 45 46 47 48 49 50
Created on 2018-10-06 by the reprex package (v0.2.1.9000)
I have a list (28 items) of dataframes (12 columns, 8 rows) named "n.l.df".
Statistics need to be applied row-wise on columns 1:3, 4:6, 7:9, 10:12, separately, within each dataframe. I am iterating through the list, calculating stats by doing the following:
library(tidyverse)
avgs <- n.l.df
avgs <- lapply(avgs, function(x) {
x[1,1] <-mean(as.numeric(x[1,1:3]))
x[2,1] <-mean(as.numeric(x[2,1:3]))
x[3,1] <-mean(as.numeric(x[3,1:3]))
x[4,1] <-mean(as.numeric(x[4,1:3]))
x[5,1] <-mean(as.numeric(x[5,1:3]))
x[6,1] <-mean(as.numeric(x[6,1:3]))
x[7,1] <-mean(as.numeric(x[7,1:3]))
x[8,1] <-mean(as.numeric(x[8,1:3]))
x[1,4] <-mean(as.numeric(x[1,4:6]))
x[2,4] <-mean(as.numeric(x[2,4:6]))
x[3,4] <-mean(as.numeric(x[3,4:6]))
x[4,4] <-mean(as.numeric(x[4,4:6]))
x[5,4] <-mean(as.numeric(x[5,4:6]))
x[6,4] <-mean(as.numeric(x[6,4:6]))
x[7,4] <-mean(as.numeric(x[7,4:6]))
x[8,4] <-mean(as.numeric(x[8,4:6]))
x[1,7] <-mean(as.numeric(x[1,7:9]))
x[2,7] <-mean(as.numeric(x[2,7:9]))
x[3,7] <-mean(as.numeric(x[3,7:9]))
x[4,7] <-mean(as.numeric(x[4,7:9]))
x[5,7] <-mean(as.numeric(x[5,7:9]))
x[6,7] <-mean(as.numeric(x[6,7:9]))
x[7,7] <-mean(as.numeric(x[7,7:9]))
x[8,7] <-mean(as.numeric(x[8,7:9]))
x[1,10] <-mean(as.numeric(x[1,10:12]))
x[2,10] <-mean(as.numeric(x[2,10:12]))
x[3,10] <-mean(as.numeric(x[3,10:12]))
x[4,10] <-mean(as.numeric(x[4,10:12]))
x[5,10] <-mean(as.numeric(x[5,10:12]))
x[6,10] <-mean(as.numeric(x[6,10:12]))
x[7,10] <-mean(as.numeric(x[7,10:12]))
x[8,10] <-mean(as.numeric(x[8,10:12]))
return(x)
})
This works nicely and I can strip out the unnecessary values in columns 2,3,5,6,8,9,11, and 12 when needed. I like that I do not have to gather the dataframes into long form and keeping it as a list of dataframes is preferable.
Clearly, this way is too repetitive and I think there must be a way to do a nested lapply/apply, but it is beyond my level. How can I simplify and shorten this code?
Thanks.
library(tidyverse)
# For reproducibility
set.seed(100)
# list of 28 random data frames
df_list <- rerun(28, data.frame(replicate(12,sample(1:100,8))))
# Use map to iterate over the list, using rowMeans and select to get means of select columns.
map(df_list, ~mutate(., rm_1_3 = rowMeans(select(., 1:3)),
rm_4_6 = rowMeans(select(., 4:6)),
rm_7_9 = rowMeans(select(., 7:9)),
rm_10_12 = rowMeans(select(., 10:12))))
[[1]]
X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 rm_1_3 rm_4_6 rm_7_9 rm_10_12
1 31 55 21 43 35 34 21 13 45 58 46 31 35.66667 37.33333 26.33333 45.00000
2 26 17 36 17 95 86 31 23 36 96 60 73 26.33333 66.00000 30.00000 76.33333
3 55 62 99 76 69 77 33 59 100 65 91 89 72.00000 74.00000 64.00000 81.66667
4 6 86 67 86 87 81 20 21 44 61 96 21 53.00000 84.66667 28.33333 59.33333
5 45 27 52 53 18 58 23 45 24 83 4 35 41.33333 43.00000 30.66667 40.66667
6 46 38 68 27 60 47 27 62 66 74 55 43 50.66667 44.66667 51.66667 57.33333
7 77 72 51 46 94 74 56 91 39 79 69 86 66.66667 71.33333 62.00000 78.00000
8 35 63 70 87 13 83 24 63 31 9 24 37 56.00000 61.00000 39.33333 23.33333
This will give you a list of 28 data frames, with 4 columns of statistics added to each. If you just want the means, then substitute transmute for mutate
Let's say I have got a data.frame like the following:
df = read.table(text = 'A B
11 98
11 87
11 999
11 22
12 34
12 34
12 44
12 98
17 77
17 67
17 87
17 66
33 6
33 45
33 12
33 10', header = TRUE)
I need to group df by col A and select only a given number of rows based on the following vector:
n_rows = c(2, 3, 4, 2)
So that the first group will have only 2 rows (no matter their order), the second group 3 rows, etc...
Here my expected output:
A B
11 98
11 87
12 34
12 34
12 44
17 77
17 67
17 87
17 66
33 6
33 45
I tried to do the trick with dplyr by doing the following:
df %>%
group_by(A) %>%
top_n(n = n_rows, wt =B)
but I got the following error:
Error: n must be a scalar integer
Any suggestion?
thanks
Another base R option,
do.call(rbind, Map(function(x, y) x[seq(y),], split(df, df$A), n_rows))
which gives,
A B
11.1 11 98
11.2 11 87
12.5 12 34
12.6 12 34
12.7 12 44
17.9 17 77
17.10 17 67
17.11 17 87
17.12 17 66
33.13 33 6
33.14 33 45
Here's a possibility, splitting first the data.frame then using map2:
library(dplyr)
library(purr)
df %>% split(.$A) %>%
map2_dfr(n_rows,head)
# A B
# 1 11 98
# 2 11 87
# 3 12 34
# 4 12 34
# 5 12 44
# 6 17 77
# 7 17 67
# 8 17 87
# 9 17 66
# 10 33 6
# 11 33 45
If order doesn't matter you don't need top_n, head works just fine (and faster), else just replace head with top_n.
EDIT:
Here is also a tidy solution, a few characters longer but maybe more satisfying as you don't separate things of the same "kind" but rather work completely inside of the data.frame (same output).
df %>% nest(B) %>%
mutate(data = map2(data,n_rows,head)) %>%
unnest
In base R, you can do something like:
df2 <- data.frame()
for (i in seq_along(unique(df$A))) {
df2 <- rbind(df2, df[df$A == unique(df$A)[i], ][1:n_rows[i], ])
}
> df2
A B
1 11 98
2 11 87
5 12 34
6 12 34
7 12 44
9 17 77
10 17 67
11 17 87
12 17 66
13 33 6
14 33 45
Here is an option with top_n
library(tidyverse)
df %>%
split(., .$A) %>%
map2_df(., n_rows, ~ top_n(., .y, wt = .$B))
If we are not looking for top_n, then another option is slice
df %>%
group_by(A) %>%
nest(B) %>%
mutate(newcol = map2(data, n_rows, ~ .x %>% slice(seq(.y)))) %>%
select(-data) %>%
unnest
I have several files with the following structure:
data <- matrix(c(1:100000), nrow=1000, ncol=100)
The first 500 rows are X coordinates and the final 500 rows are Y coordinates of several object contours. Row # 1 (X) and row 501 (Y) correspond to coordinates of the same object. I need to:
transpose the whole matrix and arrange it so now row 1 is column 1 and row 501 is column 2 and have paired x, y coordinates in contiguous columns. Row 2 and row 502 should be in column 1 and column 2 below the data of previous object.
ideally, have an extra column with filename info.
thanks.
Simpler version:
Transpose the matrix, then create a vector with the column indices and subset with them:
mat <- matrix(1:100, nrow = 10)
mat2 <- t(mat)
cols <- unlist(lapply(1:(nrow(mat2)/2), function(i) c(i, i+nrow(mat2)/2)))
mat3 <- mat2[,cols]
Then just make it a dataframe as below.
You can subset pairs of rows separated by nrow/2, make them a 2-column matrix and then cbind them all:
df <- as.data.frame(do.call(cbind, lapply(1:(nrow(mat)/2), function(i) {
matrix(mat[c(i, nrow(mat)/2 + i),], ncol = 2, byrow = TRUE)
})))
df
# V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 fname
# 1 1 6 2 7 3 8 4 9 5 10 a
# 2 11 16 12 17 13 18 14 19 15 20 e
# 3 21 26 22 27 23 28 24 29 25 30 e
# 4 31 36 32 37 33 38 34 39 35 40 o
# 5 41 46 42 47 43 48 44 49 45 50 y
# 6 51 56 52 57 53 58 54 59 55 60 q
# 7 61 66 62 67 63 68 64 69 65 70 v
# 8 71 76 72 77 73 78 74 79 75 80 b
# 9 81 86 82 87 83 88 84 89 85 90 v
# 10 91 96 92 97 93 98 94 99 95 100 y
Then just add the new column as necessary, since it's now a dataframe:
df$fname <- sample(letters, nrow(df), TRUE)
What about
n <- 500
df <- data.frame(col1 = data[1:n, ],
col2 = data[(nrow(data) - 500):nrow(data), ],
fileinfo = "this is the name of the file...")
Try David's answer, but this way:
n <- 500
df <- data.frame(col1 = data[1:n, ],
col2 = data[(nrow(data) - (n-1)):nrow(data), ],
fileinfo = "this is the name of the file...")
I have a list which has multiple vectors (total 80) of various lengths. On the x-axis I want the names of these vectors. On the y-axis I want to plot the values corresponding to each vector. How can I do it in R?
One way to do this is to reshape the data using reshape2::melt or some other method. Please try and make a reproducible example. I think this is the gist of what you are after:
set.seed(4)
mylist <- list(a = sample(1:50, 10, T),
b = sample(25:40, 15, T),
c = sample(51:75, 20, T))
mylist
# $a
# [1] 30 1 15 14 41 14 37 46 48 4
#
# $b
# [1] 37 29 26 40 31 32 40 34 40 37 36 40 33 32 35
#
# $c
# [1] 71 63 72 63 64 65 56 72 67 63 75 62 66 60 51 74 57 65 55 73
library(ggplot2)
library(reshape2)
df <- melt(mylist)
head(df)
# value L1
# 1 30 a
# 2 1 a
# 3 15 a
# 4 14 a
# 5 41 a
# 6 14 a
ggplot(df, aes(x = factor(L1), y = value)) + geom_point()