I have a large dataframe which is effectively combined output from a nested list using do.call(rbind, nested_list)
The output has the same number of rows for each list element (e.g. 5 rows per list) and I need to add a column which has a unique numeric code for each list (or group). How can I write a loop to reproduce the group column I have included in the example below, e.g. the five rows have a group value == 1, rows 6 to 10 have a group value == 2, rows 11 to 15 have a group value == 3
df <- data.frame("ID" = 1:15)
df$Var_A <- c(1,3,5,7,9,11,13,15,17,19,21,23,25,27,29)
df$Var_B <- c(10,0,0,0,12,12,12,12,0,14,NA_real_,14,16,16,16)
df$Var_C <- c(10,12,14,16,10,12,14,16,10,12,14,16,10,12,14)
df$Var_D <- c(10,12,14,16,10,12,14,16,10,12,14,16,10,12,14)
df$New_A <- c(2,5,5,8,11,14,15,17,20,21,22,23,25,25,27)
df$New_B <- c(10,0,0,0,12,12,12,12,0,14,NA_real_,14,16,16,16)
df$New_C <- c(10,12,14,16,10,12,14,16,10,12,14,16,10,12,14)
df$New_D <- c(10,12,14,16,10,12,14,16,10,12,14,16,10,12,14)
df$Group <- c(1,1,1,1,1,2,2,2,2,2,3,3,3,3,3)
ID Var_A Var_B Var_C Var_D New_A New_B New_C New_D Group
1 1 1 10 10 10 2 10 10 10 1
2 2 3 0 12 12 5 0 12 12 1
3 3 5 0 14 14 5 0 14 14 1
4 4 7 0 16 16 8 0 16 16 1
5 5 9 12 10 10 11 12 10 10 1
6 6 11 12 12 12 14 12 12 12 2
7 7 13 12 14 14 15 12 14 14 2
8 8 15 12 16 16 17 12 16 16 2
9 9 17 0 10 10 20 0 10 10 2
10 10 19 14 12 12 21 14 12 12 2
11 11 21 NA 14 14 22 NA 14 14 3
12 12 23 14 16 16 23 14 16 16 3
13 13 25 16 10 10 25 16 10 10 3
14 14 27 16 12 12 25 16 12 12 3
15 15 29 16 14 14 27 16 14 14 3
You can use the ceiling function:
df <- data.frame("ID" = 1:15)
df$Var_A <- c(1,3,5,7,9,11,13,15,17,19,21,23,25,27,29)
df$Var_B <- c(10,0,0,0,12,12,12,12,0,14,NA_real_,14,16,16,16)
df$Var_C <- c(10,12,14,16,10,12,14,16,10,12,14,16,10,12,14)
df$Var_D <- c(10,12,14,16,10,12,14,16,10,12,14,16,10,12,14)
df$New_A <- c(2,5,5,8,11,14,15,17,20,21,22,23,25,25,27)
df$New_B <- c(10,0,0,0,12,12,12,12,0,14,NA_real_,14,16,16,16)
df$New_C <- c(10,12,14,16,10,12,14,16,10,12,14,16,10,12,14)
df$New_D <- c(10,12,14,16,10,12,14,16,10,12,14,16,10,12,14)
df$Group <- ceiling(as.numeric(df$ID)/5)
df
# ID Var_A Var_B Var_C Var_D New_A New_B New_C New_D Group
# 1 1 1 10 10 10 2 10 10 10 1
# 2 2 3 0 12 12 5 0 12 12 1
# 3 3 5 0 14 14 5 0 14 14 1
# 4 4 7 0 16 16 8 0 16 16 1
# 5 5 9 12 10 10 11 12 10 10 1
# 6 6 11 12 12 12 14 12 12 12 2
# 7 7 13 12 14 14 15 12 14 14 2
# 8 8 15 12 16 16 17 12 16 16 2
# 9 9 17 0 10 10 20 0 10 10 2
# 10 10 19 14 12 12 21 14 12 12 2
# 11 11 21 NA 14 14 22 NA 14 14 3
# 12 12 23 14 16 16 23 14 16 16 3
# 13 13 25 16 10 10 25 16 10 10 3
# 14 14 27 16 12 12 25 16 12 12 3
# 15 15 29 16 14 14 27 16 14 14 3
Without adding an ID or rownums we can do this using nrow and knowledge of the group length.
group_len <- 5
groups <- nrow(df)/group_len
df$group <- rep(1:groups, each = group_len)
# Example:
# rep(1:3, each = 5)
# 1 1 1 1 1 2 2 2 2 2 3 3 3 3 3
I'd use dplyr::mutate with dplyr::row_number:
library(dplyr)
df %>%
mutate(Group=ceiling(row_number() / 5))
Output:
ID Var_A Var_B Var_C Var_D New_A New_B New_C New_D Group
1 1 1 10 10 10 2 10 10 10 1
2 2 3 0 12 12 5 0 12 12 1
3 3 5 0 14 14 5 0 14 14 1
4 4 7 0 16 16 8 0 16 16 1
5 5 9 12 10 10 11 12 10 10 1
6 6 11 12 12 12 14 12 12 12 2
7 7 13 12 14 14 15 12 14 14 2
8 8 15 12 16 16 17 12 16 16 2
9 9 17 0 10 10 20 0 10 10 2
10 10 19 14 12 12 21 14 12 12 2
11 11 21 NA 14 14 22 NA 14 14 3
12 12 23 14 16 16 23 14 16 16 3
13 13 25 16 10 10 25 16 10 10 3
14 14 27 16 12 12 25 16 12 12 3
15 15 29 16 14 14 27 16 14 14 3
An option would be to combine cumsum with rep.
cumsum(rep_len(c(TRUE, rep(FALSE, 4)), nrow(df)))
#cumsum(rep_len(c(TRUE, FALSE, FALSE, FALSE, FALSE), nrow(df))) #Alternative
# [1] 1 1 1 1 1 2 2 2 2 2 3 3 3 3 3
Or making use of auto repeat.
df$Group <- c(TRUE, rep(FALSE, 4))
df$Group <- cumsum(df$Group)
df$Group
# [1] 1 1 1 1 1 2 2 2 2 2 3 3 3 3 3
Or create a sequence with length of nrow and make an integer division %/%.
0:(nrow(df)-1) %/% 5
#seq(0, nrow(df)-1) %/% 5 #Alternative
#(seq_len(nrow(df))-1) %/% 5 #Alternative
# [1] 0 0 0 0 0 1 1 1 1 1 2 2 2 2 2
Or using rep:
rep(1:ceiling(nrow(df)/5), each=5, length.out=nrow(df))
# [1] 1 1 1 1 1 2 2 2 2 2 3 3 3 3 3
You could use the cut function with labels = FALSE to return an integer to use for the group.
n_per_group <- 5
df$group <- cut(x = df$ID, breaks = nrow(df) / n_per_group, labels = FALSE)
df$group
#[1] 1 1 1 1 1 2 2 2 2 2 3 3 3 3 3
We are preparing for a program where 18 people should discuss topics in a way that in each round they form pairs, and then they switch until everyone has talked to everyone. It means 153 discussions, 9 pairs talking parallelly in each round, for 17 rounds. I tried to formulate a matrix showing who should talk to whom in order to avoid the chaos, but could not succeed. For the sake of simplicity everyone is given a number, so the bottom line is, i would need all pairs of combinations of the numbers from 1 to 18 (did that with combn function), but then these pairs should be rearranged for the 17 round so that each number only appears once per round. Any ideas?
Let's first look at a simpler problem with 6 persons. The following matrix lists who (rows) is talking to whom (columns) in which round (entry):
So for example in round 1 (yellow) we have the following pairs:
(1-2), (3-5), (4-6)
For round 2 (green) we would have:
(1-3), (2-6), (4-5)
and so on.
Thus, basically we are looking for a symmetric latin square (i.e. in each row and in each column each entry appears only once, cf. Latin Squares on Wikipedia).
The latin square in the box can be easily generated via an addition table:
inner_ls <- function(k) {
res <- outer(0:(k-1), 0:(k-1), function(i, j) (i + j) %% k)
## replace zeros by k
res[res == 0] <- k
## replace diagonal by NA
diag(res) <- NA
res
}
inner_ls(5)
# [,1] [,2] [,3] [,4] [,5]
# [1,] NA 1 2 3 4
# [2,] 1 NA 3 4 5
# [3,] 2 3 NA 5 1
# [4,] 3 4 5 NA 2
# [5,] 4 5 1 2 NA
So all is left to append the last row (column) with the missing round number:
full_ls <- function(k) {
i_ls <- inner_ls(k - 1)
last_row <- apply(i_ls, 1, function(row) {
rounds <- 1:(k - 1)
rounds[!rounds %in% row]
})
res <- cbind(rbind(i_ls, last_row), c(last_row, NA))
rownames(res) <- colnames(res) <- 1:k
res
}
full_ls(6)
# 1 2 3 4 5 6
# 1 NA 1 2 3 4 5
# 2 1 NA 3 4 5 2
# 3 2 3 NA 5 1 4
# 4 3 4 5 NA 2 1
# 5 4 5 1 2 NA 3
# 6 5 2 4 1 3 NA
With that you get your assignment matrix as follows:
full_ls(18)
# 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
# 1 NA 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
# 2 1 NA 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 2
# 3 2 3 NA 5 6 7 8 9 10 11 12 13 14 15 16 17 1 4
# 4 3 4 5 NA 7 8 9 10 11 12 13 14 15 16 17 1 2 6
# 5 4 5 6 7 NA 9 10 11 12 13 14 15 16 17 1 2 3 8
# 6 5 6 7 8 9 NA 11 12 13 14 15 16 17 1 2 3 4 10
# 7 6 7 8 9 10 11 NA 13 14 15 16 17 1 2 3 4 5 12
# 8 7 8 9 10 11 12 13 NA 15 16 17 1 2 3 4 5 6 14
# 9 8 9 10 11 12 13 14 15 NA 17 1 2 3 4 5 6 7 16
# 10 9 10 11 12 13 14 15 16 17 NA 2 3 4 5 6 7 8 1
# 11 10 11 12 13 14 15 16 17 1 2 NA 4 5 6 7 8 9 3
# 12 11 12 13 14 15 16 17 1 2 3 4 NA 6 7 8 9 10 5
# 13 12 13 14 15 16 17 1 2 3 4 5 6 NA 8 9 10 11 7
# 14 13 14 15 16 17 1 2 3 4 5 6 7 8 NA 10 11 12 9
# 15 14 15 16 17 1 2 3 4 5 6 7 8 9 10 NA 12 13 11
# 16 15 16 17 1 2 3 4 5 6 7 8 9 10 11 12 NA 14 13
# 17 16 17 1 2 3 4 5 6 7 8 9 10 11 12 13 14 NA 15
# 18 17 2 4 6 8 10 12 14 16 1 3 5 7 9 11 13 15 NA
For example, I have a toy dataset as the one I created below,
a1<-1:10
a2<-11:20
v<-c(1,2,1,NA,2,1,2,1,2,1)
data<-data.frame(a1,a2,v,stringsAsFactors = F)
Then I want to create a new variable y which will be assigned the value a1 or a2 or NA based on the value of variable v. Therefore, the 'y'
should equals to 1 12 3 NA 15 6 17 8 19 10.
I want to generate it with the command similar to the ones I list below, It doesn't work, I guess it's because of the vectorization issue, then how can I fix it?
In reality, I have several as, say 10 and the actual values are characters instead of numeric ones.
data$y[!is.na(data$v)]<-data[,paste0('a',data$v)]
or
data%>%
mutate(y=ifelse(!is.na(v),get(paste0('a',v)),NA))
You could use standard indexing with cbind for that:
dat$y <- dat[cbind(1:nrow(dat), dat$v)]
The result:
> dat
a1 a2 v y
1 1 11 1 1
2 2 12 2 12
3 3 13 1 3
4 4 14 NA NA
5 5 15 2 15
6 6 16 1 6
7 7 17 2 17
8 8 18 1 8
9 9 19 2 19
10 10 20 1 10
(I used dat instead of data, because it is not wise to call a dataframe the same as a function; see ?data)
Only idea that comes to my mind:
data%>%
mutate(y=ifelse(!is.na(v),paste0('a',v),NA)) %>%
mutate(z=ifelse(!is.na(y),(ifelse(y=="a1",get("a1"),get("a2"))),NA))
a1 a2 v y z
1 1 11 1 a1 1
2 2 12 2 a2 12
3 3 13 1 a1 3
4 4 14 NA <NA> NA
5 5 15 2 a2 15
6 6 16 1 a1 6
7 7 17 2 a2 17
8 8 18 1 a1 8
9 9 19 2 a2 19
10 10 20 1 a1 10
or more directly:
data%>%
mutate(y=ifelse(!is.na(v),(ifelse(v==1, get("a1"),get("a2"))),NA))
a1 a2 v y
1 1 11 1 1
2 2 12 2 12
3 3 13 1 3
4 4 14 NA NA
5 5 15 2 15
6 6 16 1 6
7 7 17 2 17
8 8 18 1 8
9 9 19 2 19
10 10 20 1 10
still based on ifelse :(
You need to use a matrix accessor:
# Get the indices of missing values
ind <- which(!is.na(data$v))
# Transform colnames to indices
tab <- structure(match(c("a1", "a2"), names(data)), .Names = c("a1", "a2"))
# Access data with a matrix accessor
data$y[ind] <- data[cbind(ind, tab[paste0('a', data$v[ind])])]
I've a data frame like this
w<-c(0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0)
i would like an index position starting after value 1.
output : NA,NA,NA,NA,NA,1,2,3,4,5,6,7,1,2,3,4,5,1,2,3,4,5,6,7,8,9
ideally applicable to a data frame.
Thanks
edit : w is a data frame,
roughly this function
m<-as.data.frame(w)
m[m!=1] <- row(m)[m!=1]
m
w
1 1
2 2
3 3
4 4
5 5
6 1
7 7
8 8
9 9
10 10
11 11
12 12
13 1
14 14
15 15
16 16
17 17
18 1
19 19
20 20
21 21
22 22
23 23
24 24
25 25
26 26
but with a return to 1 when value 1 is matching.
> m
w wanted
1 1 NA
2 2 NA
3 3 NA
4 4 NA
5 5 NA
6 1 1
7 7 2
8 8 3
9 9 4
10 10 5
11 11 6
12 12 7
13 1 1
14 14 2
15 15 3
16 16 4
17 17 5
18 1 1
19 19 2
20 20 3
21 21 4
22 22 5
23 23 6
24 24 7
25 25 8
26 26 9
Thanks
This assumes that the data is ordered in the way shown in example.
m$wanted <- with(m, ave(w, cumsum(c(TRUE,diff(w) <0)), FUN=seq_along))
m$wanted
#[1] 1 2 3 4 5 1 2 3 4 5 6 7 1 2 3 4 5 1 2 3 4 5 6 7 8 9
For the given data including repeated 1's and non-sequential input, the following works:
m[9,1] <- 100
m[3,1] <- 55
m[14,1] <- 60
m[14,1] <- 60
m[25,1] <- 1
m[19,1] <- 1
m$result <- 1:nrow(m) - which(m$w == 1)[cumsum(m$w == 1)] + 1
But if the data does not start on 1:
m[1,1] <- 2
Then this works:
firstone <- which(m$w == 1)[1]
subindex <- m[firstone:nrow(m),'w'] == 1
m$result <- c(rep(NA,firstone-1),1:length(subindex) - which(subindex)[cumsum(subindex)] + 1)