join columns recursively in R - r

Hello I have a data frame of 245 columns but to add some sets and generate new columns try to do it recursively as follows
cl1<-sample(1:4,10,replace=TRUE)
cl2<-sample(1:4,10,replace=TRUE)
cl3<-sample(1:4,10,replace=TRUE)
cl4<-sample(1:4,10,replace=TRUE)
cl5<-sample(1:4,10,replace=TRUE)
cl6<-sample(1:4,10,replace=TRUE)
dat<-data.frame(cl1,cl2,cl3,cl4,cl5,cl6)
my intention is to add column 1 with column 3 and 5, likewise column 2 with 4 and 6 and in the end obtain a dataframe with two columns
and you should pay me something like that
I have programmed the following code
revisar<- function(a){
todos = list()
i=1
j=3
l=5
k=1
while(i<=2 ){
cl<-a[,i]
cl2<-a[,j]
cl3<-a[,l]
cl[is.na(cl)] <- 0
cl2[is.na(cl2)] <- 0
cl3[is.na(cl3)] <- 0
colu<-cl+cl2+cl3
col<-cbind(colu,colu)
i<-i+1
j<-j+1
l<-l+1
k<-k+1
}
return(col)
}
it turns out that it only returns column 2 repeated twice and I must replicate the same thing to join those 245 columns.7
I would like to know what is failing the example

base R
Literal programming:
with(dat, data.frame(s1 = cl1+cl3+cl5, s2 = cl2+cl4+cl6))
# s1 s2
# 1 7 11
# 2 7 7
# 3 4 11
# 4 4 10
# 5 9 8
# 6 12 5
# 7 7 6
# 8 7 10
# 9 4 9
# 10 6 5
Programmatically,
L <- list(s1 = c(1,3,5), s2 = c(2,4,6))
out <- data.frame(lapply(L, function(z) do.call(rowSums, list(as.matrix(dat[,z])))))
out
# s1 s2
# 1 7 11
# 2 7 7
# 3 4 11
# 4 4 10
# 5 9 8
# 6 12 5
# 7 7 6
# 8 7 10
# 9 4 9
# 10 6 5
dplyr
library(dplyr)
dat %>%
transmute(
s1 = rowSums(cbind(cl1, cl3, cl5)),
s2 = rowSums(cbind(cl2, cl4, cl6))
)
or programmatically using purrr:
purrr::map_dfc(L, ~ rowSums(dat[, .]))
Data
set.seed(42)
# your `dat` above

Here is an alternative general approach:
Here we sum all uneven columns -> s1 and
all even columns -> s2:
library(dplyr)
dat %>%
rowwise() %>%
mutate(s1 = sum(c_across(seq(1,ncol(dat),2)), na.rm = TRUE),
s2 = sum(c_across(seq(2,ncol(dat),2)), na.rm = TRUE))
cl1 cl2 cl3 cl4 cl5 cl6 s1 s2
<int> <int> <int> <int> <int> <int> <int> <int>
1 1 1 3 2 3 2 7 5
2 2 4 1 4 2 3 5 11
3 2 2 2 2 1 3 5 7
4 2 4 4 3 1 4 7 11
5 2 4 4 3 2 2 8 9
6 3 3 3 2 2 2 8 7
7 2 1 1 2 1 4 4 7
8 2 4 1 3 2 3 5 10
9 3 1 1 2 3 4 7 7
10 2 4 1 3 4 4 7 11

Related

Is there an R function which can pass elements of lists as arguments without specifying individual elements

Is there an R function which can pass all the elements of a list as the arguments of a function?
library(tidyr)
a <- c(1,2,3)
b <- c(4,5,6)
c <- c(7,8,9)
d <- list(a,b,c)
crossing(d[[1]],d[[2]],d[[3]])
Instead of specifying d[[1]],d[[2]],d[[3]], i'd like to just include d
Expected result:
> crossing(d[[1]],d[[2]],d[[3]])
# A tibble: 27 x 3
`d[[1]]` `d[[2]]` `d[[3]]`
<dbl> <dbl> <dbl>
1 1 4 7
2 1 4 8
3 1 4 9
4 1 5 7
5 1 5 8
6 1 5 9
7 1 6 7
8 1 6 8
9 1 6 9
10 2 4 7
# ... with 17 more rows
You can use do.call to executes a function call and a list of arguments to be passed to it.
c(d[[1]],d[[2]],d[[3]])
#[1] 1 2 3 4 5 6 7 8 9
do.call("c", d)
#[1] 1 2 3 4 5 6 7 8 9
And for crossing, which needs not duplicated Column names:
library(tidyr)
names(d) <- seq_along(d)
do.call(crossing, d)
## A tibble: 27 x 3
# `1` `2` `3`
# <dbl> <dbl> <dbl>
# 1 1 4 7
# 2 1 4 8
# 3 1 4 9
# 4 1 5 7
# 5 1 5 8
# 6 1 5 9
# 7 1 6 7
# 8 1 6 8
# 9 1 6 9
#10 2 4 7
## … with 17 more rows

numbering duplicated rows in dplyr [duplicate]

This question already has answers here:
Using dplyr to get cumulative count by group
(3 answers)
Closed 5 years ago.
I come to an issue with numbering the duplicated rows in data.frame and could not find a similar post.
Let's say we have a data like this
df <- data.frame(gr=gl(7,2),x=c("a","a","b","b","c","c","a","a","c","c","d","d","a","a"))
> df
gr x
1 1 a
2 1 a
3 2 b
4 2 b
5 3 c
6 3 c
7 4 a
8 4 a
9 5 c
10 5 c
11 6 d
12 6 d
13 7 a
14 7 a
and want to add new column called x_dupl to show that first occurrence of x values is numbered as 1 and second time 2 and third time 3 and so on..
thanks in advance!
The expected output
> df
gr x x_dupl
1 1 a 1
2 1 a 1
3 2 b 1
4 2 b 1
5 3 c 1
6 3 c 1
7 4 a 2
8 4 a 2
9 5 c 2
10 5 c 2
11 6 d 1
12 6 d 1
13 7 a 3
14 7 a 3
Your example data (plus rows where gr = 7 as in your output), and named df1, not df:
df1 <- data.frame(gr = gl(7,2),
x = c("a","a","b","b","c","c","a","a","c","c","d","d","a","a"))
library(dplyr)
df1 %>%
group_by(x) %>%
mutate(x_dupl = dense_rank(gr)) %>%
ungroup()
# A tibble: 14 x 3
gr x x_dupl
<fctr> <fctr> <int>
1 1 a 1
2 1 a 1
3 2 b 1
4 2 b 1
5 3 c 1
6 3 c 1
7 4 a 2
8 4 a 2
9 5 c 2
10 5 c 2
11 6 d 1
12 6 d 1
13 7 a 3
14 7 a 3
A base R solution:
df <- data.frame(gr=gl(7,2),x=c("a","a","b","b","c","c","a","a","c","c","d","d","a","a"))
x <- rle(as.numeric(df$x))
x$values <- ave(x$values, x$values, FUN = seq_along)
df$x_dupl <- inverse.rle(x)
# gr x x_dupl
# 1 1 a 1
# 2 1 a 1
# 3 2 b 1
# 4 2 b 1
# 5 3 c 1
# 6 3 c 1
# 7 4 a 2
# 8 4 a 2
# 9 5 c 2
# 10 5 c 2
# 11 6 d 1
# 12 6 d 1
# 13 7 a 3
# 14 7 a 3

Get all combinations of a variable and their corresponding values in a grouped data set

My data looks like this:
mydata <- data.frame(id = c(1,1,1,2,2,3,3,3,3),
subid = c(1,2,3,1,2,1,2,3,4),
time = c(16, 18, 20, 10, 11, 7, 9, 10, 11))
id subid time
1 1 1 16
2 1 2 18
3 1 3 20
4 2 1 10
5 2 2 11
6 3 1 7
7 3 2 9
8 3 3 10
9 3 4 11
My goal is to transform the data to:
newdata <- data.frame(id = c(1,1,1,2,3,3,3,3,3,3),
subid.1 = c(1,1,2,1,1,1,1,2,2,3),
subid.2 = c(2,3,3,2,2,3,4,3,4,4),
time.1 = c(16,16,18,10,7,7,7,9,9,10),
time.2 = c(18,20,20,11,9,10,11,10,11,11))
id subid.1 subid.2 time.1 time.2
1 1 1 2 16 18
2 1 1 3 16 20
3 1 2 3 18 20
4 2 1 2 10 11
5 3 1 2 7 9
6 3 1 3 7 10
7 3 1 4 7 11
8 3 2 3 9 10
9 3 2 4 9 11
10 3 3 4 10 11
So it's not a simple reshape from long-to-wide procedure: The idea is, within groups defined by id, to take all possible combinations of
subid's and their corresponding time values, and get those into a wide format.
I know I can get all possible combinations using, for example gtools::combinations. The first group consists of 3 rows, so
gtools::combinations(n=3, r=2)
gives me the matrix of the new subid.1 and subid.2 pair for group id==1:
[,1] [,2]
[1,] 1 2
[2,] 1 3
[3,] 2 3
But then I don't know how to proceed (neither to reshape the group with id==1 to this format, nor how to do that separately for each group). Thank you!
with base R:
subset(merge(mydata, mydata, by="id", suffix=c(".1",".2")), subid.1 < subid.2)
# id subid.1 time.1 subid.2 time.2
# 1 1 1 16 2 18
# 2 1 1 16 3 20
# 3 1 2 18 3 20
# 4 2 1 10 2 11
# 5 3 1 7 2 9
# 6 3 1 7 3 10
# 7 3 1 7 4 11
# 8 3 2 9 3 10
# 9 3 2 9 4 11
# 10 3 3 10 4 11
dplyr version:
mydata %>% inner_join(.,.,by="id",suffix=c(".1",".2")) %>% filter(subid.1 < subid.2)
data.table version :
setDT(mydata)
mydata[mydata, on="id", allow.cartesian=TRUE][subid < i.subid]
# id subid time i.subid i.time
# 1: 1 1 16 2 18
# 2: 1 1 16 3 20
# 3: 1 2 18 3 20
# 4: 2 1 10 2 11
# 5: 3 1 7 2 9
# 6: 3 1 7 3 10
# 7: 3 2 9 3 10
# 8: 3 1 7 4 11
# 9: 3 2 9 4 11
# 10: 3 3 10 4 11
or to get your column names right, but it kills the fun of a short solution :).
merge(mydata, mydata, by="id", suffix=c(".1",".2"), allow.cartesian=TRUE)[subid.1 < subid.2]
Forgot to state that I came up with this rather lame 4-step solution:
step1 <- lapply(unique(mydata$id), function(x) {
nrows <- nrow(mydata[which(mydata$id == x), ])
combos <- gtools::combinations(n=nrows, r=2)
return(as.data.frame(cbind(x, combos)))
})
step2 <- dplyr::bind_rows(step1)
step3a <- merge(step2, mydata, by.x = c("x", "V2"), by.y = c("id", "subid"))
step3b <- merge(step3a, mydata, by.x = c("x", "V3"), by.y = c("id", "subid"))
step4 <- step3b[, c(1, 3, 2, 4, 5)]
names(step4) <- c("id", "subid.1", "subid.2", "time.1", "time.2")
It's ugly but works.
Using the data.table-package:
library(data.table)
setDT(mydata)[, .(subid = c(t(combn(subid, 2)))), by = id
][, grp := rep(1:2, each = .N/2), by = id
][mydata, on = .(id, subid), time := time
][, dcast(.SD, id + rowid(grp) ~ grp, value.var = list('subid','time'), sep = '.')]
which gives you:
id grp subid.1 subid.2 time.1 time.2
1: 1 1 1 2 16 18
2: 1 2 1 3 16 20
3: 1 3 2 3 18 20
4: 2 4 1 2 10 11
5: 3 5 1 2 7 9
6: 3 6 1 3 7 10
7: 3 7 1 4 7 11
8: 3 8 2 3 9 10
9: 3 9 2 4 9 11
10: 3 10 3 4 10 11

Creating a new column using for/nested loop in r

Just getting started using R and I need some help in understanding the application of for/nested loop.
StudyID<-c(1:5)
SubjectID<-c(1:5)
df<-data.frame(StudyID=rep(StudyID, each=5), SubjectID=rep(SubjectID, each=1))
How can I create a new column called as ID, which would use the combination of studyID and subjectID to create a unique ID ?
So for this data, unique ID should be from 1:25.
So the final data looks like this:
UniqueID<- c(1:25)
df<-cbind(df,UniqueID)
View(df)
Is there any other way which is faster and more efficient that looping ?
Using the dplyr package, you could do:
library(dplyr)
df$Id = group_indices(df,StudyID,SubjectID)
This returns:
#StudyID SubjectID Id
# 1 1 1
# 1 2 2
# 1 3 3
# 1 4 4
# 1 5 5
# 2 1 6
# 2 2 7
# 2 3 8
# 2 4 9
# 2 5 10
# 3 1 11
# 3 3 13
# 3 4 14
# 3 5 15
# 4 1 16
# 4 2 17
# 4 3 18
# 4 4 19
# 4 5 20
# 5 1 21
# 5 2 22
# 5 3 23
# 5 4 24
# 5 5 25
Another method to achieve that without loading any library (base R) would be this (assuming data frame is sorted based on the two columns):
StudyID<-c(1:5)
SubjectID<-c(1:5)
df<-data.frame(StudyID=rep(StudyID, each=5), SubjectID=rep(SubjectID, each=1))
df$uniqueID <- cumsum(!duplicated(df[1:2]))
or you can use this solution, mentioned in the comments (I prefer this over the first solution):
df$uniqueID <- as.numeric(factor(do.call(paste, df)))
The output would be:
> print(df, row.names = FALSE)
#StudyID SubjectID uniqueID
# 1 1 1
# 1 2 2
# 1 3 3
# 1 4 4
# 1 5 5
# 2 1 6
# 2 2 7
# 2 3 8
# 2 4 9
# 2 5 10
# 3 1 11
# 3 2 12
# 3 3 13
# 3 4 14
# 3 5 15
# 4 1 16
# 4 2 17
# 4 3 18
# 4 4 19
# 4 5 20
# 5 1 21
# 5 2 22
# 5 3 23
# 5 4 24
# 5 5 25
You could go for interaction in base R:
df$uniqueID <- with(df, as.integer(interaction(StudyID,SubjectID)))
For example (this example expresses better what you are after):
set.seed(10)
df <- data.frame(StudyID=sample(5,10,replace = T), SubjectID=rep(1:5,times=2))
df$uniqueID <- with(df, as.integer(interaction(StudyID,SubjectID)))
# StudyID SubjectID uniqueID
# 1 3 1 3
# 2 2 2 6
# 3 3 3 11
# 4 4 4 16
# 5 1 5 17
# 6 2 1 2
# 7 2 2 6
# 8 2 3 10
# 9 4 4 16
# 10 3 5 19

remove i+1th term if reoccuring

Say we have the following data
A <- c(1,2,2,2,3,4,8,6,6,1,2,3,4)
B <- c(1,2,3,4,5,1,2,3,4,5,1,2,3)
data <- data.frame(A,B)
How would one write a function so that for A, if we have the same value in the i+1th position, then the reoccuring row is removed.
Therefore the output should like like
data.frame(c(1,2,3,4,8,6,1,2,3,4), c(1,2,5,1,2,3,5,1,2,3))
My best guess would be using a for statement, however I have no experience in these
You can try
data[c(TRUE, data[-1,1]!= data[-nrow(data), 1]),]
Another option, dplyr-esque:
library(dplyr)
dat1 <- data.frame(A=c(1,2,2,2,3,4,8,6,6,1,2,3,4),
B=c(1,2,3,4,5,1,2,3,4,5,1,2,3))
dat1 %>% filter(A != lag(A, default=FALSE))
## A B
## 1 1 1
## 2 2 2
## 3 3 5
## 4 4 1
## 5 8 2
## 6 6 3
## 7 1 5
## 8 2 1
## 9 3 2
## 10 4 3
using diff, which calculates the pairwise differences with a lag of 1:
data[c( TRUE, diff(data[,1]) != 0), ]
output:
A B
1 1 1
2 2 2
5 3 5
6 4 1
7 8 2
8 6 3
10 1 5
11 2 1
12 3 2
13 4 3
Using rle
A <- c(1,2,2,2,3,4,8,6,6,1,2,3,4)
B <- c(1,2,3,4,5,1,2,3,4,5,1,2,3)
data <- data.frame(A,B)
X <- rle(data$A)
Y <- cumsum(c(1, X$lengths[-length(X$lengths)]))
View(data[Y, ])
row.names A B
1 1 1 1
2 2 2 2
3 5 3 5
4 6 4 1
5 7 8 2
6 8 6 3
7 10 1 5
8 11 2 1
9 12 3 2
10 13 4 3

Resources