Get all combinations of a variable and their corresponding values in a grouped data set - r

My data looks like this:
mydata <- data.frame(id = c(1,1,1,2,2,3,3,3,3),
subid = c(1,2,3,1,2,1,2,3,4),
time = c(16, 18, 20, 10, 11, 7, 9, 10, 11))
id subid time
1 1 1 16
2 1 2 18
3 1 3 20
4 2 1 10
5 2 2 11
6 3 1 7
7 3 2 9
8 3 3 10
9 3 4 11
My goal is to transform the data to:
newdata <- data.frame(id = c(1,1,1,2,3,3,3,3,3,3),
subid.1 = c(1,1,2,1,1,1,1,2,2,3),
subid.2 = c(2,3,3,2,2,3,4,3,4,4),
time.1 = c(16,16,18,10,7,7,7,9,9,10),
time.2 = c(18,20,20,11,9,10,11,10,11,11))
id subid.1 subid.2 time.1 time.2
1 1 1 2 16 18
2 1 1 3 16 20
3 1 2 3 18 20
4 2 1 2 10 11
5 3 1 2 7 9
6 3 1 3 7 10
7 3 1 4 7 11
8 3 2 3 9 10
9 3 2 4 9 11
10 3 3 4 10 11
So it's not a simple reshape from long-to-wide procedure: The idea is, within groups defined by id, to take all possible combinations of
subid's and their corresponding time values, and get those into a wide format.
I know I can get all possible combinations using, for example gtools::combinations. The first group consists of 3 rows, so
gtools::combinations(n=3, r=2)
gives me the matrix of the new subid.1 and subid.2 pair for group id==1:
[,1] [,2]
[1,] 1 2
[2,] 1 3
[3,] 2 3
But then I don't know how to proceed (neither to reshape the group with id==1 to this format, nor how to do that separately for each group). Thank you!

with base R:
subset(merge(mydata, mydata, by="id", suffix=c(".1",".2")), subid.1 < subid.2)
# id subid.1 time.1 subid.2 time.2
# 1 1 1 16 2 18
# 2 1 1 16 3 20
# 3 1 2 18 3 20
# 4 2 1 10 2 11
# 5 3 1 7 2 9
# 6 3 1 7 3 10
# 7 3 1 7 4 11
# 8 3 2 9 3 10
# 9 3 2 9 4 11
# 10 3 3 10 4 11
dplyr version:
mydata %>% inner_join(.,.,by="id",suffix=c(".1",".2")) %>% filter(subid.1 < subid.2)
data.table version :
setDT(mydata)
mydata[mydata, on="id", allow.cartesian=TRUE][subid < i.subid]
# id subid time i.subid i.time
# 1: 1 1 16 2 18
# 2: 1 1 16 3 20
# 3: 1 2 18 3 20
# 4: 2 1 10 2 11
# 5: 3 1 7 2 9
# 6: 3 1 7 3 10
# 7: 3 2 9 3 10
# 8: 3 1 7 4 11
# 9: 3 2 9 4 11
# 10: 3 3 10 4 11
or to get your column names right, but it kills the fun of a short solution :).
merge(mydata, mydata, by="id", suffix=c(".1",".2"), allow.cartesian=TRUE)[subid.1 < subid.2]

Forgot to state that I came up with this rather lame 4-step solution:
step1 <- lapply(unique(mydata$id), function(x) {
nrows <- nrow(mydata[which(mydata$id == x), ])
combos <- gtools::combinations(n=nrows, r=2)
return(as.data.frame(cbind(x, combos)))
})
step2 <- dplyr::bind_rows(step1)
step3a <- merge(step2, mydata, by.x = c("x", "V2"), by.y = c("id", "subid"))
step3b <- merge(step3a, mydata, by.x = c("x", "V3"), by.y = c("id", "subid"))
step4 <- step3b[, c(1, 3, 2, 4, 5)]
names(step4) <- c("id", "subid.1", "subid.2", "time.1", "time.2")
It's ugly but works.

Using the data.table-package:
library(data.table)
setDT(mydata)[, .(subid = c(t(combn(subid, 2)))), by = id
][, grp := rep(1:2, each = .N/2), by = id
][mydata, on = .(id, subid), time := time
][, dcast(.SD, id + rowid(grp) ~ grp, value.var = list('subid','time'), sep = '.')]
which gives you:
id grp subid.1 subid.2 time.1 time.2
1: 1 1 1 2 16 18
2: 1 2 1 3 16 20
3: 1 3 2 3 18 20
4: 2 4 1 2 10 11
5: 3 5 1 2 7 9
6: 3 6 1 3 7 10
7: 3 7 1 4 7 11
8: 3 8 2 3 9 10
9: 3 9 2 4 9 11
10: 3 10 3 4 10 11

Related

join columns recursively in R

Hello I have a data frame of 245 columns but to add some sets and generate new columns try to do it recursively as follows
cl1<-sample(1:4,10,replace=TRUE)
cl2<-sample(1:4,10,replace=TRUE)
cl3<-sample(1:4,10,replace=TRUE)
cl4<-sample(1:4,10,replace=TRUE)
cl5<-sample(1:4,10,replace=TRUE)
cl6<-sample(1:4,10,replace=TRUE)
dat<-data.frame(cl1,cl2,cl3,cl4,cl5,cl6)
my intention is to add column 1 with column 3 and 5, likewise column 2 with 4 and 6 and in the end obtain a dataframe with two columns
and you should pay me something like that
I have programmed the following code
revisar<- function(a){
todos = list()
i=1
j=3
l=5
k=1
while(i<=2 ){
cl<-a[,i]
cl2<-a[,j]
cl3<-a[,l]
cl[is.na(cl)] <- 0
cl2[is.na(cl2)] <- 0
cl3[is.na(cl3)] <- 0
colu<-cl+cl2+cl3
col<-cbind(colu,colu)
i<-i+1
j<-j+1
l<-l+1
k<-k+1
}
return(col)
}
it turns out that it only returns column 2 repeated twice and I must replicate the same thing to join those 245 columns.7
I would like to know what is failing the example
base R
Literal programming:
with(dat, data.frame(s1 = cl1+cl3+cl5, s2 = cl2+cl4+cl6))
# s1 s2
# 1 7 11
# 2 7 7
# 3 4 11
# 4 4 10
# 5 9 8
# 6 12 5
# 7 7 6
# 8 7 10
# 9 4 9
# 10 6 5
Programmatically,
L <- list(s1 = c(1,3,5), s2 = c(2,4,6))
out <- data.frame(lapply(L, function(z) do.call(rowSums, list(as.matrix(dat[,z])))))
out
# s1 s2
# 1 7 11
# 2 7 7
# 3 4 11
# 4 4 10
# 5 9 8
# 6 12 5
# 7 7 6
# 8 7 10
# 9 4 9
# 10 6 5
dplyr
library(dplyr)
dat %>%
transmute(
s1 = rowSums(cbind(cl1, cl3, cl5)),
s2 = rowSums(cbind(cl2, cl4, cl6))
)
or programmatically using purrr:
purrr::map_dfc(L, ~ rowSums(dat[, .]))
Data
set.seed(42)
# your `dat` above
Here is an alternative general approach:
Here we sum all uneven columns -> s1 and
all even columns -> s2:
library(dplyr)
dat %>%
rowwise() %>%
mutate(s1 = sum(c_across(seq(1,ncol(dat),2)), na.rm = TRUE),
s2 = sum(c_across(seq(2,ncol(dat),2)), na.rm = TRUE))
cl1 cl2 cl3 cl4 cl5 cl6 s1 s2
<int> <int> <int> <int> <int> <int> <int> <int>
1 1 1 3 2 3 2 7 5
2 2 4 1 4 2 3 5 11
3 2 2 2 2 1 3 5 7
4 2 4 4 3 1 4 7 11
5 2 4 4 3 2 2 8 9
6 3 3 3 2 2 2 8 7
7 2 1 1 2 1 4 4 7
8 2 4 1 3 2 3 5 10
9 3 1 1 2 3 4 7 7
10 2 4 1 3 4 4 7 11

Keep rows with specific string and the following row

This is my data frame
df <- data.frame(
id = 1:14,
group_id = c(rep(1:2, each = 3), rep(3:4, each = 4)),
type = rep("A", 14), stringsAsFactors = FALSE)
df[c(2,4,8,12),"type"] <- "B"
id group_id type
1 1 1 A
2 2 1 B
3 3 1 A
4 4 2 B
5 5 2 A
6 6 2 A
7 7 3 A
8 8 3 B
9 9 3 A
10 10 3 A
11 11 4 A
12 12 4 B
13 13 4 A
14 14 4 A
I'd like to keep all rows with type B as well as the following row.
I could do...
B <- which(df$type=="B")
afterB <- B+1
df_sel <- df[c(B, afterB), ]
df_sel <- df_sel[order(df_sel$id),]
df_sel
...to get what I want.
id group_id type
2 2 1 B
3 3 1 A
4 4 2 B
5 5 2 A
8 8 3 B
9 9 3 A
12 12 4 B
13 13 4 A
How can this be done in a more generic way.
Another way, very similar to what you do but in one step and without the need to reorder:
df_sel <- df[rep(which(df$type=="B"), e=2)+c(0, 1), ]
df_sel
# id group_id type
# 2 2 1 B
# 3 3 1 A
# 4 4 2 B
# 5 5 2 A
# 8 8 3 B
# 9 9 3 A
# 12 12 4 B
# 13 13 4 A
Using lag from dplyr
library(dplyr)
df[df$type == "B" | lag(df$type == "B", default = FALSE), ]
# id group_id type
#2 2 1 B
#3 3 1 A
#4 4 2 B
#5 5 2 A
#8 8 3 B
#9 9 3 A
#12 12 4 B
#13 13 4 A
using grep will provide a row index of all instances of B - rows; concatenate (c()) this with rows + 1 to select from df will work.
rows <- grep("B", df[, "type"])
df[sort(c(rows, rows + 1)), ]
gives:
id group_id type
2 2 1 B
3 3 1 A
4 4 2 B
5 5 2 A
8 8 3 B
9 9 3 A
12 12 4 B
13 13 4 A

Creating a new column using for/nested loop in r

Just getting started using R and I need some help in understanding the application of for/nested loop.
StudyID<-c(1:5)
SubjectID<-c(1:5)
df<-data.frame(StudyID=rep(StudyID, each=5), SubjectID=rep(SubjectID, each=1))
How can I create a new column called as ID, which would use the combination of studyID and subjectID to create a unique ID ?
So for this data, unique ID should be from 1:25.
So the final data looks like this:
UniqueID<- c(1:25)
df<-cbind(df,UniqueID)
View(df)
Is there any other way which is faster and more efficient that looping ?
Using the dplyr package, you could do:
library(dplyr)
df$Id = group_indices(df,StudyID,SubjectID)
This returns:
#StudyID SubjectID Id
# 1 1 1
# 1 2 2
# 1 3 3
# 1 4 4
# 1 5 5
# 2 1 6
# 2 2 7
# 2 3 8
# 2 4 9
# 2 5 10
# 3 1 11
# 3 3 13
# 3 4 14
# 3 5 15
# 4 1 16
# 4 2 17
# 4 3 18
# 4 4 19
# 4 5 20
# 5 1 21
# 5 2 22
# 5 3 23
# 5 4 24
# 5 5 25
Another method to achieve that without loading any library (base R) would be this (assuming data frame is sorted based on the two columns):
StudyID<-c(1:5)
SubjectID<-c(1:5)
df<-data.frame(StudyID=rep(StudyID, each=5), SubjectID=rep(SubjectID, each=1))
df$uniqueID <- cumsum(!duplicated(df[1:2]))
or you can use this solution, mentioned in the comments (I prefer this over the first solution):
df$uniqueID <- as.numeric(factor(do.call(paste, df)))
The output would be:
> print(df, row.names = FALSE)
#StudyID SubjectID uniqueID
# 1 1 1
# 1 2 2
# 1 3 3
# 1 4 4
# 1 5 5
# 2 1 6
# 2 2 7
# 2 3 8
# 2 4 9
# 2 5 10
# 3 1 11
# 3 2 12
# 3 3 13
# 3 4 14
# 3 5 15
# 4 1 16
# 4 2 17
# 4 3 18
# 4 4 19
# 4 5 20
# 5 1 21
# 5 2 22
# 5 3 23
# 5 4 24
# 5 5 25
You could go for interaction in base R:
df$uniqueID <- with(df, as.integer(interaction(StudyID,SubjectID)))
For example (this example expresses better what you are after):
set.seed(10)
df <- data.frame(StudyID=sample(5,10,replace = T), SubjectID=rep(1:5,times=2))
df$uniqueID <- with(df, as.integer(interaction(StudyID,SubjectID)))
# StudyID SubjectID uniqueID
# 1 3 1 3
# 2 2 2 6
# 3 3 3 11
# 4 4 4 16
# 5 1 5 17
# 6 2 1 2
# 7 2 2 6
# 8 2 3 10
# 9 4 4 16
# 10 3 5 19

calculate number of rows in a dataframe above threshold as a function or other column factors

I would like to get find the number for rows for each subject for each day where the value is greater than 11 and output these in a data frame for analysis. The data set is large (5000 rows) so need a function for this.
subject = c(rep("A", 12), rep("B", 12))
day = c(1,1,1,1,2,2,2,2,3,3,3,3,1,1,1,1,2,2,2,2,3,3,3,3)
value = c(13,14,15,5,12,9,6,14,4,2,1,2,13,14,15,5,12,9,6,14,2,2,2,3)
df = data.frame(subject, day, value)
df
subject day value
1 A 1 13
2 A 1 14
3 A 1 15
4 A 1 5
5 A 2 12
6 A 2 9
7 A 2 6
8 A 2 14
9 A 3 4
10 A 3 2
11 A 3 1
12 A 3 2
13 B 1 13
14 B 1 14
15 B 1 15
16 B 1 5
17 B 2 12
18 B 2 9
19 B 2 6
20 B 2 14
21 B 3 2
22 B 3 2
23 B 3 2
24 B 3 3
The output I would like would be
subject.agg = c(rep("A", 3), rep("B", 3))
day.agg = as.factor(c(1,2,3,1,2,3))
highvalues = (c(3,2,0,3,2,0))
df.agg = data.frame(subject.agg,day.agg,highvalues)
df.agg
subject.agg day.agg highvalues
1 A 1 3
2 A 2 2
3 A 3 0
4 B 1 3
5 B 2 2
6 B 3 0
One option is aggregate from base R
aggregate(cbind(highvalues=value>11)~., df, sum)
Or with data.table
library(data.table)
setDT(df)[value>11, .(highvalues=.N), by = .(subject, day)]
# subject day highvalues
#1: A 1 3
#2: A 2 2
#3: A 3 3
#4: B 1 3
#5: B 2 2
#6: B 3 3
You can go the tidyverse way:
df %>%
filter(value > 11) %>%
group_by(subject,day) %>%
mutate(highvalue = n()) %>%
select(subject, day, highvalue) %>%
unique()
library(data.table)
dt = setDT(df)
dt[, sum(value>11),by = .(subject,day)]
subject day V1
1: A 1 3
2: A 2 2
3: A 3 3
4: B 1 3
5: B 2 2
6: B 3 3

How to refer to a column of a data.table by its position, in a sum() statement

I've googled the issue as many ways as my brain is capable of and I still can't find the answer. I'm new to R so there are some things that confuse me a little bit.
Let's say I have a data table like this:
x y z 100 200 300
1: 1 1 a 1 1 1
2: 1 1 b 2 3 4
3: 1 2 c 3 5 7
4: 1 2 d 4 7 0
5: 2 1 e 5 9 3
6: 2 1 f 6 1 6
7: 2 2 g 7 3 9
8: 2 2 h 8 5 2
This can be created with this piece of code:
DT = setDT(structure(list(c(1, 1, 1, 1, 2, 2, 2, 2),
c(1, 1, 2, 2, 1, 1, 2, 2),
c("a","b","c","d","e","f","g","h"),
c(1,2,3,4,5,6,7,8),
c(1,3,5,7,9,1,3,5),
c(1,4,7,0,3,6,9,2)),
.Names = c("x", "y", "z", 100, 200, 300), row.names = c(NA, -8L), class = "data.frame"))
However, in my actual code, the last three columns were auto-generated using another function (dcast), so the total number of columns of the data.table is not static. Also, you may notice that the names of those three last columns are numeric, which might be a problem at some point.
What I need is to create one aditional column for each "extra" column (the ones right after column "z"). I need the code to work such as this example: first, it creates column "100s", then for each row, it calculates the sum of column "100", considering only the rows with the same combination of x,y that the row in question. And so on for "200s" and "300s". Like this:
x y z 100 200 300 100s 200s 300s
1: 1 1 a 1 1 1 3 4 5
2: 1 1 b 2 3 4 3 4 5
3: 1 2 c 3 5 7 7 12 7
4: 1 2 d 4 7 0 7 12 7
5: 2 1 e 5 9 3 11 10 9
6: 2 1 f 6 1 6 11 10 9
7: 2 2 g 7 3 9 15 8 11
8: 2 2 h 8 5 2 15 8 11
I've tried with several modifications of this idea of a code:
for (i in 3:(dim(DT)[2])) {
DT <- DT[,paste(colnames(DT)[i], "s", sep=""):=sum(i),
by=c("x","y")]
}
This gives me the following result:
x y z 100 200 300 100s 200s 300s
1: 1 1 a 1 1 1 4 5 6
2: 1 1 b 2 3 4 4 5 6
3: 1 2 c 3 5 7 4 5 6
4: 1 2 d 4 7 0 4 5 6
5: 2 1 e 5 9 3 4 5 6
6: 2 1 f 6 1 6 4 5 6
7: 2 2 g 7 3 9 4 5 6
8: 2 2 h 8 5 2 4 5 6
Of course, R is not recognizing the numeric value of i as the number of column it should consider for the sum, as instead it's taking it as a raw number. I can't figure out how to adress a specific column by its position, because when it comes to sum(), that "with=FALSE" thing fails to save the day.
Any help will be appreciated.
There is no need for using a for loop in this case to get the desired result. You can update DT by reference with:
DT[, paste0(colnames(DT)[3:5],'s') := lapply(.SD, sum), by = .(x,y)]
which will give you the desired result:
> DT
x y 100 200 300 100s 200s 300s
1: 1 1 1 1 1 3 4 5
2: 1 1 2 3 4 3 4 5
3: 1 2 3 5 7 7 12 7
4: 1 2 4 7 0 7 12 7
5: 2 1 5 9 3 11 10 9
6: 2 1 6 1 6 11 10 9
7: 2 2 7 3 9 15 8 11
8: 2 2 8 5 2 15 8 11
When you don't know exacly which columns to sum, you could use one of the following methods:
# method 1:
DT[, paste0(colnames(DT)[3:ncol(DT)],'s') := lapply(.SD, sum), by = .(x,y)]
# method 2:
DT[, paste0(setdiff(colnames(DT), c('x','y')),'s') := lapply(.SD, sum), by = .(x,y)]
With the updated example, probably the best way to do is:
cols <- setdiff(colnames(DT), c('x','y','z'))
DT[, paste0(cols,'s') := lapply(.SD, sum), by = .(x,y), .SDcols = cols]
which gives:
> DT
x y z 100 200 300 100s 200s 300s
1: 1 1 a 1 1 1 3 4 5
2: 1 1 b 2 3 4 3 4 5
3: 1 2 c 3 5 7 7 12 7
4: 1 2 d 4 7 0 7 12 7
5: 2 1 e 5 9 3 11 10 9
6: 2 1 f 6 1 6 11 10 9
7: 2 2 g 7 3 9 15 8 11
8: 2 2 h 8 5 2 15 8 11

Resources