xx is a sample data. it contains variables dep1,dep2,dep3,bet1,bet2,bet3. I want to select all possible 2 columns combinations but not the ones with the same name (except,number) . In this examples there are
9 such combos {dep1:bet1,dep1:bet2,dep1:bet3,dep2:bet1...........}
Below is code which I want to run for all combinations ( I did it just for one) also in last line
I added a code to keep track which variables were included in calculations. I believe the regex will help
to understand. help appreciated !
xx<-data.frame(id=1:10,
category=c(rep("A",5),rep("B",5)),
dep1=sample(1:5,10,replace = T),
dep2=sample(1:5,10,replace = T),
dep3=sample(1:5,10,replace = T),
bet1=sample(1:5,10,replace = T),
bet2=sample(1:5,10,replace = T),
bet3=sample(1:5,10,replace = T))
xx%>%select(2,dep1,bet1)%>%
mutate(vdep=if_else(dep1>3,1,0),
vbet=if_else(bet1>3,1,0))%>%
group_by(category)%>%
summarise(vdep=mean(vdep),
vbet=mean(vbet))%>%ungroup()%>%
gather(variable,value,-category)%>%
mutate(variable=as.factor(variable))%>%
unite(variable,category,col = "new")%>%
spread(new,value)%>%
mutate(first="dep1",second="bet1")
If I understand you correctly, something like the following should do it:
# the data
xx<-data.frame(id=1:10,
category=c(rep("A",5),rep("B",5)),
dep1=sample(1:5,10,replace = T),
dep2=sample(1:5,10,replace = T),
dep3=sample(1:5,10,replace = T),
bet1=sample(1:5,10,replace = T),
bet2=sample(1:5,10,replace = T),
bet3=sample(1:5,10,replace = T))
# Getting the column names with "dep" or "bet"
cols = names(xx)[grepl("dep|bet", names(xx))]
deps = cols[grepl("dep", cols)]
bets = cols[grepl("bet", cols)]
# Getting all possible combinations of these columns
comb = expand.grid(deps, bets)
comb
# Var1 Var2
# 1 dep1 bet1
# 2 dep2 bet1
# 3 dep3 bet1
# 4 dep1 bet2
# 5 dep2 bet2
# 6 dep3 bet2
# 7 dep1 bet3
# 8 dep2 bet3
# 9 dep3 bet3
# Transposing the dataframe containing these combinations, so that
# we can directly use sapply / lapply on the columns latter
comb = data.frame(t(comb), stringsAsFactors = FALSE)
# For each combination, subset the dataframe xx
result = sapply(comb, function(x){
xx[, x]
}, simplify = FALSE)
result
# $X1
# dep1 bet1
# 1 1 5
# 2 1 5
# 3 2 2
# 4 2 2
# 5 1 5
# 6 3 3
# 7 1 1
# 8 2 2
# 9 3 2
# 10 1 5
#
# $X2
# dep2 bet1
# 1 1 5
# 2 2 5
# 3 4 2
# 4 5 2
# 5 1 5
# 6 5 3
# 7 2 1
# 8 1 2
# 9 4 2
# 10 4 5
#
# $X3
# dep3 bet1
# 1 3 5
# 2 2 5
# 3 4 2
# 4 3 2
# 5 3 5
# 6 2 3
# 7 1 1
# 8 4 2
# 9 5 2
# 10 5 5
#
# $X4
# dep1 bet2
# 1 1 5
# 2 1 1
# 3 2 1
# 4 2 2
# 5 1 2
# 6 3 2
# 7 1 3
# 8 2 3
# 9 3 5
# 10 1 1
#
# $X5
# dep2 bet2
# 1 1 5
# 2 2 1
# 3 4 1
# 4 5 2
# 5 1 2
# 6 5 2
# 7 2 3
# 8 1 3
# 9 4 5
# 10 4 1
#
# $X6
# dep3 bet2
# 1 3 5
# 2 2 1
# 3 4 1
# 4 3 2
# 5 3 2
# 6 2 2
# 7 1 3
# 8 4 3
# 9 5 5
# 10 5 1
#
# $X7
# dep1 bet3
# 1 1 3
# 2 1 2
# 3 2 5
# 4 2 1
# 5 1 3
# 6 3 2
# 7 1 4
# 8 2 1
# 9 3 1
# 10 1 3
#
# $X8
# dep2 bet3
# 1 1 3
# 2 2 2
# 3 4 5
# 4 5 1
# 5 1 3
# 6 5 2
# 7 2 4
# 8 1 1
# 9 4 1
# 10 4 3
#
# $X9
# dep3 bet3
# 1 3 3
# 2 2 2
# 3 4 5
# 4 3 1
# 5 3 3
# 6 2 2
# 7 1 4
# 8 4 1
# 9 5 1
# 10 5 3
Related
Hello I have a data frame of 245 columns but to add some sets and generate new columns try to do it recursively as follows
cl1<-sample(1:4,10,replace=TRUE)
cl2<-sample(1:4,10,replace=TRUE)
cl3<-sample(1:4,10,replace=TRUE)
cl4<-sample(1:4,10,replace=TRUE)
cl5<-sample(1:4,10,replace=TRUE)
cl6<-sample(1:4,10,replace=TRUE)
dat<-data.frame(cl1,cl2,cl3,cl4,cl5,cl6)
my intention is to add column 1 with column 3 and 5, likewise column 2 with 4 and 6 and in the end obtain a dataframe with two columns
and you should pay me something like that
I have programmed the following code
revisar<- function(a){
todos = list()
i=1
j=3
l=5
k=1
while(i<=2 ){
cl<-a[,i]
cl2<-a[,j]
cl3<-a[,l]
cl[is.na(cl)] <- 0
cl2[is.na(cl2)] <- 0
cl3[is.na(cl3)] <- 0
colu<-cl+cl2+cl3
col<-cbind(colu,colu)
i<-i+1
j<-j+1
l<-l+1
k<-k+1
}
return(col)
}
it turns out that it only returns column 2 repeated twice and I must replicate the same thing to join those 245 columns.7
I would like to know what is failing the example
base R
Literal programming:
with(dat, data.frame(s1 = cl1+cl3+cl5, s2 = cl2+cl4+cl6))
# s1 s2
# 1 7 11
# 2 7 7
# 3 4 11
# 4 4 10
# 5 9 8
# 6 12 5
# 7 7 6
# 8 7 10
# 9 4 9
# 10 6 5
Programmatically,
L <- list(s1 = c(1,3,5), s2 = c(2,4,6))
out <- data.frame(lapply(L, function(z) do.call(rowSums, list(as.matrix(dat[,z])))))
out
# s1 s2
# 1 7 11
# 2 7 7
# 3 4 11
# 4 4 10
# 5 9 8
# 6 12 5
# 7 7 6
# 8 7 10
# 9 4 9
# 10 6 5
dplyr
library(dplyr)
dat %>%
transmute(
s1 = rowSums(cbind(cl1, cl3, cl5)),
s2 = rowSums(cbind(cl2, cl4, cl6))
)
or programmatically using purrr:
purrr::map_dfc(L, ~ rowSums(dat[, .]))
Data
set.seed(42)
# your `dat` above
Here is an alternative general approach:
Here we sum all uneven columns -> s1 and
all even columns -> s2:
library(dplyr)
dat %>%
rowwise() %>%
mutate(s1 = sum(c_across(seq(1,ncol(dat),2)), na.rm = TRUE),
s2 = sum(c_across(seq(2,ncol(dat),2)), na.rm = TRUE))
cl1 cl2 cl3 cl4 cl5 cl6 s1 s2
<int> <int> <int> <int> <int> <int> <int> <int>
1 1 1 3 2 3 2 7 5
2 2 4 1 4 2 3 5 11
3 2 2 2 2 1 3 5 7
4 2 4 4 3 1 4 7 11
5 2 4 4 3 2 2 8 9
6 3 3 3 2 2 2 8 7
7 2 1 1 2 1 4 4 7
8 2 4 1 3 2 3 5 10
9 3 1 1 2 3 4 7 7
10 2 4 1 3 4 4 7 11
I have a dataframe like this.
data <- data.frame(Condition = c(1,1,2,3,1,1,2,2,2,3,1,1,2,3,3))
I want to populate a new variable Sequence which identifies whenever Condition starts again from 1.
So the new dataframe would look like this.
Thanks in advance for the help!
data <- data.frame(Condition = c(1,1,2,3,1,1,2,2,2,3,1,1,2,3,3),
Sequence = c(1,1,1,1,2,2,2,2,2,2,3,3,3,3,3))
base R
data$Sequence2 <- cumsum(c(TRUE, data$Condition[-1] == 1 & data$Condition[-nrow(data)] != 1))
data
# Condition Sequence Sequence2
# 1 1 1 1
# 2 1 1 1
# 3 2 1 1
# 4 3 1 1
# 5 1 2 2
# 6 1 2 2
# 7 2 2 2
# 8 2 2 2
# 9 2 2 2
# 10 3 2 2
# 11 1 3 3
# 12 1 3 3
# 13 2 3 3
# 14 3 3 3
# 15 3 3 3
dplyr
library(dplyr)
data %>%
mutate(
Sequence2 = cumsum(Condition == 1 & lag(Condition != 1, default = TRUE))
)
# Condition Sequence Sequence2
# 1 1 1 1
# 2 1 1 1
# 3 2 1 1
# 4 3 1 1
# 5 1 2 2
# 6 1 2 2
# 7 2 2 2
# 8 2 2 2
# 9 2 2 2
# 10 3 2 2
# 11 1 3 3
# 12 1 3 3
# 13 2 3 3
# 14 3 3 3
# 15 3 3 3
This took a while. Finally I find this solution:
library(dplyr)
data %>%
group_by(Sequnce = cumsum(
ifelse(Condition==1, lead(Condition)+1, Condition)
- Condition==1)
)
Condition Sequnce
<dbl> <int>
1 1 1
2 1 1
3 2 1
4 3 1
5 1 2
6 1 2
7 2 2
8 2 2
9 2 2
10 3 2
11 1 3
12 1 3
13 2 3
14 3 3
15 3 3
Unfortunately, I have spotted a weird inconsistency in the colnames when cbind different 2 particular objects: tibbles that has been by_group()ed and matrix. I writing this here because I would understand what is going on under the hood with the cbind operation and these 2 objects.
Consider the following objects:
Simple tibble
library(tidyverse)
tbl <- tibble(tbl_name = seq(1,8))
# # A tibble: 8 x 1
# tbl_name
# <int>
# 1 1
# 2 2
# 3 3
# 4 4
# 5 5
# 6 6
# 7 7
# 8 8
Simple data.frame
df <- data.frame(df_name = seq(1,8))
df
# df_name
# 1 1
# 2 2
# 3 3
# 4 4
# 5 5
# 6 6
# 7 7
# 8 8
Simple matrix
mtx <- matrix(seq(1,8), nrow = 8)
colnames(mtx) <- "mtx_name"
# mtx_name
# [1,] 1
# [2,] 2
# [3,] 3
# [4,] 4
# [5,] 5
# [6,] 6
# [7,] 7
# [8,] 8
by_grouped tibble
tb2 <- tibble(tbl2_name = seq(1,8),
tbl_group_by = c("a","b","b","c","d","d","d","d"))
tb2 <- tb2 %>%
group_by(tbl_group_by) %>%
mutate(N_by_group = n())
# A tibble: 8 x 3
# Groups: tbl_group_by [4]
# tbl2_name tbl_group_by N_by_group
# <int> <chr> <int>
# 1 1 a 1
# 2 2 b 2
# 3 3 b 2
# 4 4 c 1
# 5 5 d 4
# 6 6 d 4
# 7 7 d 4
# 8 8 d 4
When cbind them:
>This works (a.k.a: keeps the correct names)
# Comparison
# tibble & data.frame: OK
cbind(tbl,df)
# tbl_name df_name
# 1 1 1
# 2 2 2
# 3 3 3
# 4 4 4
# 5 5 5
# 6 6 6
# 7 7 7
# 8 8 8
# matrix & data.frame: OK
cbind(mtx,df)
# mtx_name df_name
# 1 1 1
# 2 2 2
# 3 3 3
# 4 4 4
# 5 5 5
# 6 6 6
# 7 7 7
# 8 8 8
# tibble & matrix: OK
cbind(tbl,mtx)
# tbl_name mtx_name
# 1 1 1
# 2 2 2
# 3 3 3
# 4 4 4
# 5 5 5
# 6 6 6
# 7 7 7
# 8 8 8
This doesn't work (a.k.a: destroyed the colname of the matrix)
# tibble(group_by()) & matrix: oops!!!!
cbind(tb2,mtx)
# New names:
# * NA -> ...4
# # A tibble: 8 x 4
# # Groups: tbl_group_by [4]
# tbl2_name tbl_group_by N_by_group ...4[,"mtx_name"]
# <int> <chr> <int> <int>
# 1 1 a 1 1
# 2 2 b 2 2
# 3 3 b 2 3
# 4 4 c 1 4
# 5 5 d 4 5
# 6 6 d 4 6
# 7 7 d 4 7
# 8 8 d 4 8
Any intuition of what's happening or how to prevent it, is very welcome. Thank you in advance.
We can remove the group attributes with ungroup and now cbind should work
library(dplyr)
cbind(ungroup(tb2), mtx)
-output
# tbl2_name tbl_group_by N_by_group mtx_name
#1 1 a 1 1
#2 2 b 2 2
#3 3 b 2 3
#4 4 c 1 4
#5 5 d 4 5
#6 6 d 4 6
#7 7 d 4 7
#8 8 d 4 8
Or specifically use cbind.data.frame because by default it may use cbind.matrix
cbind.data.frame(tb2, mtx)
When we create the 'tb2', after grouping, make sure to ungroup to prevent this kind of issues
tb2 <- tb2 %>%
group_by(tbl_group_by) %>%
mutate(N_by_group = n()) %>%
ungroup
Or make use of is_grouped_df to find if the data is grouped or not and then ungroup
f1 <- function(dat) {
if(dplyr::is_grouped_df(dat)) {
dat <- ungroup(dat)
}
dat
}
cbind(f1(tb2), mtx)
Just getting started using R and I need some help in understanding the application of for/nested loop.
StudyID<-c(1:5)
SubjectID<-c(1:5)
df<-data.frame(StudyID=rep(StudyID, each=5), SubjectID=rep(SubjectID, each=1))
How can I create a new column called as ID, which would use the combination of studyID and subjectID to create a unique ID ?
So for this data, unique ID should be from 1:25.
So the final data looks like this:
UniqueID<- c(1:25)
df<-cbind(df,UniqueID)
View(df)
Is there any other way which is faster and more efficient that looping ?
Using the dplyr package, you could do:
library(dplyr)
df$Id = group_indices(df,StudyID,SubjectID)
This returns:
#StudyID SubjectID Id
# 1 1 1
# 1 2 2
# 1 3 3
# 1 4 4
# 1 5 5
# 2 1 6
# 2 2 7
# 2 3 8
# 2 4 9
# 2 5 10
# 3 1 11
# 3 3 13
# 3 4 14
# 3 5 15
# 4 1 16
# 4 2 17
# 4 3 18
# 4 4 19
# 4 5 20
# 5 1 21
# 5 2 22
# 5 3 23
# 5 4 24
# 5 5 25
Another method to achieve that without loading any library (base R) would be this (assuming data frame is sorted based on the two columns):
StudyID<-c(1:5)
SubjectID<-c(1:5)
df<-data.frame(StudyID=rep(StudyID, each=5), SubjectID=rep(SubjectID, each=1))
df$uniqueID <- cumsum(!duplicated(df[1:2]))
or you can use this solution, mentioned in the comments (I prefer this over the first solution):
df$uniqueID <- as.numeric(factor(do.call(paste, df)))
The output would be:
> print(df, row.names = FALSE)
#StudyID SubjectID uniqueID
# 1 1 1
# 1 2 2
# 1 3 3
# 1 4 4
# 1 5 5
# 2 1 6
# 2 2 7
# 2 3 8
# 2 4 9
# 2 5 10
# 3 1 11
# 3 2 12
# 3 3 13
# 3 4 14
# 3 5 15
# 4 1 16
# 4 2 17
# 4 3 18
# 4 4 19
# 4 5 20
# 5 1 21
# 5 2 22
# 5 3 23
# 5 4 24
# 5 5 25
You could go for interaction in base R:
df$uniqueID <- with(df, as.integer(interaction(StudyID,SubjectID)))
For example (this example expresses better what you are after):
set.seed(10)
df <- data.frame(StudyID=sample(5,10,replace = T), SubjectID=rep(1:5,times=2))
df$uniqueID <- with(df, as.integer(interaction(StudyID,SubjectID)))
# StudyID SubjectID uniqueID
# 1 3 1 3
# 2 2 2 6
# 3 3 3 11
# 4 4 4 16
# 5 1 5 17
# 6 2 1 2
# 7 2 2 6
# 8 2 3 10
# 9 4 4 16
# 10 3 5 19
I try to count triplets; for this I use three vectors that are packed in a dataframe:
X=c(4,4,4,4,4,4,4,4,1,1,1,1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3,3,3,3)
Y=c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,3,4,2,2,2,2,3,4,1,1,2,2,3,3,4,4)
Z=c(4,4,5,4,4,4,4,4,6,1,1,1,1,1,1,1,2,2,2,2,7,2,3,3,3,3,3,3,3,3)
Count_Frame=data.frame(matrix(NA, nrow=(length(X)), ncol=3))
Count_Frame[1]=X
Count_Frame[2]=Y
Count_Frame[3]=Z
Counts=data.frame(table(Count_Frame))
There is the following problem: if I increase the value range in the vectors or use even more vectors the "Counts" dataframe quickly approaches its size limit due to the many 0-counts. Is there a way to exclude the 0-counts while generating "Counts"?
We can use data.table. Convert the 'data.frame' to 'data.table' (setDT(Count_Frame)), grouped by all the columns (.(X, Y, Z)), we get the number or rows (.N).
library(data.table)
setDT(Count_Frame)[,.N ,.(X, Y, Z)]
# X Y Z N
# 1: 4 1 4 7
# 2: 4 1 5 1
# 3: 1 1 6 1
# 4: 1 1 1 3
# 5: 1 2 1 2
# 6: 1 3 1 1
# 7: 1 4 1 1
# 8: 2 2 2 4
# 9: 2 3 7 1
#10: 2 4 2 1
#11: 3 1 3 2
#12: 3 2 3 2
#13: 3 3 3 2
#14: 3 4 3 2
Instead of naming all the columns, we can use names(Count_Frame) as well (if there are many columns)
setDT(Count_Frame)[,.N , names(Count_Frame)]
You can accomplish this with aggregate:
Count_Frame$one <- 1
aggregate(one ~ X1 + X2 + X3, data=Count_Frame, FUN=sum)
This will calculate the positive instances of table, but will not list the zero counts.
One solution is to create a combination of the column values and count those instead:
library(tidyr)
as.data.frame(table(unite(Count_Frame, tmp, X1, X2, X3))) %>%
separate(Var1, c('X1', 'X2', 'X3'))
Resulting output is:
X1 X2 X3 Freq
1 1 1 1 3
2 1 1 6 1
3 1 2 1 2
4 1 3 1 1
5 1 4 1 1
6 2 2 2 4
7 2 3 7 1
8 2 4 2 1
9 3 1 3 2
10 3 2 3 2
11 3 3 3 2
12 3 4 3 2
13 4 1 4 7
14 4 1 5 1
Or using plyr:
library(plyr)
count(Count_Frame, colnames(Count_Frame))
output
# > count(Count_Frame, colnames(Count_Frame))
# X1 X2 X3 freq
# 1 1 1 1 3
# 2 1 1 6 1
# 3 1 2 1 2
# 4 1 3 1 1
# 5 1 4 1 1
# 6 2 2 2 4
# 7 2 3 7 1
# 8 2 4 2 1
# 9 3 1 3 2
# 10 3 2 3 2
# 11 3 3 3 2
# 12 3 4 3 2
# 13 4 1 4 7
# 14 4 1 5 1