I have a data.frame called fd with 406 rows and 48 columns. For each row in fd I want to compute ntiles (sixtiles). I do this the following way:
quant <- apply(fd, 1, function(x) quantile(t(x), probs = c(1/6, 2/6, 0.5, 4/6, 5/6), na.rm = TRUE ))
What I now want to do, is split my original data into 6 new dataframes, i.e. fd1 to fd6, where in fd1 I have all the observations of the first sixtile, in fd2 I have all the observations of the second sixtile and so on. Again, I want to do this rowwise. Meaning, I want my algorithm/function to look at each row of fd and do the following:
Take all the observations of the first sixtile, in the first row of fd, and store them into the first row of fd1, then take the first sixtile of the second row of fd and store them in the second row of fd1.
Important to note: I do not have observations for each row and column, so in some I have missing data (NA)
Could anybody give hints on how I can achieve this?
Thanks in advance.
Old school solution using matrix, list and nested loops.
# some artifical data with missings
set.seed(123)
fd <- data.frame(matrix(rnorm(406*48), nrow = 406, ncol = 48))
diag(fd) <- NA
# quant
quant <- apply(fd, 1, function(x)
quantile(t(x), probs = (0:6)/6, na.rm = TRUE, type = 6)
)
#matrix with selection
res <- list()
for (i in 1:6) {
mm <- matrix(NA, nrow = nrow(fd), ncol = ncol(fd)/6)
for (j in 1:nrow(fd)) {
lwr <- (quant[(i),j] < fd[j,])
upr <- (fd[j,] <= quant[(i+1),j])
if (i == 1)
z_j <- fd[j,][ upr ]
else
z_j <- fd[j,][ lwr & upr ]
z_j <- z_j[!is.na(z_j)]
mm[j,1:length(z_j)] <- sort(z_j)
}
res[[i]] <- mm
}
rm(i, mm, j, lwr, upr)
fd1 <- res[[1]]
Here is a relatively shorter way of achieving this using purrr and dplyr packages:
library(dplyr)
library(purrr)
# some random example
df <- data.frame(matrix(runif(48),405,48))
df[3,5] <- NA
df[10,25:26] <- NA
quant <- apply(df, 1, function(x) aa <- quantile(t(x), probs = c(1/6, 2/6, 3/6, 4/6, 5/6), na.rm = TRUE ))
aa <- as.data.frame(t(df))
fd1 <- map2(quant[1,],aa,function(x,y) y[y <= x] %>% .[!is.na(.)]) %>%
do.call(rbind,.)%>% as.data.frame(.)
fd2 <- pmap(list(quant[1,],quant[2,],aa),function(x,y,z) z[z > x & z <= y] %>% .[!is.na(.)]) %>%
do.call(rbind,.) %>% as.data.frame(.)
fd3 <- pmap(list(quant[2,],quant[3,],aa),function(x,y,z) z[z > x & z <= y] %>% .[!is.na(.)]) %>%
do.call(rbind,.) %>% as.data.frame(.)
fd4 <- pmap(list(quant[3,],quant[4,],aa),function(x,y,z) z[z > x & z <= y] %>% .[!is.na(.)]) %>%
do.call(rbind,.) %>% as.data.frame(.)
fd5 <- pmap(list(quant[4,],quant[5,],aa),function(x,y,z) z[z > x & z <= y] %>% .[!is.na(.)]) %>%
do.call(rbind,.) %>% as.data.frame(.)
fd6 <- map2(quant[5,],aa,function(x,y) y[y > x & y <= max(y)] %>% .[!is.na(.)]) %>%
do.call(rbind,.) %>% as.data.frame(.)
NB: There are some duplicate values in the final fd1 - fd6 data frames (which is not, by the way, the best format to store values for this type of problem) but you can always filter them out by using for example unique.
Hope this helps. Any modification to the answer is welcomed.
Related
I have a large data frame.
As you can see, a pattern exists code below:
data_1<-data_1
data_2<-data_2 %>% filter(rowSums(data_2[,1:1])==0)
data_3<-data_3 %>% filter(rowSums(data_3[,1:2])==0)
data_4<-data_4 %>% filter(rowSums(data_4[,1:3])==0)
data_5<-data_5 %>% filter(rowSums(data_5[,1:4])==0)
data_6<-data_6 %>% filter(rowSums(data_6[,1:5])==0)
data_7<-data_7 %>% filter(rowSums(data_7[,1:6])==0)
data_8<-data_8 %>% filter(rowSums(data_8[,1:7])==0)
data_9<-data_9 %>% filter(rowSums(data_9[,1:8])==0)
data_10<-data_10 %>% filter(rowSums(data_10[,1:9])==0)
data_11<-data_11 %>% filter(rowSums(data_11[,1:10])==0)
data_12<-data_12 %>% filter(rowSums(data_12[,1:11])==0)
data_13<-data_13 %>% filter(rowSums(data_13[,1:12])==0)
data_14<-data_14 %>% filter(rowSums(data_14[,1:13])==0)
data_15<-data_15 %>% filter(rowSums(data_15[,1:14])==0)
data_16<-data_16 %>% filter(rowSums(data_16[,1:15])==0)
data_17<-data_17 %>% filter(rowSums(data_17[,1:16])==0)
data_18<-data_18 %>% filter(rowSums(data_18[,1:17])==0)
data_19<-data_19 %>% filter(rowSums(data_19[,1:18])==0)
data_20<-data_20 %>% filter(rowSums(data_20[,1:19])==0)
data_21<-data_21 %>% filter(rowSums(data_21[,1:20])==0)
I tried to make loop like this
for(i in 1:21){
data_i <- data_i %>% filter(rowSums(data_i[,1:i-1])==0)
but, data_i is far away from my intention.
how do I solve this problem?
1) for We use the test data in the Note at the end based on the built in anscombe data frame that comes with R. It is best to keep related data frames in a list so we first create such a list L and then iterate over it producing a new list L2 so that we don't overwrite the original list. Keeping the input and output separate makes it easier to debug.
We could alternately write seq_along(L)[-1] as seq(2, length(L)) and we could alternately write seq_len(i-1) as seq(1, i-1). Note that if DF is a data frame then DF[, 1] is the first column as a column vector but DF[, 1, drop = FALSE] is a one column data frame.
No packages are used.
L <- mget(ls(pattern = "^data_\\d+$"))
L2 <- L
for(i in seq_along(L)[-1]) {
Li <- L[[i]]
Sum <- rowSums(Li[, seq_len(i-1), drop = FALSE])
L2[[i]] <- Li[Sum == 0, ]
}
2) lapply Alternately we could use lapply:
L <- mget(ls(pattern = "^data_\\d+$"))
L2 <- L
L2[-1] <- lapply(seq_along(L)[-1], function(i) {
Li <- L[[i]]
Sum <- rowSums(Li[, seq_len(i-1), drop = FALSE])
Li[Sum == 0, ]
})
3) Map or use Map
L3 <- L
f3 <- function(d, i) {
Sum <- rowSums(d[, seq_len(i-1), drop = FALSE])
d[Sum == 0, ]
}
L3[-1] <- Map(f3, L[-1], seq_along(L)[-1])
or special case the first element like this. Note that it will take the component names from the first argument to Map after the function so it is important that f4 be defined so that that argument is L.
f4 <- function(d, i) {
if (i == 1) d
else {
Sum <- rowSums(d[, seq_len(i-1), drop = FALSE])
d[Sum == 0, ]
}
}
L4 <- Map(f4, L, seq_along(L))
Note
# create test data
data_1 <- anscombe
data_1[1, 1] <- 0
data_2 <- 10 * anscombe
data_2[2, 1:2] <- 0
data_3 <- 100 * anscombe
data_3[3, 1:3] <- 0
I currently have a dataframe consisting of 17 columns. 10 of the columns have continuous variables and the remaining 7 are binary variables that take values 0 and 1. For each of 10 continuous variables, I wish to calculate the mean, for each case where each of the binary variables equals 0. So I'd like to calculate and store 10*7 =70 means.
How would I do this in R? I have tried using the apply family but am unable to get the desired result. Have replicated my problem below--
df <- data.frame(matrix(ncol = 5, nrow = 5))
df$X1 <- c(1:5)
df$X2 <- c(24:28)
df$X3 <- c(5:10)
df$X4 <- rbinom(5, 1, 0.5)
df$X5 <- rbinom(5, 1, 0.5)
#What is the easiest way to list all of the means like below?
mean1 <- mean(df$X1[which(df$X4==0)])
mean2 <- mean(df$X2[which(df$X4==0)])
mean3 <- mean(df$X3[which(df$X4==0)])
mean4 <- mean(df$X1[which(df$X5==0)])
mean5 <- mean(df$X2[which(df$X5==0)])
mean6 <- mean(df$X3[which(df$X5==0)])
#I have tried--
list1 <- c("df$X1", "df$X2", "df$X3")
list2 <- c("df$X4", "df$X5")
mapply(mean, list1, list2)
Given
lst1 <- c("X1", "X2", "X3")
lst2 <- c("X4", "X5")
one straightforward way is to use nested sapply
sapply(
lst1,
function(i) {
sapply(
lst2,
function(j) mean(df[, i][which(df[, j] == 0)])
)
}
)
or a compact version
t(sapply(lst2,function(k) colMeans(subset(df[lst1],df[,k]==0))))
An option with tidyverse
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = X4:X5) %>%
filter(value == 0) %>%
group_by(value, name) %>%
summarise(across(everything(), mean, na.rm = TRUE))
In base R, you could use stack + aggregate as follows:
aggregate(.~values + ind, cbind(df[1:3], stack(df[4:5])), mean, subset = values == 0)
values ind X1 X2 X3
1 0 X4 4 27 9
2 0 X5 3 26 8
Compare the results you obtain with the one you have above
Here is the original dataframe:
set.seed(100)
toydata <- data.frame(A = sample(1:50,50,replace = T),
B = sample(1:50,50,replace = T),
C = sample(1:50,50,replace = T)
)
Below is the function which can swap values:
derangement <- function(x){
if(max(table(x)) > length(x)/2) return(NA)
while(TRUE){
y <- sample(x)
if(sum(y == x)<3) return(y)
}
}
swapFun <- function(x, n = 10){
inx <- which(x < n)
y <- derangement(x[inx])
if(length(y) == 1) return(NA)
x[inx] <- y
x
}
toy is the new dataframe after swapping
toy <- toydata # Work with a copy
toy[] <- lapply(toydata, swapFun)
I want to compare the contingency tables of these two dataframe by the difference of sum, which means:
table1<-table(toydata$A,toydata$B)
table2<-table(toy$A,toy$B)
SUM1<-sum(abs(table1-table2))
table3<-table(toydata$A,toydata$C)
table4<-table(toy$A,toy$C)
SUM2<-sum(abs(table3-table4))
table5<-table(toydata$B,toydata$C)
table6<-table(toy$C,toy$C)
SUM3<-sum(abs(table5-table6))
SUM1+SUM2+SUM3 is what I want to have. Can I get it more conviniently because sometimes the dataframe has many columns.
How to solve it? Thanks.
library(dplyr)
# your function to compare contingency tables
f = function(x,y){
table1<-table(toydata[,x],toydata[,y])
table2<-table(toy[,x],toy[,y])
sum(abs(table1-table2))
}
# vectorise your function
f = Vectorize(f)
combn(x=names(toydata),
y=names(toydata), 2) %>% # create all combinations of your column names
t() %>% # transpose
data.frame(., stringsAsFactors = F) %>% # save as dataframe
filter(X1 != X2) %>% # exclude pairs of same column
mutate(SumAbs = f(X1,X2)) # apply function
# X1 X2 SumAbs
# 1 A B 14
# 2 A C 26
# 3 B C 22
Below is my data
set.seed(100)
toydata <- data.frame(A = sample(1:50,50,replace = T),
B = sample(1:50,50,replace = T),
C = sample(1:50,50,replace = T)
)
Below is my swapping function
derangement <- function(x){
if(max(table(x)) > length(x)/2) return(NA)
while(TRUE){
y <- sample(x)
if(all(y != x)) return(y)
}
}
swapFun <- function(x, n = 10){
inx <- which(x < n)
y <- derangement(x[inx])
if(length(y) == 1) return(NA)
x[inx] <- y
x
}
In the first case,I get the new data toy by swapping the entire dataframe. The code is below:
toydata<-as.matrix(toydata)
toy<-swapFun(toydata)
toy<-as.data.frame(toy)
In the second case, I get the new data toy by swapping each column respectively. Below is the code:
toydata<-as.data.frame(toydata)
toy2 <- toydata # Work with a copy
toy2[] <- lapply(toydata, swapFun)
toy<-toy2
Below is the function that can output the difference of contigency table after swapping.
# the function to compare contingency tables
f = function(x,y){
table1<-table(toydata[,x],toydata[,y])
table2<-table(toy[,x],toy[,y])
sum(abs(table1-table2))
}
# vectorise your function
f = Vectorize(f)
combn(x=names(toydata),
y=names(toydata), 2) %>%# create all combinations of your column names
t() %>% # transpose
data.frame(., stringsAsFactors = F) %>% # save as dataframe
filter(X1 != X2) %>% # exclude pairs of same
# column
mutate(SumAbs = f(X1,X2)) # apply function
In the second case, this mutate function works.
But in the first case, this mutatefunction does not work. It says:
+ filter(X1 != X2) %>% # exclude pairs of same column
+ mutate(SumAbs = f(X1,X2)) # apply function
Error in combn(x = names(toydata), y = names(toydata), 2) : n < m
However in the two cases, the toy data are all dataframes with the same dimension, the same row names and the same column names. I feel confused.
How can I fix it? Thanks.
Could you please help me do the filtering in the last command below, using dplyr instead of apply?
I was trying to solve the problem posted here
library(gtools)
n <- 8
dt <- permutations(n+1,6,v=0:n,repeats.allowed=TRUE)
SmplMode <- function(x) {
tabSmpl <- tabulate(x)
SmplMode <- which(tabSmpl == max(tabSmpl))
if (sum(tabSmpl == max(tabSmpl)) > 1)
SmplMode <- 0
return(SmplMode)
}
res <- dt[apply(dt,1,function(x) {
y <- rep(c(1,2,3,4,5,6),c(x[1],x[2],x[3],x[4],x[5],x[6]))
return(mean(y)==3 & diff(range(y))==4 & median(y)==3.5 & SmplMode(y)==4)
}),]
Operations with rowwise is slow, so filtering out SmplMode(y), mean(y), diff(range(y)) conditions early on with the help of row-wise operations from matrixStats package speeds the things up nicely. Following runs about 0.4 sec on my laptop, while both your original solution and #shadow's solution runs about 70secs.
library(dplyr)
library(matrixStats)
df <- data.frame(dt)
df$m <- rowMaxs(dt) #for SmplMode(y)
S <- matrix(1:6, ncol=ncol(dt), nrow=nrow(dt), byrow=T)
Z <- S*(dt!=0)
Z[Z==0] <- NA
df$Range <- rowMaxs(Z, na.rm=TRUE)-rowMins(Z, na.rm=TRUE) #for diff(rang(y))
df$Mean <- rowSums(S*dt)/rowSums(dt) #for mean(y)
res <- df %>%
filter(X4 == m, (X1==m)+(X2==m)+(X3==m)+(X4==m)+(X5==m)+(X6==m)==1,
Range == 4, # range condition here
Mean == 3) %>% #mean condition here
rowwise() %>%
mutate(Med = median(rep(c(1,2,3,4,5,6), c(X1, X2, X3, X4, X5, X6)))) %>%
filter(Med == 3.5) %>% #median condition here
select(-m, -Range, -Mean, -Med) %>% # get rid of newcols
as.matrix
You can use rowwise to do rowwise operations. Then use mutate to determine if the condition is satisfied and filter to filter by the condition.
res <- dt %>%
data.frame %>% # convert to data.frame, so you can use dplyr
rowwise %>% # for rowwise calculations
mutate(cond = {y = rep(1:6, c(X1, X2, X3, X4, X5, X6)) # calculate condition
mean(y)==3 & diff(range(y))==4 & median(y)==3.5 & SmplMode(y)==4}) %>%
filter(cond) %>% # filter by condition
mutate(cond = NULL) %>% # remove condition
as.matrix # convert back to matrix