use user defined function after grouped data in R - r

I have a function checking zero numbers in each column in a large dataframe. Now I want to check zero numbers in each col after grouped by category.
Here is the example:
zero_rate <- function(df) {
z_rate_list <- sapply(df, function(x) {
data.frame(
n_zero=length(which(x==0)),
n=length(x),
z_rate=length(which(x==0))/length(x))
})
d <- data.frame(z_rate_list)
d <- sapply(d, unlist)
d <- as.data.frame(d)
return(d)}
df = data.frame(var1=c(1,0,NA,4,NA,6,7,0,0,10),var2=c(11,NA,NA,0,NA,16,0,NA,19,NA))
df1= data.frame(cat = c(1,1,1,1,1,2,2,2,2,2),df)
zero_rate_df = df1 %>% group_by(cat) %>% do( zero_rate(.))
Here zero_rate(df) works just as I expected. But when I group the data by cat and calculate in each category the zero_rate for each column, the result is not as I expected.
I expect something like this:
cat va1 var2
1 n_zero 1 1
n 5 5
z_rate 0.2 0.2
2 n_zero 2 1
n 5 5
z_rate 0.4 0.2
Any suggestion? Thank you.

I came up with the following code. .[-1] was used to remove grouping col:
zero_rate <- function(df){
res <- lapply(df, function(x){
y <- c(sum(x == 0, na.rm = T), length(x))
c(y, y[1]/y[2])
})
res <- do.call(cbind.data.frame, res)
res$vars <- c('n_zero', 'n', 'z_rate')
res
}
df1 %>% group_by(cat) %>% do( zero_rate(.[-1]))
# cat var1 var2 vars
# <dbl> <dbl> <dbl> <chr>
# 1 1 1.0 1.0 n_zero
# 2 1 5.0 5.0 n
# 3 1 0.2 0.2 z_rate
# 4 2 2.0 1.0 n_zero
# 5 2 5.0 5.0 n
# 6 2 0.4 0.2 z_rate

Related

Subsetting and transposing table iterating in R

I have this table (inputdf):
Number
Value
1
0.2
1
0.3
1
0.4
2
0.2
2
0.7
3
0.1
and I want to obtain this (outputdf):
Number1
Number2
Number3
0.2
0.2
0.1
0.3
0.7
NA
0.4
NA
NA
I have tried it by iterating with a for loop through the numbers in column 1, then subsetting the dataframe by that number but I have troubles to append the result to an output dataframe:
inputdf <- read.table("input.txt", sep="\t", header = TRUE)
outputdf <- data.frame()
i=1
total=3 ###user has to modify it
for(i in seq(1:total)) {
cat("Collecting values for number", i, "\n")
values <- subset(input, Number == i, select=c(Value))
cbind(outputdf, NewColumn= values, )
names(outputdf)[names(outputdf) == "NewColumn"] <- paste0("Number", i)
}
Any help or hint will be very wellcomed. Thanks in advance!
In the tidyverse, you can create an id for each element of the groups and then use tidyr::pivot_wider:
library(tidyverse)
dat %>%
group_by(Number) %>%
mutate(id = row_number()) %>%
pivot_wider(names_from = Number, names_prefix = "Number", values_from = "Value")
# A tibble: 3 × 4
n Number1 Number2 Number3
<int> <dbl> <dbl> <dbl>
1 1 0.2 0.2 0.1
2 2 0.3 0.7 NA
3 3 0.4 NA NA
in base R, same idea. Create the id column and then reshape to wide:
transform(dat, id = with(dat, ave(rep(1, nrow(dat)), Number, FUN = seq_along))) |>
reshape(direction = "wide", timevar = "Number")

How to show values as percentage of column total in R like in excel pivot table? [duplicate]

I am trying to divide each cell in a data frame by the sum of the column. For example, I have a data frame df:
sample a b c
a2 1 4 6
a3 5 5 4
I would like to create a new data frame that takes each cell in and divides by the sum of the column, like so:
sample a b c
a2 .167 .444 .6
a3 .833 .556 .4
I have seen answers using sweep(), but that looks like its for matrices, and I have data frames. I understand how to use colSums(), but I'm not sure how to write a function that loops through every cell in the column, and then divides by the column sum. Thanks for the help!
Solution 1
Here are two dplyr solutions. We can use mutate_at or mutate_if to efficiently specify which column we want to apply an operation, or under what condition we want to apply an operation.
library(dplyr)
# Apply the operation to all column except sample
dat2 <- dat %>%
mutate_at(vars(-sample), funs(./sum(.)))
dat2
# sample a b c
# 1 a2 0.1666667 0.4444444 0.6
# 2 a3 0.8333333 0.5555556 0.4
# Apply the operation if the column is numeric
dat2 <- dat %>%
mutate_if(is.numeric, funs(./sum(.)))
dat2
# sample a b c
# 1 a2 0.1666667 0.4444444 0.6
# 2 a3 0.8333333 0.5555556 0.4
Solution 2
We can also use the map_at and map_if function from the purrr package. However, since the output is a list, we will need as.data.frame from base R or as_data_frame from dplyr to convert the list to a data frame.
library(dplyr)
library(purrr)
# Apply the operation to column a, b, and c
dat2 <- dat %>%
map_at(c("a", "b", "c"), ~./sum(.)) %>%
as_data_frame()
dat2
# # A tibble: 2 x 4
# sample a b c
# <chr> <dbl> <dbl> <dbl>
# 1 a2 0.167 0.444 0.600
# 2 a3 0.833 0.556 0.400
# Apply the operation if the column is numeric
dat2 <- dat %>%
map_if(is.numeric, ~./sum(.)) %>%
as_data_frame()
dat2
# # A tibble: 2 x 4
# sample a b c
# <chr> <dbl> <dbl> <dbl>
# 1 a2 0.167 0.444 0.600
# 2 a3 0.833 0.556 0.400
Solution 3
We can also use the .SD and .SDcols from the data.table package.
library(data.table)
# Convert to data.table
setDT(dat)
dat2 <- copy(dat)
dat2[, (c("a", "b", "c")) := lapply(.SD, function(x) x/sum(x)), .SDcols = c("a", "b", "c")]
dat2[]
# sample a b c
# 1: a2 0.1666667 0.4444444 0.6
# 2: a3 0.8333333 0.5555556 0.4
Solution 4
We can also use the lapply function to loop through all column except the first column to perform the operation.
dat2 <- dat
dat2[, -1] <- lapply(dat2[, -1], function(x) x/sum(x))
dat2
# sample a b c
# 1 a2 0.1666667 0.4444444 0.6
# 2 a3 0.8333333 0.5555556 0.4
We can also use apply to loop through all columns but add an if-else statement in the function to make sure only perform the operation on the numeric columns.
dat2 <- dat
dat2[] <- lapply(dat2[], function(x){
# Check if the column is numeric
if (is.numeric(x)){
return(x/sum(x))
} else{
return(x)
}
})
dat2
# sample a b c
# 1 a2 0.1666667 0.4444444 0.6
# 2 a3 0.8333333 0.5555556 0.4
Solution 5
A dplyr and tidyr solution based on gather and spread.
library(dplyr)
library(tidyr)
dat2 <- dat %>%
gather(Column, Value, -sample) %>%
group_by(Column) %>%
mutate(Value = Value/sum(Value)) %>%
spread(Column, Value)
dat2
# # A tibble: 2 x 4
# sample a b c
# * <chr> <dbl> <dbl> <dbl>
# 1 a2 0.167 0.444 0.600
# 2 a3 0.833 0.556 0.400
Performance Evaluation
I am curious about which method has the best performance. So I conduct the following performance evaluation using the microbenchmark package with a data frame having the same column names as OP's example but with 1000000 rows.
library(dplyr)
library(tidyr)
library(purrr)
library(data.table)
library(microbenchmark)
set.seed(100)
dat <- data_frame(sample = paste0("a", 1:1000000),
a = rpois(1000000, lambda = 3),
b = rpois(1000000, lambda = 3),
c = rpois(1000000, lambda = 3))
# Convert the data frame to a data.table for later perofrmance evaluation
dat_dt <- as.data.table(dat)
head(dat)
# # A tibble: 6 x 4
# sample a b c
# <chr> <int> <int> <int>
# 1 a1 2 5 2
# 2 a2 2 5 5
# 3 a3 3 2 4
# 4 a4 1 2 2
# 5 a5 3 3 1
# 6 a6 3 6 1
In addition to all the methods I proposed, I also interested two other methods proposed by others: the prop.table method proposed by Henrik in the comments, and the apply method by Spacedman. I called all my solutions with m1_1, m1_2, m2_1, ... to m5. If there are two methods in one solution, I used _ to separate them. I also called the prop.table method as m6 and the apply method as m7. Notice that I modified m6 to have an output as a data frame so that all the methods can have data frame, tibble, or data.table output.
Here is the code I used to assess the performance.
per <- microbenchmark(m1_1 = {dat2 <- dat %>% mutate_at(vars(-sample), funs(./sum(.)))},
m1_2 = {dat2 <- dat %>% mutate_if(is.numeric, funs(./sum(.)))},
m2_1 = {dat2 <- dat %>%
map_at(c("a", "b", "c"), ~./sum(.)) %>%
as_data_frame()
},
m2_2 = {dat2 <- dat %>%
map_if(is.numeric, ~./sum(.)) %>%
as_data_frame()},
m3 = {dat_dt2 <- copy(dat_dt)
dat_dt2[, c("a", "b", "c") := lapply(.SD, function(x) x/sum(x)),
.SDcols = c("a", "b", "c")]},
m4_1 = {dat2 <- dat
dat2[, -1] <- lapply(dat2[, -1], function(x) x/sum(x))},
m4_2 = {dat2 <- dat
dat2[] <- lapply(dat2[], function(x){
if (is.numeric(x)){
return(x/sum(x))
} else{
return(x)
}
})},
m5 = {dat2 <- dat %>%
gather(Column, Value, -sample) %>%
group_by(Column) %>%
mutate(Value = Value/sum(Value)) %>%
spread(Column, Value)},
m6 = {dat2 <- dat
dat2[-1] <- prop.table(as.matrix(dat2[-1]), margin = 2)},
m7 = {dat2 <- dat
dat2[, -1] = apply(dat2[, -1], 2, function(x) {x/sum(x)})}
)
print(per)
# Unit: milliseconds
# expr min lq mean median uq max neval
# m1_1 23.335600 24.326445 28.71934 25.134798 27.465017 75.06974 100
# m1_2 20.373093 21.202780 29.73477 21.967439 24.897305 216.27853 100
# m2_1 9.452987 9.817967 17.83030 10.052634 11.056073 175.00184 100
# m2_2 10.009197 10.342819 16.43832 10.679270 11.846692 163.62731 100
# m3 16.195868 17.154327 34.40433 18.975886 46.521868 190.50681 100
# m4_1 8.100504 8.342882 12.66035 8.778545 9.348634 181.45273 100
# m4_2 8.130833 8.499926 15.84080 8.766979 9.732891 172.79242 100
# m5 5373.395308 5652.938528 5791.73180 5737.383894 5825.141584 6660.35354 100
# m6 117.038355 150.688502 191.43501 166.665125 218.837502 325.58701 100
# m7 119.680606 155.743991 199.59313 174.007653 215.295395 357.02775 100
library(ggplot2)
autoplot(per)
The result shows that methods based on lapply (m4_1 and m4_2) are the fastest, while the tidyr approach (m5) is the slowest, indicating that when row numbers are large it is not a good idea to use the gather and spread method.
DATA
dat <- read.table(text = "sample a b c
a2 1 4 6
a3 5 5 4",
header = TRUE, stringsAsFactors = FALSE)
Given this:
> d = data.frame(sample=c("a2","a3"),a=c(1,5),b=c(4,5),c=c(6,4))
> d
sample a b c
1 a2 1 4 6
2 a3 5 5 4
You can replace every column other than the first by applying over the rest:
> d[,-1] = apply(d[,-1],2,function(x){x/sum(x)})
> d
sample a b c
1 a2 0.1666667 0.4444444 0.6
2 a3 0.8333333 0.5555556 0.4
If you don't want d being stomped on make a copy beforehand.
You could do this in dplyr as well.
sample <- c("a2", "a3")
a <- c(1, 5)
b <- c(4, 5)
c <- c(6, 4)
dat <- data.frame(sample, a, b, c)
dat
library(dplyr)
dat %>%
mutate(
a.PCT = round(a/sum(a), 3),
b.PCT = round(b/sum(b), 3),
c.PCT = round(c/sum(c), 3))
sample a b c a.PCT b.PCT c.PCT
1 a2 1 4 6 0.167 0.444 0.6
2 a3 5 5 4 0.833 0.556 0.4
You can use the transpose of the matrix and then transpose again:
t(t(as.matrix(df))/colSums(df))
try apply:
mat <- matrix(1:6, ncol=3)
apply(mat,2, function(x) x / sum(x))
okay, if you have not numeric values in you columns you can force them to be numeric:
df <- data.frame( a=c('a', 'b'), b=c(3,4), d=c(1,6))
apply(df,2, function(x) {
x <- as.numeric(x)
x / sum(x)
})

How to divide long format R dataframe by a factor and put the factor before divided dataframe?

I'm trying to divide a long-formatted dataframe by a factor (e.g. for each subject) and then put the factor (subject) before the data of each one as a label. The simplied dataframe looks like this, columns X and Y are numbers, column subject is factor. The real dataset actually has hundreds of subjects.
X <- c(1,1,2,2)
Y <- c(0.2, 0.3, 1, 0.5)
Subject <- as.factor(c("A", "A", "B", "B"))
M <- tibble(X,Y,Subject)
> M
# A tibble: 4 x 3
X Y Subject
<dbl> <dbl> <fct>
1 1 0.2 A
2 1 0.3 A
3 2 1 B
4 2 0.5 B
The resulting dataframe should look like this:
> M_trans
A
1 0.2
1 0.3
B
2 1
2 0.5
Thank you for your help!
I tried this code and it works to output like below, I couldn't find a way to introduce factors as everything in r works in vector format. If you find a better solution, post it for us.
X <- c(1,1,2,2,3,3)
Y <- c(0.2, 0.3, 1, 0.5,0.2,0.9)
Subject <- as.factor(c("A", "A", "B", "B","C","C"))
M <- tibble(X,Y,Subject)
unq_subjects <- unique(Subject)
final <- data.frame()
for (i in 1: length(unique(Subject)))
{
sub <- unq_subjects[i]
tmp <- as.data.frame(M %>% filter(Subject == sub) %>%
select(-Subject) %>%
add_row(X = sub, Y = NA) %>%
arrange(desc(X)))
final <- union_all(tmp,final)
}
final Output
X Y
1 C NA
2 3 0.2
3 3 0.9
4 B NA
5 2 1.0
6 2 0.5
7 A NA
8 1 0.2
9 1 0.3
Does it answer your question now?
Using dplyr and tidyr
library(dplyr)
library(tidyr)
M %>%
group_by(Subject) %>%
nest()
Hope this helps!
Here I got an inelegant solution worked for myself, inspired by Bertil Baron's answer. I would be happy to got any easier code...
trans_output <- function(M){
M1 <- M %>%
group_by(subject) %>%
nest()
df <- NULL
for (i in 1:2)
{
output2 <- M1$data[[i]]
df_sub <- rbind(as.character(M1$subject[[i]]), # subject ID
output2) # output data
idx <- c(1L)
df_sub <- df_sub %>%
mutate(Y = ifelse(row_number() %in% idx, NA, Y)) %>% # else, stay as Y
transmute(X = X,
Y = as.numeric(Y))
df <- rbind(df, df_sub)
rm(df_sub)
}
return(df)
}
M_trans <- trans_output(M)
The output looks like this:
> M_trans
# A tibble: 6 x 2
X Y
<chr> <dbl>
1 A NA
2 1 0.2
3 2 0.3
4 B NA
5 3 1
6 4 0.5

Dividing each cell in a data set by the column sum in R

I am trying to divide each cell in a data frame by the sum of the column. For example, I have a data frame df:
sample a b c
a2 1 4 6
a3 5 5 4
I would like to create a new data frame that takes each cell in and divides by the sum of the column, like so:
sample a b c
a2 .167 .444 .6
a3 .833 .556 .4
I have seen answers using sweep(), but that looks like its for matrices, and I have data frames. I understand how to use colSums(), but I'm not sure how to write a function that loops through every cell in the column, and then divides by the column sum. Thanks for the help!
Solution 1
Here are two dplyr solutions. We can use mutate_at or mutate_if to efficiently specify which column we want to apply an operation, or under what condition we want to apply an operation.
library(dplyr)
# Apply the operation to all column except sample
dat2 <- dat %>%
mutate_at(vars(-sample), funs(./sum(.)))
dat2
# sample a b c
# 1 a2 0.1666667 0.4444444 0.6
# 2 a3 0.8333333 0.5555556 0.4
# Apply the operation if the column is numeric
dat2 <- dat %>%
mutate_if(is.numeric, funs(./sum(.)))
dat2
# sample a b c
# 1 a2 0.1666667 0.4444444 0.6
# 2 a3 0.8333333 0.5555556 0.4
Solution 2
We can also use the map_at and map_if function from the purrr package. However, since the output is a list, we will need as.data.frame from base R or as_data_frame from dplyr to convert the list to a data frame.
library(dplyr)
library(purrr)
# Apply the operation to column a, b, and c
dat2 <- dat %>%
map_at(c("a", "b", "c"), ~./sum(.)) %>%
as_data_frame()
dat2
# # A tibble: 2 x 4
# sample a b c
# <chr> <dbl> <dbl> <dbl>
# 1 a2 0.167 0.444 0.600
# 2 a3 0.833 0.556 0.400
# Apply the operation if the column is numeric
dat2 <- dat %>%
map_if(is.numeric, ~./sum(.)) %>%
as_data_frame()
dat2
# # A tibble: 2 x 4
# sample a b c
# <chr> <dbl> <dbl> <dbl>
# 1 a2 0.167 0.444 0.600
# 2 a3 0.833 0.556 0.400
Solution 3
We can also use the .SD and .SDcols from the data.table package.
library(data.table)
# Convert to data.table
setDT(dat)
dat2 <- copy(dat)
dat2[, (c("a", "b", "c")) := lapply(.SD, function(x) x/sum(x)), .SDcols = c("a", "b", "c")]
dat2[]
# sample a b c
# 1: a2 0.1666667 0.4444444 0.6
# 2: a3 0.8333333 0.5555556 0.4
Solution 4
We can also use the lapply function to loop through all column except the first column to perform the operation.
dat2 <- dat
dat2[, -1] <- lapply(dat2[, -1], function(x) x/sum(x))
dat2
# sample a b c
# 1 a2 0.1666667 0.4444444 0.6
# 2 a3 0.8333333 0.5555556 0.4
We can also use apply to loop through all columns but add an if-else statement in the function to make sure only perform the operation on the numeric columns.
dat2 <- dat
dat2[] <- lapply(dat2[], function(x){
# Check if the column is numeric
if (is.numeric(x)){
return(x/sum(x))
} else{
return(x)
}
})
dat2
# sample a b c
# 1 a2 0.1666667 0.4444444 0.6
# 2 a3 0.8333333 0.5555556 0.4
Solution 5
A dplyr and tidyr solution based on gather and spread.
library(dplyr)
library(tidyr)
dat2 <- dat %>%
gather(Column, Value, -sample) %>%
group_by(Column) %>%
mutate(Value = Value/sum(Value)) %>%
spread(Column, Value)
dat2
# # A tibble: 2 x 4
# sample a b c
# * <chr> <dbl> <dbl> <dbl>
# 1 a2 0.167 0.444 0.600
# 2 a3 0.833 0.556 0.400
Performance Evaluation
I am curious about which method has the best performance. So I conduct the following performance evaluation using the microbenchmark package with a data frame having the same column names as OP's example but with 1000000 rows.
library(dplyr)
library(tidyr)
library(purrr)
library(data.table)
library(microbenchmark)
set.seed(100)
dat <- data_frame(sample = paste0("a", 1:1000000),
a = rpois(1000000, lambda = 3),
b = rpois(1000000, lambda = 3),
c = rpois(1000000, lambda = 3))
# Convert the data frame to a data.table for later perofrmance evaluation
dat_dt <- as.data.table(dat)
head(dat)
# # A tibble: 6 x 4
# sample a b c
# <chr> <int> <int> <int>
# 1 a1 2 5 2
# 2 a2 2 5 5
# 3 a3 3 2 4
# 4 a4 1 2 2
# 5 a5 3 3 1
# 6 a6 3 6 1
In addition to all the methods I proposed, I also interested two other methods proposed by others: the prop.table method proposed by Henrik in the comments, and the apply method by Spacedman. I called all my solutions with m1_1, m1_2, m2_1, ... to m5. If there are two methods in one solution, I used _ to separate them. I also called the prop.table method as m6 and the apply method as m7. Notice that I modified m6 to have an output as a data frame so that all the methods can have data frame, tibble, or data.table output.
Here is the code I used to assess the performance.
per <- microbenchmark(m1_1 = {dat2 <- dat %>% mutate_at(vars(-sample), funs(./sum(.)))},
m1_2 = {dat2 <- dat %>% mutate_if(is.numeric, funs(./sum(.)))},
m2_1 = {dat2 <- dat %>%
map_at(c("a", "b", "c"), ~./sum(.)) %>%
as_data_frame()
},
m2_2 = {dat2 <- dat %>%
map_if(is.numeric, ~./sum(.)) %>%
as_data_frame()},
m3 = {dat_dt2 <- copy(dat_dt)
dat_dt2[, c("a", "b", "c") := lapply(.SD, function(x) x/sum(x)),
.SDcols = c("a", "b", "c")]},
m4_1 = {dat2 <- dat
dat2[, -1] <- lapply(dat2[, -1], function(x) x/sum(x))},
m4_2 = {dat2 <- dat
dat2[] <- lapply(dat2[], function(x){
if (is.numeric(x)){
return(x/sum(x))
} else{
return(x)
}
})},
m5 = {dat2 <- dat %>%
gather(Column, Value, -sample) %>%
group_by(Column) %>%
mutate(Value = Value/sum(Value)) %>%
spread(Column, Value)},
m6 = {dat2 <- dat
dat2[-1] <- prop.table(as.matrix(dat2[-1]), margin = 2)},
m7 = {dat2 <- dat
dat2[, -1] = apply(dat2[, -1], 2, function(x) {x/sum(x)})}
)
print(per)
# Unit: milliseconds
# expr min lq mean median uq max neval
# m1_1 23.335600 24.326445 28.71934 25.134798 27.465017 75.06974 100
# m1_2 20.373093 21.202780 29.73477 21.967439 24.897305 216.27853 100
# m2_1 9.452987 9.817967 17.83030 10.052634 11.056073 175.00184 100
# m2_2 10.009197 10.342819 16.43832 10.679270 11.846692 163.62731 100
# m3 16.195868 17.154327 34.40433 18.975886 46.521868 190.50681 100
# m4_1 8.100504 8.342882 12.66035 8.778545 9.348634 181.45273 100
# m4_2 8.130833 8.499926 15.84080 8.766979 9.732891 172.79242 100
# m5 5373.395308 5652.938528 5791.73180 5737.383894 5825.141584 6660.35354 100
# m6 117.038355 150.688502 191.43501 166.665125 218.837502 325.58701 100
# m7 119.680606 155.743991 199.59313 174.007653 215.295395 357.02775 100
library(ggplot2)
autoplot(per)
The result shows that methods based on lapply (m4_1 and m4_2) are the fastest, while the tidyr approach (m5) is the slowest, indicating that when row numbers are large it is not a good idea to use the gather and spread method.
DATA
dat <- read.table(text = "sample a b c
a2 1 4 6
a3 5 5 4",
header = TRUE, stringsAsFactors = FALSE)
Given this:
> d = data.frame(sample=c("a2","a3"),a=c(1,5),b=c(4,5),c=c(6,4))
> d
sample a b c
1 a2 1 4 6
2 a3 5 5 4
You can replace every column other than the first by applying over the rest:
> d[,-1] = apply(d[,-1],2,function(x){x/sum(x)})
> d
sample a b c
1 a2 0.1666667 0.4444444 0.6
2 a3 0.8333333 0.5555556 0.4
If you don't want d being stomped on make a copy beforehand.
You could do this in dplyr as well.
sample <- c("a2", "a3")
a <- c(1, 5)
b <- c(4, 5)
c <- c(6, 4)
dat <- data.frame(sample, a, b, c)
dat
library(dplyr)
dat %>%
mutate(
a.PCT = round(a/sum(a), 3),
b.PCT = round(b/sum(b), 3),
c.PCT = round(c/sum(c), 3))
sample a b c a.PCT b.PCT c.PCT
1 a2 1 4 6 0.167 0.444 0.6
2 a3 5 5 4 0.833 0.556 0.4
You can use the transpose of the matrix and then transpose again:
t(t(as.matrix(df))/colSums(df))
try apply:
mat <- matrix(1:6, ncol=3)
apply(mat,2, function(x) x / sum(x))
okay, if you have not numeric values in you columns you can force them to be numeric:
df <- data.frame( a=c('a', 'b'), b=c(3,4), d=c(1,6))
apply(df,2, function(x) {
x <- as.numeric(x)
x / sum(x)
})

R dplyr join by range or virtual column

I want to join two tibbles by a range or a virtual column. but it seems the by - parameter just allow to handle chr oder vector(chr) of existing column names.
In my example I have a tibble d with a column value, and a tibble r with a from and a to column.
d <- tibble(value = seq(1,6, by = 0.2))
r <- tibble(from = seq(1,6), to = c(seq(2,6),Inf), class = LETTERS[seq(1,6)])
> d
# A tibble: 26 x 1
value
<dbl>
1 1.0
2 1.2
3 1.4
4 1.6
5 1.8
6 2.0
7 2.2
8 2.4
9 2.6
10 2.8
# ... with 16 more rows
> r
# A tibble: 6 x 3
from to class
<int> <dbl> <chr>
1 1 2 A
2 2 3 B
3 3 4 C
4 4 5 D
5 5 6 E
6 6 Inf F
now I want to join the value column in d within the range of from and to in r:
d %>% inner_join(r, by = "value between from and to") # >= and <
I can't find a way to do this so decided to join the floor of value in d with the from column in r
d %>% inner_join(r, by = c("floor(value)" = "from"))
of course i can create a second column to solve that:
d %>%
mutate(join_value = floor(value)) %>%
inner_join(r, by = c("join_value" = "from")) %>%
select(value, class)
# A tibble: 26 x 2
value class
<dbl> <chr>
1 1.0 A
2 1.2 A
3 1.4 A
4 1.6 A
5 1.8 A
6 2.0 B
7 2.2 B
8 2.4 B
9 2.6 B
10 2.8 B
# ... with 16 more rows
but isn't there a more comfortable way?
Thanks
I don't think inequality joins is implemented in dplyr yet, or it ever will (see this discussion on Join on inequality constraints), but this is a good situation to use an SQL join:
library(tibble)
library(sqldf)
as.tibble(sqldf("select d.value, r.class from d
join r on d.value >= r.'from' and
d.value < r.'to'"))
Alternatively, if you want to integrate the join into your dplyr chain, you can use fuzzyjoin::fuzzy_join:
library(dplyr)
library(fuzzyjoin)
d %>%
fuzzy_join(r, by = c("value" = "from", "value" = "to"),
match_fun = list(`>=`, `<`)) %>%
select(value, class)
Result:
# A tibble: 31 x 2
value class
<dbl> <chr>
1 1.0 A
2 1.2 A
3 1.4 A
4 1.6 A
5 1.8 A
6 2.0 A
7 2.0 B
8 2.2 B
9 2.4 B
10 2.6 B
# ... with 21 more rows
Notice I added single quotes around from and to since those are reserved words for the SQL language.
Ok thanks for advices, this was pretty interesting. I finally wrote a function range_join (inspired by #ycw's code) and compared all described solution in view of runtime.
I like fuzzy_join but with only 50k rows in d it needs more than 40sec. Thats too slow.
Here the result with 5k rows in d
library(dplyr)
library(fuzzyjoin)
library(sqldf)
#join by range by #WiWeber
range_join <- function(x, y, value, left, right){
x_result <- tibble()
for (y_ in split(y, 1:nrow(y)))
x_result <- x_result %>% bind_rows(x[x[[value]] >= y_[[left]] & x[[value]] < y_[[right]],] %>% cbind(y_))
return(x_result)
}
#dynamic join by #ycw
dynamic_join <- function(d, r){
d$type <- NA_character_
for (r_ in split(r, r$type))
d <- d %>% mutate(type = ifelse(value >= r_$from & value < r_$to, r_$type, type))
return(d)
}
d <- tibble(value = seq(1,6, by = 0.001), join = TRUE)
r <- tibble(from = seq(1,6), to = c(seq(2,6),Inf), type = LETTERS[seq(1,6)], join = TRUE)
# #useR sqldf - fast and intuitive but extra library with horrible code
start <- Sys.time()
d2 <- tbl_df(sqldf("select d.value, r.type from d
join r on d.value >= r.'from' and
d.value < r.'to'"))
Sys.time() - start
# #useR fuzzy_join .... very cool but veeeeeeeeeeeeeeeery slow
start <- Sys.time()
d2 <- d %>%
fuzzy_join(r, by = c("value" = "from", "value" = "to"), match_fun = list(`>=`, `<`)) %>%
select(value, type)
Sys.time() - start
# #jonathande4 cut pretty fast
start <- Sys.time()
d2 <- d
d2$type <- cut(d$value, unique(c(r$from, r$to)), r$type, right = FALSE)
Sys.time() - start
# #WiWeber floor
start <- Sys.time()
d2 <- d %>%
mutate(join_value = floor(value)) %>%
inner_join(r, by = c("join_value" = "from")) %>%
select(value, type)
Sys.time() - start
# #WiWeber cross join - filter
start <- Sys.time()
d2 <- d %>%
inner_join(r, by = "join") %>%
filter(value >= from, value < to) %>%
select(value, type)
Sys.time() - start
# #hardik-gupta sapply
start <- Sys.time()
d2 <- d %>%
mutate(
type = unlist(sapply(value, function (x) r[which(x >= r$from & x < r$to), "type"]))
) %>%
select(value, type)
Sys.time() - start
# #ycw re-dynamic join
start <- Sys.time()
d2 <- d %>% dynamic_join(r)
Sys.time() - start
# #WiWeber range_join
start <- Sys.time()
d2 <- d %>%
range_join(r, "value", "from", "to") %>%
select(value, type)
Sys.time() - start
Results:
# #useR sqldf - fast and intuitive but extra library with horrible code
Time difference of 0.06221986 secs
# #useR fuzzy_join .... very cool but veeeeeeeeeeeeeeeery slow
Time difference of 4.765595 secs
# #jonathande4 cut pretty fast
Time difference of 0.004637003 secs
# #WiWeber floor
Time difference of 0.02223396 secs
# #WiWeber cross join - filter
Time difference of 0.0201931 secs
# #hardik-gupta sapply
Time difference of 5.166633 secs
# #ycw dynamic join
Time difference of 0.03124094 secs
# #WiWeber range_join
Time difference of 0.02691698 secs
greez WiWeber
You use the cut function to create a "class" in object d and then use a left join.
d <- tibble(value = seq(1,6, by = 0.2))
r <- tibble(from = seq(1,6), to = c(seq(2,6),Inf), class = LETTERS[seq(1,6)])
d[["class"]] <- cut(d[["value"]], c(0,2,3,4,5,6,Inf), c('A',"B", "C", "D", "E", "F"), right = FALSE)
d <- left_join(d, r)
To get the right buckets, you just need to work with the cut function to get what you want.
We can use sapply for this
library(tibble)
d <- tibble(value = seq(1,6, by = 0.2))
r <- tibble(from = seq(1,6), to = c(seq(2,6),Inf), class = LETTERS[seq(1,6)])
d <- cbind(d, data.frame(class = (unlist(sapply(d$value, function (x) r[which(x >= r$from & x < r$to), "class"]))) ) )
d
value class
1 1.0 A
2 1.2 A
3 1.4 A
4 1.6 A
5 1.8 A
6 2.0 B
7 2.2 B
8 2.4 B
9 2.6 B
10 2.8 B
11 3.0 C
12 3.2 C
13 3.4 C
14 3.6 C
15 3.8 C
16 4.0 D
17 4.2 D
18 4.4 D
19 4.6 D
20 4.8 D
21 5.0 E
22 5.2 E
23 5.4 E
24 5.6 E
25 5.8 E
26 6.0 F
We can use mutate and case_when from dplyr.
library(dplyr)
d2 <- d %>%
mutate(class = case_when(
value >= 1 & value < 2 ~ "A",
value >= 2 & value < 3 ~ "B",
value >= 3 & value < 4 ~ "C",
value >= 4 & value < 5 ~ "D",
value >= 5 & value < 6 ~ "E",
value >= 6 ~ "F"
))
d2
# A tibble: 26 x 2
value class
<dbl> <chr>
1 1.0 A
2 1.2 A
3 1.4 A
4 1.6 A
5 1.8 A
6 2.0 B
7 2.2 B
8 2.4 B
9 2.6 B
10 2.8 B
# ... with 16 more rows
Update
Here is a workaround by defining a function for this task.
d <- tibble(value = seq(1,6, by = 0.2))
r <- tibble(from = seq(1,6), to = c(seq(2,6),Inf), class = LETTERS[seq(1,6)])
library(dplyr)
# Define a function for dynamic join
dynamic_join <- function(d, r){
if (!("class" %in% colnames(d))){
d[["class"]] <- NA_character_
}
d <- d %>%
mutate(class = ifelse(value >= r$from & value < r$to, r$class, class))
return(d)
}
re_dynamic_join <- function(d, r){
r_list <- split(r, r$class)
for (i in 1:length(r_list)){
d <- dynamic_join(d, r_list[[i]])
}
return(d)
}
# Apply the function
d2 <- d %>% re_dynamic_join(r)
d2
# A tibble: 26 x 2
value class
<dbl> <chr>
1 1.0 A
2 1.2 A
3 1.4 A
4 1.6 A
5 1.8 A
6 2.0 B
7 2.2 B
8 2.4 B
9 2.6 B
10 2.8 B
# ... with 16 more rows
I really liked #WiWeber's range_join function, but it gives an error if a record is not within range. Here's a modification
library(dplyr)
d <- tibble(value = c(seq(1,4, by = 0.2),9))
r <- tibble(from = seq(1,5), to = c(seq(2,5),8), class = LETTERS[seq(1,5)])
range_join <- function(x, y, value, left, right){
all_matches <- tibble()
x = as.data.frame(x)
y = as.data.frame(y)
x$index=x[,value]
for (i in 1:nrow(y)){
matches = x %>% filter(index>=y[i,left] & index<= y[i,right])
if (nrow(matches)>0){
all_matches = all_matches %>% bind_rows(matches %>% cbind(y[i,]))
}
}
all_matches = all_matches %>% select(-index)
return(all_matches)
}
data <- d %>%
range_join(r, "value", "from", "to")
data

Resources