Related
I am trying to divide each cell in a data frame by the sum of the column. For example, I have a data frame df:
sample a b c
a2 1 4 6
a3 5 5 4
I would like to create a new data frame that takes each cell in and divides by the sum of the column, like so:
sample a b c
a2 .167 .444 .6
a3 .833 .556 .4
I have seen answers using sweep(), but that looks like its for matrices, and I have data frames. I understand how to use colSums(), but I'm not sure how to write a function that loops through every cell in the column, and then divides by the column sum. Thanks for the help!
Solution 1
Here are two dplyr solutions. We can use mutate_at or mutate_if to efficiently specify which column we want to apply an operation, or under what condition we want to apply an operation.
library(dplyr)
# Apply the operation to all column except sample
dat2 <- dat %>%
mutate_at(vars(-sample), funs(./sum(.)))
dat2
# sample a b c
# 1 a2 0.1666667 0.4444444 0.6
# 2 a3 0.8333333 0.5555556 0.4
# Apply the operation if the column is numeric
dat2 <- dat %>%
mutate_if(is.numeric, funs(./sum(.)))
dat2
# sample a b c
# 1 a2 0.1666667 0.4444444 0.6
# 2 a3 0.8333333 0.5555556 0.4
Solution 2
We can also use the map_at and map_if function from the purrr package. However, since the output is a list, we will need as.data.frame from base R or as_data_frame from dplyr to convert the list to a data frame.
library(dplyr)
library(purrr)
# Apply the operation to column a, b, and c
dat2 <- dat %>%
map_at(c("a", "b", "c"), ~./sum(.)) %>%
as_data_frame()
dat2
# # A tibble: 2 x 4
# sample a b c
# <chr> <dbl> <dbl> <dbl>
# 1 a2 0.167 0.444 0.600
# 2 a3 0.833 0.556 0.400
# Apply the operation if the column is numeric
dat2 <- dat %>%
map_if(is.numeric, ~./sum(.)) %>%
as_data_frame()
dat2
# # A tibble: 2 x 4
# sample a b c
# <chr> <dbl> <dbl> <dbl>
# 1 a2 0.167 0.444 0.600
# 2 a3 0.833 0.556 0.400
Solution 3
We can also use the .SD and .SDcols from the data.table package.
library(data.table)
# Convert to data.table
setDT(dat)
dat2 <- copy(dat)
dat2[, (c("a", "b", "c")) := lapply(.SD, function(x) x/sum(x)), .SDcols = c("a", "b", "c")]
dat2[]
# sample a b c
# 1: a2 0.1666667 0.4444444 0.6
# 2: a3 0.8333333 0.5555556 0.4
Solution 4
We can also use the lapply function to loop through all column except the first column to perform the operation.
dat2 <- dat
dat2[, -1] <- lapply(dat2[, -1], function(x) x/sum(x))
dat2
# sample a b c
# 1 a2 0.1666667 0.4444444 0.6
# 2 a3 0.8333333 0.5555556 0.4
We can also use apply to loop through all columns but add an if-else statement in the function to make sure only perform the operation on the numeric columns.
dat2 <- dat
dat2[] <- lapply(dat2[], function(x){
# Check if the column is numeric
if (is.numeric(x)){
return(x/sum(x))
} else{
return(x)
}
})
dat2
# sample a b c
# 1 a2 0.1666667 0.4444444 0.6
# 2 a3 0.8333333 0.5555556 0.4
Solution 5
A dplyr and tidyr solution based on gather and spread.
library(dplyr)
library(tidyr)
dat2 <- dat %>%
gather(Column, Value, -sample) %>%
group_by(Column) %>%
mutate(Value = Value/sum(Value)) %>%
spread(Column, Value)
dat2
# # A tibble: 2 x 4
# sample a b c
# * <chr> <dbl> <dbl> <dbl>
# 1 a2 0.167 0.444 0.600
# 2 a3 0.833 0.556 0.400
Performance Evaluation
I am curious about which method has the best performance. So I conduct the following performance evaluation using the microbenchmark package with a data frame having the same column names as OP's example but with 1000000 rows.
library(dplyr)
library(tidyr)
library(purrr)
library(data.table)
library(microbenchmark)
set.seed(100)
dat <- data_frame(sample = paste0("a", 1:1000000),
a = rpois(1000000, lambda = 3),
b = rpois(1000000, lambda = 3),
c = rpois(1000000, lambda = 3))
# Convert the data frame to a data.table for later perofrmance evaluation
dat_dt <- as.data.table(dat)
head(dat)
# # A tibble: 6 x 4
# sample a b c
# <chr> <int> <int> <int>
# 1 a1 2 5 2
# 2 a2 2 5 5
# 3 a3 3 2 4
# 4 a4 1 2 2
# 5 a5 3 3 1
# 6 a6 3 6 1
In addition to all the methods I proposed, I also interested two other methods proposed by others: the prop.table method proposed by Henrik in the comments, and the apply method by Spacedman. I called all my solutions with m1_1, m1_2, m2_1, ... to m5. If there are two methods in one solution, I used _ to separate them. I also called the prop.table method as m6 and the apply method as m7. Notice that I modified m6 to have an output as a data frame so that all the methods can have data frame, tibble, or data.table output.
Here is the code I used to assess the performance.
per <- microbenchmark(m1_1 = {dat2 <- dat %>% mutate_at(vars(-sample), funs(./sum(.)))},
m1_2 = {dat2 <- dat %>% mutate_if(is.numeric, funs(./sum(.)))},
m2_1 = {dat2 <- dat %>%
map_at(c("a", "b", "c"), ~./sum(.)) %>%
as_data_frame()
},
m2_2 = {dat2 <- dat %>%
map_if(is.numeric, ~./sum(.)) %>%
as_data_frame()},
m3 = {dat_dt2 <- copy(dat_dt)
dat_dt2[, c("a", "b", "c") := lapply(.SD, function(x) x/sum(x)),
.SDcols = c("a", "b", "c")]},
m4_1 = {dat2 <- dat
dat2[, -1] <- lapply(dat2[, -1], function(x) x/sum(x))},
m4_2 = {dat2 <- dat
dat2[] <- lapply(dat2[], function(x){
if (is.numeric(x)){
return(x/sum(x))
} else{
return(x)
}
})},
m5 = {dat2 <- dat %>%
gather(Column, Value, -sample) %>%
group_by(Column) %>%
mutate(Value = Value/sum(Value)) %>%
spread(Column, Value)},
m6 = {dat2 <- dat
dat2[-1] <- prop.table(as.matrix(dat2[-1]), margin = 2)},
m7 = {dat2 <- dat
dat2[, -1] = apply(dat2[, -1], 2, function(x) {x/sum(x)})}
)
print(per)
# Unit: milliseconds
# expr min lq mean median uq max neval
# m1_1 23.335600 24.326445 28.71934 25.134798 27.465017 75.06974 100
# m1_2 20.373093 21.202780 29.73477 21.967439 24.897305 216.27853 100
# m2_1 9.452987 9.817967 17.83030 10.052634 11.056073 175.00184 100
# m2_2 10.009197 10.342819 16.43832 10.679270 11.846692 163.62731 100
# m3 16.195868 17.154327 34.40433 18.975886 46.521868 190.50681 100
# m4_1 8.100504 8.342882 12.66035 8.778545 9.348634 181.45273 100
# m4_2 8.130833 8.499926 15.84080 8.766979 9.732891 172.79242 100
# m5 5373.395308 5652.938528 5791.73180 5737.383894 5825.141584 6660.35354 100
# m6 117.038355 150.688502 191.43501 166.665125 218.837502 325.58701 100
# m7 119.680606 155.743991 199.59313 174.007653 215.295395 357.02775 100
library(ggplot2)
autoplot(per)
The result shows that methods based on lapply (m4_1 and m4_2) are the fastest, while the tidyr approach (m5) is the slowest, indicating that when row numbers are large it is not a good idea to use the gather and spread method.
DATA
dat <- read.table(text = "sample a b c
a2 1 4 6
a3 5 5 4",
header = TRUE, stringsAsFactors = FALSE)
Given this:
> d = data.frame(sample=c("a2","a3"),a=c(1,5),b=c(4,5),c=c(6,4))
> d
sample a b c
1 a2 1 4 6
2 a3 5 5 4
You can replace every column other than the first by applying over the rest:
> d[,-1] = apply(d[,-1],2,function(x){x/sum(x)})
> d
sample a b c
1 a2 0.1666667 0.4444444 0.6
2 a3 0.8333333 0.5555556 0.4
If you don't want d being stomped on make a copy beforehand.
You could do this in dplyr as well.
sample <- c("a2", "a3")
a <- c(1, 5)
b <- c(4, 5)
c <- c(6, 4)
dat <- data.frame(sample, a, b, c)
dat
library(dplyr)
dat %>%
mutate(
a.PCT = round(a/sum(a), 3),
b.PCT = round(b/sum(b), 3),
c.PCT = round(c/sum(c), 3))
sample a b c a.PCT b.PCT c.PCT
1 a2 1 4 6 0.167 0.444 0.6
2 a3 5 5 4 0.833 0.556 0.4
You can use the transpose of the matrix and then transpose again:
t(t(as.matrix(df))/colSums(df))
try apply:
mat <- matrix(1:6, ncol=3)
apply(mat,2, function(x) x / sum(x))
okay, if you have not numeric values in you columns you can force them to be numeric:
df <- data.frame( a=c('a', 'b'), b=c(3,4), d=c(1,6))
apply(df,2, function(x) {
x <- as.numeric(x)
x / sum(x)
})
I would like to ask if there is a way of removing a group from dataframe using dplyr (or anz other way in that matter) in the following way. Lets say I have a dataframe in the following form grouped by variable 1:
Variable 1 Variable 2
1 a
1 b
2 a
2 a
2 b
3 a
3 c
3 a
... ...
I would like to remove only groups that have in Variable 2 two consecutive same values. That is in table above it would remove group 2 because there are values a,a,b but not group c where is a,c,a. So I would get the table bellow?
Variable 1 Variable 2
1 a
1 b
3 a
3 c
3 a
... ...
To test for consecutive identical values, you can compare a value to the previous value in that column. In dplyr, this is possible with lag. (You could do the same thing with comparing to the next value, using lead. Result comes out the same.)
Group the data by variable1, get the lag of variable2, then add up how many of these duplicates there are in that group. Then filter for just the groups with no duplicates. After that, feel free to remove the dupesInGroup column.
library(tidyverse)
df %>%
group_by(variable1) %>%
mutate(dupesInGroup = sum(variable2 == lag(variable2), na.rm = T)) %>%
filter(dupesInGroup == 0)
#> # A tibble: 5 x 3
#> # Groups: variable1 [2]
#> variable1 variable2 dupesInGroup
#> <int> <chr> <int>
#> 1 1 a 0
#> 2 1 b 0
#> 3 3 a 0
#> 4 3 c 0
#> 5 3 a 0
Created on 2018-05-10 by the reprex package (v0.2.0).
prepare data frame:
df <- data.frame("Variable 1" = c(1, 1, 2, 2, 2, 3, 3, 3), "Variable 2" = unlist(strsplit("abaabaca", "")))
write functions to test if consecutive repetitions are there or not:
any.consecutive.p <- function(v) {
for (i in 1:(length(v) - 1)) {
if (v[i] == v[i + 1]) {
return(TRUE)
}
}
return(FALSE)
}
any.consecutive.in.col.p <- function(df, col) {
any.consecutive.p(df[, col])
}
any.consecutive.p returns TRUE if it finds first consecutive repetition in a vector (v).
any.consecutive.in.col.p() looks for consecutive repetitions in a column of a data frame.
split data frame by values of Variable.1
df.l <- split(df, df$Variable.1)
df.l
$`1`
Variable.1 Variable.2
1 1 a
2 1 b
$`2`
Variable.1 Variable.2
3 2 a
4 2 a
5 2 b
$`3`
Variable.1 Variable.2
6 3 a
7 3 c
8 3 a
Finally go over this data.frame list and test for each data frame, if it contains consecutive duplicates in Variable.2 column.
If found, don't collect it.
Bind the collected data frames by rows.
Reduce(rbind, lapply(df.l, function(df) if(!any.consecutive.in.col.p(df, "Variable.2")) {df}))
Variable.1 Variable.2
1 1 a
2 1 b
6 3 a
7 3 c
8 3 a
Say you want to remove all groups of df, grouped by a, where the column b has repeated values. You can do that as below.
set.seed(0)
df <- data.frame(a = rep(1:3, rep(3, 3)), b = sample(1:5, 9, T))
# dplyr
library(dplyr)
df %>%
group_by(a) %>%
filter(all(b != lag(b), na.rm = T))
#data.table
library(data.table)
setDT(df)
df[, if(all(b != shift(b), na.rm = T)) .SD, by = a]
Benchmark shows data.table is faster
#Results
# Unit: milliseconds
# expr min lq mean median uq max neval
# use_dplyr() 141.46819 165.03761 201.0975 179.48334 205.82301 539.5643 100
# use_DT() 36.27936 50.23011 64.9218 53.87114 66.73943 345.2863 100
# Method
set.seed(0)
df <- data.table(a = rep(1:2000, rep(1e3, 2000)), b = sample(1:1e3, 2e6, T))
use_dplyr <- function(x){
df %>%
group_by(a) %>%
filter(all(b != lag(b), na.rm = T))
}
use_DT <- function(x){
df[, if (all(b != shift(b), na.rm = T)) .SD, a]
}
microbenchmark(use_dplyr(), use_DT())
I have a dataframe df1 which contains 6 columns, two of which (var1 & var3) I am using to split df1 by, resulting in a list of dataframes ls1.
For each sub dataframe in ls1 I want to sample() x$var2, x$num times with x$probs probabilities as follows:
Create data:
var1 <- rep(LETTERS[seq( from = 1, to = 3 )], each = 6)
var2 <- rep(LETTERS[seq( from = 1, to = 3 )], 6)
var3 <- rep(1:2,3, each = 3)
num <- rep(c(10, 11, 13, 8, 20, 5), each = 3)
probs <- round(runif(18), 2)
df1 <- as.data.frame(cbind(var1, var2, var3, num, probs))
ls1 <- split(df1, list(df1$var1, df1$var3))
have a look at the first couple list elements:
$A.1
var1 var2 var3 num probs
1 A A 1 10 0.06
2 A B 1 10 0.27
3 A C 1 10 0.23
$B.1
var1 var2 var3 num probs
7 B A 1 13 0.93
8 B B 1 13 0.36
9 B C 1 13 0.04
lapply over ls1:
ls1 <- lapply(ls1, function(x) {
res <- table(sample(x$var2, size = as.numeric(as.character(x$num)),
replace = TRUE, prob = as.numeric(as.character(x$probs))))
res <- as.data.frame(res)
cbind(x, res = res$Freq)
})
df2 <- do.call("rbind", ls1)
df2
Have a look at the first couple list elements of the result:
$A.1
var1 var2 var3 num probs res
1 A A 1 10 0.06 2
2 A B 1 10 0.27 4
3 A C 1 10 0.23 4
$B.1
var1 var2 var3 num probs res
7 B A 1 13 0.93 10
8 B B 1 13 0.36 3
9 B C 1 13 0.04 0
So for each dataframe a new variable res is created, the sum of res equals num and the elements of var2 are represented in res in proportions relating to probs. This does what I want but it becomes very slow when there is a lot of data.
My Question: is there a way to replace the lapply piece of code with something more efficient/faster?
I am just beginning to learn about vectorization and am guessing this could be vectorized? but I am unsure of how to achieve it.
ls1 is eventually returned to a dataframe structure so if it doesn't need to become a list to begin with all the better (although it doesn't really matter how the data is structured for this step).
Any help would be much appreciated.
First, you should create df1 using data.frame() rather than converting from a matrix, because the matrix forces all data types to the be the same even though you have both numeric and character variables.
df1 <- data.frame(var1, var2, var3, num, probs)
Next, instead of using the sample function, the rmultinom function is much more efficient because it directly outputs the number of draws for each value in x$var2:
ls1 <- lapply(ls1, function(x) {
x$res <- rmultinom(1, x$num[1], x$probs)
x
})
This should be noticeably faster than using the sample approach.
Rather than splitting your data frame in groups, I would use package {dplyr} with a group_by+mutate:
library(dplyr)
df1 %>%
mutate_at(vars(num, probs), as.numeric) %>%
group_by(var1, var3) %>%
mutate(res = c(rmultinom(1, num[1], probs)))
This should be fast and you can keep the original data structure.
Learn more there.
I am trying to divide each cell in a data frame by the sum of the column. For example, I have a data frame df:
sample a b c
a2 1 4 6
a3 5 5 4
I would like to create a new data frame that takes each cell in and divides by the sum of the column, like so:
sample a b c
a2 .167 .444 .6
a3 .833 .556 .4
I have seen answers using sweep(), but that looks like its for matrices, and I have data frames. I understand how to use colSums(), but I'm not sure how to write a function that loops through every cell in the column, and then divides by the column sum. Thanks for the help!
Solution 1
Here are two dplyr solutions. We can use mutate_at or mutate_if to efficiently specify which column we want to apply an operation, or under what condition we want to apply an operation.
library(dplyr)
# Apply the operation to all column except sample
dat2 <- dat %>%
mutate_at(vars(-sample), funs(./sum(.)))
dat2
# sample a b c
# 1 a2 0.1666667 0.4444444 0.6
# 2 a3 0.8333333 0.5555556 0.4
# Apply the operation if the column is numeric
dat2 <- dat %>%
mutate_if(is.numeric, funs(./sum(.)))
dat2
# sample a b c
# 1 a2 0.1666667 0.4444444 0.6
# 2 a3 0.8333333 0.5555556 0.4
Solution 2
We can also use the map_at and map_if function from the purrr package. However, since the output is a list, we will need as.data.frame from base R or as_data_frame from dplyr to convert the list to a data frame.
library(dplyr)
library(purrr)
# Apply the operation to column a, b, and c
dat2 <- dat %>%
map_at(c("a", "b", "c"), ~./sum(.)) %>%
as_data_frame()
dat2
# # A tibble: 2 x 4
# sample a b c
# <chr> <dbl> <dbl> <dbl>
# 1 a2 0.167 0.444 0.600
# 2 a3 0.833 0.556 0.400
# Apply the operation if the column is numeric
dat2 <- dat %>%
map_if(is.numeric, ~./sum(.)) %>%
as_data_frame()
dat2
# # A tibble: 2 x 4
# sample a b c
# <chr> <dbl> <dbl> <dbl>
# 1 a2 0.167 0.444 0.600
# 2 a3 0.833 0.556 0.400
Solution 3
We can also use the .SD and .SDcols from the data.table package.
library(data.table)
# Convert to data.table
setDT(dat)
dat2 <- copy(dat)
dat2[, (c("a", "b", "c")) := lapply(.SD, function(x) x/sum(x)), .SDcols = c("a", "b", "c")]
dat2[]
# sample a b c
# 1: a2 0.1666667 0.4444444 0.6
# 2: a3 0.8333333 0.5555556 0.4
Solution 4
We can also use the lapply function to loop through all column except the first column to perform the operation.
dat2 <- dat
dat2[, -1] <- lapply(dat2[, -1], function(x) x/sum(x))
dat2
# sample a b c
# 1 a2 0.1666667 0.4444444 0.6
# 2 a3 0.8333333 0.5555556 0.4
We can also use apply to loop through all columns but add an if-else statement in the function to make sure only perform the operation on the numeric columns.
dat2 <- dat
dat2[] <- lapply(dat2[], function(x){
# Check if the column is numeric
if (is.numeric(x)){
return(x/sum(x))
} else{
return(x)
}
})
dat2
# sample a b c
# 1 a2 0.1666667 0.4444444 0.6
# 2 a3 0.8333333 0.5555556 0.4
Solution 5
A dplyr and tidyr solution based on gather and spread.
library(dplyr)
library(tidyr)
dat2 <- dat %>%
gather(Column, Value, -sample) %>%
group_by(Column) %>%
mutate(Value = Value/sum(Value)) %>%
spread(Column, Value)
dat2
# # A tibble: 2 x 4
# sample a b c
# * <chr> <dbl> <dbl> <dbl>
# 1 a2 0.167 0.444 0.600
# 2 a3 0.833 0.556 0.400
Performance Evaluation
I am curious about which method has the best performance. So I conduct the following performance evaluation using the microbenchmark package with a data frame having the same column names as OP's example but with 1000000 rows.
library(dplyr)
library(tidyr)
library(purrr)
library(data.table)
library(microbenchmark)
set.seed(100)
dat <- data_frame(sample = paste0("a", 1:1000000),
a = rpois(1000000, lambda = 3),
b = rpois(1000000, lambda = 3),
c = rpois(1000000, lambda = 3))
# Convert the data frame to a data.table for later perofrmance evaluation
dat_dt <- as.data.table(dat)
head(dat)
# # A tibble: 6 x 4
# sample a b c
# <chr> <int> <int> <int>
# 1 a1 2 5 2
# 2 a2 2 5 5
# 3 a3 3 2 4
# 4 a4 1 2 2
# 5 a5 3 3 1
# 6 a6 3 6 1
In addition to all the methods I proposed, I also interested two other methods proposed by others: the prop.table method proposed by Henrik in the comments, and the apply method by Spacedman. I called all my solutions with m1_1, m1_2, m2_1, ... to m5. If there are two methods in one solution, I used _ to separate them. I also called the prop.table method as m6 and the apply method as m7. Notice that I modified m6 to have an output as a data frame so that all the methods can have data frame, tibble, or data.table output.
Here is the code I used to assess the performance.
per <- microbenchmark(m1_1 = {dat2 <- dat %>% mutate_at(vars(-sample), funs(./sum(.)))},
m1_2 = {dat2 <- dat %>% mutate_if(is.numeric, funs(./sum(.)))},
m2_1 = {dat2 <- dat %>%
map_at(c("a", "b", "c"), ~./sum(.)) %>%
as_data_frame()
},
m2_2 = {dat2 <- dat %>%
map_if(is.numeric, ~./sum(.)) %>%
as_data_frame()},
m3 = {dat_dt2 <- copy(dat_dt)
dat_dt2[, c("a", "b", "c") := lapply(.SD, function(x) x/sum(x)),
.SDcols = c("a", "b", "c")]},
m4_1 = {dat2 <- dat
dat2[, -1] <- lapply(dat2[, -1], function(x) x/sum(x))},
m4_2 = {dat2 <- dat
dat2[] <- lapply(dat2[], function(x){
if (is.numeric(x)){
return(x/sum(x))
} else{
return(x)
}
})},
m5 = {dat2 <- dat %>%
gather(Column, Value, -sample) %>%
group_by(Column) %>%
mutate(Value = Value/sum(Value)) %>%
spread(Column, Value)},
m6 = {dat2 <- dat
dat2[-1] <- prop.table(as.matrix(dat2[-1]), margin = 2)},
m7 = {dat2 <- dat
dat2[, -1] = apply(dat2[, -1], 2, function(x) {x/sum(x)})}
)
print(per)
# Unit: milliseconds
# expr min lq mean median uq max neval
# m1_1 23.335600 24.326445 28.71934 25.134798 27.465017 75.06974 100
# m1_2 20.373093 21.202780 29.73477 21.967439 24.897305 216.27853 100
# m2_1 9.452987 9.817967 17.83030 10.052634 11.056073 175.00184 100
# m2_2 10.009197 10.342819 16.43832 10.679270 11.846692 163.62731 100
# m3 16.195868 17.154327 34.40433 18.975886 46.521868 190.50681 100
# m4_1 8.100504 8.342882 12.66035 8.778545 9.348634 181.45273 100
# m4_2 8.130833 8.499926 15.84080 8.766979 9.732891 172.79242 100
# m5 5373.395308 5652.938528 5791.73180 5737.383894 5825.141584 6660.35354 100
# m6 117.038355 150.688502 191.43501 166.665125 218.837502 325.58701 100
# m7 119.680606 155.743991 199.59313 174.007653 215.295395 357.02775 100
library(ggplot2)
autoplot(per)
The result shows that methods based on lapply (m4_1 and m4_2) are the fastest, while the tidyr approach (m5) is the slowest, indicating that when row numbers are large it is not a good idea to use the gather and spread method.
DATA
dat <- read.table(text = "sample a b c
a2 1 4 6
a3 5 5 4",
header = TRUE, stringsAsFactors = FALSE)
Given this:
> d = data.frame(sample=c("a2","a3"),a=c(1,5),b=c(4,5),c=c(6,4))
> d
sample a b c
1 a2 1 4 6
2 a3 5 5 4
You can replace every column other than the first by applying over the rest:
> d[,-1] = apply(d[,-1],2,function(x){x/sum(x)})
> d
sample a b c
1 a2 0.1666667 0.4444444 0.6
2 a3 0.8333333 0.5555556 0.4
If you don't want d being stomped on make a copy beforehand.
You could do this in dplyr as well.
sample <- c("a2", "a3")
a <- c(1, 5)
b <- c(4, 5)
c <- c(6, 4)
dat <- data.frame(sample, a, b, c)
dat
library(dplyr)
dat %>%
mutate(
a.PCT = round(a/sum(a), 3),
b.PCT = round(b/sum(b), 3),
c.PCT = round(c/sum(c), 3))
sample a b c a.PCT b.PCT c.PCT
1 a2 1 4 6 0.167 0.444 0.6
2 a3 5 5 4 0.833 0.556 0.4
You can use the transpose of the matrix and then transpose again:
t(t(as.matrix(df))/colSums(df))
try apply:
mat <- matrix(1:6, ncol=3)
apply(mat,2, function(x) x / sum(x))
okay, if you have not numeric values in you columns you can force them to be numeric:
df <- data.frame( a=c('a', 'b'), b=c(3,4), d=c(1,6))
apply(df,2, function(x) {
x <- as.numeric(x)
x / sum(x)
})
I need to find out how many factor levels reach values of a continuous variable.
The code below produces the desired result for the example data, but it is rather an awkward work around.
My real dataframe is much larger and the real plot should show more values (or is continuous) on the x-axis. I would appreciate an applicable code a lot.
set.seed(5)
df <- data.frame(ID = factor(c("a","a","b","c","d","e","e")),values = runif(7,0,6))
seq <- 1:5
length.unique <- function(x) length(unique(x))
sub1 <- df[which(df$values >= 1), ]
sub2 <- df[which(df$values >= 2), ]
sub3 <- df[which(df$values >= 3), ]
sub4 <- df[which(df$values >= 4), ]
sub5 <- df[which(df$values >= 5), ]
N_IDs <- c(length.unique(sub1$ID),length.unique(sub2$ID),length.unique(sub3$ID),length.unique(sub4$ID),length.unique(sub5$ID))
plot(N_IDs ~ seq, type="b")
Using tidyverse, you can save some time by first calculating the max value for each ID,
library(tidyverse)
idmax <- df %>% group_by(ID) %>% summarize(max=max(values)) %>% pull(max)
Then for each cut point, return the count that pass
map_df(1:5, ~data.frame(cut=., count=sum(idmax >.)))
# cut count
# 1 1 4
# 2 2 3
# 3 3 3
# 4 4 3
# 5 5 1
Using non-equi joins:
library(data.table)
setDT(df)
df[.(seq = 1:5), on = .(values >= seq), allow = T, .(N_IDs = uniqueN(ID)), by = .EACHI]
# values N_IDs
#1: 1 4
#2: 2 3
#3: 3 3
#4: 4 3
#5: 5 1