For each 0 in x, I want to randomly insert a number between 1:10 but i'm looking for an efficent way to do this in dplyr and/or data.table as I have a very large dataset (10m rows).
library(tidyverse)
df <- data.frame(x = 1:10)
df[4, 1] = 0
df[6, 1] = 0
df
# x
# 1 1
# 2 2
# 3 3
# 4 0
# 5 5
# 6 0
# 7 7
# 8 8
# 9 9
# 10 10
This doesnt work as it replaces each year with the same value:
set.seed(1)
df %>%
mutate(x2 = ifelse(x == 0, sample(1:10, 1), x))
# x x2
# 1 1 1
# 2 2 2
# 3 3 3
# 4 0 9
# 5 5 5
# 6 0 9
# 7 7 7
# 8 8 8
# 9 9 9
# 10 10 10
It can be achieved though with rowwise but is slow on a large dataset:
set.seed(1)
#use rowwise
df %>%
rowwise() %>%
mutate(x2 = ifelse(x == 0, sample(1:10, 1), x))
# x x2
# <dbl> <dbl>
# 1 1 1
# 2 2 2
# 3 3 3
# 4 0 9
# 5 5 5
# 6 0 4
# 7 7 7
# 8 8 8
# 9 9 9
# 10 10 10
Any suggestions to speed this up?
Thanks
Not in tidyverse, but you could just do something like this:
is_zero <- (df$x == 0)
replacements <- sample(1:10, sum(is_zero))
df$x[is_zero] <- replacements
Of course, you can collapse that down if you'd like.
df$x[df$x == 0] <- sample(1:10, sum(df$x == 0))
Using the above solutions and microbenchmark and a slight modification to the dataset for setup:
library(data.table)
library(tidyverse)
df <- data.frame(x = 1:100000, y = rbinom(100000, size = 1, 0.5)) %>%
mutate(x = ifelse(y == 0, 0, x)) %>%
dplyr::select(-y)
dt <- setDT(df)
test <- microbenchmark::microbenchmark(
base1 = {
df$x[df$x == 0] <- sample(1:10, sum(df$x == 0), replace = T)
},
dplyr1 = {
df %>%
mutate(x2 = replace(x, which(x == 0), sample(1:10, sum(x == 0), replace = T)))
},
dplyr2 = {
df %>% group_by(id=row_number()) %>%
mutate(across(c(x),.fns = list(x2 = ~ ifelse(.==0, sample(1:10, 1, replace = T), .)) )) %>%
ungroup() %>% select(-id)
},
data.table = {
dt[x == 0, x := sample(1:10, .N, replace = T)]
},
times = 500L
)
test
# Unit: microseconds
# expr min lq mean median uq max neval cld
# base1 733.7 785.9 979.0938 897.25 1137.0 1839.4 500 a
# dplyr1 5207.1 5542.1 6129.2276 5967.85 6476.0 21790.7 500 a
# dplyr2 15963406.4 16156889.2 16367969.8704 16395715.00 16518252.9 19276215.5 500 b
# data.table 1547.4 2229.3 2422.1278 2455.60 2573.7 15076.0 500 a
I thought data.table would be quickest but the base solution seems best (assuming I've set up the mircobenchmark correctly?).
EDIT based on #chinsoon12 comment
1e5 rows:
Unit: microseconds
expr min lq mean median uq max neval cld
base1 730.4 839.30 1380.465 1238.00 1322.85 28977.3 500 a
data.table 1394.8 1831.85 2030.215 1946.95 2060.40 29821.9 500 b
1e6 rows:
Unit: milliseconds
expr min lq mean median uq max neval cld
base1 9.8703 11.6596 16.030715 11.76195 12.04145 326.0118 500 b
data.table 2.3772 2.7939 3.855672 3.04700 3.25900 61.4083 500 a
data.table is the quickest
Maybe try with across() from dplyr in this way:
library(tidyverse)
#Data
df <- data.frame(x = 1:10)
df[4, 1] = 0
df[6, 1] = 0
#Code
df %>% group_by(id=row_number()) %>%
mutate(across(c(x),.fns = list(x2 = ~ ifelse(.==0, sample(1:10, 1), .)) )) %>%
ungroup() %>% select(-id)
Output:
# A tibble: 10 x 2
x x_x2
<dbl> <dbl>
1 1 1
2 2 2
3 3 3
4 0 5
5 5 5
6 0 6
7 7 7
8 8 8
9 9 9
10 10 10
I am adding a different answer because there are already votes on the base option I provided. But here can be a dplyr way using replace.
library(dplyr)
df %>%
mutate(x2 = replace(x, which(x == 0), sample(1:10, sum(x == 0))))
Here is a data.table option using similar logic to Adam's answer. This filters for rows that meet your criteria: x == 0, and then samples 1:10 .N times (which, without a grouping variable, is the number of rows of the filtered data.table).
library(data.table)
set.seed(1)
setDT(df)[x == 0, x := sample(1:10, .N)]
df
x
1: 1
2: 2
3: 3
4: 9
5: 5
6: 4
7: 7
8: 8
9: 9
10: 10
Related
I have data with a grouping variable "ID" and some values:
ID, Value
1, 1
1, 2
1, 3
1, 4
2, 5
2, 6
2, 7
2, 8
Within each group, I want to append the first row after the last row.
Desired result:
ID, Value
1, 1
1, 2
1, 3
1, 4
1, 1 # First row of ID 1 inserted as last row in the group
2, 5
2, 6
2, 7
2, 8
2, 5 # First row of ID 2 inserted as last row in the group
I have 5000 row of this.
Within tidyverse, you could use add_row with do (now deprecated) or group_modify (experimental):
dat |>
group_by(ID) |>
do(add_row(., ID = unique(.$ID), Value = first(.$Value))) |>
ungroup()
dat |>
group_by(ID) |>
group_modify(~ add_row(., Value = first(.$Value))) |>
ungroup()
Or bind_rows with summarize (my variation of #Gregor Thomas, thanks):
dat |>
group_by(ID) |>
summarize(bind_rows(cur_data(), head(cur_data(), 1))) |>
ungroup()
Or by applying the same logic of #Henrik using bind_rows, filter, and arrange:
dat |>
bind_rows(dat |> filter(!duplicated(ID))) |>
arrange(ID)
Output:
# A tibble: 10 × 2
ID Value
<int> <dbl>
1 1 1
2 1 2
3 1 3
4 1 4
5 1 1
6 2 5
7 2 6
8 2 7
9 2 8
10 2 5
Thanks to #SamR for the data.
And with data.table:
library(data.table)
dt <- data.table(ID = rep(1:2, each = 4), Value = 1:8)
dt[,.(Value = c(Value, first(Value))), ID]
#> ID Value
#> 1: 1 1
#> 2: 1 2
#> 3: 1 3
#> 4: 1 4
#> 5: 1 1
#> 6: 2 5
#> 7: 2 6
#> 8: 2 7
#> 9: 2 8
#> 10: 2 5
Benchmarking with a 5000-row table:
library(dplyr)
dt <- data.table(ID = rep(1:1250, each = 4), Value = 1:5e3)
f1 <- function(dt) dt[,.(Value = c(Value, first(Value))), ID]
# base R
f2 <- function(dt) do.call(rbind, lapply(split(dt, by = "ID"), function(x) rbind(x, x[1,])))
# tidyverse
f3 <- function(dt) {
dt %>%
group_by(ID) %>%
do(add_row(., ID = unique(.$ID), Value = first(.$Value))) %>%
ungroup()
}
f4 <- function(dt) {
dt %>%
group_by(ID) %>%
group_modify(~ add_row(., Value = first(.$Value))) %>%
ungroup()
}
microbenchmark::microbenchmark(data.table = f1(dt),
"base R" = f2(dt),
tidyverse1 = f3(dt),
tidyverse2 = f4(dt),
times = 10)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> data.table 3.4989 3.6844 4.16619 4.3085 4.4623 5.3020 10
#> base R 245.1397 263.1636 283.61131 284.8053 307.7105 310.3901 10
#> tidyverse1 761.7097 773.3705 791.05115 787.9463 808.5416 821.5321 10
#> tidyverse2 711.9593 716.4959 752.20273 728.2170 782.6474 837.1926 10
If speed is really important, this simple Rcpp function provides a very fast solution:
Rcpp::cppFunction(
"IntegerVector firstLast(const IntegerVector& x) {
const int n = x.size();
IntegerVector idxOut(2*n);
int i0 = 1;
int idx = 0;
idxOut(0) = 1;
for (int i = 1; i < n; i++) {
if (x(i) != x(i - 1)) {
idxOut(++idx) = i0;
i0 = i + 1;
}
idxOut(++idx) = i + 1;
}
idxOut(++idx) = i0;
return idxOut[Rcpp::Range(0, idx)];
}"
)
Benchmarking against the fastest solution from this answer (on a much larger dataset):
dt = data.table(ID = rep(1:125e4, each = 4), Value = 1:5e6)
microbenchmark::microbenchmark(
f_uniq_dt = setorder(rbindlist(list(dt, unique(dt, by = "ID"))), ID),
f_Rcpp = dt[firstLast(dt$ID)],
check = "equal"
)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> f_uniq_dt 78.6056 83.71345 95.42876 85.80720 90.03685 175.8867 100
#> f_Rcpp 49.1485 53.38275 60.96322 55.44925 58.01485 121.3637 100
Use !duplicated to get first row by "ID" (more efficient than "by group" operations). rbind with original data and order result:
df = data.frame(ID = rep(1:2, each = 4), Value = 1:8)
d2 = rbind(df, df[!duplicated(df$ID), ])
d2[order(d2$ID), ]
# ID Value
# 1 1 1
# 2 1 2
# 3 1 3
# 4 1 4
# 11 1 1
# 5 2 5
# 6 2 6
# 7 2 7
# 8 2 8
# 51 2 5
Same idea with data.table::duplicated:
d = as.data.table(df)
d2 = rbindlist(list(d, d[!duplicated(d, by = "ID")]))
setorder(d2, ID)
More straightforward with data.table::unique:
d2 = rbindlist(list(d, unique(d, by = "ID")))
setorder(d2, ID)
Also data.table::rowid:
d2 = rbindlist(list(d, d[rowid(ID) == 1]))
setorder(d2, ID)
By avoiding "by group" operations, !duplicated, unique and rowid alternatives are all faster than the clear winner in the previous benchmark, data.table solution which uses by, on a 5000 row data set (see OP):
df = data.frame(ID = rep(1:1250, each = 4), Value = 1:5e3)
d = as.data.table(d)
microbenchmark(
f_by = {
d1 = d[ , .(Value = c(Value, first(Value))), by = ID]
},
f_dupl_df = {
d2 = rbind(df, df[!duplicated(df$ID), ])
d2 = d2[order(d2$ID), ]
},
f_dupl_dt = {
d3 = rbindlist(list(d, d[!duplicated(d, by = "ID")]))
setorder(d3, ID)
},
f_uniq_dt = {
d4 = rbindlist(list(d, unique(d, by = "ID")))
setorder(d4, ID)
},
f_rowid = {
d5 = rbindlist(list(d, d[rowid(ID) == 1]))
setorder(d5, ID)
},
times = 10L)
# Unit: milliseconds
# expr min lq mean median uq max
# f_by 8.5167 9.1397 11.01410 9.90925 12.3327 15.9134
# f_dupl_df 6.8337 7.0901 8.31057 7.56810 8.4899 13.9278
# f_dupl_dt 2.4742 2.6687 3.24932 3.18670 3.7993 4.3318
# f_uniq_dt 2.2059 2.4225 3.50756 3.36250 4.4590 5.6632
# f_rowid 2.2963 2.4295 3.43876 2.74345 4.8035 5.9278
all.equal(d1, as.data.table(d2))
all.equal(d1, d3)
all.equal(d1, d4)
all.equal(d1, d5)
# [1] TRUE
However, a sub-second benchmark isn't very informative, so try on larger data with many groups. The base solution loses ground. data.table::duplicated, unique and rowid scale better, and are now about 20 times faster, data.table::unique being fastest.
df = data.frame(ID = rep(1:1250000, each = 4), Value = 1:5e6)
d = as.data.table(df)
# Unit: milliseconds
# expr min lq mean median uq max neval
# f_by 6834.5959 7157.1686 12273.2399 7775.3919 8850.5324 35339.0262 10
# f_dupl_df 10732.1536 11035.4886 19440.4964 11691.5347 37956.6961 38387.4927 10
# f_dupl_dt 174.5640 183.8399 391.8605 381.8920 401.4929 962.4948 10
# f_uniq_dt 156.1267 161.9555 212.3472 180.7912 209.3905 406.7780 10
# f_rowid 192.1106 197.1564 380.0023 234.5851 474.5024 1172.6529 10
For completeness, a binary search, with mult = "first" to select the first match:
d[.(unique(ID)), on = .(ID), mult = "first"]
However, in both timings above, it ended on twice the time compared to the unique alternative.
Here is a base R method:
do.call(rbind,
lapply(split(dat, dat$ID), \(id_df) rbind(id_df, id_df[1,]))
)
# ID Value
# 1.1 1 1
# 1.2 1 2
# 1.3 1 3
# 1.4 1 4
# 1.5 1 1
# 2.5 2 5
# 2.6 2 6
# 2.7 2 7
# 2.8 2 8
# 2.51 2 5
It does give you slightly strange row names - if you care about that you can wrap it in (or pipe it to) tibble::as_tibble(), which removes row names altogether.
Alternatively you could do data.table::rbindlist(lapply(split(dat, dat$ID), \(id_df) rbind(id_df, id_df[1,]))), becausedata.table also does not use row names.
Data
dat <- read.csv(text = "ID, Value
1, 1
1, 2
1, 3
1, 4
2, 5
2, 6
2, 7
2, 8", h=T)
You can try the following data.table option
> setDT(df)[, .SD[(seq(1 + .N) - 1) %% .N + 1], ID]
ID Value
1: 1 1
2: 1 2
3: 1 3
4: 1 4
5: 1 1
6: 2 5
7: 2 6
8: 2 7
9: 2 8
10: 2 5
I want to know the best way to iterate over rows of a data frame when the value of a variable at row n depends on the value of variable(s) at row n-1 and/or n-2. Ideally I would like to do this in a "tidyverse" way, perhaps with purrr::pmap().
For example, say I have this data frame:
library(dplyr)
x <- tibble(t = c(1:10),
a = c(seq(100, 140, 10), rep(NA_real_, 5)),
b = c(runif(5), rep(NA_real_, 5)),
c = c(runif(5), rep(NA_real_, 5)))
x
#> # A tibble: 10 x 4
#> t a b c
#> <int> <dbl> <dbl> <dbl>
#> 1 1 100 0.750 0.900
#> 2 2 110 0.898 0.657
#> 3 3 120 0.731 0.000137
#> 4 4 130 0.208 0.696
#> 5 5 140 0.670 0.882
#> 6 6 NA NA NA
#> 7 7 NA NA NA
#> 8 8 NA NA NA
#> 9 9 NA NA NA
#> 10 10 NA NA NA
I have known values up to time (t) = 5. Beyond that, I wish to project values, using the following formulae:
a = lag(a) * 1.1
b = a * lag(b)
c = b * lag(a, 2)
This code achieves the desired output, but it's a clunky, horrible for loop that scales poorly to larger datasets:
for(i in 1:nrow(x)) {
x <- x %>%
mutate(a = if_else(!is.na(a), a, lag(a, 1) * 1.1),
b = if_else(!is.na(b), b, a * lag(b, 1)),
c = if_else(!is.na(c), c, b * lag(a, 2)))
}
x
#> # A tibble: 10 x 4
#> t a b c
#> <int> <dbl> <dbl> <dbl>
#> 1 1 100 7.50e- 1 9.00e- 1
#> 2 2 110 8.98e- 1 6.57e- 1
#> 3 3 120 7.31e- 1 1.37e- 4
#> 4 4 130 2.08e- 1 6.96e- 1
#> 5 5 140 6.70e- 1 8.82e- 1
#> 6 6 154 1.03e+ 2 1.34e+ 4
#> 7 7 169. 1.75e+ 4 2.45e+ 6
#> 8 8 186. 3.26e+ 6 5.02e+ 8
#> 9 9 205. 6.68e+ 8 1.13e+11
#> 10 10 225. 1.51e+11 2.80e+13
I think that for this sort of intrinsically iterative process it is genuinely hard to beat a for loop. The method proposed by #Shree depends on the NAs being continuous and starting in a known spot.
Here is my mild improvement on your loop, which I think is more readable and about 2.5 times the speed and will probably scale up better than your approach which combines vectorized operations with the loop. By moving out of the tidyverse altogether and embracing a rowwise loop that really works on each row one at a time, we get some efficiencies on both counts:
method_peter <- function(x){
for(i in 2:nrow(x)){
x[i, "a"] <- ifelse(is.na(x[i, "a"]), x[i - 1, "a"] * 1.1, x[i, "a"])
x[i, "b"] <- ifelse(is.na(x[i, "b"]), x[i, "a"] * x[i - 1, "b"], x[i, "b"])
x[i, "c"] <- ifelse(is.na(x[i, "c"]), x[i, "b"] * x[i - 2, "a"], x[i, "c"])
}
return(x)
}
There's doubtless more efficiencies possible, and of course this is an ideal candidate to rewrite it in C++ :).
This is about twice as fast as your method as seen by this:
method_matt <- function(x){
for(i in 1:nrow(x)) {
x <- x %>%
mutate(a = if_else(!is.na(a), a, lag(a, 1) * 1.1),
b = if_else(!is.na(b), b, a * lag(b, 1)),
c = if_else(!is.na(c), c, b * lag(a, 2)))
}
return(x)
}
set.seed(123)
x <- tibble(t = c(1:10),
a = c(seq(100, 140, 10), rep(NA_real_, 5)),
b = c(runif(5), rep(NA_real_, 5)),
c = c(runif(5), rep(NA_real_, 5)))
stopifnot(identical(method_matt(x), method_peter(x)))
library(microbenchmark)
microbenchmark(
method_matt(x),
method_peter(x)
)
which returns:
Unit: milliseconds
expr min lq mean median uq max neval
method_matt(x) 24.1975 25.50925 30.64438 26.33310 31.8681 74.5093 100
method_peter(x) 10.0005 10.56050 13.33751 11.06495 13.5913 42.0568 100
#Shree's method is much faster again and is ideal for the example data, but I'm not sure it is flexible enough to work in all your use cases.
I would like to see a tidyverse solution if there is one.
Edit: Added tidyverse approach
Here's a readable and flexible tidyverse approach. The downside is that it is very slow.
accumutate <- function(df, ...){
df %>% group_by(row_number()) %>%
nest() %>%
pull(data) %>%
accumulate(function(x,y){bind_rows(x,y) %>% mutate(!!!enquos(...)) }) %>%
.[[length(.)]]
}
x %>%
accumutate(a = ifelse(is.na(a), 1.1 * lag(a,1), a)) %>%
accumutate(b = ifelse(is.na(b), a * lag(b), b)) %>%
accumutate(c = ifelse(is.na(c),b * lag(a, 2), c))
#> # A tibble: 10 x 4
#> t a b c
#> <int> <dbl> <dbl> <dbl>
#> 1 1 100 2.88e- 1 4.56e- 2
#> 2 2 110 7.88e- 1 5.28e- 1
#> 3 3 120 4.09e- 1 8.92e- 1
#> 4 4 130 8.83e- 1 5.51e- 1
#> 5 5 140 9.40e- 1 4.57e- 1
#> 6 6 154 1.45e+ 2 1.88e+ 4
#> 7 7 169. 2.45e+ 4 3.43e+ 6
#> 8 8 186. 4.57e+ 6 7.04e+ 8
#> 9 9 205. 9.37e+ 8 1.59e+11
#> 10 10 225. 2.11e+11 3.94e+13
Created on 2020-10-07 by the reprex package (v0.3.0)
Here's another approach that you might find interesting. It's not concise or especially readable, but it's tidyverse (or at least functionally) inspired. And it performs fairly well.
It uses a semigroup pattern, converting the mutate expressions into binary functions, creating corresponding lists and then using accumulate.
library(tidyverse)
library(dplyr)
library(microbenchmark)
options(width =100)
set.seed(123)
# Create the data frame
x <- tibble(t = c(1:100),
a = c(seq(100, 140, 10), rep(NA_real_,100- 5)),
b = c(runif(5), rep(NA_real_, 100-5)),
c = c(runif(5), rep(NA_real_, 100-5)))
a_mappend <- function(a1, a2) {
ifelse(is.na(a2), a1 * 1.1, a2)
}
b_mappend <- function(ab1, ab2) {
list(a = ab2$a, b = ifelse(is.na(ab2$b), ab2$a * ab1$b,ab2$b))
}
c_mappend <- function(abc12, abc23) {
list(abc1 = list(a = abc12$abc2$a, b = abc12$abc2$b, c = abc12$abc2$c),
abc2 = list(a = abc23$abc2$a, b = abc23$abc2$b, c = ifelse(is.na(abc23$abc2$c),abc12$abc1$a * abc23$abc2$b,abc23$abc2$c)))
}
method_ian <- function(x) {
x %>%
mutate(a = accumulate(a, a_mappend)) %>%
mutate(b = list(a, b) %>%
pmap(~ list(a = .x, b = .y)) %>%
accumulate(b_mappend) %>% map_dbl(~ .x$b)) %>%
mutate(c = list(a, b, c, c(a[-1], NA), c(b[-1], NA), c(c[-1], NA)) %>%
pmap(~ list(abc1 = list(a = ..1, b = ..2, c = ..3),
abc2 = list(a = ..4, b = ..5, c = ..6))) %>%
accumulate(c_mappend) %>% map_dbl(~ .x$abc1$c))
}
method_matt <- function(x){
for(i in 1:nrow(x)) {
x <- x %>%
mutate(a = if_else(!is.na(a), a, lag(a, 1) * 1.1),
b = if_else(!is.na(b), b, a * lag(b, 1)),
c = if_else(!is.na(c), c, b * lag(a, 2)))
}
return(x)
}
method_peter <- function(x){
for(i in 2:nrow(x)){
x[i, "a"] <- ifelse(is.na(x[i, "a"]), x[i - 1, "a"] * 1.1, x[i, "a"])
x[i, "b"] <- ifelse(is.na(x[i, "b"]), x[i, "a"] * x[i - 1, "b"], x[i, "b"])
x[i, "c"] <- ifelse(is.na(x[i, "c"]), x[i, "b"] * x[i - 2, "a"], x[i, "c"])
}
return(x)
}
stopifnot(identical(method_matt(x), method_ian(x)))
microbenchmark( method_matt(x), method_peter(x), method_ian(x))
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> method_matt(x) 324.90086 330.93192 337.46518 334.55447 338.38461 426.30457 100
#> method_peter(x) 208.27498 211.60526 213.59438 212.66088 214.36421 242.59854 100
#> method_ian(x) 13.06774 13.43105 14.30003 13.86428 14.32263 19.54843 100
Created on 2020-10-06 by the reprex package (v0.3.0)
I don't think there's any simple way in tidyverse to do calculations with row-dependencies. Something with Reduce or gather + spread could be possible but I don't expect them to score poits on readability.
Anyways, on the bright side, your calculations are vectorizable using dplyr and zoo packages -
x %>%
mutate(
a = ifelse(is.na(a), na.locf(a) * 1.1^(t-5), a),
b = ifelse(is.na(b), na.locf(b) * c(rep(1, 5), cumprod(a[6:n()])), b),
c = ifelse(is.na(c), b * lag(a, 2), c)
)
# A tibble: 10 x 4
t a b c
<int> <dbl> <dbl> <dbl>
1 1 100 1.85e- 1 9.43e- 1
2 2 110 7.02e- 1 1.29e- 1
3 3 120 5.73e- 1 8.33e- 1
4 4 130 1.68e- 1 4.68e- 1
5 5 140 9.44e- 1 5.50e- 1
6 6 154 1.45e+ 2 1.89e+ 4
7 7 169. 2.46e+ 4 3.45e+ 6
8 8 186. 4.59e+ 6 7.07e+ 8
9 9 205. 9.40e+ 8 1.59e+11
10 10 225. 2.12e+11 3.95e+13
Data -
set.seed(2)
x <- tibble(t = c(1:10),
a = c(seq(100, 140, 10), rep(NA_real_, 5)),
b = c(runif(5), rep(NA_real_, 5)),
c = c(runif(5), rep(NA_real_, 5)))
Hei, I learn R and I try to count how many zeros I have within the melted data. So, I want to know how many zeros corresponds to column a and b and print two results out.
I generated an example:
library(reshape)
library(plyr)
library(dplyr)
id = c(1,2,3,4,5,6,7,8,9,10)
b = c(0,0,5,6,3,7,2,8,1,8)
c = c(0,4,9,87,0,87,0,4,5,0)
test = data.frame(id,b,c)
test_melt = melt(test, id.vars = "id")
test_melt
I imagine for that I should create an if statement. Something with
if (test$value == 0){print()}, but how can I tell R to count zeros for a columns that have been melted?
With your data:
test_melt %>%
group_by(variable) %>%
summarize(zeroes = sum(value == 0))
# # A tibble: 2 x 2
# variable zeroes
# <fctr> <int>
# 1 b 2
# 2 c 4
Base R:
aggregate(test_melt$value, by = list(variable = test_melt$variable),
FUN = function(x) sum(x == 0))
# variable x
# 1 b 2
# 2 c 4
... and for curiosity:
library(microbenchmark)
microbenchmark(
dplyr = group_by(test_melt, variable) %>% summarize(zeroes = sum(value == 0)),
base1 = aggregate(test_melt$value, by = list(variable = test_melt$variable), FUN = function(x) sum(x == 0)),
# #PankajKaundal's suggested "formula" notation reads easier
base2 = aggregate(value ~ variable, test_melt, function(x) sum(x == 0))
)
# Unit: microseconds
# expr min lq mean median uq max neval
# dplyr 916.421 986.985 1069.7000 1022.1760 1094.7460 2272.636 100
# base1 647.658 682.302 783.2065 715.3045 765.9940 1905.411 100
# base2 813.219 867.737 950.3247 897.0930 959.8175 2017.001 100
sum(test_melt$value==0)
This should do it.
This might help . Is this what you're looking for ?
> test_melt[4] <- 1
> test_melt2 <- aggregate(V4 ~ value + variable, test_melt, sum)
> test_melt2
value variable V4
1 0 b 2
2 1 b 1
3 2 b 1
4 3 b 1
5 5 b 1
6 6 b 1
7 7 b 1
8 8 b 2
9 0 c 4
10 4 c 2
11 5 c 1
12 9 c 1
13 87 c 2
V4 is the count
This question already has answers here:
How do I get a contingency table?
(6 answers)
Faster ways to calculate frequencies and cast from long to wide
(4 answers)
Closed 4 years ago.
Have a data frame like this
other=data.frame(name=c("a","b","a","c","d"),result=c("Y","N","Y","Y","N"))
How can I use spread function in tidyr or other function to get the count of result Y or N as column header like this
name Y N
a 2 0
b 0 1
Thanks
These are a few ways of many to go about it:
1) With library dplyr, you can simply group things and count into the format needed:
library(dplyr)
other %>% group_by(name) %>% summarise(N = sum(result == 'N'), Y = sum(result == 'Y'))
Source: local data frame [4 x 3]
name N Y
<fctr> <int> <int>
1 a 0 2
2 b 1 0
3 c 0 1
4 d 1 0
2) You can use a combination of table and tidyr spread as follows:
library(tidyr)
spread(as.data.frame(table(other)), result, Freq)
name N Y
1 a 0 2
2 b 1 0
3 c 0 1
4 d 1 0
3) You can use a combination of dplyr and tidyr to do as follows:
library(dplyr)
library(tidyr)
spread(count(other, name, result), result, n, fill = 0)
Source: local data frame [4 x 3]
Groups: name [4]
name N Y
<fctr> <dbl> <dbl>
1 a 0 2
2 b 1 0
3 c 0 1
4 d 1 0
Here is another option using dcast from data.table
library(data.table)
dcast(setDT(other), name~result, length)
# name N Y
#1: a 0 2
#2: b 1 0
#3: c 0 1
#4: d 1 0
Although, table(other) would be a compact option (from #mtoto's comments), for large datasets, it may be more efficient to use dcast. Some benchmarks are given below
set.seed(24)
other1 <- data.frame(name = sample(letters, 1e6, replace=TRUE),
result = sample(c("Y", "N"), 1e6, replace=TRUE), stringsAsFactors=FALSE)
other2 <- copy(other1)
gopala1 <- function() other1 %>%
group_by(name) %>%
summarise(N = sum(result == 'N'), Y = sum(result == 'Y'))
gopala2 <- function() spread(as.data.frame(table(other1)), result, Freq)
gopala3 <- function() spread(count(other1, name, result), result, n, fill = 0)
akrun <- function() dcast(as.data.table(other2), name~result, length)
library(microbenchmark)
microbenchmark(gopala1(), gopala2(), gopala3(),
akrun(), unit='relative', times = 20L)
# expr min lq mean median uq max neval
# gopala1() 2.710561 2.331915 2.142183 2.325167 2.134399 1.513725 20
# gopala2() 2.859464 2.564126 2.531130 2.683804 2.720833 1.982760 20
# gopala3() 2.345062 2.076400 1.953136 2.027599 1.882079 1.947759 20
# akrun() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 20
I have a data frame that looks as follows:
> df <- data_frame(g = c('A', 'A', 'B', 'B', 'B', 'C'), x = c(7, 3, 5, 9, 2, 4))
> df
Source: local data frame [6 x 2]
g x
1 A 7
2 A 3
3 B 5
4 B 9
5 B 2
6 C 4
I know how to add a column with the maximum x value for each group g:
> df %>% group_by(g) %>% mutate(x_max = max(x))
Source: local data frame [6 x 3]
Groups: g
g x x_max
1 A 7 7
2 A 3 7
3 B 5 9
4 B 9 9
5 B 2 9
6 C 4 4
But what I would like is to get is the maximum x value for each group g, excluding the x value in each row.
For the given example, the desired output would look like this:
Source: local data frame [6 x 3]
Groups: g
g x x_max x_max_exclude
1 A 7 7 3
2 A 3 7 7
3 B 5 9 9
4 B 9 9 5
5 B 2 9 9
6 C 4 4 NA
I thought I might be able to use row_number() to remove particular elements and take the max of what remained, but hit warning messages and got incorrect -Inf output:
> df %>% group_by(g) %>% mutate(x_max = max(x), r = row_number(), x_max_exclude = max(x[-r]))
Source: local data frame [6 x 5]
Groups: g
g x x_max r x_max_exclude
1 A 7 7 1 -Inf
2 A 3 7 2 -Inf
3 B 5 9 1 -Inf
4 B 9 9 2 -Inf
5 B 2 9 3 -Inf
6 C 4 4 1 -Inf
Warning messages:
1: In max(c(4, 9, 2)[-1:3]) :
no non-missing arguments to max; returning -Inf
2: In max(c(4, 9, 2)[-1:3]) :
no non-missing arguments to max; returning -Inf
3: In max(c(4, 9, 2)[-1:3]) :
no non-missing arguments to max; returning -Inf
What is the most {readable, concise, efficient} way to get this output in dplyr? Any insight into why my attempt using row_number() doesn't work would also be much appreciated. Thanks for the help.
You could try:
df %>%
group_by(g) %>%
arrange(desc(x)) %>%
mutate(max = ifelse(x == max(x), x[2], max(x)))
Which gives:
#Source: local data frame [6 x 3]
#Groups: g
#
# g x max
#1 A 7 3
#2 A 3 7
#3 B 9 5
#4 B 5 9
#5 B 2 9
#6 C 4 NA
Benchmark
I've tried the solutions so far on the benchmark:
df <- data.frame(g = sample(LETTERS, 10e5, replace = TRUE),
x = sample(1:10, 10e5, replace = TRUE))
library(microbenchmark)
mbm <- microbenchmark(
steven = df %>%
group_by(g) %>%
arrange(desc(x)) %>%
mutate(max = ifelse(x == max(x), x[2], max(x))),
eric = df %>%
group_by(g) %>%
mutate(x_max = max(x),
x_max2 = sort(x, decreasing = TRUE)[2],
x_max_exclude = ifelse(x == x_max, x_max2, x_max)) %>%
select(-x_max2),
arun = setDT(df)[order(x), x_max_exclude := c(rep(x[.N], .N-1L), x[.N-1L]), by=g],
times = 50
)
#Arun's data.table solution is the fastest:
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# steven 158.58083 163.82669 197.28946 210.54179 212.1517 260.1448 50 b
# eric 223.37877 228.98313 262.01623 274.74702 277.1431 284.5170 50 c
# arun 44.48639 46.17961 54.65824 47.74142 48.9884 102.3830 50 a
Interesting problem. Here's one way using data.table:
require(data.table)
setDT(df)[order(x), x_max_exclude := c(rep(x[.N], .N-1L), x[.N-1L]), by=g]
The idea is to order by column x and on those indices, we group by g. Since we've the ordered indices, for the first .N-1 rows, the max value is the value at .N. And for the .Nth row, it's the value at .N-1th row.
.N is a special variable that holds the number of observations in each group.
I'll leave it to you and/or the dplyr experts to translate this (or answer with another approach).
This is the best I've come up with so far. Not sure if there's a better way.
df %>%
group_by(g) %>%
mutate(x_max = max(x),
x_max2 = sort(x, decreasing = TRUE)[2],
x_max_exclude = ifelse(x == x_max, x_max2, x_max)) %>%
select(-x_max2)
Another way with a functional:
df %>% group_by(g) %>% mutate(x_max_exclude = max_exclude(x))
Source: local data frame [6 x 3]
Groups: g
g x x_max_exclude
1 A 7 3
2 A 3 7
3 B 5 9
4 B 9 5
5 B 2 9
6 C 4 NA
We write a function called max_exclude that does the operation that you describe.
max_exclude <- function(v) {
res <- c()
for(i in seq_along(v)) {
res[i] <- suppressWarnings(max(v[-i]))
}
res <- ifelse(!is.finite(res), NA, res)
as.numeric(res)
}
It works with base R too:
df$x_max_exclude <- with(df, ave(x, g, FUN=max_exclude))
Source: local data frame [6 x 3]
g x x_max_exclude
1 A 7 3
2 A 3 7
3 B 5 9
4 B 9 5
5 B 2 9
6 C 4 NA
Benchmark
Here's a lesson kids, beware of for loops!
big.df <- data.frame(g=rep(LETTERS[1:4], each=1e3), x=sample(10, 4e3, replace=T))
microbenchmark(
plafort_dplyr = big.df %>% group_by(g) %>% mutate(x_max_exclude = max_exclude(x)),
plafort_ave = big.df$x_max_exclude <- with(big.df, ave(x, g, FUN=max_exclude)),
StevenB = (big.df %>%
group_by(g) %>%
mutate(max = ifelse(row_number(desc(x)) == 1, x[row_number(desc(x)) == 2], max(x)))
),
Eric = df %>%
group_by(g) %>%
mutate(x_max = max(x),
x_max2 = sort(x, decreasing = TRUE)[2],
x_max_exclude = ifelse(x == x_max, x_max2, x_max)) %>%
select(-x_max2),
Arun = setDT(df)[order(x), x_max_exclude := c(rep(x[.N], .N-1L), x[.N-1L]), by=g]
)
Unit: milliseconds
expr min lq mean median uq max neval
plafort_dplyr 75.219042 85.207442 89.247409 88.203225 90.627663 179.553166 100
plafort_ave 75.907798 84.604180 87.136122 86.961251 89.431884 104.884294 100
StevenB 4.436973 4.699226 5.207548 4.931484 5.364242 11.893306 100
Eric 7.233057 8.034092 8.921904 8.414720 9.060488 15.946281 100
Arun 1.789097 2.037235 2.410915 2.226988 2.423638 9.326272 100