Counting amount of zeros within a "melted" data frame - r

Hei, I learn R and I try to count how many zeros I have within the melted data. So, I want to know how many zeros corresponds to column a and b and print two results out.
I generated an example:
library(reshape)
library(plyr)
library(dplyr)
id = c(1,2,3,4,5,6,7,8,9,10)
b = c(0,0,5,6,3,7,2,8,1,8)
c = c(0,4,9,87,0,87,0,4,5,0)
test = data.frame(id,b,c)
test_melt = melt(test, id.vars = "id")
test_melt
I imagine for that I should create an if statement. Something with
if (test$value == 0){print()}, but how can I tell R to count zeros for a columns that have been melted?

With your data:
test_melt %>%
group_by(variable) %>%
summarize(zeroes = sum(value == 0))
# # A tibble: 2 x 2
# variable zeroes
# <fctr> <int>
# 1 b 2
# 2 c 4
Base R:
aggregate(test_melt$value, by = list(variable = test_melt$variable),
FUN = function(x) sum(x == 0))
# variable x
# 1 b 2
# 2 c 4
... and for curiosity:
library(microbenchmark)
microbenchmark(
dplyr = group_by(test_melt, variable) %>% summarize(zeroes = sum(value == 0)),
base1 = aggregate(test_melt$value, by = list(variable = test_melt$variable), FUN = function(x) sum(x == 0)),
# #PankajKaundal's suggested "formula" notation reads easier
base2 = aggregate(value ~ variable, test_melt, function(x) sum(x == 0))
)
# Unit: microseconds
# expr min lq mean median uq max neval
# dplyr 916.421 986.985 1069.7000 1022.1760 1094.7460 2272.636 100
# base1 647.658 682.302 783.2065 715.3045 765.9940 1905.411 100
# base2 813.219 867.737 950.3247 897.0930 959.8175 2017.001 100

sum(test_melt$value==0)
This should do it.

This might help . Is this what you're looking for ?
> test_melt[4] <- 1
> test_melt2 <- aggregate(V4 ~ value + variable, test_melt, sum)
> test_melt2
value variable V4
1 0 b 2
2 1 b 1
3 2 b 1
4 3 b 1
5 5 b 1
6 6 b 1
7 7 b 1
8 8 b 2
9 0 c 4
10 4 c 2
11 5 c 1
12 9 c 1
13 87 c 2
V4 is the count

Related

Append first row by group to original data

I have data with a grouping variable "ID" and some values:
ID, Value
1, 1
1, 2
1, 3
1, 4
2, 5
2, 6
2, 7
2, 8
Within each group, I want to append the first row after the last row.
Desired result:
ID, Value
1, 1
1, 2
1, 3
1, 4
1, 1 # First row of ID 1 inserted as last row in the group
2, 5
2, 6
2, 7
2, 8
2, 5 # First row of ID 2 inserted as last row in the group
I have 5000 row of this.
Within tidyverse, you could use add_row with do (now deprecated) or group_modify (experimental):
dat |>
group_by(ID) |>
do(add_row(., ID = unique(.$ID), Value = first(.$Value))) |>
ungroup()
dat |>
group_by(ID) |>
group_modify(~ add_row(., Value = first(.$Value))) |>
ungroup()
Or bind_rows with summarize (my variation of #Gregor Thomas, thanks):
dat |>
group_by(ID) |>
summarize(bind_rows(cur_data(), head(cur_data(), 1))) |>
ungroup()
Or by applying the same logic of #Henrik using bind_rows, filter, and arrange:
dat |>
bind_rows(dat |> filter(!duplicated(ID))) |>
arrange(ID)
Output:
# A tibble: 10 × 2
ID Value
<int> <dbl>
1 1 1
2 1 2
3 1 3
4 1 4
5 1 1
6 2 5
7 2 6
8 2 7
9 2 8
10 2 5
Thanks to #SamR for the data.
And with data.table:
library(data.table)
dt <- data.table(ID = rep(1:2, each = 4), Value = 1:8)
dt[,.(Value = c(Value, first(Value))), ID]
#> ID Value
#> 1: 1 1
#> 2: 1 2
#> 3: 1 3
#> 4: 1 4
#> 5: 1 1
#> 6: 2 5
#> 7: 2 6
#> 8: 2 7
#> 9: 2 8
#> 10: 2 5
Benchmarking with a 5000-row table:
library(dplyr)
dt <- data.table(ID = rep(1:1250, each = 4), Value = 1:5e3)
f1 <- function(dt) dt[,.(Value = c(Value, first(Value))), ID]
# base R
f2 <- function(dt) do.call(rbind, lapply(split(dt, by = "ID"), function(x) rbind(x, x[1,])))
# tidyverse
f3 <- function(dt) {
dt %>%
group_by(ID) %>%
do(add_row(., ID = unique(.$ID), Value = first(.$Value))) %>%
ungroup()
}
f4 <- function(dt) {
dt %>%
group_by(ID) %>%
group_modify(~ add_row(., Value = first(.$Value))) %>%
ungroup()
}
microbenchmark::microbenchmark(data.table = f1(dt),
"base R" = f2(dt),
tidyverse1 = f3(dt),
tidyverse2 = f4(dt),
times = 10)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> data.table 3.4989 3.6844 4.16619 4.3085 4.4623 5.3020 10
#> base R 245.1397 263.1636 283.61131 284.8053 307.7105 310.3901 10
#> tidyverse1 761.7097 773.3705 791.05115 787.9463 808.5416 821.5321 10
#> tidyverse2 711.9593 716.4959 752.20273 728.2170 782.6474 837.1926 10
If speed is really important, this simple Rcpp function provides a very fast solution:
Rcpp::cppFunction(
"IntegerVector firstLast(const IntegerVector& x) {
const int n = x.size();
IntegerVector idxOut(2*n);
int i0 = 1;
int idx = 0;
idxOut(0) = 1;
for (int i = 1; i < n; i++) {
if (x(i) != x(i - 1)) {
idxOut(++idx) = i0;
i0 = i + 1;
}
idxOut(++idx) = i + 1;
}
idxOut(++idx) = i0;
return idxOut[Rcpp::Range(0, idx)];
}"
)
Benchmarking against the fastest solution from this answer (on a much larger dataset):
dt = data.table(ID = rep(1:125e4, each = 4), Value = 1:5e6)
microbenchmark::microbenchmark(
f_uniq_dt = setorder(rbindlist(list(dt, unique(dt, by = "ID"))), ID),
f_Rcpp = dt[firstLast(dt$ID)],
check = "equal"
)
#> Unit: milliseconds
#> expr min lq mean median uq max neval
#> f_uniq_dt 78.6056 83.71345 95.42876 85.80720 90.03685 175.8867 100
#> f_Rcpp 49.1485 53.38275 60.96322 55.44925 58.01485 121.3637 100
Use !duplicated to get first row by "ID" (more efficient than "by group" operations). rbind with original data and order result:
df = data.frame(ID = rep(1:2, each = 4), Value = 1:8)
d2 = rbind(df, df[!duplicated(df$ID), ])
d2[order(d2$ID), ]
# ID Value
# 1 1 1
# 2 1 2
# 3 1 3
# 4 1 4
# 11 1 1
# 5 2 5
# 6 2 6
# 7 2 7
# 8 2 8
# 51 2 5
Same idea with data.table::duplicated:
d = as.data.table(df)
d2 = rbindlist(list(d, d[!duplicated(d, by = "ID")]))
setorder(d2, ID)
More straightforward with data.table::unique:
d2 = rbindlist(list(d, unique(d, by = "ID")))
setorder(d2, ID)
Also data.table::rowid:
d2 = rbindlist(list(d, d[rowid(ID) == 1]))
setorder(d2, ID)
By avoiding "by group" operations, !duplicated, unique and rowid alternatives are all faster than the clear winner in the previous benchmark, data.table solution which uses by, on a 5000 row data set (see OP):
df = data.frame(ID = rep(1:1250, each = 4), Value = 1:5e3)
d = as.data.table(d)
microbenchmark(
f_by = {
d1 = d[ , .(Value = c(Value, first(Value))), by = ID]
},
f_dupl_df = {
d2 = rbind(df, df[!duplicated(df$ID), ])
d2 = d2[order(d2$ID), ]
},
f_dupl_dt = {
d3 = rbindlist(list(d, d[!duplicated(d, by = "ID")]))
setorder(d3, ID)
},
f_uniq_dt = {
d4 = rbindlist(list(d, unique(d, by = "ID")))
setorder(d4, ID)
},
f_rowid = {
d5 = rbindlist(list(d, d[rowid(ID) == 1]))
setorder(d5, ID)
},
times = 10L)
# Unit: milliseconds
# expr min lq mean median uq max
# f_by 8.5167 9.1397 11.01410 9.90925 12.3327 15.9134
# f_dupl_df 6.8337 7.0901 8.31057 7.56810 8.4899 13.9278
# f_dupl_dt 2.4742 2.6687 3.24932 3.18670 3.7993 4.3318
# f_uniq_dt 2.2059 2.4225 3.50756 3.36250 4.4590 5.6632
# f_rowid 2.2963 2.4295 3.43876 2.74345 4.8035 5.9278
all.equal(d1, as.data.table(d2))
all.equal(d1, d3)
all.equal(d1, d4)
all.equal(d1, d5)
# [1] TRUE
However, a sub-second benchmark isn't very informative, so try on larger data with many groups. The base solution loses ground. data.table::duplicated, unique and rowid scale better, and are now about 20 times faster, data.table::unique being fastest.
df = data.frame(ID = rep(1:1250000, each = 4), Value = 1:5e6)
d = as.data.table(df)
# Unit: milliseconds
# expr min lq mean median uq max neval
# f_by 6834.5959 7157.1686 12273.2399 7775.3919 8850.5324 35339.0262 10
# f_dupl_df 10732.1536 11035.4886 19440.4964 11691.5347 37956.6961 38387.4927 10
# f_dupl_dt 174.5640 183.8399 391.8605 381.8920 401.4929 962.4948 10
# f_uniq_dt 156.1267 161.9555 212.3472 180.7912 209.3905 406.7780 10
# f_rowid 192.1106 197.1564 380.0023 234.5851 474.5024 1172.6529 10
For completeness, a binary search, with mult = "first" to select the first match:
d[.(unique(ID)), on = .(ID), mult = "first"]
However, in both timings above, it ended on twice the time compared to the unique alternative.
Here is a base R method:
do.call(rbind,
lapply(split(dat, dat$ID), \(id_df) rbind(id_df, id_df[1,]))
)
# ID Value
# 1.1 1 1
# 1.2 1 2
# 1.3 1 3
# 1.4 1 4
# 1.5 1 1
# 2.5 2 5
# 2.6 2 6
# 2.7 2 7
# 2.8 2 8
# 2.51 2 5
It does give you slightly strange row names - if you care about that you can wrap it in (or pipe it to) tibble::as_tibble(), which removes row names altogether.
Alternatively you could do data.table::rbindlist(lapply(split(dat, dat$ID), \(id_df) rbind(id_df, id_df[1,]))), becausedata.table also does not use row names.
Data
dat <- read.csv(text = "ID, Value
1, 1
1, 2
1, 3
1, 4
2, 5
2, 6
2, 7
2, 8", h=T)
You can try the following data.table option
> setDT(df)[, .SD[(seq(1 + .N) - 1) %% .N + 1], ID]
ID Value
1: 1 1
2: 1 2
3: 1 3
4: 1 4
5: 1 1
6: 2 5
7: 2 6
8: 2 7
9: 2 8
10: 2 5

efficient way to rowwise mutate with sample

For each 0 in x, I want to randomly insert a number between 1:10 but i'm looking for an efficent way to do this in dplyr and/or data.table as I have a very large dataset (10m rows).
library(tidyverse)
df <- data.frame(x = 1:10)
df[4, 1] = 0
df[6, 1] = 0
df
# x
# 1 1
# 2 2
# 3 3
# 4 0
# 5 5
# 6 0
# 7 7
# 8 8
# 9 9
# 10 10
This doesnt work as it replaces each year with the same value:
set.seed(1)
df %>%
mutate(x2 = ifelse(x == 0, sample(1:10, 1), x))
# x x2
# 1 1 1
# 2 2 2
# 3 3 3
# 4 0 9
# 5 5 5
# 6 0 9
# 7 7 7
# 8 8 8
# 9 9 9
# 10 10 10
It can be achieved though with rowwise but is slow on a large dataset:
set.seed(1)
#use rowwise
df %>%
rowwise() %>%
mutate(x2 = ifelse(x == 0, sample(1:10, 1), x))
# x x2
# <dbl> <dbl>
# 1 1 1
# 2 2 2
# 3 3 3
# 4 0 9
# 5 5 5
# 6 0 4
# 7 7 7
# 8 8 8
# 9 9 9
# 10 10 10
Any suggestions to speed this up?
Thanks
Not in tidyverse, but you could just do something like this:
is_zero <- (df$x == 0)
replacements <- sample(1:10, sum(is_zero))
df$x[is_zero] <- replacements
Of course, you can collapse that down if you'd like.
df$x[df$x == 0] <- sample(1:10, sum(df$x == 0))
Using the above solutions and microbenchmark and a slight modification to the dataset for setup:
library(data.table)
library(tidyverse)
df <- data.frame(x = 1:100000, y = rbinom(100000, size = 1, 0.5)) %>%
mutate(x = ifelse(y == 0, 0, x)) %>%
dplyr::select(-y)
dt <- setDT(df)
test <- microbenchmark::microbenchmark(
base1 = {
df$x[df$x == 0] <- sample(1:10, sum(df$x == 0), replace = T)
},
dplyr1 = {
df %>%
mutate(x2 = replace(x, which(x == 0), sample(1:10, sum(x == 0), replace = T)))
},
dplyr2 = {
df %>% group_by(id=row_number()) %>%
mutate(across(c(x),.fns = list(x2 = ~ ifelse(.==0, sample(1:10, 1, replace = T), .)) )) %>%
ungroup() %>% select(-id)
},
data.table = {
dt[x == 0, x := sample(1:10, .N, replace = T)]
},
times = 500L
)
test
# Unit: microseconds
# expr min lq mean median uq max neval cld
# base1 733.7 785.9 979.0938 897.25 1137.0 1839.4 500 a
# dplyr1 5207.1 5542.1 6129.2276 5967.85 6476.0 21790.7 500 a
# dplyr2 15963406.4 16156889.2 16367969.8704 16395715.00 16518252.9 19276215.5 500 b
# data.table 1547.4 2229.3 2422.1278 2455.60 2573.7 15076.0 500 a
I thought data.table would be quickest but the base solution seems best (assuming I've set up the mircobenchmark correctly?).
EDIT based on #chinsoon12 comment
1e5 rows:
Unit: microseconds
expr min lq mean median uq max neval cld
base1 730.4 839.30 1380.465 1238.00 1322.85 28977.3 500 a
data.table 1394.8 1831.85 2030.215 1946.95 2060.40 29821.9 500 b
1e6 rows:
Unit: milliseconds
expr min lq mean median uq max neval cld
base1 9.8703 11.6596 16.030715 11.76195 12.04145 326.0118 500 b
data.table 2.3772 2.7939 3.855672 3.04700 3.25900 61.4083 500 a
data.table is the quickest
Maybe try with across() from dplyr in this way:
library(tidyverse)
#Data
df <- data.frame(x = 1:10)
df[4, 1] = 0
df[6, 1] = 0
#Code
df %>% group_by(id=row_number()) %>%
mutate(across(c(x),.fns = list(x2 = ~ ifelse(.==0, sample(1:10, 1), .)) )) %>%
ungroup() %>% select(-id)
Output:
# A tibble: 10 x 2
x x_x2
<dbl> <dbl>
1 1 1
2 2 2
3 3 3
4 0 5
5 5 5
6 0 6
7 7 7
8 8 8
9 9 9
10 10 10
I am adding a different answer because there are already votes on the base option I provided. But here can be a dplyr way using replace.
library(dplyr)
df %>%
mutate(x2 = replace(x, which(x == 0), sample(1:10, sum(x == 0))))
Here is a data.table option using similar logic to Adam's answer. This filters for rows that meet your criteria: x == 0, and then samples 1:10 .N times (which, without a grouping variable, is the number of rows of the filtered data.table).
library(data.table)
set.seed(1)
setDT(df)[x == 0, x := sample(1:10, .N)]
df
x
1: 1
2: 2
3: 3
4: 9
5: 5
6: 4
7: 7
8: 8
9: 9
10: 10

Row Minimum except certain columns

I have a data frame below. I need to find the the row min and max except few column that are characters.
df
x y z
1 1 1 a
2 2 5 b
3 7 4 c
I need
df
x y z Min Max
1 1 1 a 1 1
2 2 5 b 2 5
3 7 4 c 4 7
Another dplyr possibility could be:
df %>%
mutate(Max = do.call(pmax, select_if(., is.numeric)),
Min = do.call(pmin, select_if(., is.numeric)))
x y z Max Min
1 1 1 a 1 1
2 2 5 b 5 2
3 7 4 c 7 4
Or a variation proposed be #G. Grothendieck:
df %>%
mutate(Min = pmin(!!!select_if(., is.numeric)),
Max = pmax(!!!select_if(., is.numeric)))
Another base R solution. Subset only the columns with numbers and then use apply in each row to get the minimum and maximum value with range.
cbind(df, t(apply(df[sapply(df, is.numeric)], 1, function(x)
setNames(range(x, na.rm = TRUE), c("min", "max")))))
# x y z min max
#1 1 1 a 1 1
#2 2 5 b 2 5
#3 7 4 c 4 7
1) This one-liner uses no packages:
transform(df, min = pmin(x, y), max = pmax(x, y))
giving:
x y z min max
1 1 1 a 1 1
2 2 5 b 2 5
3 7 4 c 4 7
2) If you have many columns and don't want to list them all or determine yourself which are numeric then this also uses no packages.
ix <- sapply(df, is.numeric)
transform(df, min = apply(df[ix], 1, min), max = apply(df[ix], 1, max))
If your actual data has NAs and if you want to ignore them when taking the min or max then min, max, pmin and pmax all take an optional na.rm = TRUE argument.
Note
Lines <- "x y z
1 1 1 a
2 2 5 b
3 7 4 c"
df <- read.table(text = Lines)
1) We can use select_if. Here, we can use select_if to select the columns that are numeric, then with pmin, pmax get the rowwise min and max and bind it with the original dataset
library(dplyr)
library(purrr)
df %>%
select_if(is.numeric) %>%
transmute(Min = reduce(., pmin, na.rm = TRUE),
Max = reduce(., pmax, na.rm = TRUE)) %>%
bind_cols(df, .)
# x y z Min Max
#1 1 1 a 1 1
#2 2 5 b 2 5
#3 7 4 c 4 7
NOTE: Here, we use only a single expression of select_if
2) The same can be done in base R (no packages used)
i1 <- names(which(sapply(df, is.numeric)))
df['Min'] <- do.call(pmin, c(df[i1], na.rm = TRUE))
df['Max'] <- do.call(pmax, c(df[i1], na.rm = TRUE))
Also, as stated in the comments, this is generalized option. If it is only for two columns, just doing pmin(x, y) or pmax(x,y) is possible and that wouldn't check if the columns are numeric or not and it is not a general solution
NOTE: All of the solutions mentioned here are either answered first or from the comments with the OP
data
df <- structure(list(x = c(1L, 2L, 7L), y = c(1L, 5L, 4L), z = c("a",
"b", "c")), class = "data.frame", row.names = c("1", "2", "3"
))

Removing groups from dataframe if variable has repeated values

I would like to ask if there is a way of removing a group from dataframe using dplyr (or anz other way in that matter) in the following way. Lets say I have a dataframe in the following form grouped by variable 1:
Variable 1 Variable 2
1 a
1 b
2 a
2 a
2 b
3 a
3 c
3 a
... ...
I would like to remove only groups that have in Variable 2 two consecutive same values. That is in table above it would remove group 2 because there are values a,a,b but not group c where is a,c,a. So I would get the table bellow?
Variable 1 Variable 2
1 a
1 b
3 a
3 c
3 a
... ...
To test for consecutive identical values, you can compare a value to the previous value in that column. In dplyr, this is possible with lag. (You could do the same thing with comparing to the next value, using lead. Result comes out the same.)
Group the data by variable1, get the lag of variable2, then add up how many of these duplicates there are in that group. Then filter for just the groups with no duplicates. After that, feel free to remove the dupesInGroup column.
library(tidyverse)
df %>%
group_by(variable1) %>%
mutate(dupesInGroup = sum(variable2 == lag(variable2), na.rm = T)) %>%
filter(dupesInGroup == 0)
#> # A tibble: 5 x 3
#> # Groups: variable1 [2]
#> variable1 variable2 dupesInGroup
#> <int> <chr> <int>
#> 1 1 a 0
#> 2 1 b 0
#> 3 3 a 0
#> 4 3 c 0
#> 5 3 a 0
Created on 2018-05-10 by the reprex package (v0.2.0).
prepare data frame:
df <- data.frame("Variable 1" = c(1, 1, 2, 2, 2, 3, 3, 3), "Variable 2" = unlist(strsplit("abaabaca", "")))
write functions to test if consecutive repetitions are there or not:
any.consecutive.p <- function(v) {
for (i in 1:(length(v) - 1)) {
if (v[i] == v[i + 1]) {
return(TRUE)
}
}
return(FALSE)
}
any.consecutive.in.col.p <- function(df, col) {
any.consecutive.p(df[, col])
}
any.consecutive.p returns TRUE if it finds first consecutive repetition in a vector (v).
any.consecutive.in.col.p() looks for consecutive repetitions in a column of a data frame.
split data frame by values of Variable.1
df.l <- split(df, df$Variable.1)
df.l
$`1`
Variable.1 Variable.2
1 1 a
2 1 b
$`2`
Variable.1 Variable.2
3 2 a
4 2 a
5 2 b
$`3`
Variable.1 Variable.2
6 3 a
7 3 c
8 3 a
Finally go over this data.frame list and test for each data frame, if it contains consecutive duplicates in Variable.2 column.
If found, don't collect it.
Bind the collected data frames by rows.
Reduce(rbind, lapply(df.l, function(df) if(!any.consecutive.in.col.p(df, "Variable.2")) {df}))
Variable.1 Variable.2
1 1 a
2 1 b
6 3 a
7 3 c
8 3 a
Say you want to remove all groups of df, grouped by a, where the column b has repeated values. You can do that as below.
set.seed(0)
df <- data.frame(a = rep(1:3, rep(3, 3)), b = sample(1:5, 9, T))
# dplyr
library(dplyr)
df %>%
group_by(a) %>%
filter(all(b != lag(b), na.rm = T))
#data.table
library(data.table)
setDT(df)
df[, if(all(b != shift(b), na.rm = T)) .SD, by = a]
Benchmark shows data.table is faster
#Results
# Unit: milliseconds
# expr min lq mean median uq max neval
# use_dplyr() 141.46819 165.03761 201.0975 179.48334 205.82301 539.5643 100
# use_DT() 36.27936 50.23011 64.9218 53.87114 66.73943 345.2863 100
# Method
set.seed(0)
df <- data.table(a = rep(1:2000, rep(1e3, 2000)), b = sample(1:1e3, 2e6, T))
use_dplyr <- function(x){
df %>%
group_by(a) %>%
filter(all(b != lag(b), na.rm = T))
}
use_DT <- function(x){
df[, if (all(b != shift(b), na.rm = T)) .SD, a]
}
microbenchmark(use_dplyr(), use_DT())

Tidyr how to spread into count of occurrence [duplicate]

This question already has answers here:
How do I get a contingency table?
(6 answers)
Faster ways to calculate frequencies and cast from long to wide
(4 answers)
Closed 4 years ago.
Have a data frame like this
other=data.frame(name=c("a","b","a","c","d"),result=c("Y","N","Y","Y","N"))
How can I use spread function in tidyr or other function to get the count of result Y or N as column header like this
name Y N
a 2 0
b 0 1
Thanks
These are a few ways of many to go about it:
1) With library dplyr, you can simply group things and count into the format needed:
library(dplyr)
other %>% group_by(name) %>% summarise(N = sum(result == 'N'), Y = sum(result == 'Y'))
Source: local data frame [4 x 3]
name N Y
<fctr> <int> <int>
1 a 0 2
2 b 1 0
3 c 0 1
4 d 1 0
2) You can use a combination of table and tidyr spread as follows:
library(tidyr)
spread(as.data.frame(table(other)), result, Freq)
name N Y
1 a 0 2
2 b 1 0
3 c 0 1
4 d 1 0
3) You can use a combination of dplyr and tidyr to do as follows:
library(dplyr)
library(tidyr)
spread(count(other, name, result), result, n, fill = 0)
Source: local data frame [4 x 3]
Groups: name [4]
name N Y
<fctr> <dbl> <dbl>
1 a 0 2
2 b 1 0
3 c 0 1
4 d 1 0
Here is another option using dcast from data.table
library(data.table)
dcast(setDT(other), name~result, length)
# name N Y
#1: a 0 2
#2: b 1 0
#3: c 0 1
#4: d 1 0
Although, table(other) would be a compact option (from #mtoto's comments), for large datasets, it may be more efficient to use dcast. Some benchmarks are given below
set.seed(24)
other1 <- data.frame(name = sample(letters, 1e6, replace=TRUE),
result = sample(c("Y", "N"), 1e6, replace=TRUE), stringsAsFactors=FALSE)
other2 <- copy(other1)
gopala1 <- function() other1 %>%
group_by(name) %>%
summarise(N = sum(result == 'N'), Y = sum(result == 'Y'))
gopala2 <- function() spread(as.data.frame(table(other1)), result, Freq)
gopala3 <- function() spread(count(other1, name, result), result, n, fill = 0)
akrun <- function() dcast(as.data.table(other2), name~result, length)
library(microbenchmark)
microbenchmark(gopala1(), gopala2(), gopala3(),
akrun(), unit='relative', times = 20L)
# expr min lq mean median uq max neval
# gopala1() 2.710561 2.331915 2.142183 2.325167 2.134399 1.513725 20
# gopala2() 2.859464 2.564126 2.531130 2.683804 2.720833 1.982760 20
# gopala3() 2.345062 2.076400 1.953136 2.027599 1.882079 1.947759 20
# akrun() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 20

Resources