Number of non-NA records by column, grouped - r

I have a data.table that looks something like this:
> dt <- data.table(
group1 = c("a", "a", "a", "b", "b", "b", "b"),
group2 = c("x", "x", "y", "y", "z", "z", "z"),
data1 = c(NA, rep(T, 3), rep(F, 2), "sometimes"),
data2 = c("sometimes", rep(F,3), rep(T,2), NA))
> dt
group1 group2 data1 data2
1: a x NA sometimes
2: a x TRUE FALSE
3: a y TRUE FALSE
4: b y TRUE FALSE
5: b z FALSE TRUE
6: b z FALSE TRUE
7: b z sometimes NA
My goal is to find the number of non-NA records in each data column, grouped by group1 and group2.
group1 group2 data1 data2
1: a x 1 2
3: a y 1 1
4: b y 1 1
5: b z 3 2
I have this code left over from dealing with another part of the dataset, which had no NAs and was logical:
dt[
,
lapply(.SD, sum),
by = list(group1, group2),
.SDcols = c("data3", "data4")
]
But it won't work with NA values, or non-logical values.

dt[, lapply(.SD, function(x) sum(!is.na(x))), by = .(group1, group2)]
# group1 group2 data1 data2
#1: a x 1 2
#2: a y 1 1
#3: b y 1 1
#4: b z 3 2

Another alternative is to melt/dcast in order to avoid by column operation. This will remove the NAs and use the length function by default
dcast(melt(dt, id = c("group1", "group2"), na.rm = TRUE), group1 + group2 ~ variable)
# Aggregate function missing, defaulting to 'length'
# group1 group2 data1 data2
# 1: a x 1 2
# 2: a y 1 1
# 3: b y 1 1
# 4: b z 3 2

Using dplyr (with some help from David Arenburg & eddi):
library(dplyr)
dt %>% group_by(group1, group2) %>% summarise_each(funs(sum(!is.na(.))))
Source: local data table [4 x 4]
Groups: group1
group1 group2 data1 data2
1 a x 1 2
2 a y 1 1
3 b y 1 1
4 b z 3 2

Related

grouping multiple columns and bringing values into character string

This is an example we can work with:
df <- tibble(y = c("a", "a", "a", "a", "a", "a"), z = c("b", "b", "b", "b", "b", "b"), a = c("aaa", "aaa", "aaa", "bbb", "bbb", "bbb"),
b = c(1,2,3,1,2,3), c = c(5,10,15,100,95,90))
df
# A tibble: 6 x 5
y z a b c
<chr> <chr> <chr> <dbl> <dbl>
1 a b aaa 1 5
2 a b aaa 2 10
3 a b aaa 3 15
4 a b bbb 1 100
5 a b bbb 2 95
6 a b bbb 3 90
I want to group the values in column y, z and a and combine column b and c to a single string. The final result should look exactly like this:
# A tibble: 2 x 4
y z a result
<chr> <chr> <chr> <chr>
1 a b aaa {"1":5,"2":10,"3":15}
2 a b bbb {"1":100,"2":95,"3":90}
Which i can almost achieve with:
b <- by(df[-1:-3], df$a, function(x)
sprintf("{%s}", toString(Reduce(paste0, c(x, "\"", "\":")[c(3, 1, 4, 2)]))))
data.frame(a=unique(df$a), result=do.call(rbind, as.list(b)), row.names=NULL)
a result
1 aaa {"1":5, "2":10, "3":15}
2 bbb {"1":100, "2":95, "3":90}
This only groups by column a, though and not by all three (y, z and a) columns. I got the hint that i can do fix it with the aggregate function but have a hard time appying it.
Using dplyr you can make use sprintf/paste0 :
library(dplyr)
df %>%
group_by(y, z, a) %>%
summarise(result = paste0('{', toString(sprintf('"%d":"%d"', b, c)), '}')) %>%
ungroup %>% data.frame()
# y z a result
#1 a b aaa {"1":"5", "2":"10", "3":"15"}
#2 a b bbb {"1":"100", "2":"95", "3":"90"}
Using by this can be written as :
do.call(rbind, by(df, list(df$y, df$z, df$a), function(x)
cbind(unique(x[1:3]),
result = paste0('{', toString(sprintf('"%d":"%d"', x$b, x$c)), '}'))))

Assign column value based on corresponding column - R [duplicate]

I have data frame where some of the values are missing
A 1
A NA
A NA
B NA
B 2
B NA
C NA
C NA
C NA
How can I fill in groups where I have data?
You can also use fill from tidyr:
library(dplyr)
library(tidyr)
df1 %>%
group_by(ID) %>%
fill(v1) %>%
fill(v1, .direction = "up")
Result:
# A tibble: 9 x 2
# Groups: ID [3]
ID v1
<chr> <int>
1 A 1
2 A 1
3 A 1
4 B 2
5 B 2
6 B 2
7 C NA
8 C NA
9 C NA
Credits to #akrun for dput
Alternative solution, though perhaps a bit flawed in how many assumptions it makes:
library(dplyr)
y %>%
group_by(V1) %>%
arrange(V2) %>%
mutate(V2 = V2[1])
# Source: local data frame [9 x 2]
# Groups: V1 [3]
# V1 V2
# (chr) (int)
# 1 A 1
# 2 A 1
# 3 A 1
# 4 B 2
# 5 B 2
# 6 B 2
# 7 C NA
# 8 C NA
# 9 C NA
We can use data.table. Convert the 'data.frame' to 'data.table' (setDT(df1)), grouped by 'ID', we assign (:=) the column 'v1' as the first non-NA value.
library(data.table)
setDT(df1)[, v1:= v1[!is.na(v1)][1L] , by = ID]
df1
# ID v1
#1: A 1
#2: A 1
#3: A 1
#4: B 2
#5: B 2
#6: B 2
#7: C NA
#8: C NA
#9: C NA
Or using only base R
with(df1, ave(v1, ID, FUN = function(x)
replace(x, is.na(x), x[!is.na(x)][1L])))
#[1] 1 1 1 2 2 2 NA NA NA
data
df1 <- structure(list(ID = c("A", "A", "A", "B", "B", "B", "C", "C",
"C"), v1 = c(1L, NA, NA, NA, 2L, NA, NA, NA, NA)), .Names = c("ID",
"v1"), class = "data.frame", row.names = c(NA, -9L))

unique rows in data frame based on if's

I have a large data table (millions of rows), where I need to trim the rows down to one per ID. The rule is that if another art than "X" is in the unique ID,
the X'es should be deleted. But if no other art is in the Unique ID, the X should stay.
Test dataset:
dt <- data.table(
ID=c(1,1,1,2,2,3,4,4),
art=c("X", "Y", "X", "X", "X", "X", "Z", "X"),
redskb=c("a", "Y", "a", "b", "b", "c", "k", "n")
)
ID art redskb
1: 1 X a
2: 1 Y Y
3: 1 X a
4: 2 X b
5: 2 X b
6: 3 X c
7: 4 X k
8: 4 Z n
Required output:
ID art redskb
1: 1 Y Y
2: 2 X b
3: 3 X c
4: 4 Z n
I tried with
unique(dt, by = c("ID"))
but could not get it to work efficiently with if's.
I'd try something like this:
unique(dt)[, `:=`(flag, if (.N == 1) TRUE else art != "X"), ID][(flag)]
## ID art redskb flag
## 1: 1 Y Y TRUE
## 2: 2 X b TRUE
## 3: 3 X c TRUE
## 4: 4 Z k TRUE
data.table:
dt[order(ID,art=="X"),.SD[1],ID]
or #Frank's version:
unique(dt[order(ID,art == "X")], by="ID")
# ID art redskb
# 1: 1 Y Y
# 2: 2 X b
# 3: 3 X c
# 4: 4 Z k
dplyr:
dt %>% group_by(ID) %>% slice(which.max(art != "X"))
# # A tibble: 4 x 3
# # Groups: ID [4]
# ID art redskb
# <dbl> <fctr> <chr>
# 1 1 Y Y
# 2 2 X b
# 3 3 X c
# 4 4 Z k
We can do
dt[dt[, .I[if(uniqueN(art) >1 & any(art == "X")) art!="X" else seq_len(.N)==1], ID]$V1]
# ID art redskb
#1: 1 Y Y
#2: 2 X b
#3: 3 X c
#4: 4 Z k

Filling missing value in group

I have data frame where some of the values are missing
A 1
A NA
A NA
B NA
B 2
B NA
C NA
C NA
C NA
How can I fill in groups where I have data?
You can also use fill from tidyr:
library(dplyr)
library(tidyr)
df1 %>%
group_by(ID) %>%
fill(v1) %>%
fill(v1, .direction = "up")
Result:
# A tibble: 9 x 2
# Groups: ID [3]
ID v1
<chr> <int>
1 A 1
2 A 1
3 A 1
4 B 2
5 B 2
6 B 2
7 C NA
8 C NA
9 C NA
Credits to #akrun for dput
Alternative solution, though perhaps a bit flawed in how many assumptions it makes:
library(dplyr)
y %>%
group_by(V1) %>%
arrange(V2) %>%
mutate(V2 = V2[1])
# Source: local data frame [9 x 2]
# Groups: V1 [3]
# V1 V2
# (chr) (int)
# 1 A 1
# 2 A 1
# 3 A 1
# 4 B 2
# 5 B 2
# 6 B 2
# 7 C NA
# 8 C NA
# 9 C NA
We can use data.table. Convert the 'data.frame' to 'data.table' (setDT(df1)), grouped by 'ID', we assign (:=) the column 'v1' as the first non-NA value.
library(data.table)
setDT(df1)[, v1:= v1[!is.na(v1)][1L] , by = ID]
df1
# ID v1
#1: A 1
#2: A 1
#3: A 1
#4: B 2
#5: B 2
#6: B 2
#7: C NA
#8: C NA
#9: C NA
Or using only base R
with(df1, ave(v1, ID, FUN = function(x)
replace(x, is.na(x), x[!is.na(x)][1L])))
#[1] 1 1 1 2 2 2 NA NA NA
data
df1 <- structure(list(ID = c("A", "A", "A", "B", "B", "B", "C", "C",
"C"), v1 = c(1L, NA, NA, NA, 2L, NA, NA, NA, NA)), .Names = c("ID",
"v1"), class = "data.frame", row.names = c(NA, -9L))

Finding "complete" groups in R

I have a large dataset of groups and subgroups. I want to filter the data according to the "completeness" of the groups, i.e. within each groups all levels of the sub-groups (a and b) should occur
A small example
group <- rep(c("A", "B", "C"), each=5)
a <- c(1,1,2,2,3,1,1,1,3,3,1,2,2,3,3)
b <- c("a", "a", "a", "b", "c", "a", "a", "a", "b", "c", "a", "b", "b", "b", "b")
df <- data.frame(group, a, b)
group a b
1 A 1 a
2 A 1 a
3 A 2 a
4 A 2 b
5 A 3 c
6 B 1 a
7 B 1 a
8 B 1 a
9 B 3 b
10 B 3 c
11 C 1 a
12 C 2 b
13 C 2 b
14 C 3 b
15 C 3 b
So here only A would be considered complete because all levels of a and b occur. Is there an efficient (and flexible) way to filter with those conditions?
I would do something like this:
sapply(split(df, df$group), function(x) all(a %in% x$a) & all(b %in% x$b))
## A B C
## TRUE FALSE FALSE
Here is a dplyr solution:
library(dplyr)
df %>%
group_by(group) %>%
mutate(
a_complete = all(unique(df$a) %in% a),
b_complete = all(unique(df$b) %in% b)
) %>%
filter(a_complete, b_complete) %>%
select(- ends_with("complete"))
I would try data.table, something like
library(data.table)
setDT(df)[, indx := length(unique(a)) + length(unique(b))]
df[, indx2 := length(unique(a)) + length(unique(b)), by = group]
df[indx == indx2]
# group a b indx indx2
# 1: A 1 a 6 6
# 2: A 1 a 6 6
# 3: A 2 a 6 6
# 4: A 2 b 6 6
# 5: A 3 c 6 6
Or for a more general solution, you can specify the column names and then use .SDcols, something like
cols <- c("a", "b")
setDT(df)[, indx := Reduce(sum, lapply(.SD, function(x) length(unique(x)))), .SDcols = cols]
df[, indx2 := Reduce(sum, lapply(.SD, function(x) length(unique(x)))), .SDcols = cols, by = group]
df[indx == indx2]

Resources