Group by and Count using specified columns in list - r

I am having some difficulty creating a function which would group by according to column P and count all the "YES" for the columns that have been saved in a list such as "list_col". I am pretty sure I would be using a function from the apply family but not sure how to group it by a certain column (Col P in this case)
P <- as.character(c("A", "A", "A", "A", "A", "B", "B", "B", "B", "B"))
a <- as.character(c(NA,"Yes","Yes",NA,NA,NA,"Yes","Yes","Yes",NA))
b <- as.character(c(NA,"Yes",NA,NA,"Yes",NA,"Yes","Yes",NA,NA))
c <- as.character(c(NA,NA,NA,NA,"Yes",NA,"Yes",NA,NA,NA))
df_sample <- as.data.frame(cbind(P, a, b, c))
df_sample
list_col <- c("a","b","c")
Ideally I would be looking for the following answer with the above sample dataframe and the answer below with the columns changing according to the "list_col"
P a b c
A 2 2 1
B 3 2 1
Any help would be appreciated

Here is an approach via dplyr,
library(dplyr)
df_sample %>%
group_by(P) %>%
select(list_col) %>%
summarise_all(funs(sum(. == 'Yes', na.rm = TRUE)))
#Adding missing grouping variables: `P`
# A tibble: 2 x 4
# P a b c
# <fctr> <int> <int> <int>
#1 A 2 2 1
#2 B 3 2 1

In data.table:
library(data.table)
list_col <- c("a","b","c")
setDT(df_sample)[, (lapply(.SD, function(x) sum(x=="Yes", na.rm = TRUE))), by = P, .SDcols = list_col]
# P a b c
#1: A 2 2 1
#2: B 3 2 1
Alternatively, a base R solution still using lapply:
res <-lapply(split(df_sample[,list_col], df_sample$P), function(x) colSums(x=="Yes", na.rm = TRUE))
do.call(rbind, res)
# a b c
#A 2 2 1
#B 3 2 1
For what it's worth, a microbenchmark on my machine:
microbenchmark::microbenchmark(splitlapply = do.call(rbind, lapply(split(df_sample[,list_col], df_sample$P), function(x) colSums(x=="Yes", na.rm = TRUE))),
+ dt = sampleDT[, (lapply(.SD, function(x) sum(x=="Yes", na.rm = TRUE))), by = P, .SDcols = list_col])
Unit: microseconds
expr min lq mean median uq max neval
splitlapply 455.841 505.0715 546.6699 529.3225 561.2315 889.436 100
dt 861.722 1052.9920 1114.2752 1111.7040 1166.7695 1707.761 100

Using melt from reshape
library(reshape)
df=na.omit(melt(df_sample,id.vars='P'))
table(df$P,df$variable)
a b c
A 2 2 1
B 3 2 1

df_sample <- as.data.frame(cbind(P, a, b, c), stringsAsFactors = FALSE)
for (i in list_col){
df_r <- df_sample[, i] == 'Yes' & !is.na(df_sample[, i])
df_sample[df_r, i] <- df_sample$P[df_r]
}
sapply(df_sample[, list_col], table)

Related

Count occurrence of a categorical variable, when grouping and summarising by a different variable in R

I have a table df that looks like this:
a <- c(10,20, 20, 20, 30)
b <- c("u", "u", "u", "r", "r")
c <- c("a", "a", "b", "b", "b")
df <- data.frame(a,b,c)
I would like to create a new table that contains the mean of col a, grouped by variable c. And I would like to have a column with the counts of the occurrence of b types within each group c.
I would therefore like the result table to look like df2:
a_m <- c(15, 23.3)
c <- c("a", "b")
counts_b <-c("2 u", "1 u, 2 r")
df2 <- data.frame(a_m, c, counts_b)
What I have so far is:
df2 <- df %>% group_by(c) %>% summarise(a_m = mean(a, na.rm = TRUE))
I do not know how to add the column counts_b in the example df2.
Giulia
Here's a way using a little table magic:
df %>%
group_by(c) %>%
summarise(a_mean = mean(a),
b_list = paste(names(table(b)), table(b), collapse = ', '))
# A tibble: 2 x 3
c a_mean b_list
<fct> <dbl> <chr>
1 a 15.0 r 0, u 2
2 b 23.3 r 2, u 1
Here is another solution using reshape2. The output format may be more convenient to work with, each value of b has its own column with the number of occurrences.
out1 <- dcast(df, c ~ b, value.var="c", fun.aggregate=length)
c r u
1 a 0 2
2 b 2 1
out2 <- df %>% group_by(c) %>% summarise(a_m = mean(a))
# A tibble: 2 x 2
c a_m
<fctr> <dbl>
1 a 15.00000
2 b 23.33333
df2 <- merge(out1, out2, by=c)
c r u a_m
1 a 0 2 15.00000
2 b 2 1 23.33333

Counting amount of zeros within a "melted" data frame

Hei, I learn R and I try to count how many zeros I have within the melted data. So, I want to know how many zeros corresponds to column a and b and print two results out.
I generated an example:
library(reshape)
library(plyr)
library(dplyr)
id = c(1,2,3,4,5,6,7,8,9,10)
b = c(0,0,5,6,3,7,2,8,1,8)
c = c(0,4,9,87,0,87,0,4,5,0)
test = data.frame(id,b,c)
test_melt = melt(test, id.vars = "id")
test_melt
I imagine for that I should create an if statement. Something with
if (test$value == 0){print()}, but how can I tell R to count zeros for a columns that have been melted?
With your data:
test_melt %>%
group_by(variable) %>%
summarize(zeroes = sum(value == 0))
# # A tibble: 2 x 2
# variable zeroes
# <fctr> <int>
# 1 b 2
# 2 c 4
Base R:
aggregate(test_melt$value, by = list(variable = test_melt$variable),
FUN = function(x) sum(x == 0))
# variable x
# 1 b 2
# 2 c 4
... and for curiosity:
library(microbenchmark)
microbenchmark(
dplyr = group_by(test_melt, variable) %>% summarize(zeroes = sum(value == 0)),
base1 = aggregate(test_melt$value, by = list(variable = test_melt$variable), FUN = function(x) sum(x == 0)),
# #PankajKaundal's suggested "formula" notation reads easier
base2 = aggregate(value ~ variable, test_melt, function(x) sum(x == 0))
)
# Unit: microseconds
# expr min lq mean median uq max neval
# dplyr 916.421 986.985 1069.7000 1022.1760 1094.7460 2272.636 100
# base1 647.658 682.302 783.2065 715.3045 765.9940 1905.411 100
# base2 813.219 867.737 950.3247 897.0930 959.8175 2017.001 100
sum(test_melt$value==0)
This should do it.
This might help . Is this what you're looking for ?
> test_melt[4] <- 1
> test_melt2 <- aggregate(V4 ~ value + variable, test_melt, sum)
> test_melt2
value variable V4
1 0 b 2
2 1 b 1
3 2 b 1
4 3 b 1
5 5 b 1
6 6 b 1
7 7 b 1
8 8 b 2
9 0 c 4
10 4 c 2
11 5 c 1
12 9 c 1
13 87 c 2
V4 is the count

Count unique values of a column by pairwise combinations of another column in R

Let's say I have the following data frame:
ID Code
1 1 A
2 1 B
3 1 C
4 2 B
5 2 C
6 2 D
7 3 C
8 3 A
9 3 D
10 3 B
11 4 D
12 4 B
I would like to get the count of unique values of the column "ID" by pairwise combinations of the column "Code":
Code.Combinations Count.of.ID
1 A, B 2
2 A, C 2
3 A, D 1
4 B, C 3
5 B, D 3
6 C, D 2
I have searched for solution(s) online, so far haven't been able to achieve the desired result.
Any help would be appreciated. Thanks!
Here is a data.table way to solve the problem. Use combn function to pick up all possible combinations of Code and then count ID for each unique CodeComb:
library(data.table)
setDT(df)[, .(CodeComb = sapply(combn(Code, 2, simplify = F),
function(cmb) paste(sort(cmb), collapse = ", "))), .(ID)]
# list all combinations of Code for each ID
[, .(IdCount = .N), .(CodeComb)]
# count number of unique id for each code combination
# CodeComb IdCount
# 1: A, B 2
# 2: A, C 2
# 3: B, C 3
# 4: B, D 3
# 5: C, D 2
# 6: A, D 1
Assuming your data.frame is named df and using dplyr
df %>% full_join(df, by="ID") %>% group_by(Code.x,Code.y) %>% summarise(length(unique(ID))) %>% filter(Code.x!=Code.y)
Join the df with itself and then count by the groups
Below makes use of combinations from the gtools package as well as count from the plyr package.
library(gtools)
library(plyr)
PairWiseCombo <- function(df) {
myID <- df$ID
BreakDown <- rle(myID)
Unis <- BreakDown$values
numUnis <- BreakDown$lengths
Len <- length(Unis)
e <- cumsum(numUnis)
s <- c(1L, e + 1L)
## more efficient to generate outside of the "do.call(c, lapply(.."
## below. This allows me to reference a particular combination
## rather than re-generating the same combination multiple times
myCombs <- lapply(2:max(numUnis), function(x) combinations(x,2L))
tempDF <- plyr::count(do.call(c, lapply(1:Len, function(i) {
myRange <- s[i]:e[i]
combs <- myCombs[[numUnis[i]-1L]]
vapply(1:nrow(combs), function(j) paste(sort(df$Code[myRange[combs[j,]]]), collapse = ","), "A,D")
})))
names(tempDF) <- c("Code.Combinations", "Count.of.ID")
tempDF
}
Below are some metrics. I didn't test the solution by #Carl as it was giving different results than the other solutions.
set.seed(537)
ID <- do.call(c, lapply(1:100, function(x) rep(x, sample(2:26,1))))
temp <- rle(ID)
Code <- do.call(c, lapply(1:100, function(x) LETTERS[sample(temp$lengths[x])]))
TestDF <- data.frame(ID, Code, stringsAsFactors = FALSE)
system.time(t1 <- Noah(TestDF))
user system elapsed
97.05 0.31 97.42
system.time(t2 <- DTSolution(TestDF))
user system elapsed
0.43 0.00 0.42
system.time(t3 <- PairWiseCombo(TestDF))
user system elapsed
0.42 0.00 0.42
identical(sort(t3[,2]),sort(t2$IdCount))
TRUE
identical(sort(t3[,2]),sort(t1[,2]))
TRUE
Using microbenchmark we have:
library(microbenchmark)
microbenchmark(Joseph = PairWiseCombo(TestDF), Psidom = DTSolution(TestDF), times = 10L)
Unit: milliseconds
expr min lq mean median uq max neval
Joseph 420.1090 433.9471 442.0133 446.4880 450.4420 452.7852 10
Psidom 396.8444 413.4933 416.3315 418.5573 420.9669 423.6303 10
Overall, the data.table solution provided by #Psidom was the fastest (not surprisingly). Both my solution and the data.table solution performed similarly on really large examples. However, the solution provided from #Noah is extremely memory intensive and couldn't be tested on larger data frames.
sessionInfo()
R version 3.3.0 (2016-05-03)
Platform: x86_64-w64-mingw32/x64 (64-bit)
Running under: Windows 7 x64 (build 7601) Service Pack 1
Update
After tweaking #Carl's solution, the dplyr approach is by far the fastest. Below is the code (you will see what parts I altered):
DPLYRSolution <- function(df) {
df <- df %>% full_join(df, by="ID") %>% group_by(Code.x,Code.y) %>% summarise(length(unique(ID))) %>% filter(Code.x!=Code.y)
## These two lines were added by me to remove "duplicate" rows
df <- mutate(df, Code=ifelse(Code.x < Code.y, paste(Code.x, Code.y), paste(Code.y, Code.x)))
df[which(!duplicated(df$Code)), ]
}
Below are the new metrics:
system.time(t4 <- DPLYRSolution(TestDF))
user system elapsed
0.03 0.00 0.03 ### Wow!!! really fast
microbenchmark(Joseph = PairWiseCombo(TestDF), Psidom = DTSolution(TestDF),
Carl = DPLYRSolution(TestDF), times = 10L)
Unit: milliseconds
expr min lq mean median uq max neval
Joseph 437.87235 442.7348 450.91085 452.77204 457.09465 461.85035 10
Psidom 407.81519 416.9444 422.62793 425.26041 429.02064 434.38881 10
Carl 44.33698 44.8066 48.39051 45.35073 54.06513 59.35653 10
## Equality Check
identical(sort(c(t4[,3])[[1]]), sort(t1[,2]))
[1] TRUE
Using base only:
df <- data.frame(ID=c(1,1,1,2,2,2,3,3,3,3,4,4),
code=c("A", "B", "C", "B", "C", "D", "C", "A", "D", "B", "D", "B"), stringsAsFactors =FALSE)
# Create data.frame of unique combinations of codes
e <- expand.grid(df$code, df$code)
e <- e[e[,1]!=e[,2],]
e1 <- as.data.frame(unique(t(apply(e, 1, sort))), stringsAsFactors = FALSE)
# Count the occurrence of each code combination across IDs
e1$count <- apply(e1, 1, function(y)
sum(sapply(unique(df$ID), function(x)
sum(y[1] %in% df$code[df$ID==x] & y[2] %in% df$code[df$ID==x]))))
# Turn the codes into a string and print output
out <- data.frame(Code.Combinations=do.call(paste, c(e1[,1:2], sep=", ")),
Count.of.ID=e1$count, stringsAsFactors = FALSE)
out
# Code.Combinations Count.of.ID
# 1 A, B 2
# 2 A, C 2
# 3 A, D 1
# 4 B, C 3
# 5 B, D 3
# 6 C, D 2

Create column with minimum (first) level of factors by group r

I have a data frame with a mixture of patient and appointment information. Each patient may have attended multiple appointments. Some patient information is collected at each appointment, with the result that some patient information is conflicting and some is missing.
I want to fill in the missing patient information for appointments where it wasn't recorded based upon data recorded at other appointments. And (this is where I'm stuck) I want to take the 'minimum' level (earliest in order of levels) recorded for factors that have conflicting information for the same patient. (In the example below the factor levels are in alphabetical order , but this isn't always the case).
This is similar to this question, but I am using factors instead of characters, and I have multiple factors that I want the minimum for so can't filter by row.
eg. I have
df.have <- data.frame(
grp_id = rep(1:3, each = 2),
grpvar1 = factor(c("B", "A", "B", "C", NA, "A")),
grpvar2 = factor(c("a", "b", "c", NA, NA, "x")),
appt_id = 1:6)
I want
grp_id grpvar1 grpvar2 appt_id
1 A a 1
1 A a 2
2 B c 3
2 B c 4
3 A x 5
3 A x 6
or at least
grp_id grpvar1 grpvar1.1
1 A a
2 B c
3 A x
We can try with summarise_each. As we need the first level of 'grpvar' variables, we need to make sure that the unused levels are dropped before we do that (using droplevels).
df.have %>%
group_by(grp_id) %>%
summarise_each(funs(first(levels(droplevels(.)))), grpvar1:grpvar2)
# grp_id grpvar1 grpvar2
# <int> <chr> <chr>
#1 1 A a
#2 2 B c
#3 3 A x
Or if we use mutate_each, we get the first output
df.have %>%
group_by(grp_id) %>%
mutate_each(funs(levels(droplevels(.))[1]), grpvar1:grpvar2)
# grp_id grpvar1 grpvar2 appt_id
# <int> <chr> <chr> <int>
#1 1 A a 1
#2 1 A a 2
#3 2 B c 3
#4 2 B c 4
#5 3 A x 5
#6 3 A x 6
if we need the output as factor columns for 'grpvar's.
df.have %>%
group_by(grp_id) %>%
mutate_each(funs(factor(levels(droplevels(.))[1])), grpvar1:grpvar2)
# grp_id grpvar1 grpvar2 appt_id
# <int> <fctr> <fctr> <int>
#1 1 A a 1
#2 1 A a 2
#3 2 B c 3
#4 2 B c 4
#5 3 A x 5
#6 3 A x 6
Or using data.table
library(data.table)
setDT(df.have)[, lapply(.SD, function(x) levels(droplevels(x))[1]) ,
grp_id, .SDcols = grpvar1:grpvar2]
While revisiting this for a different project with a larger dataset I realised that it was much more efficient (though required more keystrokes) to convert the factors to numbers, take the minimum, then reconvert back to factors.
library(data.table)
library(dplyr)
set.seed(1)
n <- 100L
dat <- data.table(
grp_id = rep(1:n/10, each = 10),
grpvar1 = factor(sample(c(LETTERS, NA), n, replace = TRUE), levels = LETTERS),
grpvar2 = factor(sample(c(letters, NA), n, replace = TRUE), levels = letters),
appt_id = 1:n)
cols <- c("grpvar1","grpvar2")
dplyr_fct <- function(data, cols) {
data %>%
group_by(grp_id) %>%
mutate_each(funs(factor(levels(droplevels(.))[1])), one_of(cols))
}
dt_fct <- function(data, cols) {
data[, lapply(.SD, function(x) levels(droplevels(x))[1]), grp_id, .SDcols = cols]}
dt_nmbr <- function(data, cols) {
dat_out <- copy(data)
v_lvl = lapply(dat_out[, .SD, .SDcols = cols], levels)
# Convert factors to numeric
for(col in cols) set(dat_out, j = col, value = as.numeric(dat_out[[col]]))
# Select highest value
dat_out[, (cols):= lapply(.SD, min, na.rm = TRUE), by = grp_id, .SDcols = cols]
# Convert back to factor
for(col in cols) set(dat_out, j = col,
value = factor(dat_out[[col]], levels = 1:length(v_lvl[[col]]), labels = v_lvl[[col]]))
assign("dat_out", dat_out, envir = .GlobalEnv)
}
mbm <- microbenchmark::microbenchmark(
mbm_dplyr_fct = dplyr_fct(dat, cols),
mbm_dt_fct = dt_fct(dat, cols),
mbm_dt_nmbr = dt_nmbr(dat, cols)
)
mbm
Unit: milliseconds
expr min lq mean median uq max neval cld
mbm_dplyr_fct 84.487484 85.829834 90.988740 87.015878 91.159178 120.22171 100 c
mbm_dt_fct 56.768529 58.007094 60.988083 58.831850 60.269427 87.11799 100 b
mbm_dt_nmbr 4.181538 4.406392 4.540248 4.557948 4.619757 6.04197 100 a

Combine tables of aggregated values with summarised variables from 'parent' data set

I have a data set along these lines:
df<-data.frame(sp=c(100, 100, 100, 101, 101, 101, 102, 102, 102),
type=c("C","C","C","H","H","H","C","C","C"),
country=c("A", "A", "A", "B", "B", "B", "C", "C", "C"),
vals=c(1,2,3,4,5,6,7,8,9)
)
I want to aggregate df$vals and bring the other variables through as well
At the moment I'm doing it like this:
multi.func<- function(x){
c(
n = length(x),
min = min(x, na.rm=TRUE),
max = max(x, na.rm=TRUE),
mean = mean(x, na.rm=TRUE)
)}
aggVals<-as.data.frame(do.call(rbind, by(df$vals, df$sp, FUN=multi.func, simplify=TRUE)))
aggVals$sp<-row.names(aggVals)
aggDescrip<-aggregate(cbind(as.character(type), as.character(country)) ~ sp, data=df, FUN=unique)
result<-merge(aggDescrip,aggVals)
This works well enough but I wondered if there's an easier way.
Thanks
Perhaps you should look into the data.table package.
library(data.table)
DT <- data.table(df, key="sp")
DT[, list(type = unique(as.character(type)),
country = unique(as.character(country)),
n = .N, min = min(vals), max = max(vals),
mean = mean(vals)), by=key(DT)]
# sp type country n min max mean
# 1: 100 C A 3 1 3 2
# 2: 101 H B 3 4 6 5
# 3: 102 C C 3 7 9 8
If you want to stick with base R, here is another approach that might be of use (though aggregate is probably more common):
unique(within(df, {
mean <- ave(vals, sp, FUN=mean)
max <- ave(vals, sp, FUN=max)
min <- ave(vals, sp, FUN=min)
n <- ave(vals, sp, FUN=length)
rm(vals)
}))
# sp type country n min max mean
# 1 100 C A 3 1 3 2
# 4 101 H B 3 4 6 5
# 7 102 C C 3 7 9 8
Update: A variation on your initial attempt
I would suggest sticking with data.table if possible, because the resulting code is easy to follow and the process of aggregation is quick.
However, with a little bit of modification, you can have (yet another) base R approach that is somewhat more direct.
First, modify your function so that instead of using c(), use data.frame. Also, add an argument that specifies which column needs to be aggregated.
multi.func <- function(x, value_column) {
data.frame(
n = length(x[[value_column]]),
min = min(x[[value_column]], na.rm=TRUE),
max = max(x[[value_column]], na.rm=TRUE),
mean = mean(x[[value_column]], na.rm=TRUE))
}
Second, use lapply on your dataset, split up by your grouping variable, merge the output with your original dataset, and return the unique values.
unique(merge(df[-4],
do.call(rbind, lapply(split(df, df$sp),
multi.func, value_column = "vals")),
by.x = "sp", by.y = "row.names"))
Using just aggregate:
result <- aggregate(vals ~ type + sp + country, df,
function(x) c(length(x), min(x), max(x), mean(x))
)
result
type sp country vals.1 vals.2 vals.3 vals.4
1 C 100 A 3 1 3 2
2 H 101 B 3 4 6 5
3 C 102 C 3 7 9 8
colnames(result)
[1] "type" "sp" "country" "vals"
The above seems to create a weird "multi-value" column. But summaryBy from the doBy package is similar to aggregate but will allow an output with multiple columns:
library(doBy)
result <- summaryBy(vals ~ type + sp + country, df,
FUN=function(x) c(n=length(x), min=min(x), max=max(x), mean=mean(x))
)
result
type sp country vals.n vals.min vals.max vals.mean
1 C 100 A 3 1 3 2
2 C 102 C 3 7 9 8
3 H 101 B 3 4 6 5
colnames(result)
[1] "type" "sp" "country" "vals.n" "vals.min" "vals.max"
[7] "vals.mean"

Resources