Counting the average of duplicates per id in R - r

My data looks like this:
id
date
1
a
1
a
1
b
1
c
1
c
1
c
2
z
2
z
2
e
2
x
I want to calculate the average of duplicates per id i.e for id=1 we have 2a 1b 3c I want the output to be 2.
The result shoulbe like this:
id
mean
1
2
2
1.333

You can use mean(table(date)) to get average of counts, apply it by for each id value.
Using dplyr -
library(dplyr)
df %>%
group_by(id) %>%
summarise(mean = mean(table(date)))
# id mean
# <int> <dbl>
#1 1 2
#2 2 1.33
Or with base R aggregate.
aggregate(date~id, df, function(x) mean(table(x)))

You can try a tidyverse
library(tidyverse)
d %>%
group_by(id) %>%
count(date) %>%
summarise(mean = mean(n))
# A tibble: 2 x 2
id mean
<int> <dbl>
1 1 2
2 2 1.33
Using base R you can try
foo <- function(x) mean(rle(x)$length)
aggregate(d$date, by=list(d$id), foo)
The data
d <- read.table(text ="id date
1 a
1 a
1 b
1 c
1 c
1 c
2 a
2 a
2 e
2 z", header=T)

using data.table package
library(data.table)
# dt <- your_data_frame %>% as.data.table() ## convert to table from frame
dt[, .(N=.N), by = .(id,date)][, .(mean = mean(N)), by = id]

Another data.table option
> setDT(df)[, .(Mean = .N / uniqueN(date)), id]
id Mean
1: 1 2.000000
2: 2 1.333333
or
dcast(setDT(df), id ~ date, fill = NA)[, .(Mean = rowMeans(.SD, na.rm = TRUE)), id]
gives
id Mean
1: 1 2.000000
2: 2 1.333333

We can use
library(dplyr)
df1 %>%
group_by(id) %>%
summarise(Mean = count(cur_data(), date) %>%
pull(n) %>%
mean)

A table approach using the base package:
at<-table(a$id,a$date)
apply(at,1,function(x) sum(x)/sum(x!=0))
# 1 2
#2.000000 1.333333
The dataset:
a = data.frame('id'=c(1,1,1,1,1,1,2,2,2,2),'date'=c('a','a','b','c','c','c','a','a','e','z'))

here is a package free solution
a = cbind(c(1,1,1,1,1,1,2,2,2,2),c('a','a','b','c','c','c','a','a','e','z'))
b = matrix(ncol = 2)[-1,]
for(i in unique(a[,1])){
b=rbind(b,c(i,sum(table(a[a[,1]==i,2]))/length(table(a[a[,1]==i,2]))))
}
The output:
[,1] [,2]
[1,] "1" "2"
[2,] "2" "1.33333333333333"

Related

Simultaneous Count and Sort in R

I am trying to obtain counts of a certain categorical variable in 2 separate columns, with each column reflecting the presence or an absence of an indicator variable. This is for a very large data frame. Here is an example data frame to further illustrate what I'm trying to do.
X <- (1:10)
Y <- c('a','b','a','c','b','b','a','a','c','c')
Z <- c(0,1,1,1,0,1,0,1,1,1)
test_df <- data.frame(X,Y,Z)
I would like to make a new DF grouped by 'a','b', and 'c' with 2 columns to the right, one with counts of the letter for Z==1 and the a count of that letter for Z==0.
The dplyr way:
library(dplyr)
library(tidyr)
#Code
res <- test_df %>% group_by(Y,Z) %>% summarise(N=n()) %>%
pivot_wider(names_from = Z,values_from=N,
values_fill = 0)
Output:
# A tibble: 3 x 3
# Groups: Y [3]
Y `0` `1`
<chr> <int> <int>
1 a 2 2
2 b 1 2
3 c 0 3
We can use values_fn in pivot_wider to do this in a single step
library(dplyr)
library(tidyr)
test_df %>%
pivot_wider(names_from = Z, values_from = X,
values_fn = length, values_fill = 0)
# A tibble: 3 x 3
# Y `0` `1`
# <chr> <int> <int>
#1 a 2 2
#2 b 1 2
#3 c 0 3
A base R option using aggregate + reshape
replace(
u <- reshape(
aggregate(X ~ ., test_df, length),
idvar = "Y",
timevar = "Z",
direction = "wide"
),
is.na(u),
0
)
giving
Y X.0 X.1
1 a 2 2
2 b 1 2
5 c 0 3
One way with data.table:
library(data.table)
setDT(test_df)
test_df[ , z1 := sum(Z==1), by=Y]
test_df[ , z0 := sum(Z==0), by=Y]
In base R you can use table :
table(test_df$Y, test_df$Z)
# 0 1
# a 2 2
# b 1 2
# c 0 3

Comparing all rows within a dataframe and apply a function conditionally

I have this dataframe and I want to cross-compare all the values inside this data frame.
dat <- tibble::tibble(
name = c("a","b","c"),
value = c(1,2,3))
I want to compare all the row pairs inside this dataframe and in this case I want to divide the smaller number by the bigger number.
The final dataframe should look like this:
a,b,0.5
a,c,0.33
b,c,0.66
Is there a method to achieve this?
Using the data.table package, we can join dat with itself on the condition that one value is less than the other, and compute the ratio with the columns of the joined table.
library(data.table)
setDT(dat)
out <-
dat[dat, on = .(value < value),
.(name1 = x.name,
name2 = i.name,
ratio = x.value/i.value)]
out <- out[!is.na(ratio)]
out
# name1 name2 ratio
# 1: a b 0.5000000
# 2: a c 0.3333333
# 3: b c 0.6666667
One option would be
v1 <- setNames(dat$value, dat$name)
do.call(rbind, combn(v1, 2, FUN = function(x)
setNames(data.frame(as.list(names(x)), round(Reduce(`/`, x[order(x)]), 2)),
c("col1", "col2", "val")), simplify = FALSE))
# col1 col2 val
#1 a b 0.50
#2 a c 0.33
#3 b c 0.67
Or an option with fuzzyjoin (inspired from #IceCreamToucan's post)
library(fuzzyjoin)
fuzzy_inner_join(dat, dat, by = "name", match_fun = list(`<`)) %>%
transmute(col1 = name.x, col2 = name.y, val = value.x/value.y)
# A tibble: 3 x 3
# col1 col2 val
# <chr> <chr> <dbl>
#1 a b 0.5
#2 a c 0.333
#3 b c 0.667
We can use tidyverse:
library(tidyverse)
dat %>% expand(name, name) %>% cbind(expand(dat, value,value)) %>%
filter(value1>value) %>%
mutate(ratio=value/value1)
#> name name1 value value1 ratio
#> 1 a b 1 2 0.5000000
#> 2 a c 1 3 0.3333333
#> 3 b c 2 3 0.6666667
Or just a doodle in base r:
df <- cbind(expand.grid(dat$name,dat$name), expand.grid(dat$value, dat$value))
df <- df[order(df[,3], -df[,4]),]
df <- df[df[,3] < df[,4],]
df$ratio <- df[,3] / df[,4]
df[,-c(3,4)] -> df
df
#> Var1 Var2 ratio
#> 7 a c 0.3333333
#> 4 a b 0.5000000
#> 8 b c 0.6666667

Group data by factor level, then transform to data frame with colname being levels?

There is my problem that I can't solve it:
Data:
df <- data.frame(f1=c("a", "a", "b", "b", "c", "c", "c"),
v1=c(10, 11, 4, 5, 0, 1, 2))
data.frame:f1 is factor
f1 v1
a 10
a 11
b 4
b 5
c 0
c 1
c 2
# What I want is:(for example, fetch data with the number of element of some level == 2, then to data.frame)
a b
10 4
11 5
Thanks in advance!
I might be missing something simple here , but the below approach using dplyr works.
library(dplyr)
nlevels = 2
df1 <- df %>%
add_count(f1) %>%
filter(n == nlevels) %>%
select(-n) %>%
mutate(rn = row_number()) %>%
spread(f1, v1) %>%
select(-rn)
This gives
# a b
# <int> <int>
#1 10 NA
#2 11 NA
#3 NA 4
#4 NA 5
Now, if you want to remove NA's we can do
do.call("cbind.data.frame", lapply(df1, function(x) x[!is.na(x)]))
# a b
#1 10 4
#2 11 5
As we have filtered the dataframe which has only nlevels observations, we would have same number of rows for each column in the final dataframe.
split might be useful here to split df$v1 into parts corresponding to df$f1. Since you are always extracting equal length chunks, it can then simply be combined back to a data.frame:
spl <- split(df$v1, df$f1)
data.frame(spl[lengths(spl)==2])
# a b
#1 10 4
#2 11 5
Or do it all in one call by combining this with Filter:
data.frame(Filter(function(x) length(x)==2, split(df$v1, df$f1)))
# a b
#1 10 4
#2 11 5
Here is a solution using unstack :
unstack(
droplevels(df[ave(df$v1, df$f1, FUN = function(x) length(x) == 2)==1,]),
v1 ~ f1)
# a b
# 1 10 4
# 2 11 5
A variant, similar to #thelatemail's solution :
data.frame(Filter(function(x) length(x) == 2, unstack(df,v1 ~ f1)))
My tidyverse solution would be:
library(tidyverse)
df %>%
group_by(f1) %>%
filter(n() == 2) %>%
mutate(i = row_number()) %>%
spread(f1, v1) %>%
select(-i)
# # A tibble: 2 x 2
# a b
# * <dbl> <dbl>
# 1 10 4
# 2 11 5
or mixing approaches :
as_tibble(keep(unstack(df,v1 ~ f1), ~length(.x) == 2))
Using all base functions (but you should use tidyverse)
# Add count of instances
x$len <- ave(x$v1, x$f1, FUN = length)
# Filter, drop the count
x <- x[x$len==2, c('f1','v1')]
# Hacky pivot
result <- data.frame(
lapply(unique(x$f1), FUN = function(y) x$v1[x$f1==y])
)
colnames(result) <- unique(x$f1)
> result
a b
1 10 4
2 11 5
I'd like code this, may it helps for you
library(reshape2)
library(dplyr)
aa = data.frame(v1=c('a','a','b','b','c','c','c'),f1=c(10,11,4,5,0,1,2))
cc = aa %>% group_by(v1) %>% summarise(id = length((v1)))
dd= merge(aa,cc) #get the level
ee = dd[dd$aa==2,] #select number of level equal to 2
ee$id = rep(c(1,2),nrow(ee)/2) # reset index like (1,2,1,2)
dcast(ee, id~v1,value.var = 'f1')
all done!

Getting rows whose value are greater than the group mean

I have a data frame where column "A" has 6 distinct values. Column "B" has float values. By using dplyr, I can group by column "A" and find mean of column "B" of each group as follows:
mydf %>% group_by(A) %>% summarize(Mean = mean(B, na.rm=TRUE))
My utter aim is to find rows in each group whose "B" values are higher than the group average. How can I achieve this (using base R or dplyr)?
A simple alternative with base R ave would be
df[df$b > ave(df$b, df$a) , ]
# a b
#4 1 4
#5 1 5
#9 2 9
#10 2 10
The default argument for ave is mean so no need to mention it explicitly, if there are NA values present in b modify it to
df[df$b > ave(df$b, df$a, FUN = function(x) mean(x,na.rm = TRUE)) , ]
Another solution with subset and ave as suggested by #Onyambu
subset(df,b>ave(b,a))
# a b
#4 1 4
#5 1 5
#9 2 9
#10 2 10
data
df <- data.frame(a = rep(c(1, 2), each = 5), b = 1:10)
df
# a b
#1 1 1
#2 1 2
#3 1 3
#4 1 4
#5 1 5
#6 2 6
#7 2 7
#8 2 8
#9 2 9
#10 2 10
You can just group and then filter:
mydf %>%
group_by(A) %>%
filter(B > mean(B, na.rm = TRUE)) %>%
ungroup()
Using Base R, I would go for this. It is not as elegant as dplyr.
mean.df <- aggregate(mydf$b, by =list(a = mydf$a), FUN = mean)
names(mean.df)[2] <- "mean"
mydf <- merge(mydf, mean.df, by = "a")
# Rows whose values are higher than mean
new.df <- subset(mydf, b > mean, select = -mean)
I like working with Data tables. So a data.table solution would be,
mydt <- data.table(mydf)
mydt[, mean := mean(b), by = a]
new.dt <- mydt[b > mean, -c("mean"), with = TRUE]
Another way to do it using base R and tapply:
mydf = cbind.data.frame(A=sample(6,20,rep=T),B=runif(20))
mydf.ave = tapply(mydf$B,mydf$A,mean)
newdf = mydf[mydf$B > mydf.ave[as.character(mydf$A)],]
(thus the one liner would be:mydf[mydf$B > tapply(mydf$B,mydf$A,mean)[as.character(mydf$A)],])

Reshape2: multiple observations for variable

I have the following sample data:
d <- data.frame(id=c(1,1,1,2,2), time=c(1,1,1,1,1), var=runif(5))
id time var
1 1 1 0.373448545
2 1 1 0.007007124
3 1 1 0.840572603
4 2 1 0.684893481
5 2 1 0.822581501
I want to reshape this data.frame to wide format using dcast such that the output is the following:
id var.1 var.2 var.3
1 1 0.3734485 0.007007124 0.8405726
2 2 0.6848935 0.822581501 NA
Does anyone has some ideas?
Create a sequence column, seq, by id and then use dcast:
library(reshape2)
set.seed(123)
d <- data.frame(id=c(1,1,1,2,2), time=c(1,1,1,1,1), var=runif(5))
d2 <- transform(d, seq = ave(id, id, FUN = seq_along))
dcast(d2, id ~ seq, value.var = "var")
giving:
id 1 2 3
1 1 0.28758 0.78831 0.40898
2 2 0.88302 0.94047 NaN
A dplyr/tidyr option with spread would be
library(dplyr)
library(tidyr)
d %>%
group_by(id) %>%
mutate(n1= paste0("var.",row_number())) %>%
spread(n1, var) %>%
select(-time)
# id var.1 var.2 var.3
# (int) (dbl) (dbl) (dbl)
#1 1 0.3734485 0.007007124 0.8405726
#2 2 0.6848935 0.822581501 NA
Ok - here's a working solution. The key is to add a counting variable. My solution for this is a bit complicated - maybe you can come up with something better.
library(dplyr)
library(magrittr)
library(reshape2)
d <- data.frame(id=c(1,1,1,2,2,3,3,3,3), time=c(1,1,1,1,1,1,1,1,1), var=runif(9))
group_by(d, id) %>%
summarise(n = n()) %>%
data.frame() -> count
f <- c()
for (i in 1:nrow(count)) {
f <- c(f, 1:count$n[i])
}
d <- data.frame(d, f)
dcast(d, id ~ f, value.var = "var")

Resources