I was wondering if there is a way to compute the mean excluding outliers using the dplyr package in R? I was trying to do something like this, but did not work:
library(dplyr)
w = rep("months", 4)
value = c(1, 10, 12, 9)
df = data.frame(w, value)
output = df %>% group_by(w) %>% summarise(m = mean(value, na.rm = T, outlier = T))
So in above example, output should be 10.333 (mean of 10, 12, & 9) instead of 8 (mean of 1, 10, 12, 9)
Thanks!
One way would be something like this using the outlier package.
library(outliers) #containing function outlier
library(dplyr)
df %>%
group_by(w) %>%
filter(!value %in% c(outlier(value))) %>%
summarise(m = mean(value, na.rm = TRUE))
# w m
#1 months 10.33333
Related
I need to sum the values for about 40 variables by the same group.
This is an example dataset. So I wanted to sum the values of score1-score5 by region and department.
region <- rep(c("south", "east", "west", "north"),times=10)
department <- rep(c("A", "B","C","D","E"),times=8)
score1 <- rnorm(n = 40, mean = 0, sd = 1)
score2 <-rnorm(n = 40, mean = 3, sd = 1.5)
score3 <-rnorm(n = 40, mean = 2, sd = 1)
score4 <-rnorm(n = 40, mean = 1, sd = 1.5)
score5 <-rnorm(n = 40, mean = 5, sd = 1.5)
df <- data.frame(region, department, score1, score2, score3, score4, score5)
This is the code that would lead to the resutls I wanted to have but is there any easier ways to do this:
df %>% group_by(region, department) %>%
summarise(score1=sum(score1),
score2=sum(score2),
score3=sum(score3),
score4=sum(score4),
score5=sum(score5))
I tried to use a loop but this didn't work:
vlist<-c("score1", "score2", "score3", "score4", "score5")
for (var in vlist) {
df<-df %>% group_by(region, department) %>%
summarise(var=sum(.[[var]]))
}
Is there any other ways or what is wrong with my loop?
Thanks!
Use across - loop across the columns that starts_with 'score' and get the sum
library(dplyr)
out1 <- df %>%
group_by(region, department) %>%
summarise(across(starts_with('score'), sum), .groups = 'drop')
In the for loop, the issue is that df is getting updated (df <-..) in each iteration and summarise returns only the columns provided in the group by and the summarised output. Thus, after the first iteration, 'df' wouldn't have the 'score' columns at all. If we want to use a for loop, get the output in a list and then reduce with a join
library(purrr)
out_list <- vector('list', length(vlist))
names(out_list) <- vlist
for (var in vlist) {
out_list[[var]] <- df %>%
group_by(region, department) %>%
summarise(!!var := sum(cur_data()[[var]]), .groups = 'drop')
}
out2 <- reduce(out_list, full_join, by = c('region', 'department'))
-checking the outputs
> identical(out1, out2)
[1] TRUE
Below is my data where columns group, score and prob represent treatment groups, scores outcome and probability of occurrence of the score outcome
data1 <- tibble(group = rep(c('A', 'B'), each = 5),
score = c(0, 1, 2, 3, 4, 0, 1, 2, 3, 4),
prob = c(.08, .8, .1, .02, 0, 0, 0, .4, .4, .2))
data1
I want to use this data and simulate the scores R times so I expand as follows
R <- 1:10 # number of simulations
data2 <- data1 %>%
nest(data = c(score, prob)) %>%
unnest_wider(col = data) %>%
mutate(size = 15)
data3 <- data2 %>%
group_by(group, score, prob, size) %>%
tidyr::expand(iter = R) %>%
ungroup
data3
My function to simulate the data is as follows where I use multinomial distribution
sim_data <- function(dt, n, size, prob){
d1 <- dt %>%
mutate(sim_score = pmap(list(n = 1,
size = size,
prob = prob),
rmultinom)) %>%
select(-prob) %>%
unnest(cols = c(sim_score, score))
# create real scores from counts above
d2 <- d1 %>%
rowwise %>%
mutate(outcome = map(score, rep, times = sim_score)) %>%
unnest(cols = outcome) %>%
select(size, contains('iter'), group, outcome) %>%
nest(data = c(group, outcome))
return(d2)
}
data4 <- sim_data(data3, n, size, prob)
data4
This works fine however I want to omit those instances where each group has less than or equal to 2 unique scores. In this case, I modify my function sim_data to do the check as follows using repeat so as to ignore cases that do not meet the above mentioned requirement
sim_check <- function(my_data, n, size, prob){
repeat{
# simulate data
dt = sim_data(my_data, n, size, prob)
# check requirement
check <- dt %>%
select(data) %>%
unnest(cols = c(data)) %>%
group_by(group) %>%
distinct(outcome) %>%
mutate(id = 1:n()) %>%
summarise(n = n()) %>%
distinct(n) %>%
pull(n)
if(all(check > 2)) break
}
return(dt)
}
data5 <- sim_check(data3, n, size, prob)
This however does not do the checking as I see observations with 2 unique scores.
I can achieve my objective with a for loop as follows but I want to avoid the for loop. Can someone point out where I'm doing wrong
out <- list()
for(i in 1:10){
data6 <- sim_check(data2, n, size, prob)
out[[i]] <- data6 %>%
pull(data)
}
#Ronak thanks for the observation. I just corrected it
is it possible for the top_n() command to return both max and min value at the same time?
Using the example from the reference page https://dplyr.tidyverse.org/reference/top_n.html
I tried the following
df <- data.frame(x = c(10, 4, 1, 6, 3, 1, 1))
df %>% top_n(c(1,-1)) ## returns an error
df <- data.frame(x = c(10, 4, 1, 6, 3, 1, 1))
df %>% top_n(1) %>% top_n(-1) ## returns only max value
Thanks
Not really involving top_n(), but you can try:
df %>%
arrange(x) %>%
slice(c(1, n()))
x
1 1
2 10
Or:
df %>%
slice(which(x == max(x) | x == min(x))) %>%
distinct()
Or (provided by #Gregor):
df %>%
slice(c(which.min(x), which.max(x)))
Or using filter():
df %>%
filter(x %in% range(x) & !duplicated(x))
Idea similar to #Jakub's answer with purrr::map_dfr
library(tidyverse) # dplyr and purrrr for map_dfr
df %>%
map_dfr(c(1, -1), top_n, wt = x, x = .)
# x
# 1 10
# 2 1
# 3 1
# 4 1
Here is an option with top_n where we pass a logical vector based that returns TRUE for min/max using range and then get the distinct rows as there are ties for range i.e duplicate elements are present
library(dplyr)
df %>%
top_n(x %in% range(x), 1) %>%
distinct
# x
#1 10
#2 1
I like #tmfmnk's answer. If you want to use top_n function, you can do this:
df <- data.frame(x = c(10, 4, 1, 6, 3, 1, 1))
bind_rows(
df %>% top_n(1),
df %>% top_n(-1)
)
# this solution addresses the specification in comments
df %>%
group_by(y) %>%
summarise(min = min(x),
max = max(x),
average = mean(x))
I want to calculate total number of negative values for each observation, using previous 10 observations. I used the following code, but it does not work -
funda_addit <- funda_addit %>%
group_by(TICKER) %>%
arrange(year) %>%
mutate(NEG_EARN = rollapply(ni, 10, sum (ni<0), partial=TRUE)) %>%
ungroup()
Actually I want to create the new variable "NEG_EARN", which is the number of negative values of previous 10 observations(10 years in my data) for the variable "ni". I also use the following code, but it does not work -
funda_addit <- funda_addit %>%
group_by(TICKER) %>%
arrange(year) %>%
mutate(NEG_EARN = rollapply(ni, 10, length(which(ni<0)), partial=TRUE)) %>%
ungroup()
You could create a vector cumsum(ni < 0) and then subtract a lagged version of that vector from it
funda_addit %>%
group_by(TICKER) %>%
arrange(year) %>%
mutate(neg_earn = {cs <- cumsum(ni < 0)
cs - lag(cs, 10, default = 0)})
This is equivalent to akrun's answer if you change rollapply to rollapplyr (tested using akrun's example data)
use_cumsum <-
funda_addit %>%
group_by(TICKER) %>%
arrange(year) %>%
mutate(neg_earn = {cs <- cumsum(ni < 0)
cs - lag(cs, 10, default = 0)})
use_rollapply <-
funda_addit %>%
group_by(TICKER) %>%
arrange(year) %>%
mutate(neg_earn = rollapplyr(ni, 10, FUN = f1, partial=TRUE))
all(use_cumsum == use_rollapply)
# [1] TRUE
We can use anonymous function call (or create a new function) instead of the whole column 'ni'
library(dplyr)
library(zoo)
f1 <- function(x) sum(x < 0)
funda_addit %>%
group_by(TICKER) %>%
arrange(year) %>%
mutate(NEG_EARN = rollapplyr(ni, 10, FUN = f1, partial=TRUE)) %>%
ungroup()
EDIT: changed rollapply to rollapplyr (based on comments from #IceCreamToucan)
data
set.seed(24)
funda_addit <- data.frame(TICKER = rep(LETTERS[1:3], each = 20),
year = 1921:1940, ni = rnorm(60))
I have an issue understanding how to use the dplyr bootstrap function properly.
What I want is to generate a bootstrap distribution from two randomly assigned groups and compute the difference in means, like this for example :
library(dplyr)
library(broom)
data(mtcars)
mtcars %>%
mutate(treat = sample(c(0, 1), 32, replace = T)) %>%
group_by(treat) %>%
summarise(m = mean(disp)) %>%
summarise(m = m[treat == 1] - m[treat == 0])
The issue is that I need to repeat this operation 100, 1000, or more times.
Using replicate, I can do
frep = function(mtcars) mtcars %>%
mutate(treat = sample(c(0, 1), 32, replace = T)) %>%
group_by(treat) %>%
summarise(m = mean(disp)) %>%
summarise(m = m[treat == 1] - m[treat == 0])
replicate(1000, frep(mtcars = mtcars), simplify = T) %>% unlist()
and get the distribution
I don't really get how to use bootstraphere. How should I start ?
mtcars %>%
bootstrap(10) %>%
mutate(treat = sample(c(0, 1), 32, replace = T))
mtcars %>%
bootstrap(10) %>%
do(tidy(treat = sample(c(0, 1), 32, replace = T)))
It's not really working. Where should I put the bootstrap pip ?
Thanks.
In the do step, we wrap with data.frame and create the 'treat' column, then we can group by 'replicate' and 'treat' to get the summarised output column
mtcars %>%
bootstrap(10) %>%
do(data.frame(., treat = sample(c(0,1), 32, replace=TRUE))) %>%
group_by(replicate, treat) %>%
summarise(m = mean(disp)) %>%
summarise(m = m[treat == 1] - m[treat == 0])
#or as 1 occurs second and 0 second, we can also use
#summarise(m = last(m) - first(m))