Using tidyverse, sum values conditionally on distributions within each subset - r

I have an example dataframe below where each day of the month and precip are recorded.
set.seed(560)
df<-data.frame(month= rep(1:4, each=30),
precip= rep(c(rnorm(30, 20, 10), rnorm(30, 10, 2),
rnorm(30, 50, 1), rnorm(30, 15, 3))))
For each subset, I wish to count the number of instances a value was +/- 2 standard deviations (sd) above or below the mean of that month's precip values. Essentially I ned to find values at the extremes of the distribution of values (i.e. the tails of the distribution). This result column will be called count.
The output would appear as follows for this example dataset:
set.seed(560)
output<-data.frame(month= rep(1:4, each=1), count= c(1,2,1,1))
Notice for month 1 values above 35.969 and values below 2.61 are within +/- 2sd of the mean. One value (precip=41.1) fits this requirement. Proof:
sub1<- subset(df, month==1)
v1<- mean(sub1$precip)+ 2*sd(sub1$precip)#35.969
v2<- mean(sub1$precip)- 2*sd(sub1$precip)#2.61
sub2<- subset(df, month==2)
v3<- mean(sub2$precip)+ 2*sd(sub2$precip)#13.89
v4<- mean(sub2$precip)- 2*sd(sub2$precip)#7.35
sub3<- subset(df, month==3)
v5<- mean(sub3$precip)+ 2*sd(sub3$precip)#51.83
v6<- mean(sub3$precip)- 2*sd(sub3$precip)#48.308
sub4<- subset(df, month==4)
v7<- mean(sub4$precip)+ 2*sd(sub4$precip)#18.69
v8<- mean(sub4$precip)- 2*sd(sub4$precip)#9.39
I have tried:
output<-
df %>%
group_by(month)%>%
summarise(count= sum(precip > (mean(precip)+(2*sd(precip)))&
precip < (mean(precip)-(2*sd(precip))))))

Very simple fix, change your logic AND & to OR | as no row will be in both conditions.
output<-
df %>%
group_by(month)%>%
summarise(count= sum(precip > (mean(precip)+(2*sd(precip))) |
precip < (mean(precip)-(2*sd(precip)))))
output
# A tibble: 4 x 2
# month count
# <int> <int>
# 1 1 1
# 2 2 2
# 3 3 2
# 4 4 1
And to add a base R solution using by (the counterpart to dplyr::group_by())
do.call(rbind,
by(df, df$month, FUN=function(i){
tmp <- i[i$precip < mean(i$precip) - 2*sd(i$precip) |
i$precip > mean(i$precip) + 2*sd(i$precip),]
return(data.frame(month=i$month[[1]], count=nrow(tmp)))
})
)
# month count
# 1 1 1
# 2 2 2
# 3 3 2
# 4 4 1
Alternatively, with ave, ifelse, and aggregate:
df$count <- ifelse(df$precip > ave(df$precip, df$month, FUN=function(g) mean(g) + 2*sd(g)) |
df$precip < ave(df$precip, df$month, FUN=function(g) mean(g) - 2*sd(g)), 1, 0)
aggregate(count ~ month, df, FUN=sum)
# month count
# 1 1 1
# 2 2 2
# 3 3 2
# 4 4 1

In base R
tapply(df$precip, df$month, function(a) sum(abs(scale(a)) >= 2))
Output
1 2 3 4
1 2 2 1

Related

Find minimum and maximum value for each group based on another variable in R [duplicate]

This question already has answers here:
How to use Dplyr's Summarize and which() to lookup min/max values
(3 answers)
Closed 1 year ago.
I would like to know how I can find the minimum and maximum day of year (DoY) based on water temperature (Wtemp) for each site (siteID).
Example Dataset:
df1 <- data.frame(matrix(ncol = 4, nrow = 20))
x <- c("siteID", "Date", "DoY", "Wtemp")
colnames(df1) <- x
df1$siteID <- c(101,101,101,101,101,
102,102,102,102,102,
103,103,103,103,103,
104,104,104,104,104)
df1$Date <- rep(seq(from = as.Date("2020-01-01"), to = as.Date("2020-01-05"), by = 1),4)
df1$DoY <- rep(seq(from = 1, to = 5, by = 1),4)
df1$Wtemp <- c(10,2,6,12,15,
20,15,5,10,16,
2,4,6,8,10,
12,14,16,18,20)
The output should look like this:
siteID DoY_MaxWtemp DoY_MinWtemp
1 101 5 2
2 102 1 3
3 103 5 1
4 104 5 1
We can group by 'siteID', get the index of 'max' and 'min' value of 'Wtemp' with which.max and which.min respectively, use that to extract the corresponding values of 'DoY' in summarise
library(dplyr)
df1 %>%
group_by(siteID) %>%
summarise(Doy_MaxWtemp = DoY[which.max(Wtemp)],
Doy_MinWtemp = DoY[which.min(Wtemp)], .groups = 'drop')
-output
# A tibble: 4 x 3
# siteID Doy_MaxWtemp Doy_MinWtemp
#* <dbl> <dbl> <dbl>
#1 101 5 2
#2 102 1 3
#3 103 5 1
#4 104 5 1

Create ranges by accumulating values

I have the DF1:
KEY <- c(11,12,22,33,44,55,66,77,88,99,1010,1111,1212,1313,1414,1515,1616,1717,1818,1919,2020)
PRICE <- c(0,0,1,5,7,10,20,80,110,111,200,1000,2500,2799,3215,4999,7896,8968,58914,78422,96352)
DF1 <- data.frame(KEY,PRICE)
I want to group DF1 into ranges, accumulating the values of the two columns (count the KEY column and sum the PRICE column). This is the result I hope for:
INTERVAL <-c('0','UP_TO_10','UP_TO_100','UP_TO_1000','UP_TO_5000','UP_TO_10000','UP_TO_100000')
COUNT_KEY <-c(2,6,8,12,16,18,21)
SUM_PRICE <- c(0,23,123,1544,15057,31921,265609)
DF2 <- data.frame(INTERVAL,COUNT_KEY,SUM_PRICE)
How do I make this table?
If you have a vector of limits or thresholds, such as:
LIMITS <- c(0, 10, 100, 1000, 5000, 10000, 100000)
You could obtain a count of rows where PRICE is below each limit:
unlist(lapply(LIMITS, function(x) sum(DF1$PRICE <= x)))
[1] 2 6 8 12 16 18 21
And a sum of these prices as well:
unlist(lapply(LIMITS, function(x) sum(DF1$PRICE[DF1$PRICE <= x])))
[1] 0 23 123 1544 15057 31921 265609
Is this what you had in mind?
This is everything all together:
LIMITS <- c(0, 10, 100, 1000, 5000, 10000, 100000)
COUNT_KEY <- unlist(lapply(LIMITS, function(x) sum(DF1$PRICE <= x)))
SUM_PRICE <- unlist(lapply(LIMITS, function(x) sum(DF1$PRICE[DF1$PRICE <= x])))
data.frame(INTERVAL = c(0, paste("UP_TO", LIMITS[-1], sep="_")), COUNT_KEY, SUM_PRICE)
INTERVAL COUNT_KEY SUM_PRICE
1 0 2 0
2 UP_TO_10 6 23
3 UP_TO_100 8 123
4 UP_TO_1000 12 1544
5 UP_TO_5000 16 15057
6 UP_TO_10000 18 31921
7 UP_TO_100000 21 265609
You have to manually define you boundaries first:
X = c(-Inf,0,10,100,1000,5000,10000,100000)
Then you use cut to assign to entries to your labels. And we first summarize the counts and total price within the intervals.
library(dplyr)
DF1 %>%
mutate(LABELS = cut(DF1$PRICE,X,INTERVAL,include.lowest =TRUE)) %>%
group_by(LABELS) %>%
summarise(COUNT_KEY=n(),SUM_PRICE=sum(PRICE))
# A tibble: 7 x 3
LABELS COUNT_KEY SUM_PRICE
<fct> <int> <dbl>
1 0 2 0
2 UP_TO_10 4 23
3 UP_TO_100 2 100
4 UP_TO_1000 4 1421
5 UP_TO_5000 4 13513
6 UP_TO_10000 2 16864
7 UP_TO_100000 3 233688
This is close to what you want, except the sum_price and counts, should be cumulative. So this can be achieved by doing mutate_if(is.numeric,cumsum):
DF1 %>%
mutate(LABELS = cut(DF1$PRICE,X,INTERVAL,include.lowest =TRUE)) %>% group_by(LABELS) %>%
summarise(COUNT_KEY=n(),SUM_PRICE=sum(PRICE)) %>%
mutate_if(is.numeric,cumsum)
To give:
# A tibble: 7 x 3
LABELS COUNT_KEY SUM_PRICE
<fct> <int> <dbl>
1 0 2 0
2 UP_TO_10 6 23
3 UP_TO_100 8 123
4 UP_TO_1000 12 1544
5 UP_TO_5000 16 15057
6 UP_TO_10000 18 31921
7 UP_TO_100000 21 265609
Okay, here's an all-in-one, tidy way to handle this using dplyr ;)
library(dplyr)
DF1 %>%
mutate(
INTERVAL =
factor(
case_when( # create discrete variable
PRICE == 0 ~ '0',
PRICE <= 10 ~ 'UP_TO_10',
PRICE <= 100 ~ 'UP_TO_100',
PRICE <= 1000 ~ 'UP_TO_1000',
PRICE <= 5000 ~ 'UP_TO_5000',
PRICE <= 10000 ~ 'UP_TO_10000',
PRICE <= 100000 ~ 'UP_TO_100000'
),
levels = # set the factor levels
c(
'0',
'UP_TO_10',
'UP_TO_100',
'UP_TO_1000',
'UP_TO_5000',
'UP_TO_10000',
'UP_TO_100000'
)
)
) %>%
group_by(INTERVAL) %>% # create desired group
summarise( # and summary variables
COUNT_KEY = n(),
SUM_PRICE = sum(PRICE)
) %>%
mutate( # cumulative totals
COUNT_KEY_CUM = cumsum(COUNT_KEY),
SUM_PRICE_CUM = cumsum(SUM_PRICE)
)

Filter rows based on multiple conditions using dplyr

df <- data.frame(loc.id = rep(1:2,each = 10), threshold = rep(1:10,times = 2))
I want to filter out the first rows when threshold >= 2 and threshold is >= 4 for each loc.id. I did this:
df %>% group_by(loc.id) %>% dplyr::filter(row_number() == which.max(threshold >= 2),row_number() == which.max(threshold >= 4))
I expected a dataframe like this:
loc.id threshold
1 2
1 4
2 2
2 4
But it returns me an empty dataframe
Based on the condition, we can slice the rows from concatenating the two which.max index, get the unique (if there are only cases where threshold is greater than 4, then both the conditions get the same index)
df %>%
group_by(loc.id) %>%
filter(any(threshold >= 2)) %>% # additional check
#slice(unique(c(which.max(threshold > 2), which.max(threshold > 4))))
# based on the expected output
slice(unique(c(which.max(threshold >= 2), which.max(threshold >= 4))))
# A tibble: 4 x 2
# Groups: loc.id [2]
# loc.id threshold
# <int> <int>
#1 1 2
#2 1 4
#3 2 2
#4 2 4
Note that there can be groups where there are no values in threshold greater than or equal to 2. We could keep only those groups
If this isn't what you want, assign the df below a name and use it to filter your dataset.
df %>%
distinct() %>%
filter(threshold ==2 | threshold==4)
#> loc.id threshold
#> 1 1 2
#> 2 1 4
#> 3 2 2
#> 4 2 4
```

Factor data frame values into quartile/decile ranges

I'm trying to create decile factors corresponding to my dataframe's values. I would like the factors to appear as a range e.g. if the value is "164" then the factored result should be "160 - 166".
In the past I would do this:
quantile(countries.Imported$Imported, seq(0,1, 0.1), na.rm = T) # display deciles
Imported.levels <- c(0, 1000, 10000, 20000, 30000, 50000, 80000) # create levels from observed deciles
Imported.labels <- c('< 1,000t', '1,000t - 10,000t', '10,000t - 20,000t', etc) # create corresponding labels
colfunc <- colorRampPalette(c('#E5E4E2', '#8290af','#512888'))
# apply factor function
Imported.colors <- colfunc(10)
names(Imported.colors) <- Imported.labels
countries.Imported$Imported.fc <- factor(
cut(countries.Imported$Imported, Imported.levels),labels = Imported.labels)
Instead, I would like to apply a function that will factor the values into decile range. I want to avoid manually setting factor labels since I will be running many queries and plotting maps that have discrete legends. I've created a column called Value.fc but I cannot format it to "160 - 166" from "(160, 166]". Please see the problematic code below:
corn_df <- corn_df %>%
mutate(Value.fc = gtools::quantcut(Value, 10))
corn_df %>%
select(Value, unit_desc, domain_desc, Value.fc) %>%
head(6)
A tibble: 6 x 4
Value unit_desc domain_desc Value.fc
<dbl> <chr> <chr> <fct>
1 164. BU / ACRE TOTAL (160,166]
2 196. BU / ACRE TOTAL (191,200]
3 203. BU / ACRE TOTAL (200,230]
4 205. BU / ACRE TOTAL (200,230]
5 172. BU / ACRE TOTAL (171,178]
6 213. BU / ACRE TOTAL (200,230]
You can try to use dplyr::ntile() or Hmisc::cut2().
If you're interested where the decline of the variable starts and ends you can use Hmisc::cut2() and stringr::str_extract_all()
require(tidyverse)
require(Hmisc)
require(stringr)
df <- data.frame(value = 1:100) %>%
mutate(decline = cut2(value, g=10),
decline = factor(sapply(str_extract_all(decline, "\\d+"),
function(x) paste(x, collapse="-"))))
head(df)
value decline
1 1 1-11
2 2 1-11
3 3 1-11
4 4 1-11
5 5 1-11
6 6 1-11
If you're looking only for the decline of the variable you can use dplyr::ntile().
require(tidyverse)
df <- data.frame(value = 1:100) %>%
mutate(decline = ntile(value, 10))
head(df)
value decline
1 1 1
2 2 1
3 3 1
4 4 1
5 5 1
6 6 1

How to apply a set of functions to each group of a grouping variable in R data.frame

I need to reshape data.frame in R in one step.
In short, change of values of objects (x1 to x6) is visible row by row (from 1990 to 1995):
> tab1[1:10, ] # raw data see plot for tab1
id value year
1 x1 7 1990
2 x1 10 1991
3 x1 11 1992
4 x1 7 1993
5 x1 3 1994
6 x1 1 1995
7 x2 6 1990
8 x2 7 1991
9 x2 9 1992
10 x2 5 1993
I am able to do reshaping step by step, does anybody know how do it in one step?
Original data
Table 1 - see that minimal value from all timeseries is "0"
Step1:
Table 2 - rescale each timeseries that each would have minimal value equal "0".
All times fall down on x-axes.
Step2:
Table 3 - apply diff() function on each timeline.
Step3:
Table 4 - apply sort() function on each timeseries.
I hope the pictures are clear enough for understanding each step.
So final table looks like this:
> tab4[1:10, ]
id value time
1 x1 -4 1
2 x1 -4 2
3 x1 -2 3
4 x1 1 4
5 x1 3 5
6 x2 -4 1
7 x2 -3 2
8 x2 1 3
9 x2 1 4
10 x2 2 5
# Source data:
tab1 <- data.frame(id = rep(c("x1","x2","x3","x4","x5","x6"), each = 6),
value = c(7,10,11,7,3,1,6,7,9,5,2,3,11,9,7,9,1,
0,1,2,2,4,7,4,2,3,1,6,4,2,3,5,4,3,5,6),
year = rep(c(1990:1995), times = 6))
tab2 <- data.frame(id = rep(c("x1","x2","x3","x4","x5","x6"), each = 6),
value = c(6,9,10,6,2,0,4,5,7,3,0,1,11,9,7,9,1,0,
0,1,1,3,6,3,1,2,0,5,3,1,0,2,1,0,2,3),
year = rep(c(1990:1995), times = 6))
tab3 <- data.frame(id = rep(c("x1","x2","x3","x4","x5","x6"), each = 5),
value = c(3,1,-4,-4,-2,1,2,-4,-3,1,-2,-2,2,-8,-1,
1,0,2,3,-3,1,-2,5,-2,-2,2,-1,-1,2,1),
time = rep(c(1:5), times = 6))
tab4 <- data.frame(id = rep(c("x1","x2","x3","x4","x5","x6"), each = 5),
value = c(-4,-4,-2,1,3,-4,-3,1,1,2,-8,-2,-2,-1,2,
-3,0,1,2,3,-2,-2,-2,1,5,-1,-1,1,2,2),
time = rep(c(1:5), times = 6))
Using data.table, this is simply:
require(data.table) ## 1.9.2
ans <- setDT(tab1)[, list(value=diff(value)), by=id] ## aggregation
setkey(ans, id,value)[, time := seq_len(.N), by=id] ## order + add 'time' column
Note that your 'step 1' is unnecessary as your second step is calculating difference and it wouldn't have any effect (and is therefore skipped here).
It sounds like you want to apply a set of functions to each group of a grouping variable. There are many ways to do this in R (from base R by and tapply to add-on packages like plyr, data.table, and dplyr). I've been learning how to use package dplyr, and came up with the following solution.
require(dplyr)
tab4 = tab1 %>%
group_by(id) %>% # group by id
mutate(value = value - min(value), value = value - lag(value)) %>% # group min to 0, difference lag 1
na.omit %>% # remove NA caused by lag 1 differencing
arrange(id, value) %>% # order by value within each id
mutate(time = 1:length(value)) %>% # Make a time variable from 1 to 5 based on current order
select(-year) # remove year column to match final OP output

Resources