How to Calculate A Daily Max for Different Locations in R? - r

So currently I am able to calculate a daily max for one site using the following code:
library('dplyr')
library('data.table')
library('tidyverse')
library('tidyr')
library('lubridate')
funcVolume <- function(max_data$enter_yard, max_data$exit_yard)
{
vecOnes <- array(1,c(length(max_data$enter_yard),1))
vecTime <- c(max_data$enter_yard,max_data$exit_yard)
vecCount <- c(vecOnes,-vecOnes)
df_test <- data.frame(T = vecTime, Count = vecCount)
df_test <- df_test %>%
arrange(T) %>%
mutate(Volume = cumsum(Count))
df_test
}
df_test2 <- df_test
df_test2$date <- as.Date(format(df_test$T, "%Y-%m-%d"))
df_test3 <- df_test2
df_test3 <- tibble(x = df_test2$Volume, y = df_test2$date) %>%
arrange(y)
dataset <- df_test3 %>%
group_by(y) %>%
dplyr::filter(x == max(x)) %>%
distinct(x,.keep_all = T) %>%
ungroup()
However, I would like to do this for multiple locations. In my original dataframe, I have a column that lists the name of the site, and two columns for when an object enter or leaves a site. The name is just a general text column, and the other two columns are datetime columns. Ideally, I would want an output that looks like the following:
Date | Max Count | Site
x y z
x a b
I also have a couple million rows of data, so I need something that can run in a reasonable time frame.

Related

In a longitundinal dataframe, I found the max value for each row. Now I need to retrieve columns associated with the max. In R

I have a longitudinal dataframe that look like this :
ID <- c(1,2,3,4,5,6)
Grade_06 <- c(1000,900,850,1200,450,600)
Age_06 <- c(17,25,36,12,22,44)
Grade_07 <- c(900,1010,423,1500,125,564)
Age_07 <- c(18,26,37,13,23,45)
Grade_08 <- c(847,1111,568,450,1300,641)
Age_08 <- c(19,27,38,14,24,46)
Grade_09 <- c(1451,820,650,740,910,1000)
Age_09 <- c(20,28,39,15,25,47)
df <- as.data.frame(ID, Grade_06, Age_06, Grade_07, Age_07, Grade_08, Age_08, Grade_09, Age_09)
I found the max of each row using :
df <- within(df, {Best_grade=pmax(Grade_06, Grade_07, Grade_08, Grade_09, na.rm = TRUE)})
Now, that the max has been found, I need to associate the max of each row with the age of the people when they reach their max grade.
I really struggle with this. Can you help?
The following gives you a data frame with the ID and Grade column names where it equals Best_grade.
library(tidyverse)
df %>%
mutate(across(contains('Grade_'), function(x) x == Best_grade)) %>%
split(df$ID) %>%
map_dfr(function(x) {
x_grades_only <- x %>% select(contains('Grade_'))
tibble(ID = x$ID, Grade = names(x_grades_only)[as.logical(x_grades_only)])
})

Reduce a data frame by combining like rows according to two qualitative factors

I have a dataframe like the following:
observations<- data.frame(X=c("00KS089001","00KS089001","00KS089002","00KS089002","00KS089003","00KS089003","00KS105001","00KS105001", "00KS177011","00KS177011","00P0006","00P006","00P006","00P006"), hzdept = c(0,20,0,15,0,13,0,20,0,16,0,6,13,29), hzdepb = c(20,30,15,30,13,30,20,30,16,30,6,13,29,30),Y=c("Red","White","Red","White","Green","Red","Red","Blue", "Black","Black","Red","White","White","White"), Z = c(0.67,0.33,0.5,0.5,0.43,0.57,0.67,0.33,0.53,0.47,0.2,0.23,0.53,0.04))
I want to be able to reduce this so that anytime X and Y are the same for two rows, the observations are combined i.e.
data.frame(X=c("00KS089001","00KS089001","00KS089002","00KS089002","00KS089003","00KS089003","00KS105001","00KS105001", "00KS177011","00P0006","00P006"), hzdept = c(0,20,0,15,0,13,0,20,0,0,6), hzdepb = c(20,30,15,30,13,30,20,30,30,6,30),Y=c("Red","White","Red","White","Green","Red","Red","Blue", "Black","Red","White"), Z = c(0.67,0.33,0.5,0.5,0.43,0.57,0.67,0.33,1.00,0.20,0.80))
Any suggestions on how to best go about this?
Edit: ok, now that I see how hzdept and hzdepb are supposed to be combined from your commment above:
library(tidyverse)
df <- observations %>% count(X,Y,wt = Z,name = "Z")
df_hzdept <- observations %>%
arrange(hzdept) %>%
distinct(X,Y,.keep_all = T) %>%
select(X,Y,hzdept)
df_hzdepb <- observations %>%
arrange(desc(hzdepb)) %>%
distinct(X,Y,.keep_all = T) %>%
select(X,Y,hzdepb)
df <- df %>% left_join(df_hzdept) %>% left_join(df_hzdepb)
Using dplyr
Here is how you would group by two columns and summarize using the minimum, max, and sum other columns in a dataframe:
library(magrittr) # For the pipe: %>%
observations %>%
dplyr::group_by(X, Y) %>%
dplyr::summarise(hzdept = min(hzdept),
hzdepb = max(hzdepb),
Z = sum(Z), .groups = 'drop')

Find two points farthest apart using group_by() in Sf

Haven't been able to find an exact Q/A to match this problem, though there are several related. Im trying to calculate a distance matrix for all points groups defined by an ID column. Then select the two points that are farthest apart from each group, retaining the original group id. The number of points in each group varies from 2, 4 or 6.
My sf df:
df <- data.frame(x = runif(12), y = runif(12), id = rep(1:3,each = 4)) %>%
st_as_sf(coords = c("x","y"), crs = 27700)
I've tried code such as:
a <- df %>%
group_by(id) %>%
st_distance(.)
Though this just returns a distance matrix of all points.
The below gives me, what I want, though I fear it would be slow on large datasets:
maxMin <- do.call(rbind,lapply(unique(allInts$id), function(x) {
df <- allInts %>% filter(id == x)
d <- st_distance(df)
df %>% slice(unique(as.vector(which(d == max(d),arr.ind=T))))
}))
You can use dplyr::group_split to split your data frame into a list per group. You can then apply whatever function you want to that list using map/lapply.
Script below keeps the 2 points which are furthest apart in each group.
library(sf)
library(tidyverse)
# dummy data
data <- data.frame(x = runif(12), y = runif(12), id = rep(1:3,each = 4)) %>%
st_as_sf(coords = c("x","y"), crs = 27700)
# split it into a list per ID
data_group <- data %>%
group_by(id) %>%
group_split()
#apply a function to each list
distance_per_group <- map(data_group, function(x){
distance_matrix <- st_distance(x)
biggest_distance <- as.numeric(which(distance_matrix == max(distance_matrix), arr.ind = TRUE)[1,])
farthest_apart <- x[biggest_distance,]
})

sample multiple different sample sizes using crossing and sample_n to create single df

I am attempting to sample a dataframe using sample_n. I know that sample_n usually takes a single size= argument at a time, however, I would like to sample sizes from 2 to the max # of rows in the df. Unfortunately, the code I have compiled below does not do the job. The needed output would be a dataframe with an id= column or a list divided by the id column from crossing().
df <- data.frame(Date = 1:15,
grp = rep(1:3,each = 5),
frq = rep(c(3,2,4), each = 5))
data_sampled_by_stratum <- df %>%
group_by(Date) %>%
crossing(id = seq(500)) %>% # repeat dataframes
group_by(id) %>%
sample_n(size=c(2:15)) %>%
group_by(CLUSTER_ID,Date) %>% filter(n() > 2)
If you had a column with different sites you could do this.
data_sampled_by_stratum <- data_grouped_by_stratum %>%
group_by(siteid, Date) %>%
crossing(id = seq(500)) %>% # repeat dataframes
sample_n(rbinom(1,sum(siteid==i),(1-s)^2))

Summarise dataframe to obtain diff (lagged difference)

I have a dataframe that I want to group and obtain the median of the diff (lagged difference) in consistent units. Is very similar to the example below. As you can see by running the code below I have problems because diff have an units attribute that is not taken into account by my summarise function
library(tidyverse)
# Initialise random data
t = Sys.time()
rnd <- sample(1:10000,10,replace=F)
add <- rnd[order(rnd)]
# Create 2 dtaaframes
time1 <- data.frame(datetime = t + add)
time2 <- data.frame(datetime = t + add * 1000)
# Bind dataframe together
mydata <- bind_rows(time1, time2, .id = "group")
# Trying to get a summary table
mydata %>% group_by(group) %>% summarise(elapsed = median(diff(datetime[order(datetime)])))
# These are the values that I should get in my summary table
median(diff(time1$datetime))
median(diff(time2$datetime))
What about using difftime and setting the units?
mydata %>%
group_by(group) %>%
summarise(elapsed = median(difftime(datetime, lag(datetime), units = "mins"), na.rm = TRUE))
Here's one option, which will show all results in seconds. Use dminutes(1) or dhours(1) or ddays(1) if more appropriate.
mydata %>%
group_by(group) %>%
summarise(elapsed = median(diff(datetime[order(datetime)])) / lubridate::dseconds(1))

Resources