R: Comparing Subgroups From Different Datasets - r

I am working with the R programming language.
I have the following dataset that contains the heights and weights of people from Canada - using the value of height (cm), I split weight (kg) into bins based on ntiles, and calculated the average value of var2 within each ntile bin:
library(dplyr)
library(gtools)
set.seed(123)
canada = data.frame(height = rnorm(10000,150,10), weight = rnorm(10000,90, 10))
Part_1 = canada %>%
mutate(quants = quantcut(weight, 100),
rank = as.numeric(quants)) %>%
group_by(quants) %>%
mutate(min = min(weight), max = max(weight), count = n(), avg_height = mean(height))
Part_1 = Part_1 %>% distinct(rank, .keep_all = TRUE)
> Part_1
# A tibble: 100 x 8
# Groups: quants [100]
height weight quants rank min max count avg_height
<dbl> <dbl> <fct> <dbl> <dbl> <dbl> <int> <dbl>
1 144. 114. (110.2,113.9] 99 110. 114. 100 150.
2 148. 88.3 (88.12,88.38] 44 88.1 88.4 100 149.
3 166. 99.3 (99.1,99.52] 83 99.1 99.5 100 152.
4 151. 84.3 (84.14,84.44] 29 84.1 84.4 100 150.
For example, I see that there are 100 people between the weight range of 100.2 - 113.9 kg and the average height of these people is 150 cm
Now, suppose I have a similar dataset for people from the USA:
set.seed(124)
usa = data.frame(height = rnorm(10000,150,10), weight = rnorm(10000,90, 10))
My Question: Based on the weight ranges I calculated using the Canada dataset - I want to find out how many people from the USA fall within these Canadian ranges and what is the average weight of the Americans within these Canadian ranges
For example:
In the Canada dataset, I saw that there are 100 people between the weight range of 100.2 - 113.9 kg and the average height of these people is 150 cm
How many Americans are between the weight range of 100.2 - 113.9 kg and what is the average height of these Americans?
I know that I can do this manually for each rank:
americans_in_canadian_rank99 = usa %>%
filter(weight > 110.2 & weight < 113.9) %>%
group_by() %>%
summarize(count = n(), avg_height = mean(height))
americans_in_canadian_rank44 = usa %>%
filter(weight > 88.1 & weight < 88.4) %>%
group_by() %>%
summarize(count = n(), avg_height = mean(height))
In the end, I would be looking for something a desired output like this:
# number of rows should be = number of unique ranks
canadian_rank min_weight max_weight canadian_count canadian_avg_height american_count american_avg_height
1 99 110.2 113.9 100 150 116 150
2 44 88.1 88.4 100 149 154 150
Can someone please help me figure out a better way to do this?
Thanks!

Note: updated based on the desired output format combining the two sets:
This can be done in a straight-forward manner using the non-equijoin functionality of data.table.
library(data.table)
library(gtools)
set.seed(123)
canada = data.table(height = rnorm(10000,150,10), weight = rnorm(10000,90, 10))
set.seed(124)
usa = data.table(height = rnorm(10000,150,10), weight = rnorm(10000,90, 10))
## You can also use data.table to generate your Part_1 summary table
Part_1 <- canada[, .(min = min(weight),
max = max(weight),
count = .N,
avg_height = mean(height)), keyby = .(quants = quantcut(weight,100))]
Part_1[, rank := as.numeric(quants)]
## Join using a non-equi join to combine data sets
usa[Part_1, on = .(weight >= min,
weight < max)
## On the join result, compute same summary states by quants & rank
][, .(usa_count = .N,
usa_avg_height = mean(height)), keyby = .(rank,
quants,
## whenever we do a non-equijoin, the foreign key values, in this case min/max
## overwrite the local keys. Since we used weight twice, canadian min/max
## will show up in the join result table as weight and weight.1
min_weight = weight,
max_weight = weight.1,
## To keep both sets of results distinct, we can rename columns in our "by" statement
canadian_count = count,
canadian_avg_height = avg_height)]
Gives results as follows:
rank quants min_weight max_weight canadian_count canadian_avg_height usa_count usa_avg_height
1: 1 [55.11,66.71] 55.11266 66.69011 100 149.2101 114 149.8116
2: 2 (66.71,69.48] 66.70575 69.46055 100 149.0639 119 148.6486
3: 3 (69.48,71.15] 69.48011 71.13895 100 150.5331 94 148.4336
4: 4 (71.15,72.44] 71.14747 72.43042 100 150.4779 104 149.8926
Also, another option would be to assign result columns for the usa table directly back to your Part_1 summary table in place.
## This is a two-part nested join
Part_1[
## Start by creating a result that matches Part_1 ranks to all usa data
Part_1[usa,on = .(min <= weight,
max > weight)
## Compute aggregated results on the join table result
][,.(usa_count = .N,
usa_avg_height = mean(height)), by = .(rank)],
## Finaly, assign results back to the Part_1 summary table joined by rank
c("usa_count",
"usa_avg_height") := .(usa_count,
usa_avg_height), on = .(rank)]
Gives the following
quants min max count avg_height rank usa_count usa_avg_height
1: [55.11,66.71] 55.11266 66.69011 100 149.2101 1 114 149.8116
2: (66.71,69.48] 66.70575 69.46055 100 149.0639 2 119 148.6486
3: (69.48,71.15] 69.48011 71.13895 100 150.5331 3 94 148.4336
4: (71.15,72.44] 71.14747 72.43042 100 150.4779 4 104 149.8926

With data.table you can do this:
library(data.table)
library(stringr)
dt1 <- as.data.table(usa)
dt1 <- dt1[, c("min", "max") := weight]
dt2 <- as.data.table(Part_1 %>% select("quants", "rank"))
dt2 <- cbind(dt2[,.(rank)],
setDT(tstrsplit(str_sub(dt2$quants, 2, -2), ",", fixed = TRUE, names = c("min", "max"))))
dt2 <- dt2[, lapply(.SD, as.numeric)]
setkey(dt2, min, max)
dt1 <- dt1[, rank := dt2$rank[foverlaps(dt1, dt2, by.x = c("min", "max"), by.y = c("min", "max"), which = TRUE)$yid]] %>%
select(-c("min", "max"))
EDIT
Totally missed the last part. But if you wish to do that, it should be relatively straightforward from the last point (you could use dplyr for that if you wish):
dt3 <- rbind(canada %>%
mutate(quants = quantcut(weight, 100),
rank = as.numeric(quants),
country = "Canada") %>%
as.data.table(),
copy(dt1)[, country := "USA"], fill = TRUE)
dt3 <- dt3[,.(count = .N, avg_height = mean(height)), by = c("rank", "country")] %>%
dcast(rank ~ country, value.var = c("count", "avg_height")) %>%
merge(dt2 %>% rename("min_weight" = "min", "max_weight" = "max"), by = c("rank"), all.x = TRUE)
EDIT 2
Alternatively, you could try to do something similar using cut function without learning anything from data.table
rank_breaks <- Part_1 %>%
mutate(breaks = sub(",.*", "", str_sub(quants, 2)) %>% as.numeric()) %>%
arrange(rank) %>%
pull(breaks)
# Here I change minimum and maximum of groups 1 and 100 to -Inf and Inf respectively.
# If you do not wish to do so, you can disregard it and run `rank_breaks <- c(rank_breaks, max(canada$weight))` instead
rank_breaks[1] <- -Inf
rank_breaks <- c(rank_breaks, Inf)
usa <- usa %>%
mutate(rank = cut(weight, breaks = rank_breaks, labels = c(1:100)))

You can use fuzzyjoin for this.
library(fuzzyjoin)
# take percentile ranges and join US data
us_by_canadian_quantiles <- Part_1 |>
ungroup() |>
distinct(rank, min, max, height_avg_can = avg_height) |>
fuzzy_full_join(usa, by = c(min = "weight", max = "weight"), match_fun = c(`<`, `>=`))
# get count and height average per bin
us_by_canadian_quantiles |>
group_by(rank) |>
summarize(n_us = n(),
height_avg_us = mean(height),
height_avg_can = first(height_avg_can)
)
#> # A tibble: 101 × 4
#> rank n_us height_avg_us height_avg_can
#> <dbl> <int> <dbl> <dbl>
#> 1 1 114 150. 149.
#> 2 2 119 149. 149.
#> 3 3 94 148. 151.
#> 4 4 104 150. 150.
#> 5 5 115 152. 150.
#> 6 6 88 150. 149.
#> 7 7 86 150. 150.
#> 8 8 86 150. 151.
#> 9 9 102 151. 151.
#> 10 10 81 152. 150.
#> # … with 91 more rows
Note that there are a number of cases in the US frame which fall outside of the Canadian percentile ranges. They are grouped together here with rank being NA, but you could also add ranks 0 and 101 if you wanted to distinguish them.
I should note that fuzzyjoin tends to be much slower than data.table. But since you have already gotten a data.table solution, this might be more to your liking.

Related

Aggregating a dataframe with dplyr in R based on several dummy variables

I am using dplyr to aggregate my dataframe, so it shows percentages of people choosing specific protein design tasks by company size. I have different dummy variables for protein design tasks, because this was a multiple choice question in a survey.
I figured out a way to do this, but my code is very long, because I aggregate the data per task and then join all these separate dataframes together into one. I’m curious whether there is a more elegant (shorter) way to do this?
library(tidyverse)
EarlyAccess <- read_csv("https://dropbox.com/s/antzwk1jh4ldrhi/EarlyAccess_anon.csv?dl=1")
#################### STABILITY ################################################
Proportions_tasks_stability <- EarlyAccess %>%
select(size, Improving.stability..generic..thermal..pH.) %>%
group_by(size, Improving.stability..generic..thermal..pH.) %>%
summarise(count_var_stability=n())%>%
mutate(total_group_by_size = sum(count_var_stability)) %>%
mutate(pc_var_stability=count_var_stability/sum(count_var_stability)*100) %>%
filter(Improving.stability..generic..thermal..pH.=="Improving stability (generic, thermal, pH)") %>%
select(size, Improving.stability..generic..thermal..pH., pc_var_stability)
######################## ACTIVITY #############################################
Proportions_tasks_activity <- EarlyAccess %>%
select(size, Improving.activity ) %>%
group_by(size, Improving.activity) %>%
summarise(count_var_activity=n())%>%
mutate(total_group_by_size = sum(count_var_activity)) %>%
mutate(pc_var_activity=count_var_activity/sum(count_var_activity)*100) %>%
filter(Improving.activity=="Improving activity") %>%
select(size, Improving.activity, pc_var_activity)
######################## BINDING AFFINITY ######################################
Proportions_tasks_binding.affinity<- EarlyAccess %>%
select(size, Improving.binding.affinity ) %>%
group_by(size, Improving.binding.affinity) %>%
summarise(count_var_binding.affinity=n())%>%
mutate(total_group_by_size = sum(count_var_binding.affinity)) %>%
mutate(pc_var_binding.affinity=count_var_binding.affinity/sum(count_var_binding.affinity)*100) %>%
filter(Improving.binding.affinity=="Improving binding affinity") %>%
select(size, Improving.binding.affinity, pc_var_binding.affinity)
# Then join them
Protein_design_tasks <- Proportions_tasks_stability %>%
inner_join(Proportions_tasks_activity, by = "size") %>%
inner_join(Proportions_tasks_binding.affinity, by = "size")
Using the datafile you provided, this should give the percentages of the selected category within each column for each size:
library(tidyverse)
df <-
read_csv("https://dropbox.com/s/antzwk1jh4ldrhi/EarlyAccess_anon.csv?dl=1")
df |>
group_by(size) |>
summarise(
pc_var_stability = sum(
Improving.stability..generic..thermal..pH. == "Improving stability (generic, thermal, pH)",
na.rm = TRUE
) / n() * 100,
pc_var_activity = sum(Improving.activity == "Improving activity",
na.rm = TRUE) / n() * 100,
pc_var_binding.affinity = sum(
Improving.binding.affinity == "Improving binding affinity",
na.rm = TRUE
) / n() * 100
)
#> # A tibble: 7 × 4
#> size pc_var_stability pc_var_activity pc_var_binding.affinity
#> <chr> <dbl> <dbl> <dbl>
#> 1 1000-10000 43.5 47.8 34.8
#> 2 10000+ 65 65 70
#> 3 11-50 53.8 53.8 46.2
#> 4 2-10 51.1 46.8 46.8
#> 5 200-1000 64.7 52.9 52.9
#> 6 50-200 42.1 42.1 36.8
#> 7 Just me 48.5 39.4 54.5
Looking at your data, each column has either the string value you're testing for or NA, so you could make it even shorter/tidier just by counting non-NAs in relevant columns:
df |>
group_by(size) |>
summarise(across(
c(
Improving.stability..generic..thermal..pH.,
Improving.activity,
Improving.binding.affinity
),
\(val) 100 * sum(!is.na(val)) / n()
))
If what you're aiming to do is summarise across all columns then the latter method may work best - there are several ways of specifying which columns you want and so you don't necessarily need to type all names and values in. You might also find it clearest to make calculating and formatting all percentages a named function to call:
library(tidyverse)
df <-
read_csv("https://dropbox.com/s/antzwk1jh4ldrhi/EarlyAccess_anon.csv?dl=1",
show_col_types = FALSE)
perc_nonmissing <- function(val) {
sprintf("%.1f%%", 100 * sum(!is.na(val)) / n())
}
df |>
group_by(size) |>
summarise(across(-c(1:2), perc_nonmissing))
#> # A tibble: 7 × 12
#> size Disco…¹ Searc…² Under…³ Impro…⁴ Impro…⁵ Impro…⁶ Impro…⁷ Impro…⁸ Impro…⁹
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 1000-… 21.7% 17.4% 43.5% 47.8% 39.1% 43.5% 30.4% 39.1% 39.1%
#> 2 10000+ 40.0% 55.0% 55.0% 65.0% 70.0% 65.0% 20.0% 30.0% 40.0%
#> 3 11-50 30.8% 26.9% 42.3% 53.8% 38.5% 53.8% 15.4% 30.8% 38.5%
#> 4 2-10 38.3% 40.4% 48.9% 46.8% 36.2% 51.1% 23.4% 31.9% 42.6%
# etc.

Sum of column based on a condition a R

I would like to print out total amount for each date so that my new dataframe will have date and and total amount columns.
My data frame looks like this
permitnum
amount
6/1/2022
na
ascas
30.00
olic
40.41
6/2/2022
na
avrey
17.32
fev
32.18
grey
12.20
any advice on how to go about this will be appreciated
Here is another tidyverse option, where I convert to date (and then reformat), then we can fill in the date, so that we can use that to group. Then, get the sum for each date.
library(tidyverse)
df %>%
mutate(permitnum = format(as.Date(permitnum, "%m/%d/%Y"), "%m/%d/%Y")) %>%
fill(permitnum, .direction = "down") %>%
group_by(permitnum) %>%
summarise(total_amount = sum(as.numeric(amount), na.rm = TRUE))
Output
permitnum total_amount
<chr> <dbl>
1 06/01/2022 70.4
2 06/02/2022 61.7
Data
df <- structure(list(permitnum = c("6/1/2022", "ascas", "olic", "6/2/2022",
"avrey", "fev", "grey"), amount = c("na", "30.00", "40.41", "na",
"17.32", "32.18", "12.20")), class = "data.frame", row.names = c(NA,
-7L))
Here is an option. Split the data by the date marked by a row with a number, then summarize the total in amount and combine the date and all rows.
library(tidyverse)
dat <- read_table("permitnum amount
6/1/2022 na
ascas 30.00
olic 40.41
6/2/2022 na
avrey 17.32
fev 32.18
grey 12.20")
dat |>
group_split(id = cumsum(grepl("\\d", permitnum))) |>
map_dfr(\(x){
date <- x$permitnum[[1]]
x |>
slice(-1) |>
summarise(date = date,
total_amount = sum(as.numeric(amount)))
})
#> # A tibble: 2 x 2
#> date total_amount
#> <chr> <dbl>
#> 1 6/1/2022 70.4
#> 2 6/2/2022 61.7

Efficiently summarize data over many different time periods by additional criteria (person and location) in R

I have pollution measures by day and location. I have a population of people for which I want to measure pollution exposure. Each person has a location and a period of time during which they were in that location.
For each person in my dataset, I need to sum up the pollution values from their location over their time period, and also count the number of missing pollution measurements.
The table structures are the following:
ids start_dates end_dates zips
1 1 2000-10-10 2001-02-18 45108
2 2 2000-11-11 2001-04-07 45190
3 3 2000-03-05 2000-06-27 45117
4 4 2001-02-04 2001-06-09 45142
5 5 2000-03-16 2000-07-13 45197
6 6 1999-12-15 2000-04-27 45060
exposure_day exposure_zip exposure_value
1 1999-06-26 45108 14
2 1999-06-27 45108 27
3 1999-06-28 45108 22
4 1999-06-29 45108 4
5 1999-06-30 45108 26
6 1999-07-01 45108 20
Desired output:
ids start_dates end_dates zips exposure_sum na_count
1: 1 2000-10-10 2001-02-18 45108 3188 5
2: 2 2000-11-11 2001-04-07 45190 3789 1
3: 3 2000-03-05 2000-06-27 45117 2917 3
4: 4 2001-02-04 2001-06-09 45142 2969 2
5: 5 2000-03-16 2000-07-13 45197 2860 3
6: 6 1999-12-15 2000-04-27 45060 3497 2
My current solution is quite slow. I would like to find a more efficient solution, so that I can efficiently perform this calculation for roughly 1,000,000 people.
Below is the code to simulate my data and my current solution.
set.seed(123)
library(lubridate)
library(data.table)
# Make person dataframe
n = 1000 # sample size
ids = c(1:n)
end_dates = sample(seq(as.Date('2000-01-01'), as.Date('2002-01-01'), by="day"), n, replace = T)
time_intervals = sample(seq(100, 200), n, replace = T)
start_dates = end_dates - time_intervals
zips = sample(seq(45000, 45200), n, replace = T)
person_df = data.frame(ids, start_dates, end_dates, zips)
# Make exposure dataframe
ziplist = unique(zips)
nzips = length(ziplist)
ndays = as.numeric(as.Date(max(person_df$end_dates)) - as.Date(min(person_df$start_dates)) + 1)
exposure_dates = seq(as.Date(min(person_df$start_dates)), as.Date(max(person_df$end_dates)), by = 'day')
exposure_day = rep(exposure_dates, nzips)
exposure_zip = rep(ziplist, each = ndays)
exposure_value = sample(c(NA, 1:50), length(exposure_day), replace = T)
exposure_df = data.frame(exposure_day, exposure_zip, exposure_value)
# convert to datatable
person_dt = data.table(person_df)
exposure_dt = data.table(exposure_df)
#summarize
summary_dt = person_dt[, ":="(exposure_sum = .(sum(exposure_dt[exposure_day>=start_dates & exposure_day<=end_dates & exposure_zip == zips, exposure_value], na.rm = T)),
na_count = .(sum(is.na(exposure_dt[exposure_day>=start_dates & exposure_day<=end_dates & exposure_zip == zips, exposure_value])))),
by = 'ids'][]
EDIT --- added variation of #langtang's clever approach, which allows an n=1M approach in 4 seconds with dplyr.
This dplyr approach is about 40x faster at n=1000, 50x at n=10k, and 60x faster at n=100k, with the same output. The main gain is from translating the non-equi join into a left join by expanding person_df to have one row for each exposure_day in each ids range. That upfront step of expanding all the dates can be done once to make the subsequent joins dramatically faster.
It takes about 2 minutes when I run this for n=1,000,000, which I presume would take around 2 hours using the original code. I imagine further improvements could be made by porting to data.table or collapse if that isn't fast enough.
person_df %>%
group_by(ids, exposure_zip = zips) %>%
summarize(exposure_day = seq.Date(start_dates, end_dates, by = "day"), .groups = "drop") %>%
left_join(exposure_df) %>%
group_by(ids) %>%
summarize(exposure_sum = sum(exposure_value, na.rm = TRUE),
na_count = sum(is.na(exposure_value))) %>%
# optional to add start dates end dates, zips columns back
left_join(person_df)
Update: porting to data.table using dtplyr improved the 1M row test slightly, to 100 seconds on my machine.
library(dtplyr)
person_df %>%
lazy_dt() %>%
group_by(ids, exposure_zip = zips) %>%
summarize(exposure_day = seq.Date(start_dates, end_dates, by = "day"), .groups = "drop") %>%
left_join(exposure_df) %>%
group_by(ids) %>%
summarize(exposure_sum = sum(exposure_value, na.rm = TRUE),
na_count = sum(is.na(exposure_value)), .groups = "drop") %>%
left_join(person_df) %>%
collect()
#langtang's approach is clever, by recognizing that the sum over each id's range can be done more efficiently by subtracting the cumulative value at the end of the range from the cumulative value the day before the start of the range. This improves the time for n=1M to 4 seconds, even using dplyr's slower aggregate calculations.
exposure_df_cuml <- exposure_df %>%
group_by(zips = exposure_zip) %>%
transmute(value = exposure_day,
expo_cuml = cumsum(coalesce(exposure_value,0)),
expo_na_cuml = cumsum(is.na(exposure_value))) %>%
ungroup()
person_df %>%
tidyr::pivot_longer(ends_with("dates")) %>%
mutate(value = value - if_else(name == "start_dates", 1, 0)) %>%
left_join(exposure_df_cuml) %>%
mutate(across(starts_with("expo"), ~if_else(name == "start_dates", -.x, .x))) %>%
group_by(ids, zips) %>%
summarize(across(starts_with("expo"), sum), .groups = "drop")
You should be able to get the runtime for one million ids down to a couple of seconds.
The trick here is to use cumulative sums across the entire range of days, and then subtract the cumulative sum at the end date for an id from the cumulative sum at the start date for an id. This is quite fast, because it doesn’t require any expansion of rows and apart from the direct merge, doesn’t require any grouping by ids:
Step 1: Create the cumulative sum of exposure values (cval) and number of NA values (nas)
exposure_dt[order(exposure_day), `:=`(
cval=cumsum(fifelse(is.na(exposure_value),0,exposure_value)),
nas = cumsum(is.na(exposure_value))
),exposure_zip]
Step 2: Simply melt the person_dt frame, and do direct merge on the exposure_dt frame. Make sure to subtract the exposure value from the cumulative sum if this is a start day and exposure value is not NA; similarly subtract one from nas if this is a start day and exposure value is NA.
k <- melt(person_dt,id.vars = c("ids","zips")) %>%
.[exposure_dt, on=.(zips==exposure_zip, value=exposure_day), nomatch=0] %>%
.[variable=="start_dates" & is.na(exposure_value), nas:=nas-1] %>%
.[variable=="start_dates" & !is.na(exposure_value),cval:=cval-exposure_value] %>%
.[order(ids,value)]
Step 3: Simply subtract the odd rows from the even rows, and cbind the result to the person_dt
cbind(
person_dt,
k[seq(2,.N,2),.(cval,nas)] - k[seq(1,.N,2),.(cval,nas)]
)
All of this takes 0.08 seconds on my machine with the original 1000 ids dataset. If I set n=1000000, it takes about 1.1 seconds.
Output:
ids start_dates end_dates zips cval nas
<int> <Date> <Date> <int> <num> <int>
1: 1 2000-10-10 2001-02-18 45108 3188 5
2: 2 2000-11-11 2001-04-07 45190 3789 1
3: 3 2000-03-05 2000-06-27 45117 2917 3
4: 4 2001-02-04 2001-06-09 45142 2969 2
5: 5 2000-03-16 2000-07-13 45197 2860 3
---
996: 996 2000-02-21 2000-07-29 45139 4250 2
997: 997 2000-02-02 2000-07-15 45074 4407 4
998: 998 2001-07-29 2001-11-15 45139 2686 3
999: 999 2001-09-10 2001-12-20 45127 2581 1
1000: 1000 2000-10-15 2001-05-01 45010 4941 2

Calculating distance between all locations to first location, by group

I have GPS locations from several seabird tracks, each starting from colony x. Therefore the individual tracks all have similar first locations. For each track, I would like to calculate the beeline distance between each GPS location and either (a) a specified location that represents the location of colony x, or (b) the first GPS point of a given track which represents the location of colony x. For (b), I would look to use the first location of each new track ID (track_id).
I have looked for appropriate functions in geosphere, sp, raster, adehabitatLT, move, ... and just cannot seem to find what I am looking for.
I can calculate the distance between successive GPS points, but that is not what I need.
package(dplyr)
df %>%
group_by(ID) %>%
mutate(lat_prev = lag(Lat,1), lon_prev = lag(Lon,1) ) %>%
mutate(dist = distVincentyEllipsoid(matrix(c(lon_prev, lat_prev), ncol = 2), # or use distHaversine
matrix(c(Lon, Lat), ncol = 2)))
#example data:
df <- data.frame(Lon = c(-96.8, -96.60861, -96.86875, -96.14351, -92.82518, -90.86053, -90.14208, -84.64081, -83.7, -82, -80, -88.52732, -94.46049,-94.30, -88.60, -80.50, -81.70, -83.90, -84.60, -90.10, -90.80, -92.70, -96.10, -96.55, -96.50, -96.00),
Lat = c(25.38657, 25.90644, 26.57339, 27.63348, 29.03572, 28.16380, 28.21235, 26.71302, 25.12554, 24.50031, 24.89052, 30.16034, 29.34550, 29.34550, 30.16034, 24.89052, 24.50031, 25.12554, 26.71302, 28.21235, 28.16380, 29.03572, 27.63348, 26.57339, 25.80000, 25.30000),
ID = c(rep("ID1", 13), rep("ID2", 13)))
Grateful for any pointers.
You were pretty close. The key is that you want to calcualte the distance from the first observation in each track. Therefore you need to first adorn with the order in each track (easy to do with dplyr::row_number()). Then for the distance calculation, make the reference observation always the first by subsetting with order == 1.
library(tidyverse)
library(geosphere)
df <- data.frame(Lon = c(-96.8, -96.60861, -96.86875, -96.14351, -92.82518, -90.86053, -90.14208, -84.64081, -83.7, -82, -80, -88.52732, -94.46049,-94.30, -88.60, -80.50, -81.70, -83.90, -84.60, -90.10, -90.80, -92.70, -96.10, -96.55, -96.50, -96.00),
Lat = c(25.38657, 25.90644, 26.57339, 27.63348, 29.03572, 28.16380, 28.21235, 26.71302, 25.12554, 24.50031, 24.89052, 30.16034, 29.34550, 29.34550, 30.16034, 24.89052, 24.50031, 25.12554, 26.71302, 28.21235, 28.16380, 29.03572, 27.63348, 26.57339, 25.80000, 25.30000),
ID = c(rep("ID1", 13), rep("ID2", 13)))
df %>%
group_by(ID) %>%
mutate(order = row_number()) %>%
mutate(dist = distVincentyEllipsoid(matrix(c(Lon[order == 1], Lat[order == 1]), ncol = 2),
matrix(c(Lon, Lat), ncol = 2)))
#> # A tibble: 26 x 5
#> # Groups: ID [2]
#> Lon Lat ID order dist
#> <dbl> <dbl> <chr> <int> <dbl>
#> 1 -96.8 25.4 ID1 1 0
#> 2 -96.6 25.9 ID1 2 60714.
#> 3 -96.9 26.6 ID1 3 131665.
#> 4 -96.1 27.6 ID1 4 257404.
#> 5 -92.8 29.0 ID1 5 564320.
#> 6 -90.9 28.2 ID1 6 665898.
#> 7 -90.1 28.2 ID1 7 732131.
#> 8 -84.6 26.7 ID1 8 1225193.
#> 9 -83.7 25.1 ID1 9 1319482.
#> 10 -82 24.5 ID1 10 1497199.
#> # ... with 16 more rows
Created on 2022-01-09 by the reprex package (v2.0.1)
This also seems to work (sent to me by a friend) - very similar to Dan's suggestion above, but slightly different
library(geosphere)
library(dplyr)
df %>%
group_by(ID) %>%
mutate(Dist_to_col = distHaversine(c(Lon[1], Lat[1]),cbind(Lon,Lat)))

How can I delineate multiple watersheds in R using the streamstats package?

There is an R package in development that I would like to use called streamstats. What it does is delineate a watershed (within the USA) for a latitude & longitude point along a body of water and provides watershed characteristics such as drainage area and proportions of various land covers. What I would like to do is extract some watershed characteristics of interest from a data frame of several lat & long positions.
I can get the package to do what I want for one point
devtools::install_github("markwh/streamstats")
library(streamstats)
setTimeout(120)
dat1 <- data.frame(matrix(ncol = 3, nrow = 3))
x <- c("state","lat","long")
colnames(dat1) <- x
dat1$state <- c("NJ","NY","VA")
dat1$lat <- c(40.99194,42.02458,38.04235)
dat1$long <- c(-74.28000,-75.11928,-79.88144)
test_dat <- dat1[1,]
ws1 <- delineateWatershed(xlocation = test_dat$long, ylocation = test_dat$lat, crs = 4326,
includeparameters = "true", includeflowtypes = "true")
chars1 <- computeChars(workspaceID = ws1$workspaceID, rcode = "MA")
chars1$parameters
However what I would like is to be able to give the delineateWatershed function several watersheds at once (i.e., all 3 locations found in dat1) and combine the chars1$parameters output variables DRNAREA,FOREST,LC11DEV, and LC11IMP into a data frame. Maybe this could be achieved with a for loop?
The ideal output would look like this
state lat long DRNAREA FOREST LC11DEV LC11IMP
1 NJ 40.99194 -74.28000 160 66.2 26.20 5.50
2 NY 42.02458 -75.11928 457 89.3 2.52 0.18
3 VA 38.04235 -79.88144 158 NA 4.63 0.20
I would put what you have in a function then use purrr::pmap_df() to loop through each row in dat1 then bind all the results together. See also this answer
library(dplyr)
library(purrr)
library(tidyr)
library(streamstats)
setTimeout(120)
dat1 <- data.frame(matrix(ncol = 3, nrow = 2))
colnames(dat1) <- c("state", "lat", "long")
dat1$state <- c("NJ", "NY")
dat1$lat <- c(40.99194, 42.02458)
dat1$long <- c(-74.28000, -75.11928)
dat1
#> state lat long
#> 1 NJ 40.99194 -74.28000
#> 2 NY 42.02458 -75.11928
Define a function for catchment delineation
catchment_delineation <- function(rcode_in, lat_y, long_x) {
print(paste0("Processing for lat = ", lat_y, " and long = ", long_x))
ws <- delineateWatershed(xlocation = long_x, ylocation = lat_y, crs = 4326,
includeparameters = "true", includeflowtypes = "true")
ws_properties <- computeChars(workspaceID = ws$workspaceID, rcode = rcode_in)
# keep only what we need
ws_properties_df <- ws_properties$parameters %>%
filter(code %in% c("DRNAREA", "FOREST", "LC11DEV", "LC11IMP")) %>%
mutate(ID = ws$workspaceID,
state = rcode_in,
long = long_x,
lat = lat_y)
return(ws_properties_df)
}
Apply the function to each row in dat1 data frame
catchment_df <- pmap_df(dat1, ~ catchment_delineation(..1, ..2, ..3))
#> https://streamstats.usgs.gov/streamstatsservices/watershed.geojson?rcode=NJ&xlocation=-74.28&ylocation=40.99194&includeparameters=true&includeflowtypes=true&includefeatures=true&crs=4326https://streamstats.usgs.gov/streamstatsservices/parameters.json?rcode=NJ&workspaceID=NJ20210923064141811000&includeparameters=truehttps://streamstats.usgs.gov/streamstatsservices/watershed.geojson?rcode=NY&xlocation=-75.11928&ylocation=42.02458&includeparameters=true&includeflowtypes=true&includefeatures=true&crs=4326https://streamstats.usgs.gov/streamstatsservices/parameters.json?rcode=NY&workspaceID=NY20210923064248530000&includeparameters=true
catchment_df
#> ID name
#> 1 NJ20210923064141811000 Drainage Area
#> 2 NJ20210923064141811000 Percent Forest
#> 3 NJ20210923064141811000 Percent Developed from NLCD2011
#> 4 NJ20210923064141811000 Percent_Impervious_NLCD2011
#> 5 NY20210923064248530000 Drainage Area
#> 6 NY20210923064248530000 Percent Forest
#> 7 NY20210923064248530000 Percent Developed from NLCD2011
#> 8 NY20210923064248530000 Percent_Impervious_NLCD2011
#> description
#> 1 Area that drains to a point on a stream
#> 2 Percentage of area covered by forest
#> 3 Percentage of developed (urban) land from NLCD 2011 classes 21-24
#> 4 Average percentage of impervious area determined from NLCD 2011 impervious dataset
#> 5 Area that drains to a point on a stream
#> 6 Percentage of area covered by forest
#> 7 Percentage of developed (urban) land from NLCD 2011 classes 21-24
#> 8 Average percentage of impervious area determined from NLCD 2011 impervious dataset
#> code unit value state long lat
#> 1 DRNAREA square miles 160.00 NJ -74.28000 40.99194
#> 2 FOREST percent 66.20 NJ -74.28000 40.99194
#> 3 LC11DEV percent 26.20 NJ -74.28000 40.99194
#> 4 LC11IMP percent 5.50 NJ -74.28000 40.99194
#> 5 DRNAREA square miles 457.00 NY -75.11928 42.02458
#> 6 FOREST percent 89.30 NY -75.11928 42.02458
#> 7 LC11DEV percent 2.52 NY -75.11928 42.02458
#> 8 LC11IMP percent 0.18 NY -75.11928 42.02458
Reshape the result to desired format
catchment_reshape <- catchment_df %>%
select(state, long, lat, code, value) %>%
pivot_wider(names_from = code,
values_from = value)
catchment_reshape
#> # A tibble: 2 x 7
#> state long lat DRNAREA FOREST LC11DEV LC11IMP
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 NJ -74.3 41.0 160 66.2 26.2 5.5
#> 2 NY -75.1 42.0 457 89.3 2.52 0.18
Created on 2021-09-22 by the reprex package (v2.0.1)
Since you mentioned the use a for loop I thought why not make a solution of it.
Here is your data:
library(dplyr)
library(purrr)
library(tidyr)
library(streamstats)
setTimeout(120)
dat1 <- data.frame(matrix(ncol = 3, nrow = 2))
colnames(dat1) <- c("state", "lat", "long")
dat1$state <- c("NJ", "NY")
dat1$lat <- c(40.99194, 42.02458)
dat1$long <- c(-74.28000, -75.11928)
dat1
Create an empty list to store the watershed characteristics:
water_shed <- list()
Loop through the dat1 and return the properties for each respective longitude and latitude:
for(i in 1:nrow(dat1)){
water_shed[[i]] <-
delineateWatershed(xlocation = dat1$long[i], ylocation = dat1$lat[i], crs = 4326,
includeparameters = "true", includeflowtypes = "true")
}
Now create a list to store the watershed properties:
ws_properties <- list()
Loop through the water_shed returning the parameters of each location:
for(i in 1:length(water_shed)){
ws_properties[[i]] <- computeChars(workspaceID = water_shed[[i]][[1]], rcode = dat1$state)
}
Finally, create a dataframe for your desired outputs then append the properties for each location looping through the list of watershed properties:
# data frame:
ws_properties_df <- data.frame(state=character(),long=integer(), lat=integer(),
DRNAREA = integer(), FOREST = integer(), LC11DEV = integer(), LC11IMP = integer(),
stringsAsFactors=FALSE)
#append properties for eact location
for(i in 1:length(ws_properties)){
ws_properties_df[i,] <- ws_properties[[i]]$parameters %>%
filter(code %in% c("DRNAREA", "FOREST", "LC11DEV", "LC11IMP")) %>%
mutate(state = dat1$state[i],
long = dat1$long[i],
lat = dat1$lat[i]) %>%
select(state, long, lat, code, value) %>%
pivot_wider(names_from = code,
values_from = value)
}
Desired Output:

Resources