truncate dataset when variable falls below threshold

truncate dataset when variable falls below threshold - r

I wish to find the day on which variable hb falls below 90 for each record. I can find the day when x=min
f <- function(x) 1:length(x) <= which.min(x)
ind <- as.logical(ave(df$hb, df1$ id, FUN=f))
dfhb <- (df [ind, ])
maxday <- dfhb %>% group_by(id) %>% summarise(daymax = last(day))
However, I can’t get hb<90
f2 <- function(x) 1:length(x) <= which(x<=90)
ind <- as.logical(ave(df$hb, df$id, FUN=f2))
dfhb <- (df [ind, ])
maxday <- dfhb %>% group_by(id) %>% summarise(daymax = last(day))
summary(maxday$daymax)
I would be very grateful for your advice, bw Annemarie
id day hb
1 1 95
1 2 NA
1 3 91
1 4 89
2 1 98
2 2 87
2 3 84
3 1 89
3 2 92
3 3 89

Here is the idea for obtaining the first observed value that satisfies your threshold,
df %>%
group_by(id) %>%
summarise(daymax = which(hb<90)[1])
# A tibble: 3 × 2
# id daymax
# <int> <int>
#1 1 4
#2 2 2
#3 3 1

Related

Binding rows based on common id

I have a very simple case where I want to combine several data frames into one based on a common id elements of a particular data frame.
Example:
id <- c(1, 2, 3)
x <- c(10, 12, 14)
data1 <- data.frame(id, x)
id <- c(2, 3)
x <- c(20, 22)
data2 <- data.frame(id, x)
id <- c(1, 3)
x <- c(30, 32)
data3 <- data.frame(id, x)
Which gives us,
$data1
id x
1 1 10
2 2 12
3 3 14
$data2
id x
1 2 20
2 3 22
$data3
id x
1 1 30
2 3 32
Now, I want to combine all three data frames based on the id's of the data3. The expected output should look like
> comb
id x
1 1 10
2 1 NA
3 1 30
4 3 14
5 3 22
6 3 32
I am trying the following, but not getting the expected output.
library(dplyr)
library(tidyr)
combined <- bind_rows(data1, data2, data3, .id = "id") %>% arrange(id)
Any idea how to get the expected output?

Does this work:
library(dplyr)
library(tidyr)
data1 %>% full_join(data2, by = 'id') %>% full_join(data3, by = 'id') %>% arrange(id) %>% right_join(data3, by = 'id') %>%
pivot_longer(cols = -id) %>% select(-name) %>% distinct()
# A tibble: 6 x 2
id value
<dbl> <dbl>
1 1 10
2 1 NA
3 1 30
4 3 14
5 3 22
6 3 32

Combine the 3 dataframes in one list and use filter to select only the id's in 3rd dataframe.
library(dplyr)
library(tidyr)
bind_rows(data1, data2, data3, .id = "new_id") %>%
filter(id %in% id[new_id == 3]) %>%
complete(new_id, id)
# new_id id x
# <chr> <dbl> <dbl>
#1 1 1 10
#2 1 3 14
#3 2 1 NA
#4 2 3 22
#5 3 1 30
#6 3 3 32

A pure base R solution can also make it
lst <- list(data1, data2, data3)
reshape(
subset(
reshape(
do.call(rbind, Map(cbind, lst, grp = seq_along(lst))),
idvar = "id",
timevar = "grp",
direction = "wide"
),
id %in% lst[[3]]$id
),
idvar = "id",
varying = -1,
direction = "long"
)[c("id", "x")]
which gives
id x
1.1 1 10
3.1 3 14
1.2 1 NA
3.2 3 22
1.3 1 30
3.3 3 32
>

Using base R
do.call(rbind, unname(lapply(mget(ls(pattern = "^data\\d+$")), \(x) {
x1 <- subset(x, id %in% data3$id)
v1 <- setdiff(data3$id, x1$id)
if(length(v1) > 0) rbind(x1, cbind(id = v1, x = NA)) else x1
})))
-output
id x
1 1 10
3 3 14
2 3 22
11 1 NA
12 1 30
21 3 32

bind_rows(data1, data2, data3, .id = 'grp')%>%
complete(id, grp)%>%
select(-grp) %>%
filter(id%in%data3$id)
# A tibble: 6 x 2
id x
<dbl> <dbl>
1 1 10
2 1 NA
3 1 30
4 3 14
5 3 22
6 3 32

Regex colnames and grouping in R?

I have this data frame:
id <- c(0,1,2,3,4)
groupA_sample1_values <- c(10,11,12,13,14)
groupA_sample2_values <- c(20,21,22,23,24)
groupA_sample3_values <- c(30,31,32,33,34)
groupB_sample1_values <- c(40,41,42,43,44)
groupB_sample2_values <- c(50,51,52,53,54)
groupB_sample3_values <- c(60,61,62,63,64)
df <- data.frame(id,
groupA_sample1_values,
groupA_sample2_values,
groupA_sample3_values,
groupB_sample1_values,
groupB_sample2_values,
groupB_sample3_values)
df
and I am trying to obtain another table with these columns:
id, group, sample, value.
I belive I would have to extract the name groupA/groupB with regex, and the same for the sample number, and the melt it to a new data frame, but I'm not sure how to approach it.
Any help?

try
library( tidyverse )
df %>%
pivot_longer( -id,
names_to = c("group", "sample" ),
names_pattern = "group(.)_sample(.)_values",
values_to = "value" )
# # A tibble: 30 x 4
# id group sample value
# <dbl> <chr> <chr> <dbl>
# 1 0 A 1 10
# 2 0 A 2 20
# 3 0 A 3 30
# 4 0 B 1 40
# 5 0 B 2 50
# 6 0 B 3 60
# 7 1 A 1 11
# 8 1 A 2 21
# 9 1 A 3 31
#10 1 B 1 41

How to make bin for the each row of dataframe only with the non NA values?

I have a dataframe which contains values and NAs. Some of them have the NAs from the start of the row, some of them have the NAs at the end of the row.
# like this way
df<- data.frame(A=c(1,5,6, 1,NA,NA),
B=c(1,2,3, 2,NA,NA),
C=c(1,3,NA, 4,3,NA),
D=c(1,1,NA, 6,10,NA),
E=c(1,NA,NA, 1,1,1),
F=c(1,NA,NA, 1,1,1))
Now I would like to build two bins for each row based on the non NA values and sum them up.
#expected output
Sum Bin
3 1
3 2
7 1
5 2
6 1
3 2
...
Now what I did is I first separate the dataframe into 2 base on whether the row will start or end with NAs. Then I use a loop for the calculation.
df_bin <- data.frame(Sum = 0, Bin = 0)
bin = 2 # set bin for the calculation
for (i in 1:nrow(df)) {
l <- sum(!is.na(df[i,]))
ll <- as.integer(l/bin)
s <- c()
j <- 1
while (j <= (bin-1)) {
k <- sum(df[i,(j*ll-ll+1):(j*ll)])
s <- c(s,k)
j = j+1
}
k <- k <- sum(df[i,(j*(bin-1)+1):l])
s <- c(s,k)
df2 <- data.frame(Sum = s, Bin = 1:bin)
df_bin <- rbind(df_bin,df2)
}
But it runs very slow, I was wondering if there is a more elegant way to do it. Thank you in advance : )

A pure tidyverse solution using pivoting:
df %>%
mutate(orig_row = 1:n()) %>%
pivot_longer(-orig_row) %>% filter(!is.na(value)) %>%
group_by(orig_row) %>% mutate(Bin = round(1 + seq(0, n() - 1) / n())) %>%
group_by(orig_row, Bin) %>% summarise(Sum = sum(value)) %>% ungroup() %>%
select(-orig_row)
Result:
# A tibble: 12 x 2
Bin Sum
<dbl> <dbl>
1 1 3
2 2 3
3 1 7
4 2 4
5 1 6
6 2 3
7 1 7
8 2 8
9 1 13
10 2 2
11 1 1
12 2 1

You can try using apply :
do.call(rbind, apply(df, 1, function(x) {
#Remove NA values
x1 <- na.omit(x)
#Calculate length of non-NA values
n <- length(x1)
#Calculate mid point
half_len <- round(n/2)
#Create dataframe with sum of two bin values
data.frame(Sum = c(sum(x1[1:half_len]), sum(x1[(half_len + 1):n])),
Bin = 1:2)
}))
# Sum Bin
#1 3 1
#2 3 2
#3 7 1
#4 4 2
#5 6 1
#6 3 2
#7 7 1
#8 8 2
#9 13 1
#10 2 2
#11 1 1
#12 1 2

How to reduce factor levels depending on other attribute?

I have a dataframe of two columns id and result, and I want to assign factor levels to result depending on id. So that for id "1", result c("a","b","c","d") will have factor levels 1,2,3,4.
For id "2", result c("22","23","24") will have factor levels 1,2,3.
id <- c(1,1,1,1,2,2,2)
result <- c("a","b","c","d","22","23","24")
I tried to group them by split, but they will be converted to a list instead of a data frame, which causes a length problem for modeling. Can you help please?

Though the question was closed as a duplicate by user #Ronak Shah, I don't believe it is the same question.
After numbering the row by group the new column must be coerced to class "factor".
library(dplyr)
id <- c(1,1,1,1,2,2,2)
result <- c("a","b","c","d","22","23","24")
df <- data.frame(id, result)
df %>%
group_by(id) %>%
mutate(fac = row_number()) %>%
ungroup() %>%
mutate(fac = factor(fac))
# A tibble: 7 x 3
# id result fac
# <dbl> <fct> <fct>
#1 1 a 1
#2 1 b 2
#3 1 c 3
#4 1 d 4
#5 2 22 1
#6 2 23 2
#7 2 24 3
Edit.
If there are repeated values in result, coerce as.integer/factor to get numbers, then coerce those numbers to factor.
id2 <- c(1,1,1,1,2,2,2,2)
result2 <- c("a","b","c","d","22", "22","23","24")
df2 <- data.frame(id = id2, result = result2)
df2 %>%
group_by(id) %>%
mutate(fac = as.integer(factor(result))) %>%
ungroup() %>%
mutate(fac = factor(fac))
# A tibble: 8 x 3
# id result fac
# <dbl> <fct> <fct>
#1 1 a 1
#2 1 b 2
#3 1 c 3
#4 1 d 4
#5 2 22 1
#6 2 22 1
#7 2 23 2
#8 2 24 3

After grouping by id, we can use match with unique to assign unique number to each result. Using #Rui Barradas' dataframe df2
library(dplyr)
df2 %>%
group_by(id) %>%
mutate(ans = match(result, unique(result))) %>%
ungroup %>%
mutate(ans = factor(ans))
# id result ans
# <dbl> <fct> <fct>
#1 1 a 1
#2 1 b 2
#3 1 c 3
#4 1 d 4
#5 2 22 1
#6 2 22 1
#7 2 23 2
#8 2 24 3

Computing minimum distance between a row and all previous rows in R

I want to compute the minimum distance between the current row and every row before it within each group. My data frame has several groups, and each group has multiple dates with longitude and latitude. I use a Haversine function to compute distance, and I need to apply this function as described above. The data frame looks like the following:
grp date long lat rowid
1 1 1995-07-01 11 12 1
2 1 1995-07-05 3 0 2
3 1 1995-07-09 13 4 3
4 1 1995-07-13 4 25 4
5 2 1995-03-07 12 6 1
6 2 1995-03-10 3 27 2
7 2 1995-03-13 34 8 3
8 2 1995-03-16 25 9 4
My current attempt uses purrrlyr::by_row, but the method is too slow. In practice, each group has thousands of dates and geographic positions. Here is part of my current attempt:
calc_min_distance <- function(df, grp.name, row){
df %>%
filter(
group_name==grp.name
) %>%
filter(
row_number() <= row
) %>%
mutate(
last.lat = last(lat),
last.long = last(long),
rowid = 1:n()
) %>%
group_by(rowid) %>%
purrrlyr::by_row(
~haversinedistance.fnct(.$last.long, .$last.lat, .$long, .$lat),
.collate='rows',
.to = 'min.distance'
) %>%
filter(
row_number() < n()
) %>%
summarise(
min = min(min.distance)
) %>%
.$min
}
df_dist <-
df %>%
group_by(grp_name) %>%
mutate(rowid = 1:n()) %>%
group_by(grp_name, rowid) %>%
purrrlyr::by_row(
~calc_min_distance(df, .$grp_name,.$rowid),
.collate='rows',
.to = 'min.distance'
) %>%
ungroup %>%
select(-rowid)
Suppose that distance is defined as (lat + long) for reference row - (lat + long) for each pairwise row less than the reference row. My expected output for grp 1 is the following:
grp date long lat rowid min.distance
1 1 1995-07-01 11 12 1 0
2 1 1995-07-05 3 0 2 -20
3 1 1995-07-09 13 4 3 -6
4 1 1995-07-13 4 25 4 6
How can I quickly compute the minimum distance between the current rowid and all rowids before it?

Here's how I would go about it. You need to calculate all the within-group pair-wise distances anyway, so we'll use geosphere::distm which is designed to do just that. I'd suggest stepping through my function line-by-line and looking at what it does, I think it will make sense.
library(geosphere)
find_min_dist_above = function(long, lat, fun = distHaversine) {
d = distm(x = cbind(long, lat), fun = fun)
d[lower.tri(d, diag = TRUE)] = NA
d[1, 1] = 0
return(apply(d, MAR = 2, min, na.rm = TRUE))
}
df %>% group_by(grp) %>%
mutate(min.distance = find_min_dist_above(long, lat))
# # A tibble: 8 x 6
# # Groups: grp [2]
# grp date long lat rowid min.distance
# <int> <fct> <int> <int> <int> <dbl>
# 1 1 1995-07-01 11 12 1 0
# 2 1 1995-07-05 3 0 2 1601842.
# 3 1 1995-07-09 13 4 3 917395.
# 4 1 1995-07-13 4 25 4 1623922.
# 5 2 1995-03-07 12 6 1 0
# 6 2 1995-03-10 3 27 2 2524759.
# 7 2 1995-03-13 34 8 3 2440596.
# 8 2 1995-03-16 25 9 4 997069.
Using this data:
df = read.table(text = ' grp date long lat rowid
1 1 1995-07-01 11 12 1
2 1 1995-07-05 3 0 2
3 1 1995-07-09 13 4 3
4 1 1995-07-13 4 25 4
5 2 1995-03-07 12 6 1
6 2 1995-03-10 3 27 2
7 2 1995-03-13 34 8 3
8 2 1995-03-16 25 9 4', h = TRUE)