R Recode Variables In A Loop - r

Ciao,
Here is a replicate able example.
df <- data.frame("STUDENT"=c(1,2,3,4,5),
"TEST1"=c(6,88,17,5,18),
"TEST2"=c(34,NA,87,88,82),
"TEST3"=c(87,62,13,8,71),
"TEST1NEW"=c(0,1,0,0,0),
"TEST2NEW"=c(0,NA,1,1,1),
"TEST3NEW"=c(1,1,0,0,1)
If I have data frame df with STUDENT, TEST1, TEST2, TEST3 I want to make TEST1NEW TEST2NEW and TEST3NEW such that the new variables are equal to 1 when old variable TEST is more than or equals to 50 and the NEW TEST variables should be equal to 0 when the old TEST variable is below 50. I made an attempt here below but this is insufficient and also I believe this may require a loop.
COLUMNS <- c("TEST1", "TEST2", "TEST3")
df[paste0(COLUMNS)] <- replace(df[COLUMNS],df[COLUMNS] < 50, 0 , 1, NA)

You could do
df[, paste0("TEST", 1:3, "_NEW")] <- as.integer(df[,-1] >= 50)
df
# STUDENT TEST1 TEST2 TEST3 TEST1_NEW TEST2_NEW TEST3_NEW
#1 1 6 34 87 0 0 1
#2 2 88 NA 62 1 NA 1
#3 3 17 87 13 0 1 0
#4 4 5 88 8 0 1 0
#5 5 18 82 71 0 1 1
data
df <- data.frame(
"STUDENT" = c(1, 2, 3, 4, 5),
"TEST1" = c(6, 88, 17, 5, 18),
"TEST2" = c(34, NA, 87, 88, 82),
"TEST3" = c(87, 62, 13, 8, 71)
)
In case where the assignment is more complex we can make use of dplyr::case_when
library(dplyr)
df[, paste0("TEST", 1:3, "_NEW")] <- case_when(df[,-1] < 20 ~ 4L,
df[,-1] >= 65 ~ 8L,
is.na(df[,-1]) ~ NA_integer_,
TRUE ~ 7L)

Related

Replace values outside range with NA using replace_with_na function

I have the following dataset
structure(list(a = c(2, 1, 9, 2, 9, 8), b = c(4, 5, 1, 9, 12,
NA), c = c(50, 34, 77, 88, 33, 60)), class = "data.frame", row.names = c(NA,
-6L))
a b c
1 2 4 50
2 1 5 34
3 9 1 77
4 2 9 88
5 9 12 33
6 8 NA 60
From column b I only want values between 4-9. Column c between 50-80. Replacing the values outside the range with NA, resulting in
structure(list(a = c(2, 1, 9, 2, 9, 8), b = c(4, 5, NA, 9, NA,
NA), c = c(50, NA, 77, NA, NA, 60)), class = "data.frame", row.names = c(NA,
-6L))
a b c
1 2 4 50
2 1 5 NA
3 9 NA 77
4 2 9 NA
5 9 NA NA
6 8 NA 60
I've tried several things with replace_with_na_at function where this seemed most logical:
test <- replace_with_na_at(data = test, .vars="c",
condition = ~.x < 2 & ~.x > 2)
However, nothing I tried works. Does somebody know why? Thanks in advance! :)
You can subset with a logical vector testing your conditions.
x$b[x$b < 4 | x$b > 9] <- NA
x$c[x$c < 50 | x$c > 80] <- NA
x
# a b c
#1 2 4 50
#2 1 5 NA
#3 9 NA 77
#4 2 9 NA
#5 9 NA NA
#6 8 NA 60
Data:
x <- structure(list(a = c(2, 1, 9, 2, 9, 8), b = c(4, 5, 1, 9, 12,
NA), c = c(50, 34, 77, 88, 33, 60)), class = "data.frame", row.names = c(NA,
-6L))
Yet another base R solution, this time with function is.na<-
is.na(test$b) <- with(test, b < 4 | b > 9)
is.na(test$c) <- with(test, c < 50 | c > 80)
A package naniar solution with a pipe could be
library(naniar)
library(magrittr)
test %>%
replace_with_na_at(
.vars = 'b',
condition = ~(.x < 4 | .x > 9)
) %>%
replace_with_na_at(
.vars = 'c',
condition = ~(.x < 50 | .x > 80)
)
You should mention the packages you are using. From googling, i'm guessing you are using naniar. The problem appears to be that you did not properly specify the condition, but the following should work:
library(naniar)
test <- structure(list(a = c(2, 1, 9, 2, 9, 8),
b = c(4, 5, 1, 9, 12, NA),
c = c(50, 34, 77, 88, 33, 60)),
class = "data.frame",
row.names = c(NA, -6L))
replace_with_na_at(test, "c", ~.x < 50 | .x > 80)
#> a b c
#> 1 2 4 50
#> 2 1 5 NA
#> 3 9 1 77
#> 4 2 9 NA
#> 5 9 12 NA
#> 6 8 NA 60
Created on 2020-06-02 by the reprex package (v0.3.0)
You simply could use Map to replace your values with NA.
dat[2:3] <- Map(function(x, y) {x[!x %in% y] <- NA;x}, dat[2:3], list(4:9, 50:80))
dat
# a b c
# 1 2 4 50
# 2 1 5 NA
# 3 9 NA 77
# 4 2 9 NA
# 5 9 NA NA
# 6 8 NA 60
Data:
dat <- structure(list(a = c(2, 1, 9, 2, 9, 8), b = c(4, 5, 1, 9, 12,
NA), c = c(50, 34, 77, 88, 33, 60)), class = "data.frame", row.names = c(NA,
-6L))
We can use map2
library(purrr)
library(dplyr)
df1[c('b', 'c')] <- map2(df1 %>%
select(b, c), list(c(4, 9), c(50,80)), ~
replace(.x, .x < .y[1]|.x > .y[2], NA))

Is there a way to create new columns in R based on manipulations from multiple data frames?

Does anyone know if it is possible to use a variable in one dataframe (in my case the "deploy" dataframe) to create a variable in another dataframe?
For example, I have two dataframes:
df1:
deploy <- data.frame(ID = c("20180101_HH1_1_1", "20180101_HH1_1_2", "20180101_HH1_1_3"),
Site_Depth = c(42, 93, 40), Num_Depth_Bins_Required = c(5, 100, 4),
Percent_Column_in_each_bin = c(20, 10, 25))
df2:
sp.c <- data.frame(species = c("RR", "GS", "GT", "BR", "RS", "BA", "GS", "RS", "SH", "RR"),
ct = c(25, 66, 1, 12, 30, 6, 1, 22, 500, 6),
percent_dist_from_surf = c(11, 15, 33, 68, 71, 100, 2, 65, 5, 42))
I want to create new columns in df2 that assigns each species and count to a bin based on the Percent_Column_in_each_bin for each ID. For example, in 20180101_HH1_1_3 there would be 4 bins that each make up 25% of the column and all species that are within 0-25% of the column (in df2) would be in bin 1 and species within 25-50% of the column would be in depth bin 2, and so on. What I'm imagining this looking like is:
i.want.this <- data.frame(species = c("RR", "GS", "GT", "BR", "RS", "BA", "GS", "RS", "SH", "RR"),
ct = c(25, 66, 1, 12, 30, 6, 1, 22, 500, 6),
percent_dist_from_surf = c(11, 15, 33, 68, 71, 100, 2, 65, 5, 42),
'20180101_HH1_1_1_Bin' = c(1, 1, 2, 4, 4, 5, 1, 4, 1, 3),
'20180101_HH1_1_2_Bin' = c(2, 2, 4, 7, 8, 10, 1, 7, 1, 5),
'20180101_HH1_1_3_Bin' = c(1, 1, 2, 3, 3, 4, 1, 3, 1, 2))
I am pretty new to R and I'm not sure how to make this happen. I need to do this for over 100 IDs (all with different depths, number of depth bins, and percent of the column in each bin) so I was hoping that I don't need to do them all by hand. I have tried mutate in dplyr but I can't get it to pull from two different dataframes. I have also tried ifelse statements, but I would need to run the ifelse statement for each ID individually.
I don't know if what I am trying to do is possible but I appreciate the feedback. Thank you in advance!
Edit: my end goal is to find the max count (max ct) for each species within each bin for each ID. What I've been doing to find this (using the bins generated with suggestions from #Ben) is using dplyr to slice and find the max ID like this:
20180101_HH1_1_1 <- sp.c %>%
group_by(20180101_HH1_1_1, species) %>%
arrange(desc(ct)) %>%
slice(1) %>%
group_by(20180101_HH1_1_1) %>%
mutate(Count_Total_Per_Bin = sum(ct)) %>%
group_by(species, add=TRUE) %>%
mutate(species_percent_of_total_in_bin =
paste0((100*ct/Count_Total_Per_Bin) %>%
mutate(ID= "20180101_HH1_1_1 ") %>%
ungroup()
but I have to do this for over 100 IDs. My desired output would be something like:
end.goal <- data.frame(ID = c(rep("20180101_HH1_1_1", 8)),
species = c("RR", "GS", "SH", "GT", "RR", "BR", "RS", "BA"),
bin = c(1, 1, 1, 2, 3, 4, 4, 5),
Max_count_of_each_species_in_each_bin = c(11, 66, 500, 1, 6, 12, 30, 6),
percent_dist_from_surf = c(11, 15, 5, 33, 42, 68, 71, 100),
percent_each_species_max_in_each_bin = c((11/577)*100, (66/577)*100, (500/577)*100, 100, 100, (12/42)*100, (30/42)*100, 100))
I was thinking that by answering the original question I could get to this but I see now that there's still a lot you have to do to get this for each ID.
Here is another approach, which does not require a loop.
Using sapply you can cut to determine bins for each percent_dist_from_surf value in your deploy dataframe.
res <- sapply(deploy$Percent_Column_in_each_bin, function(x) {
cut(sp.c$percent_dist_from_surf, seq(0, 100, by = x), include.lowest = TRUE, labels = 1:(100/x))
})
colnames(res) <- deploy$ID
cbind(sp.c, res)
Or using purrr:
library(purrr)
cbind(sp.c, imap(setNames(deploy$Percent_Column_in_each_bin, deploy$ID),
~ cut(sp.c$percent_dist_from_surf, seq(0, 100, by = .x), include.lowest = TRUE, labels = 1:(100/.x))
))
Output
species ct percent_dist_from_surf 20180101_HH1_1_1 20180101_HH1_1_2 20180101_HH1_1_3
1 RR 25 11 1 2 1
2 GS 66 15 1 2 1
3 GT 1 33 2 4 2
4 BR 12 68 4 7 3
5 RS 30 71 4 8 3
6 BA 6 100 5 10 4
7 GS 1 2 1 1 1
8 RS 22 65 4 7 3
9 SH 500 5 1 1 1
10 RR 6 42 3 5 2
Edit:
To determine the maximum ct value for each species, site, and bin, put the result of above into a dataframe called res and do the following.
First would put into long form with pivot_longer. Then you can group_by species, site, and bin, and determine the maximum ct for this combination.
library(tidyverse)
res %>%
pivot_longer(cols = starts_with("2018"), names_to = "site", values_to = "bin") %>%
group_by(species, site, bin) %>%
summarise(max_ct = max(ct)) %>%
arrange(site, bin)
Output
# A tibble: 26 x 4
# Groups: species, site [21]
species site bin max_ct
<fct> <chr> <fct> <dbl>
1 GS 20180101_HH1_1_1 1 66
2 RR 20180101_HH1_1_1 1 25
3 SH 20180101_HH1_1_1 1 500
4 GT 20180101_HH1_1_1 2 1
5 RR 20180101_HH1_1_1 3 6
6 BR 20180101_HH1_1_1 4 12
7 RS 20180101_HH1_1_1 4 30
8 BA 20180101_HH1_1_1 5 6
9 GS 20180101_HH1_1_2 1 1
10 SH 20180101_HH1_1_2 1 500
11 GS 20180101_HH1_1_2 2 66
12 RR 20180101_HH1_1_2 2 25
13 GT 20180101_HH1_1_2 4 1
14 RR 20180101_HH1_1_2 5 6
15 BR 20180101_HH1_1_2 7 12
16 RS 20180101_HH1_1_2 7 22
17 RS 20180101_HH1_1_2 8 30
18 BA 20180101_HH1_1_2 10 6
19 GS 20180101_HH1_1_3 1 66
20 RR 20180101_HH1_1_3 1 25
21 SH 20180101_HH1_1_3 1 500
22 GT 20180101_HH1_1_3 2 1
23 RR 20180101_HH1_1_3 2 6
24 BR 20180101_HH1_1_3 3 12
25 RS 20180101_HH1_1_3 3 30
26 BA 20180101_HH1_1_3 4 6
It is helpful to distinguish between the contents of your two dataframes.
df2 appears to contain measurements from some sites
df1 appears to contain parameters by which you want to process/summarise the measurements in df2
Given these different purposes of the two dataframes, your best approach is probably to loop over all the rows of df1 each time adding a column to df2. Something like the following:
max_dist = max(df2$percent_dist_from_surf)
for(ii in 1:nrow(df1)){
# extract parameters
this_ID = df1[[ii,"ID"]]
this_depth = df1[[ii,"Site_Depth"]]
this_bins = df1[[ii,"Num_Depth_Bins_Required"]]
this_percent = df1[[ii,"Percent_Column_in_each_bin"]]
# add column to df2
df2 = df2 %>%
mutate(!!sym(this_ID) := insert_your_calculation_here)
}
The !!sym(this_ID) := part of the code is to allow dynamic naming of your output columns.
And as best I can determine the formula you want for insert_your_calculation_here is ceil(percent_dist_from_surf / max_dist * this_bins)

Find last non-zero element in column for each group, fill different column

I am trying to create a for loop that does the following:
for (i in 2:length(Exampledata$Levels)) {
if(is.na(Exampledata$Levels[i]) == "TRUE" {
find the last instance where
is.na(Exampledata$Levels) == "FALSE"
for that same ID, and input
the day from that row into last_entry[i]
}
}
Example data:
ID<-c("QYZ","MMM","QYZ","bb2","gm6","gm6","YOU","LLL","LLL","LLL")
day<-c(1,2,3,4,5,6,7,8,9,10)
values<-c(1,2,4,5,5,6,8,9,6,4)
Levels<-c("A","","A","C",'D','D',"C","y","","")
last_entry<-c(0,0,0,0,0,0,0,0,0,0)
What data currently looks like:
ID values Levels day last_entry
1 QYZ 1 A 1 0
2 MMM 2 2 0
3 QYZ 4 A 3 0
4 bb2 5 C 4 0
5 gm6 5 D 5 0
6 gm6 6 D 6 0
7 YOU 8 C 7 0
8 LLL 9 y 8 0
9 LLL 6 9 0
10 LLL 4 10 0
What I want it to look like:
ID values Levels day last_entry
1 QYZ 1 A 1 0
2 MMM 2 2 0
3 QYZ 4 A 3 0
4 bb2 5 C 4 0
5 gm6 5 D 5 0
6 gm6 6 D 6 0
7 YOU 8 C 7 0
8 LLL 9 y 8 0
9 LLL 6 9 8
10 LLL 4 10 8
I have seen a lot of code that looks for last non-zero elements or last is.na=FALSE, but none that can do it by ID, and extract a value from that row. I also need to ignore cases where there is no entry for that ID.
Essentially I want to know the last day that a level was entered for that ID.
Here's a solution using data.table:
library('data.table')
dt <- data.table(ID = c("QYZ","MMM","QYZ","bb2","gm6","gm6","YOU","LLL","LLL","LLL"),
Day = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
values = c(1, 2, 4, 5, 5, 6, 8, 9, 6, 4),
Levels = c("A", NA, "A", "C", "D", "D", "C", "y", NA, NA),
last_entry = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0))
func <- function(days, levels){
if(!any(is.na(levels)) | all(is.na(levels))) return(0)
return(last(days[which(!is.na(levels))]))
}
dt[, last_entry := ifelse(!is.na(Levels), 0, func(Day, Levels)), by = ID]
But if you're set on using a for loop:
ID <- c("QYZ","MMM","QYZ","bb2","gm6","gm6","YOU","LLL","LLL","LLL")
Day <- c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
Levels <- c("A", NA, "A", "C", "D", "D", "C", "y", NA, NA)
last_entry <- c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
i.na <- which(is.na(Levels))
for(id in unique(ID)){
i.id <- which(ID == id)
if(all(is.na(Levels[i.id])) | !any(is.na(Levels[i.id]))) next
day <- last(Day[i.id[!(i.id %in% i.na)]])
last_entry[i.na[i.na %in% i.id]] <- day
}
Here is one way using tidyr::fill. We replace the last_entry columns with NA where the Levels are empty, then use fill to replace those NA's with latest non-NA values and turn last_entry value of all non-empty Levels to 0.
library(dplyr)
df %>%
mutate(last_entry = ifelse(Levels != "", day, NA)) %>%
group_by(ID) %>%
tidyr::fill(last_entry) %>%
mutate(last_entry = replace(last_entry, Levels != "" | n() == 1, 0))
# ID day values Levels last_entry
# <fct> <dbl> <dbl> <fct> <dbl>
# 1 QYZ 1 1 A 0
# 2 MMM 2 2 "" 0
# 3 QYZ 3 4 A 0
# 4 bb2 4 5 C 0
# 5 gm6 5 5 D 0
# 6 gm6 6 6 D 0
# 7 YOU 7 8 C 0
# 8 LLL 8 9 y 0
# 9 LLL 9 6 "" 8
#10 LLL 10 4 "" 8
We can also do
df %>%
group_by(ID) %>%
mutate(last_entry = purrr::map_dbl(row_number(), ~if (Levels[.x] == "" & n() > 1)
day[max(which(Levels[1:.x] != ""))] else 0))
data
ID<-c("QYZ","MMM","QYZ","bb2","gm6","gm6","YOU","LLL","LLL","LLL")
day<-c(1,2,3,4,5,6,7,8,9,10)
values<-c(1,2,4,5,5,6,8,9,6,4)
Levels<-c("A","","A","C",'D','D',"C","y","","")
last_entry<-c(0,0,0,0,0,0,0,0,0,0)
df <- data.frame(ID, day, values, Levels, last_entry)
If you want to do it properly, you may want to code "empty" cells to NA beforehand.
Exampledata[Exampledata == ""] <- NA
Then you may use by from base R to look up "day" of the last !is.na entry of "Levels" in the by "ID" splitted data.
res <- do.call(rbind, by(Exampledata, Exampledata$ID, function(x) {
x$last_entry <- ifelse(is.na(x$Levels), x$day[tail(which(!is.na(x$Levels)), 1)], 0)
x
}))
Since the rbinded result comes out ordered alphabetically by "ID" we can re-order it by day.
res <- res[order(res$day), ]
res
# ID day values Levels last_entry
# QYZ.1 QYZ 1 1 A 0
# MMM MMM 2 2 <NA> NA
# QYZ.3 QYZ 3 4 A 0
# bb2 bb2 4 5 C 0
# gm6.5 gm6 5 5 D 0
# gm6.6 gm6 6 6 D 0
# YOU YOU 7 8 C 0
# LLL.8 LLL 8 9 y 0
# LLL.9 LLL 9 6 <NA> 8
# LLL.10 LLL 10 4 <NA> 8
Now there are the desired last entries for the "LLL" level, and an NA for MMM what it logically should have since "Levels" is NA and it has no last entry.
Data
Exampledata <- structure(list(ID = structure(c(5L, 4L, 5L, 1L, 2L, 2L, 6L, 3L,
3L, 3L), .Label = c("bb2", "gm6", "LLL", "MMM", "QYZ", "YOU"), class = "factor"),
day = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), values = c(1, 2,
4, 5, 5, 6, 8, 9, 6, 4), Levels = structure(c(2L, NA, 2L,
3L, 4L, 4L, 3L, 5L, NA, NA), .Label = c("", "A", "C", "D",
"y"), class = "factor"), last_entry = c(0, 0, 0, 0, 0, 0,
0, 0, 0, 0)), row.names = c(NA, -10L), class = "data.frame")

Compare one variable to other variables by group in R

I have the following data frame:
data.frame(id = c("a", "a", "a", "d", "d"),
value = c(5, 46, 12, 14, 32),
low = c(46, 8, NA, 0, 34),
high = c(56, 20, NA, 12, 60))
id value low high
1 a 5 46 56
2 a 46 8 20
3 a 12 NA NA
4 d 14 0 12
5 d 32 34 60
I need to set a new variable to TRUE if value is out of every intervals defined by low and high for each line with the same id.
My desired dataframe would be:
id value low high result
1 a 5 45 56 TRUE # 5 not in 45-56, 8-20
2 a 46 8 20 FALSE # 46 in 45-56
3 a 12 NA NA FALSE # 12 in 8-20
4 d 14 0 12 TRUE # 14 not in 0-12, 34-60
5 d 32 34 60 TRUE # 32 not in 0-12, 34-60
How can I do it in base R? I work in a restrictive environment where I only have access to base R.
I figured out an ugly and not optimized solution but it works ! Here is the code :
df <- data.frame(id = c("a", "a", "a", "d", "d"),
value = c(5, 46, 12, 14, 32),
low = c(46, 8, NA, 0, 34),
high = c(56, 20, NA, 12, 60))
list.inter <- list()
for(i in 1:nrow(df)){
if(is.na(df$low[i]) | is.na(df$low[i])) {
list.inter[[i]] <- NA
}else{
list.inter[[i]] <- seq(from = df$low[i], to = df$high[i])
}
}
result <- c()
for(i in 1:nrow(df)){
result[i] <- ! df$value[i] %in% unlist(list.inter[which(df$id[i]==df$id)])
}
df$result <- result
I hope it helps and I am curious to see some optimized code from other users!
Without apply, sapply and map function:
isInDataframe <- function(data = data, value = "value", from = "low", to = "high", id = "id"){
result <- c()
for (i in 1:length(data[,1])) {
deeta <- data[data[id] == as.character(data[id][i,1]),]
subresult <- c()
for (j in 1:nrow(deeta)) {
subresult[j] <- (data[value][i,1] >= deeta[from][j,1] & data[value][i,1] <= deeta[to][j,1])
}
result[i] <- !any(subresult,na.rm = T)
}
data$result <- result
return(data)
}
isInDataframe(data = data, value = "value", from = "low", to = "high", id = "id")
id value low high result
1 a 5 46 56 TRUE
2 a 46 8 20 FALSE
3 a 12 NA NA FALSE
4 d 14 0 12 TRUE
5 d 32 34 60 TRUE
I finally choose to separate id and value in a data frame and id, low and high in another data frame for this analysis.
However, here is a solution highly inspired from the solutions suggested for this new approach:
df <- data.frame(id = c("a", "a", "a", "d", "d"),
value = c(5, 46, 12, 14, 32),
low = c(46, 8, NA, 0, 34),
high = c(56, 20, NA, 12, 60))
temp <- merge(x = df[c("id",
"value")],
y = df[c("id",
"low",
"high")])
temp$result <- temp$value < temp$low | temp$value > temp$high
merge(x = df,
y = aggregate(formula = result ~ id + value,
data = temp,
FUN = all))
id value low high result
1 a 12 NA NA FALSE
2 a 46 8 20 FALSE
3 a 5 46 56 TRUE
4 d 14 0 12 TRUE
5 d 32 34 60 TRUE

Efficient way to build data frame of (current state, next state) in R

I am working with a data set of patients' health state over time.
I would like to compute the data frame of transitions
from the current health state to the next health state.
Here is an example where the health state is measured
only by AFP level and weight.
The health state measurements might look like the following:
x <- data.frame(id = c(1, 1, 1, 2, 2, 2),
day = c(1, 2, 3, 1, 2, 3),
event = c('status', 'status', 'death', 'status', 'status', 'status'),
afp = c(10, 50, NA, 20, 30, 40),
weight = c(100, 105, NA, 200, 200, 200))
The desired output looks like the following:
y <- data.frame(id = c(1, 1, 2, 2),
current_afp = c(10, 50, 20, 30),
current_weight = c(100, 105, 200, 200),
next_event = c('status', 'death', 'status', 'status'),
next_afp = c(50, NA, 30, 40),
next_weight = c(105, NA, 200, 200))
One inefficient way to obtain the output is:
take the cross product of the measurements data frame with itself
keep only rows with matching ids, and day.x + 1 = day.y
rename the columns
Is there a more efficient way to obtain the output?
Note: The real measurements data frame can have more than 10 columns,
so it is not very efficient from a lines of code perspective
to explicitly write
current_afp = x$afp[1:(n-1)],
next_afp = x$afp[2:n]
...
and so on.
You could try:
library(dplyr)
x %>%
mutate_each(funs(lead(.)), -id, -day) %>%
full_join(x, ., by = c("id", "day")) %>%
select(-event.x) %>%
setNames(c(names(.)[1:2],
paste0("current_", sub("\\..*","", names(.)[3:4])),
paste0("next_", sub("\\..*","", names(.)[5:7])))) %>%
group_by(id) %>%
filter(day != last(day))
Which gives:
# id day current_afp current_weight next_event next_afp next_weight
#1 1 1 10 100 status 50 105
#2 1 2 50 105 death NA NA
#3 2 1 20 200 status 30 200
#4 2 2 30 200 status 40 200
Using base R with a split-apply-combine approach
res <- lapply(split(x[-2], x$id), function(y) {
xx <- cbind(y[1:(nrow(y)-1), ], y[2:nrow(y), -1])
colnames(xx) <- c("id", paste("current", colnames(y)[-1], sep="_"),
paste("next", colnames(y)[-1], sep="_"))
xx[, which(colnames(xx) != "current_event")]
})
do.call(rbind, res)
id current_afp current_weight next_event next_afp next_weight
1 1 10 100 status 50 105
2 1 50 105 death NA NA
3 2 20 200 status 30 200
4 2 30 200 status 40 200
Or, an example where not all days are in sequence
x <- data.frame(id = c(1, 1, 1, 2, 2, 2),
day = c(1, 2, 3, 1, 2, 4),
event = c('status', 'status', 'death', 'status', 'status', 'status'),
afp = c(10, 50, NA, 20, 30, 40),
weight = c(100, 105, NA, 200, 200, 200))
x
id day event afp weight
1 1 1 status 10 100
2 1 2 status 50 105
3 1 3 death NA NA
4 2 1 status 20 200
5 2 2 status 30 200
6 2 4 status 40 200
Some of the transitions are NA, which could be removed if desired.
res <- lapply(split(x, x$id), function(y) {
y <- merge(data.frame(id=unique(y$id), day = 1:max(y$day)), y,
by = c("id", "day"), all.x=TRUE)[, -2]
xx <- cbind(y[1:(nrow(y)-1), ], y[2:nrow(y), -1])
colnames(xx) <- c("id", paste("current", colnames(y)[-1], sep="_"),
paste("next", colnames(y)[-1], sep="_"))
xx[, which(colnames(xx) != "current_event")]
})
do.call(rbind, res)
id current_afp current_weight next_event next_afp next_weight
1.1 1 10 100 status 50 105
1.2 1 50 105 death NA NA
2.1 2 20 200 status 30 200
2.2 2 30 200 <NA> NA NA
2.3 2 NA NA status 40 200

Resources