Perform a series of mutations to columns in dataframe - r

I am trying to replace some text in my dataframe (a few rows given below)
> dput(Henry.longer[1:4,])
structure(list(N_l = c(4, 4, 4, 4), UG = c("100", "100", "100",
"100"), S = c(12, 12, 12, 12), Sample = c(NA, NA, NA, NA), EQ = c("Henry",
"Henry", "Henry", "Henry"), DF = c(0.798545454545455, 0.798545454545455,
0.798545454545455, 0.798545454545455), meow = c("Henry.Exterior.single",
"Multi", "Henry.Exterior.multi", "Henry.Interior.single"), Girder = c("Henry.Exterior.single",
"Henry.Interior.multi", "Henry.Exterior.multi", "Interior")), row.names = c(NA,
-4L), groups = structure(list(UG = "100", S = 12, .rows = list(
1:4)), row.names = c(NA, -1L), class = c("tbl_df", "tbl",
"data.frame"), .drop = FALSE), class = c("grouped_df", "tbl_df",
"tbl", "data.frame"))
I try to mutate the dataframe as:
Henry.longer <- Henry.longer %>%
mutate(Loading = str_replace(meow, "Henry.Exterior.single", "Single")) %>%
mutate(Loading = str_replace(meow, "Henry.Exterior.multi", "Multi")) %>%
mutate(Loading = str_replace(meow, "Henry.Interior.single", "Single")) %>%
mutate(Loading = str_replace(meow, "Henry.Interior.multi", "Multi")) %>%
mutate(Girder = str_replace(meow, "Henry.Exterior.multi", "Exterior")) %>%
mutate(Girder = str_replace(meow, "Henry.Exterior.single", "Exterior")) %>%
mutate(Girder = str_replace(meow, "Henry.Interior.multi", "Interior")) %>%
mutate(Girder = str_replace(meow, "Henry.Interior.single", "Interior")) %>%
select(-meow)
But for some reason the results does not get applied to all the rows and only:
N_l UG S Sample EQ DF Loading Girder
1 4 100 12 NA Henry 0.799 Henry.Exterior.single Henry.Exterior.single
2 4 100 12 NA Henry 0.799 Multi Henry.Interior.multi
3 4 100 12 NA Henry 0.799 Henry.Exterior.multi Henry.Exterior.multi
4 4 100 12 NA Henry 0.799 Henry.Interior.single Interior

I think we can use lookup vectors for this, if it's easy or safer to use static string lookups:
tr_vec <- c(Henry.Exterior.single = "Single", Henry.Exterior.multi = "Multi", Henry.Interior.single = "Single", Henry.Interior.multi = "Multi")
tr_vec2 <- c(Henry.Exterior.multi = "Exterior", Henry.Exterior.single = "Exterior", Henry.Interior.multi = "Interior", Henry.Interior.single = "Interior")
Henry.longer %>%
mutate(
Loading = coalesce(tr_vec[Loading], Loading),
Girder = coalesce(tr_vec2[Girder], Girder)
)
# # A tibble: 4 x 8
# # Groups: UG, S [1]
# N_l UG S Sample EQ DF Loading Girder
# <dbl> <chr> <dbl> <lgl> <chr> <dbl> <chr> <chr>
# 1 4 100 12 NA Henry 0.799 Single Exterior
# 2 4 100 12 NA Henry 0.799 Multi Interior
# 3 4 100 12 NA Henry 0.799 Multi Exterior
# 4 4 100 12 NA Henry 0.799 Single Interior
The advantage of RonakShah's regex solution is that it can very easily handle many of the types of substrings you appear to need. Regexes do carry a little risk, though, in that they may (unlikely in that answer, but) miss match.

Instead of using str_replace I guess it would be easier to extract what you want using regex.
library(dplyr)
Henry.longer %>%
mutate(Loading = sub('.*\\.', '', meow),
Girder = sub('.*\\.(\\w+)\\..*', '\\1', meow))
where
Loading - removes everything until last dot
Girder - extracts a word between two dots.

Oh boy, looks like you've got some answers here already but here's a super-simple one that uses stringr::str_extract:
Henry.longer <- Henry.longer %>%
mutate(Loading = str_extract(meow, "single|multi")) %>%
mutate(Girder = str_extract(meow, "Interior|Exterior"))
It's worth noting that the demo data has a weird entry for meow in one column, so it didn't run perfectly on my machine:

Related

Calculate year-to-year absolute change in R

Give a dataframe df as follows:
df <- structure(list(year = c(2001, 2002, 2003, 2004), `1` = c(22.0775,
24.2460714285714, 29.4039285714286, 27.7110714285714), `2` = c(27.2535714285714,
35.9996428571429, 26.39, 27.8557142857143), `3` = c(24.7710714285714,
25.4428571428571, 15.1142857142857, 19.9657142857143)), row.names = c(NA,
-4L), groups = structure(list(year = c(2001, 2002, 2003, 2004
), .rows = structure(list(1L, 2L, 3L, 4L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, 4L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
Out:
year 1 2 3
0 2001 22.07750 27.25357 24.77107
1 2002 24.24607 35.99964 25.44286
2 2003 29.40393 26.39000 15.11429
3 2004 27.71107 27.85571 19.96571
For column 1, 2 and 3, how could I calculate year-to-year absolute change?
The expected result will like this:
year 1 2 3
0 2002 2.16857 8.74607 0.67179
1 2003 5.15786 9.60964 10.32857
2 2004 1.69286 1.46571 4.85142
The final objective is to compare values of 1, 2, 3 columns across all years, find the largest change year and column, at this example, it should be 2003 and column 3.
How could I do that in R? Thanks.
You can use :
library(dplyr)
data <- df %>% ungroup %>% summarise(across(-1, ~abs(diff(.))))
data
# A tibble: 3 x 3
# `1` `2` `3`
# <dbl> <dbl> <dbl>
#1 2.17 8.75 0.672
#2 5.16 9.61 10.3
#3 1.69 1.47 4.85
To get max change
mat <- which(data == max(data), arr.ind = TRUE)
mat
# row col
#[1,] 2 3
#Year name
df$year[mat[, 1] + 1]
#[1] 2003
#Column name
mat[, 2]
#col
# 3
You can try:
library(reshape2)
library(dplyr)
#Melt
Melted <- reshape2::melt(df,id.vars = 'year')
#Group
Melted %>% group_by(variable) %>% mutate(Diff=c(0,abs(diff(value)))) %>% ungroup() %>%
filter(Diff==max(Diff))
# A tibble: 1 x 4
year variable value Diff
<dbl> <fct> <dbl> <dbl>
1 2003 3 15.1 10.3
We can apply the diff on the entire dataset by converting the numeric columns of interest to matrix in base R
cbind(year = df$year[-1], abs(diff(as.matrix(df[-1]))))
# year 1 2 3
#[1,] 2002 2.168571 8.746071 0.6717857
#[2,] 2003 5.157857 9.609643 10.3285714
#[3,] 2004 1.692857 1.465714 4.8514286

Merge two data frames based on multiple columns in R

I have two data frames looking like that
data frame 1:
P.X value
OOPA 5
POKA 4
JKIO 3
KOPP 1
data frame 2:
P.X.1 P.X.2 P.X.3 P.X.4 mass
JKIO UIX HOP 56
CX OOPA 44
EDD POKA 13
KOPP FOSI 11
and I want to merge the two data files based on the df1 P.X and df2 P.X.1,P.X.2,P.X.3,P.X.4. So if it the JKIO in P.X.2. appears in the P.X one then merge them in a new data frame in the same row JKIO, 3, 56 as below:
data frame new:
P.X value mass
OOPA 5 44
POKA 4 13
JKIO 3 56
KOPP 1 11
Do you know how can I do it maybe with
merge(df1,df2 by(P.X == P.X.1 | P.X.2 | P.X.3 | P.X.4)
?
The following is one way to achieve your goal. You want to convert df2 to a long-format data and get rows that have more than 1 character. Once you have this data, you merge df1 with the updated df2.
library(dplyr)
library(tidyr)
left_join(df1,
pivot_longer(df2, cols = P.X.1:P.X.4, names_to = "foo",
values_to = "P.X") %>% filter(nchar(P.X) > 0),
by = "P.X") %>%
select(-foo)
P.X value mass
1 OOPA 5 44
2 POKA 4 13
3 JKIO 3 56
4 KOPP 1 11
DATA
df1 <- structure(list(P.X = c("OOPA", "POKA", "JKIO", "KOPP"), value = c(5L,
4L, 3L, 1L)), class = "data.frame", row.names = c(NA, -4L))
df2 <- structure(list(P.X.1 = c("", "", "EDD", "KOPP"), P.X.2 = c("JKIO",
"", "", "FOSI"), P.X.3 = c("UIX", "CX", "POKA", ""), P.X.4 = c("HOP",
"OOPA", "", ""), mass = c(56, 44, 13, 11)), row.names = c(NA,
-4L), class = c("tbl_df", "tbl", "data.frame"))
You could also just do:
df_new <- cbind(df1, df2[,5])

applying a function across columns by extracting similar column names

My data looks like:
[[1]]
date germany france germany_mean france_mean germany_sd france_sd
1 2016-01-01 17 25 21.29429 48.57103 30.03026 47.05169
What I am trying to do is to compute the following calculation over all the lists using map.
germany_calc = (germany - germany_mean) / germany_sd
france_calc = (france - france_mean) / france_sd
However the number of columns can change - here there are two categories/countries but in another list there could be 1 or 3 or N. The countries always follow the same structure. That is,
"country1", "country2", ... , "countryN", "country1_mean", "country2_mean", ... , "countryN_mean", "country1_sd", "country2_sd", ... , "countryN_sd".
Expected Output (for the first list):
Germany: -0.1429988 = (17 - 21.29429) / 30.03026
France: -0.5009603 = (25 - 48.57103) / 47.05169
EDIT: Apologies - expected output:
-0.1429988
-0.5009603
Function:
Scale_Me <- function(x){
(x - mean(x, na.rm = TRUE)) / sd(x, na.rm = TRUE)
}
Data:
my_list <- list(structure(list(date = structure(16801, class = "Date"),
germany = 17, france = 25, germany_mean = 21.2942922374429,
france_mean = 48.5710301846855, germany_sd = 30.030258443028,
france_sd = 47.0516928425878), class = "data.frame", row.names = c(NA,
-1L)), structure(list(date = structure(16802, class = "Date"),
germany = 9, france = 29, germany_mean = 21.2993150684932,
france_mean = 48.5605316914534, germany_sd = 30.0286190461173,
france_sd = 47.0543871206842), class = "data.frame", row.names = c(NA,
-1L)), structure(list(date = structure(16803, class = "Date"),
germany = 8, france = 18, germany_mean = 21.2947488584475,
france_mean = 48.551889593794, germany_sd = 30.0297291333284,
france_sd = 47.0562416513092), class = "data.frame", row.names = c(NA,
-1L)), structure(list(date = structure(16804, class = "Date"),
germany = 3, france = 11, germany_mean = 21.2778538812785,
france_mean = 48.5382545766386, germany_sd = 30.0267943793948,
france_sd = 47.0607680244109), class = "data.frame", row.names = c(NA,
-1L)), structure(list(date = structure(16805, class = "Date"),
germany = 4, france = 13, germany_mean = 21.2614155251142,
france_mean = 48.5214531240057, germany_sd = 30.0269420596686,
france_sd = 47.0676011750263), class = "data.frame", row.names = c(NA,
-1L)), structure(list(date = structure(16806, class = "Date"),
germany = 4, france = 9, germany_mean = 21.253196347032,
france_mean = 48.5055948249362, germany_sd = 30.0292032528186,
france_sd = 47.0737183354519), class = "data.frame", row.names = c(NA,
-1L)))
Why not just rbind the thing?
with(do.call(rbind, my_list),
cbind(germany=(germany - germany_mean) / germany_sd,
france=(france - france_mean) / france_sd))
# germany france
# [1,] -0.1429988 -0.5009603
# [2,] -0.4095864 -0.4157005
# [3,] -0.4427196 -0.6492633
# [4,] -0.6087181 -0.7976550
# [5,] -0.5748642 -0.7546901
# [6,] -0.5745473 -0.8392283
The question is unclear on the exact form of output so we assume that what is wanted is a data frame with a column for date and a column for each country in which the country value is normalized. In this case it means we want 3 columns in the output.
1) pivot_longer/_wider Bind the my_list list components together creating a data frame with a row from each component. Then for each bare country name among the columns append _root to it so that every column name except date is of the form country_suffix. Then convert to long form, perform the normalization and convert back to wide form:
library(dplyr)
library(tidyr)
library(purrr)
my_list %>%
bind_rows %>%
set_names(names(.)[1], sub("^([^_]*)$", "\\1_root", names(.)[-1])) %>%
pivot_longer(-date, names_to = c("country", ".value"), names_sep = "_") %>%
mutate(root = (root - mean) / sd) %>%
pivot_wider(id_cols = "date", names_from = "country", values_from = "root")
giving:
# A tibble: 6 x 3
date germany france
<date> <dbl> <dbl>
1 2016-01-01 -0.143 -0.501
2 2016-01-02 -0.410 -0.416
3 2016-01-03 -0.443 -0.649
4 2016-01-04 -0.609 -0.798
5 2016-01-05 -0.575 -0.755
6 2016-01-06 -0.575 -0.839
2) Base R
After rbinding the list components together giving d we pick out the country names, nms, as those names not containing an underscore except for the first such (which is date). Then perform the normalization and cbind the date column to that.
d <- do.call("rbind", my_list)
nms <- grep("_", names(d), invert = TRUE, value = TRUE)[-1]
cbind(d[1], (d[nms] - d[paste0(nms, "_mean")]) / d[paste0(nms, "_sd")])
giving:
date germany france
1 2016-01-01 -0.1429988 -0.5009603
2 2016-01-02 -0.4095864 -0.4157005
3 2016-01-03 -0.4427196 -0.6492633
4 2016-01-04 -0.6087181 -0.7976550
5 2016-01-05 -0.5748642 -0.7546901
6 2016-01-06 -0.5745473 -0.8392283
Do you have to use map ?
Here I get your desired output using two for loops instead of using map
Result_list = vector("list",length(my_list))
for(i in 1:length(my_list))
{
df = my_list[[i]]
# identifier number of countries
countries = colnames(df)[grep('mean',colnames(df))]
countries = gsub("_mean","",countries)
df_result = NULL
for(j in 1:length(countries))
{
country = countries[j]
value_country = df[1,match(country,colnames(df))]
mean_country = df[1,match(paste0(country,"_mean"),colnames(df))]
sd_country = df[1,match(paste0(country,"_sd"),colnames(df))]
result_country = (value_country - mean_country) / sd_country
Sentence = paste0(country,": ",round(result_country,5)," = (",value_country," - ",round(mean_country,5),") / ",round(sd_country,5))
df_result = c(df_result,Sentence)
}
Result_list[[i]] = df_result
}
And the output Result_list looks like:
> Result_list
[[1]]
[1] "germany: -0.143 = (17 - 21.29429) / 30.03026"
[2] "france: -0.50096 = (25 - 48.57103) / 47.05169"
[[2]]
[1] "germany: -0.40959 = (9 - 21.29932) / 30.02862"
[2] "france: -0.4157 = (29 - 48.56053) / 47.05439"
[[3]]
[1] "germany: -0.44272 = (8 - 21.29475) / 30.02973"
[2] "france: -0.64926 = (18 - 48.55189) / 47.05624"
[[4]]
[1] "germany: -0.60872 = (3 - 21.27785) / 30.02679"
[2] "france: -0.79765 = (11 - 48.53825) / 47.06077"
[[5]]
[1] "germany: -0.57486 = (4 - 21.26142) / 30.02694"
[2] "france: -0.75469 = (13 - 48.52145) / 47.0676"
[[6]]
[1] "germany: -0.57455 = (4 - 21.2532) / 30.0292"
[2] "france: -0.83923 = (9 - 48.50559) / 47.07372"
Is it what you are looking for ?
EDIT: Extracting only results
For extracting only result values, you can do the following:
Df_result_value = NULL
for(i in 1:length(my_list))
{
df = my_list[[i]]
# identifier number of countries
countries = colnames(df)[grep('mean',colnames(df))]
countries = gsub("_mean","",countries)
for(j in 1:length(countries))
{
country = countries[j]
value_country = df[1,match(country,colnames(df))]
mean_country = df[1,match(paste0(country,"_mean"),colnames(df))]
sd_country = df[1,match(paste0(country,"_sd"),colnames(df))]
result_country = (value_country - mean_country) / sd_country
Df_result_value = rbind(Df_result_value,c(country,result_country))
}
}
Df_result_value = data.frame(Df_result_value)
colnames(Df_result_value) = c("Country","Result")
And get this output:
> Df_result_value
Country Result
1 germany -0.142998843835787
2 france -0.500960300483614
3 germany -0.409586436512588
4 france -0.415700488060442
5 germany -0.442719572974515
6 france -0.649263275639099
7 germany -0.608718121899195
8 france -0.797654950237258
9 germany -0.574864249939699
10 france -0.754690110335453
11 germany -0.574547256608035
12 france -0.839228262008441
We can use transform as well in base R
transform(do.call(rbind, my_list),
germany = (germany - germany_mean)/germany_sd,
france = (france - france_mean)/france_sd)[c('date', 'germany', 'france')]
# date germany france
#1 2016-01-01 -0.1429988 -0.5009603
#2 2016-01-02 -0.4095864 -0.4157005
#3 2016-01-03 -0.4427196 -0.6492633
#4 2016-01-04 -0.6087181 -0.7976550
#5 2016-01-05 -0.5748642 -0.7546901
#6 2016-01-06 -0.5745473 -0.8392283
Or in dplyr, without any reshaping, this can be done
library(dplyr)
bind_rows(my_list) %>%
transmute(date,
germany = (germany - germany_mean)/germany_sd,
france = (france - france_mean)/france_sd)

grouping a character vector into new groups using dplyr

I have a data frame that looks like this:
# A tibble: 5 x 5
# Groups: Trial [1]
GID Trial pop `1A-1145442` `1A-1158042`
<chr> <chr> <chr> <int> <int>
GID421213 ES1 ES1-5 12 11
GID419903 ES1 ES1-5 22 12
GID3881 ES1 ES1-5 22 22
GID13646 ES1 ES1-5 12 12
GID418846 ES1 ES1-5 22 11
Here is a dput of it :
structure(list(GID = c("GID421213", "GID419903", "GID3881", "GID13646",
"GID418846"), Trial = c("ES1", "ES1", "ES1", "ES1", "ES1"), pop = c("ES1-5",
"ES1-5", "ES1-5", "ES1-5", "ES1-5"), `1A-1145442` = c(12L, 22L,
22L, 12L, 22L), `1A-1158042` = c(11L, 12L, 22L, 12L, 11L)), row.names =
c(NA, -5L), class = c("grouped_df", "tbl_df", "tbl", "data.frame"), vars =
"Trial", drop = TRUE, indices = list(0:4), group_sizes = 5L,
biggest_group_size = 5L, labels = structure(list(Trial = "ES1"), row.names
= c(NA, -1L), class = "data.frame", vars = "Trial", drop = TRUE))
I want to perform a regrouping transformation into a new column from the Trial column just as I did in the past with the pop column using regex operations but now with dplyr. The Trial column consists of ES values from 1 to 38: I would like to group in this fashion ES1-3,ES3-6,ES7-9 and so forth using the dplyr package. I know I could start with df >%> group_by(df,Trial) but from there on I have no idea how I could operate.
library(dplyr)
df %>%
mutate(pop2 = case_when(
Trial == "ES1" | Trial == "ES2" | Trial == "ES3" ~ "ES1-3",
Trial == "ES4" | Trial == "ES5" | Trial == "ES6" ~ "ES4-6"
))
Will return
# A tibble: 5 x 6
# Groups: Trial [1]
GID Trial pop `1A-1145442` `1A-1158042` pop2
<chr> <chr> <chr> <int> <int> <chr>
1 GID421213 ES1 ES1-5 12 11 ES1-3
2 GID419903 ES1 ES1-5 22 12 ES1-3
3 GID3881 ES1 ES1-5 22 22 ES1-3
4 GID13646 ES1 ES1-5 12 12 ES1-3
5 GID418846 ES1 ES1-5 22 11 ES1-3
Given
(df <- data.frame(Trial = paste0("ES", 1:10)))
# Trial
# 1 ES1
# 2 ES2
# 3 ES3
# 4 ES4
# 5 ES5
# 6 ES6
# 7 ES7
# 8 ES8
# 9 ES9
# 10 ES10
We may, using base R, do
size <- 3
groups <- (as.numeric(substring(df$Trial, 3)) - 1) %/% size
(df$newCol <- sprintf("ES%d-%d", 1 + groups * size, size * (1 + groups)))
# [1] "ES1-3" "ES1-3" "ES1-3" "ES4-6" "ES4-6" "ES4-6" "ES7-9" "ES7-9"
# [9] "ES7-9" "ES10-12"
Here as.numeric(substring(df$Trial, 3)) gets the numeric part of df$Trial and converts it to a numeric vector. Subtracting 1 and using %/% then returns the group number for each element of df$Trial, starting from 0. Given a group number, we can easily construct a new column with sprintf.
size is the size of groups. E.g., setting size <- 5 would give values ES1-5, ES6-10, and so on.
Here's a solution that uses parse_number from readr.
df %>%
mutate(grp = cut(parse_number(Trial),
breaks = seq(1, 38, by = 3),
right = FALSE)) %>%
group_by(grp)
This pulls out the number from Trial then cuts to create a grouping variable, which it then groups by. right=FALSE indicates that the interval is closed on the left.
An edit based on a comment below.
df %>%
mutate(grp = cut(parse_number(Trial),
breaks = c(seq(1, 34, by = 3) 38),
right = FALSE),
include.lowest = TRUE) %>%
group_by(grp)

compute an average of the last two columns which differ for all subjects

I'm an R beginner and it's my first post here. I'm struggling with a problem and would love your advice. Basically, I have a dataset with 3 sets of columns that I need to manipulate altogether in order to obtain the desired outcome, which is an average of the 2 most recent observations (and that these observations must occur after a cutoff date, say, 3/15/2018) that are of high quality, but what makes it complex is that the relevant columns that go into the average differ for all cases.
The first set of data columns has to do with the number of observations each case has, so subject one has 2 observations, subject two has 3, etc.
The second set of columns describe the data quality for each of these observations. So for example, subject 1 has two good observations whereas subject 2 has 1 bad data quality for the first observation and good data quality for the 2 latter ones, and subject 3 has 3 observations that are of good quality and one observation (obs_3)that is of bad data quality.
The third set of columns specify the dates of the observations.
subject_id obs_1 obs_2 obs_3 obs_4 obs_1_dq obs_2_dq obs_3_dq obs_4_dq obs_1_date obs_2_date obs_3_date obs_4_date desired.average
1 1 5 6 NA NA TRUE TRUE NA NA 2018-02-01 2018-03-16 <NA> <NA> NA
2 2 6 8 11 NA FALSE TRUE TRUE NA 2018-02-18 2018-03-16 2018-04-10 <NA> 9.5
3 3 7 9 12 15 TRUE TRUE FALSE TRUE 2018-02-15 2018-03-18 2018-04-02 2018-04-10 12.0
4 4 3 4 8 15 TRUE TRUE TRUE TRUE 2018-02-16 2018-03-08 2018-03-10 2018-03-15 NA
In order to compute an average of TWO latest observations that are of good data quality:
I must first decide which observations are of good quality,
Then, compute an average (and it has to be an average of 2 observations) that occur after 3/15 and they must be the two most recent observations.
Below is my sample dataset. I've tried to do this manually in Excel and it was really painstaking. I'm hoping to do this in R and would very much appreciate your feedback. Thank you!
Here is my sample dataset:
> dput(head(df,5))
structure(list(subject_id = c(1, 2, 3, 4), obs_1 = c(5, 6, 7,
3), obs_2 = c(6, 8, 9, 4), obs_3 = c(NA, 11, 12, 8), obs_4 = c(NA,
NA, 15, 15), obs_1_dq = c(TRUE, FALSE, TRUE, TRUE), obs_2_dq = c(TRUE,
TRUE, TRUE, TRUE), obs_3_dq = c(NA, TRUE, FALSE, TRUE), obs_4_dq =
c(NA,
NA, TRUE, TRUE), obs_1_date = structure(c(17563, 17580, 17577,
17578), class = "Date"), obs_2_date = structure(c(17606, 17606,
17608, 17598), class = "Date"), obs_3_date = structure(c(NA,
17631, 17623, 17600), class = "Date"), obs_4_date = structure(c(NA,
NA, 17631, 17605), class = "Date"), desired.average = c(NA, 9.5,
12, NA)), .Names = c("subject_id", "obs_1", "obs_2", "obs_3",
"obs_4", "obs_1_dq", "obs_2_dq", "obs_3_dq", "obs_4_dq", "obs_1_date",
"obs_2_date", "obs_3_date", "obs_4_date", "desired.average"), row.names
= c(NA,
4L), class = "data.frame")
This should also work, and though a bit verbose it doesn't rely on column indices, so should be robust:
library(dplyr)
library(tidyr)
num_date <- as.numeric(as.Date("2018-03-15"))
df <- df[,-ncol(df)]
df_join <- df %>%
gather(Obs, value, 2:ncol(df)) %>%
mutate(
nr = as.numeric(gsub("[^\\d]", "", Obs, perl = TRUE))
) %>%
group_by(subject_id, nr) %>%
filter(!(is.na(value) | (grepl("_dq", Obs) & value == 0) | any(value[grepl("_date", Obs)] <= num_date))) %>%
ungroup() %>%
group_by(subject_id, Obs) %>%
filter(!row_number() < (max(row_number() - 1))) %>%
ungroup() %>%
group_by(subject_id) %>%
mutate(
desired.average = mean(value[grepl("_date|_dq", Obs) == FALSE], na.rm = TRUE)
) %>%
filter(!max(row_number()) == 3) %>%
distinct(subject_id, desired.average)
df <- left_join(df, df_join)
Result:
subject_id obs_1 obs_2 obs_3 obs_4 obs_1_dq obs_2_dq obs_3_dq obs_4_dq obs_1_date obs_2_date
1 1 5 6 NA NA TRUE TRUE NA NA 2018-02-01 2018-03-16
2 2 6 8 11 NA FALSE TRUE TRUE NA 2018-02-18 2018-03-16
3 3 7 9 12 15 TRUE TRUE FALSE TRUE 2018-02-15 2018-03-18
4 4 3 4 8 15 TRUE TRUE TRUE TRUE 2018-02-16 2018-03-08
obs_3_date obs_4_date desired.average
1 <NA> <NA> NA
2 2018-04-10 <NA> 9.5
3 2018-04-02 2018-04-10 12.0
4 2018-03-10 2018-03-15 NA
See if this works for you. Code is annotated briefly.
df=structure(list(subject_id = c(1, 2, 3, 4), obs_1 = c(5, 6, 7,
3), obs_2 = c(6, 8, 9, 4), obs_3 = c(NA, 11, 12, 8), obs_4 = c(NA,
NA, 15, 15), obs_1_dq = c(TRUE, FALSE, TRUE, TRUE), obs_2_dq = c(TRUE,
TRUE, TRUE, TRUE), obs_3_dq = c(NA, TRUE, FALSE, TRUE), obs_4_dq =
c(NA, NA, TRUE, TRUE), obs_1_date = structure(c(17563, 17580, 17577,
17578), class = "Date"), obs_2_date = structure(c(17606, 17606,
17608, 17598), class = "Date"), obs_3_date = structure(c(NA,
17631, 17623, 17600), class = "Date"), obs_4_date = structure(c(NA,
NA, 17631, 17605), class = "Date"), desired.average = c(NA, 9.5,
12, NA)), .Names = c("subject_id", "obs_1", "obs_2", "obs_3",
"obs_4", "obs_1_dq", "obs_2_dq", "obs_3_dq", "obs_4_dq", "obs_1_date",
"obs_2_date", "obs_3_date", "obs_4_date", "desired.average"), row.names
= c(NA, 4L), class = "data.frame")
# separate each section
obs=df[,2:5]
dq=df[, 6:9]
dt=sapply(df[, 10:13], as.numeric) # for easier calculations
# remove bad quality
obs[dq==F]=NA
# remove dates before 2018-3-15
obs[dt - as.numeric(as.Date("2018-03-15")) <= 0] = NA
# only leave two most recent dates
dt[is.na(obs)]=NA
dt=t(apply(dt,1,function(x){x[x<max(x[x!=max(x, na.rm=T)],na.rm=T)]=NA;x}))
obs[is.na(dt)]=NA
# average
df$avg=apply(obs,1,function(x)ifelse(sum(!is.na(x))>=2, mean(x,na.rm=T), NA))
df
Edits:
Explanation
dt=t(apply(dt,1, function(x){x[x<max(x[x!=max(x, na.rm=T)],na.rm=T)]=NA;x}))
I think this might be a little confusing for x[x<max(x[x!=max(x, na.rm=T)],na.rm=T)]=NA. The na.rm=T meaning remove NA values. max(x[x!=max(x)]) meaning the second largest number. So x[x < 2nd_largest_num]=NA just removed any number except the largest and the 2nd largest. This function is then applied to the data frame row-wise. The final result is dt contains only two largest number in each row (most recent date in numeric format). All "discarded" values (NA in dt) will be removed from obs in the next line obs[is.na(dt)]=NA. After all these, obs only contains the two recent values in each line.

Resources