difference between first non-NA and last non-NA in each row - r

I have a data frame with up to 5 measurements (x) and their corresponding time:
df = structure(list(x1 = c(92.9595722286402, 54.2085219673818,
46.3227062573019,
NA, 65.1501442134141, 49.736451235317), time1 = c(43.2715277777778,
336.625, 483.975694444444, NA, 988.10625, 510.072916666667),
x2 = c(82.8368681534474, 53.7981639701784, 12.9993531230419,
NA, 64.5678816290574, 55.331442940348), time2 = c(47.8166666666667,
732, 506.747222222222, NA, 1455.25486111111, 958.976388888889
), x3 = c(83.5433119686794, 65.723072881366, 19.0147593408309,
NA, 65.1989838202356, 36.7000828457705), time3 = c(86.5888888888889,
1069.02083333333, 510.275, NA, 1644.21527777778, 1154.95694444444
), x4 = c(NA, 66.008102917677, 40.6243513885846, NA, 62.1694420909955,
29.0078249523063), time4 = c(NA, 1379.22986111111, 520.726388888889,
NA, 2057.20833333333, 1179.86805555556), x5 = c(NA, 61.0047472617535,
45.324715258421, NA, 59.862110645527, 45.883161439362), time5 = c(NA,
1825.33055555556, 523.163888888889, NA, 3352.26944444444,
1364.99513888889)), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -6L))
"NA" means that the person (row) didn't have a measurement.
I would like to calculate the difference between the last existing measurement and the first one.
So for the first one it would be x3 minus x1 (6.4), for the second it would be -6.8 and so on.
I tried something like this, which didnt work:
df$diff = apply(df %>% select(., contains("x")), 1, function(x) head(x,
na.rm = T) - tail(x, na.rm=T))
Any suggestions? Also, is apply/rowwise the most efficient way, or is there a vectorized function to do that?

A vectorized way would be using max.col where we get "first" and "last" non-NA value using ties.method parameter
#Get column number of first and last col
first_col <- max.col(!is.na(df[x_cols]), ties.method = "first")
last_col <- max.col(!is.na(df[x_cols]), ties.method = "last")
#subset the dataframe to include only `"x"` cols
new_df <- as.data.frame(df[grep("^x", names(df))])
#Subtract last non-NA value with the first one
df$new_calc <- new_df[cbind(1:nrow(df), last_col)] -
new_df[cbind(1:nrow(df), first_col)]
Using apply you could do
x_cols <- grep("^x", names(df))
df$new_calc <- apply(df[x_cols], 1, function(x) {
new_x <- x[!is.na(x)]
if (length(new_x) > 0)
new_x[length(new_x)] - new_x[1L]
else NA
})

We can use tidyverse methods on the tbl_df. Create a row names column (rownames_to_column), gather the 'x' columns to 'long' format while removing the NA elements (na.rm = TRUE), grouped by row name, get the difference of first and last 'val'ues and bind the extracted column with the original dataset 'df'
library(tidyverse)
rownames_to_column(df, 'rn') %>%
select(rn, starts_with('x')) %>%
gather(key, val, -rn, na.rm = TRUE) %>%
group_by(rn) %>%
summarise(Diff = diff(c(first(val), last(val)))) %>%
mutate(rn = as.numeric(rn)) %>%
complete(rn = min(rn):max(rn)) %>%
pull(Diff) %>%
bind_cols(df, new_col = .)
# A tibble: 6 x 11
# x1 time1 x2 time2 x3 time3 x4 time4 x5 time5 new_col
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 93.0 43.3 82.8 47.8 83.5 86.6 NA NA NA NA -9.42
#2 54.2 337. 53.8 732 65.7 1069. 66.0 1379. 61.0 1825. 6.80
#3 46.3 484. 13.0 507. 19.0 510. 40.6 521. 45.3 523. -0.998
#4 NA NA NA NA NA NA NA NA NA NA NA
#5 65.2 988. 64.6 1455. 65.2 1644. 62.2 2057. 59.9 3352. -5.29
#6 49.7 510. 55.3 959. 36.7 1155. 29.0 1180. 45.9 1365. -3.85

Related

How to cut a dataframe within a list after a certain marker in R?

I would like to cut my dataframe after a certain marker. Means after the first time 3 or more times TRUE shows up (=marker) in V1, I would like to cut the dataframes within a list and take the following next 4 rows as my new dataframe within a list.
library(dplyr)
set.seed(94756)
mat1 <- matrix(sample(seq(-1,100, 0.11),70, replace = TRUE),ncol = 5)
mat1 <- as.tibble(mat1)
mat2 <- matrix(sample(seq(-1,100, 0.11),70, replace = TRUE),ncol = 5)
mat2 <- as.tibble(mat2)
mat2[3,1] <- NA
mat2[6,1] <- NA
mat3 <- matrix(sample(seq(-1,100, 0.11), 70,replace = TRUE),ncol = 5)
mat3 <- as.tibble(mat3)
mat3[4,1] <- NA
data <- list(mat1, mat2, mat3)
data1 <- map(data, ~add_column(., V1_logical = between(.$V1, 20, 80), .after = 'V1'))
r_pre <- lapply(data1, "[", 2)
Maybe it is helpful to add an ID column for each dataframe within the list
r_pre1 <- rbindlist(r_pre, idcol = "ID")
r_pre1 <- split(r_pre1, r_pre1$ID)
So the result should be like:
mat1re <- data.frame(V1 = c(93.16, 47.18, 12.86, 38.71),
V2 = c(56.75, 57.85, 18.69, 3.18),
V3 = c(-0.01, 14.95, 46.08, 96.46),
V4 = c(20.89, 32.55, 91.73, 58.73),
V5 = c(66.54, 56.75, 92.94, 77.54))
mat2re <- data.frame(V1 = c(87.99, 53.23, 40.36, 0.65),
V2 = c(89.42, 81.28, 36.84, 73.58),
V3 = c(89.86, 78.75, 76.77, 61.81),
V4 = c(47.18, 22.98, 34.64, 25.18),
V5 = c(18.69, 77.21, 58.29, 94.04))
mat3re <- data.frame(V1 = c(81.50, 43.55, 54.55, 9.45),
V2 = c(33.21, 70.83, 21.66, 88.10),
V3 = c(72.15, -0.45, 11.65, 15.06),
V4 = c(47.07, 47.95, 88.10, 81.50),
V5 = c(80.07, 67.75, 14.84, 10.33))
result <- list(mat1re, mat2re, mat3re)
What I've tried already:
data2 <- lapply(data1, function(x) {x$V1_logical[x$V1_logical== TRUE] <- 1; x})
data3 <- lapply(data2, function(x) {x$V1_logical[x$V1_logical== FALSE] <- 0; x})
data4 <- map(data3, ~add_column(., ind = rleid(.$V1_logical), .after = "V1_logical"))
So in data 4 it's about to find the marker: $V1_logical = 1 & $ind = number that shows up >= 3 times consecutively (e. g. 5, 5, 5) and cut the data before away incl. marker or in other word start new dataframes after the marker.
The following code is also close, but doesn't cut the beginning incl. marker out when NA's are included in the data...Have a look at the second list here, doesn't cut the beginning and marker out.
matrix_final <- map(data, ~ .x %>%
mutate(V1_logical = between(V1, 20, 80), ind = rleid(V1_logical), .after = "V1") %>%
group_by(ind) %>%
mutate(rn = if(n() >=3 && first(V1_logical)) row_number() else NA_integer_) %>%
ungroup %>%
slice(seq(max(which.max(rn) + 1, 1, replace_na = TRUE), length.out = 4)) %>%
select(-ind, -rn) %>%
mutate(across(everything(), round, digits = 2)))
print(matrix_final[[2]])
Thanks in advance!
We may loop over the list with map, create the logical column on 'V1' with between, create a grouping column with rleid (returns a sequence column that increments when there is a change in value in adjacent elements) and slice the rows based on the condition
library(dplyr)
library(purrr)
library(data.table)
library(tidyr)
map(data, ~ .x %>%
mutate(V1_logical = replace_na(between(V1, 20, 80), FALSE),
ind = rleid(V1_logical), .after = "V1") %>%
group_by(ind) %>%
mutate(rn = if(n() >=3 && first(V1_logical)) row_number() else
NA_integer_) %>%
ungroup %>%
slice(seq(max(which.max(rn) + 1, 1, na.rm = TRUE), length.out = 4)) %>%
select(-ind, -rn, -V1_logical) %>%
mutate(across(everything(), round, digits = 2)))
-output
[[1]]
# A tibble: 4 × 5
V1 V2 V3 V4 V5
<dbl> <dbl> <dbl> <dbl> <dbl>
1 93.2 56.8 -0.0100 20.9 66.5
2 47.2 57.8 15.0 32.6 56.8
3 12.9 18.7 46.1 91.7 92.9
4 38.7 3.18 96.5 58.7 77.5
[[2]]
# A tibble: 4 × 5
V1 V2 V3 V4 V5
<dbl> <dbl> <dbl> <dbl> <dbl>
1 88.0 89.4 89.9 47.2 18.7
2 53.2 81.3 78.8 23.0 77.2
3 40.4 36.8 76.8 34.6 58.3
4 0.65 73.6 61.8 25.2 94.0
[[3]]
# A tibble: 4 × 5
V1 V2 V3 V4 V5
<dbl> <dbl> <dbl> <dbl> <dbl>
1 81.5 33.2 72.2 47.1 80.1
2 43.6 70.8 -0.45 48.0 67.8
3 54.6 21.7 11.6 88.1 14.8
4 9.45 88.1 15.1 81.5 10.3

Simple but not easy merge task

I have two incomplete dataframes (df_a, df_b): Columns are missing or NA values. "by" is the merge index and df_a has "priority" over df_b.
df_a = structure(list(Datum = structure(c(1635163200, 1635166800, 1635170400, 1635174000), class = c("POSIXct", "POSIXt")), Vorhersage = c(10.297922, 10.155121, 10.044135, 9.699513), Export = c("10.912", "10.47", NA, NA), color = c("rgb(0,128,0)", "rgb(0,128,0)", NA, NA), Status = c("ok", "ok", NA, NA), Plausibilität = c("4", "4", NA, NA), min = c(7.93000000000001, 9.4, 8.7, 8.3), max = c(12.31715325, 12.42822725, 12.51326325, 12.28620625)), row.names = c(NA, -4L), class = "data.frame")
df_b = structure(list(Datum = structure(c(1632510000, 1632513600, 1632517200, 1632520800), class = c("POSIXct", "POSIXt")), Vorhersage = c(14.821988, 14.832919, 14.706179, 14.573527), Referenz = c(16.6, 16.2, 15.9, 16), DWD_Name = c("Elpersbüttel", "Elpersbüttel", "Elpersbüttel", "Elpersbüttel"), Export = c(17.198, 16.713, 16.378, 16.358), color = c("rgb(0,128,0)", "rgb(0,128,0)", "rgb(0,128,0)", "rgb(0,128,0)"), Status = c("ok", "ok", "ok", "ok"), Plausibilität = c(4, 4, 4, 4), min = c(13.05, 12.808, 11.631891, 12.312), max = c(17, 17, 16.9, 16.7)), row.names = c(NA, -4L), class = "data.frame")
desired output is:
Datum Vorhersage Export color Status Plausibilität min max Referenz
1 2021-10-25 14:00:00 10.3 10.912 rgb(0,128,0) ok 4 7.9 12 NA
2 2021-10-25 15:00:00 10.2 10.47 rgb(0,128,0) ok 4 9.4 12 NA
3 2021-10-25 16:00:00 10.0 <NA> <NA> <NA> <NA> 8.7 13 NA
4 2021-10-25 17:00:00 9.7 <NA> <NA> <NA> <NA> 8.3 12 NA
5 2021-09-24 21:00:00 14.8 17.198 rgb(0,128,0) ok 4 13.1 17 17
6 2021-09-24 22:00:00 14.8 16.713 rgb(0,128,0) ok 4 12.8 17 16
7 2021-09-24 23:00:00 14.7 16.378 rgb(0,128,0) ok 4 11.6 17 16
8 2021-09-25 00:00:00 14.6 16.358 rgb(0,128,0) ok 4 12.3 17 16
DWD_Name
1 <NA>
2 <NA>
3 <NA>
4 <NA>
5 Elpersbüttel
6 Elpersbüttel
7 Elpersbüttel
8 Elpersbüttel
# for rebuild:
structure(list(Datum = structure(c(1635163200, 1635166800, 1635170400,
1635174000, 1632510000, 1632513600, 1632517200, 1632520800), class = c("POSIXct",
"POSIXt")), Vorhersage = c(10.297922, 10.155121, 10.044135, 9.699513,
14.821988, 14.832919, 14.706179, 14.573527), Export = c("10.912",
"10.47", NA, NA, "17.198", "16.713", "16.378", "16.358"), color = c("rgb(0,128,0)",
"rgb(0,128,0)", NA, NA, "rgb(0,128,0)", "rgb(0,128,0)", "rgb(0,128,0)",
"rgb(0,128,0)"), Status = c("ok", "ok", NA, NA, "ok", "ok", "ok",
"ok"), Plausibilität = c("4", "4", NA, NA, "4", "4", "4", "4"
), min = c(7.93000000000001, 9.4, 8.7, 8.3, 13.05, 12.808, 11.631891,
12.312), max = c(12.31715325, 12.42822725, 12.51326325, 12.28620625,
17, 17, 16.9, 16.7), Referenz = c(NA, NA, NA, NA, 16.6, 16.2,
15.9, 16), DWD_Name = c(NA, NA, NA, NA, "Elpersbüttel", "Elpersbüttel",
"Elpersbüttel", "Elpersbüttel")), row.names = c(NA, -8L), class = "data.frame")
Thanks to the help of #r2evans I tried the following:
by = "Datum"
library(data.table)
colnms <- setdiff(intersect(names(df_a), names(df_b)), by)
setDT(df_a)
setDT(df_b)
merge(df_a, df_b, by = by, all = TRUE
)[, (colnms) := lapply(colnms, function(nm) fcoalesce(.SD[[paste0(nm, ".x")]], .SD[[paste0(nm, ".y")]]))
][, c(outer(colnms, c(".x", ".y"), paste0)) := NULL ][]
but I get the following error:
Error in fcoalesce(.SD[[paste0(nm, ".x")]], .SD[[paste0(nm, ".y")]]) :
Item 2 is type double but the first item is type character. Please coerce
Most of the other answers are good, but many either over-complicate the result (in my opinion) or they perform a left or right join, not the full join as expected in the OP.
Here's a quick solution that uses dynamic column names.
library(data.table)
colnms <- setdiff(intersect(names(df_a), names(df_b)), "by")
colnms
# [1] "a"
setDT(df_a)
setDT(df_b)
merge(df_a, df_b, by = "by", all = TRUE
)[, (colnms) := lapply(colnms, function(nm) fcoalesce(.SD[[paste0(nm, ".x")]], .SD[[paste0(nm, ".y")]]))
][, c(outer(colnms, c(".x", ".y"), paste0)) := NULL ][]
# by b c a
# <num> <num> <num> <num>
# 1: 1 1 NA 1
# 2: 2 NA 2 2
# 3: 3 3 3 3
# 4: 4 NA 4 4
Notes:
the normal data.table::[ merge is a left-join only, so we need to use data.table::merge in order to be able to get a full-join with all=TRUE;
because it's using merge, the repeated columns get the .x and .y suffixes, something we can easily capitalize on;
the canonical and most-performant way when using (colnms) := ... is to also include .SDcols=colnms, but that won't work as well here since we need the suffixed columns, not the colnms columns themselves; this is a slight performance penalty but certainly not an anti-pattern (I believe) given what we need to do; and since we could have more than one duplicate column, we have to be careful to do it with each pair at a time, not all of them at once;
the last [-block (using outer) is for removing the duplicate columns; without it, the output would have column names c("by", "a.x", "b", "a.y", "c", "a"). It uses outer because that's a straight-forward way to get 1-or-more colnms and combine .x and .y to each of them; it then uses data.table's := NULL shortcut for removing one-or-more columns.
This isn't the most elegant, but you can make a function that applies your rule to coalesce the values if they occur in both data frames.
# find the unique column names (not called "by")
cols <- union(names(df_a),names(df_b))
cols <- cols[!(cols == "by")]
# merge the data sets
df_merge <- merge(df_a, df_b, by = "by", all = TRUE)
# function to check for the base column names that now have a '.x' and
# a '.y' version. for the columns, fill in the NAs from '.x' with the
# value from '.y'
col_val <- function(col_base, df) {
x <- names(df)
if (all(paste0(col_base, c(".x", ".y")) %in% x)) {
na.x <- is.na(df[[paste0(col_base, ".x")]])
df[[paste0(col_base, ".x")]][na.x] <- df[[paste0(col_base, ".y")]][na.x]
df[[paste0(col_base, ".x")]]
} else {
df[[col_base]]
}
}
# apply this function to every column
cbind(df_merge["by"], sapply(cols, col_val, df = df_merge))
This will give the following result.
by a b c
1 1 1 1 NA
2 2 2 NA 2
3 3 3 3 3
4 4 4 NA 4
I know you specified base, by the natural_join() function is worth mentioning.
library(rqdatatable)
natural_join(df_a, df_b, by = "by", jointype = "FULL")
This gives exactly what you want.
by a b c
1 1 1 1 NA
2 2 2 NA 2
3 3 3 3 3
4 4 4 NA 4
Not the answer with R base. But one possible solution with the package data.table
library(data.table)
setDT(df_a)
setDT(df_b)
df_a <- rbind(df_a, list(4, NA, NA))
df_b <- rbind(list(1, NA, NA), df_b)
df_a[df_b, `:=` (a = fifelse(is.na(a), i.a, a), c = c), on = .(by)][]
#> by a b c
#> 1: 1 1 1 NA
#> 2: 2 2 NA 2
#> 3: 3 3 3 3
#> 4: 4 4 NA 4
Edit with the help of #r2evans, A much more elegant and efficient solution:
df_a[df_b, `:=` (a = fcoalesce(a, i.a), c = c), on = .(by)][]
#> by a b c
#> 1: 1 1 1 NA
#> 2: 2 2 NA 2
#> 3: 3 3 3 3
#> 4: 4 4 NA 4
Created on 2021-10-19 by the reprex package (v2.0.1)
here a dynamic solution.. not bad, but maybe someone knows how to speed it up.
get_complete_df<-function(df_a,df_b, by = "by"){
df_a = unique(df_a)
df_b = unique(df_b)
nam_a = names(df_a)[!(names(df_a) == by)]
nam_b = names(df_b)[!(names(df_b) == by)]
nums_a = unlist(lapply(df_a, is.numeric))
nums_b = unlist(lapply(df_b, is.numeric))
nums = unique(names(df_a)[nums_a],names(df_b)[nums_b])
## try to supplement NAs
x = df_b[[by]][df_b[[by]] %in% df_a[[by]]]
y = nam_b[nam_b %in% nam_a]
vna = is.na(df_a[df_a[,1] %in% x,y])
df_a[df_a[,1] %in% x ,y][vna] = df_b[df_b[,1] %in% x,y][vna]
## get complete df
all_names = c(nam_a,nam_b )
all_names = c(by, unique(all_names))
all_by = na.omit(unique(c(df_a[[by]],df_b[[by]]) ))
## build
df_o = as.data.frame(matrix(nrow = length(all_by),ncol = length(all_names)))
names(df_o) = all_names
df_o[[by]] = all_by
## fill in content
df_o[df_o[,1] %in% df_b[,1],names(df_b)] = df_b
df_o[df_o[,1] %in% df_a[,1],names(df_a)] = df_a ## df_a has priority!
# fix numeric:
# why did some(!) num fields changed to chr ?
df_o[,nums] = as.data.frame(apply(df_o[,nums], 2, as.numeric))
df_o
}

R Aggregate over multiple columns

i´m currently working with a large dataframe of 75 columns and round about 9500 rows. This dataframe contains observations for every day from 1995-2019 for several observation points.
Edit: The print from dput(head(df))
> dput(head(df))
structure(list(date = structure(c(9131, 9132, 9133, 9134, 9135,
9136), class = "Date"), x1 = c(50.75, 62.625, 57.25, 56.571,
36.75, 39.125), x2 = c(62.25, 58.714, 49.875, 56.375, 43.25,
41.625), x3 = c(90.25, NA, 70.125, 75.75, 83.286, 98.5),
x4 = c(60, 72, 68.375, 65.5, 63.25, 55.875), x5 = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), xn = c(53.25,
61.143, 56.571, 58.571, 36.25, 44.375), year = c(1995, 1995, 1995, 1995,
1995, 1995), month = c(1, 1, 1, 1, 1, 1), day = c(1, 2, 3,
4, 5, 6)), row.names = c(NA, -6L), class = c("tbl_df", "tbl",
"data.frame"))
The dataframe looks like this sample from it:
date x1 x2 x3 x4 x5 xn year month day
<date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1995-01-01 50.8 62.2 90.2 60 NA 53.2 1995 1 1
2 1999-08-02 62.6 58.7 NA 72 NA 61.1 1999 8 2
3 2001-09-03 57.2 49.9 70.1 68.4 NA 56.6 2001 9 3
4 2008-05-04 56.6 56.4 75.8 65.5 NA 58.6 2008 5 4
5 2012-04-05 36.8 43.2 83.3 63.2 NA 36.2 2012 4 5
6 2019-12-31 39.1 41.6 98.5 55.9 NA 44.4 2019 12 31
str(df)
tibble [9,131 x 75] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
$ date : Date[1:9131], format: "1995-01-01" "1995-01-02" ...
$ x1 : num [1:9131] 50.8 62.6 57.2 56.6 36.8 ...
$ x2 : num [1:9131] 62.2 58.7 49.9 56.4 43.2 ...
xn
$ year : num [1:9131] 1995 1995 1995 1995 1995 ...
$ month : num [1:9131] 1 1 1 1 1 1 1 1 1 1 ...
$ day : num [1:9131] 1 2 3 4 5 6 7 8 9 10 ...
My goal is to get for every observation point xn the count of all observations which cross a certain limit per year.
So far i tried to reach this with the Aggregate function.
To get the mean of every year i used the following command:
aggregate(list(df), by=list(year=df$year), mean, na.rm=TRUE)
this works perfect, i get the mean for every year for every observation point.
To get the sum of one station i used the following code
aggregate(list(x1=df$x1), by=list(year=df$year), function(x) sum(rle(x)$values>120, na.rm=TRUE))
which results in this print:
year x1
1 1995 52
2 1996 43
3 1997 44
4 1998 42
5 1999 38
6 2000 76
7 2001 52
8 2002 58
9 2003 110
10 2004 34
11 2005 64
12 2006 46
13 2007 46
14 2008 17
15 2009 41
16 2010 30
17 2011 40
18 2012 47
19 2013 40
20 2014 21
21 2015 56
22 2016 27
23 2017 45
24 2018 22
25 2019 45
So far, so good. I know i could expand the code by adding (..,x2=data$x2, x3=data$x3,..xn) to the list argument in code above. which i tried and they work.
But how do I get them all at once?
I tried the following codes:
aggregate(.~(date, year, month, day), by=list(year=df$year), function(x) sum(rle(x)$values>120, na.rm=TRUE))
Fehler: Unerwartete(s) ',' in "aggregate(.~(date,"
aggregate(.~date+year+month+day, by=list(year=df$year), function(x) sum(rle(x)$values>120, na.rm=TRUE))
Fehler in as.data.frame.default(data, optional = TRUE) :
cannot coerce class ‘"function"’ to a data.frame
aggregate(. ~ date + year + month + day, data = df,by=list(year=df$year), function(x) sum(rle(x)$values>120, na.rm=TRUE))
Fehler in aggregate.data.frame(lhs, mf[-1L], FUN = FUN, ...) :
Argumente müssen dieselbe Länge haben
But unfortunately none of them works. Could someone please give me a hint where my mistake is?
Here is an answer that uses base R, and since none of the data in the example data is above 120, we set a criterion of above 70.
data <- structure(
list(
date = structure(c(9131, 9132, 9133, 9134, 9135,
9136), class = "Date"),
x1 = c(50.75, 62.625, 57.25, 56.571,
36.75, 39.125),
x2 = c(62.25, 58.714, 49.875, 56.375, 43.25,
41.625),
x3 = c(90.25, NA, 70.125, 75.75, 83.286, 98.5),
x4 = c(60, 72, 68.375, 65.5, 63.25, 55.875),
x5 = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_),
xn = c(53.25,
61.143, 56.571, 58.571, 36.25, 44.375),
year = c(1995, 1995, 1995, 1995,
1995, 1995),
month = c(1, 1, 1, 1, 1, 1),
day = c(1, 2, 3,
4, 5, 6)
),
row.names = c(NA,-6L),
class = c("tbl_df", "tbl",
"data.frame"
))
First, we create a subset of the data that contains all columns containing x, and set them to TRUE or FALSE based on whether the value is greater than 70.
theCols <- data[,colnames(data)[grepl("x",colnames(data))]]
Second, we cbind() the year onto the matrix of logical values.
x_logical <- cbind(year = data$year,as.data.frame(apply(theCols,2,function(x) x > 70)))
Finally, we use aggregate across all columns other than year and sum the columns.
aggregate(x_logical[2:ncol(x_logical)],by = list(x_logical$year),sum,na.rm=TRUE)
...and the output:
Group.1 x1 x2 x3 x4 x5 xn
1 1995 0 0 5 1 0 0
>
Note that by using colnames() to extract the columns that start with x and nrow() in the aggregate() function, we make this a general solution that will handle a varying number of x locations.
Two tidyverse solutions
A tidyverse solution to the same problem is as follows. It includes the following steps.
Use mutate() with across() to create the TRUE / FALSE versions of the x variables. Note that across() requires dplyr 1.0.0, which is currently in development but due for production release the week of May 25th.
Use pivot_longer() to allow us to summarise() multiple measures without a lot of complicated code.
Use pivot_wider() to convert the data back to one column for each x measurement.
...and the code is:
devtools::install_github("tidyverse/dplyr") # needed for across()
library(dplyr)
library(tidyr)
library(lubridate)
data %>%
mutate(.,across(starts_with("x"),~if_else(. > 70,TRUE,FALSE))) %>%
select(-year,-month,-day) %>% group_by(date) %>%
pivot_longer(starts_with("x"),names_to = "measure",values_to = "value") %>%
mutate(year = year(date)) %>% group_by(year,measure) %>%
select(-date) %>%
summarise(value = sum(value,na.rm=TRUE)) %>%
pivot_wider(id_cols = year,names_from = "measure",
values_from = value)
...and the output, which matches the Base R solution that I originally posted:
`summarise()` regrouping output by 'year' (override with `.groups` argument)
# A tibble: 1 x 7
# Groups: year [1]
year x1 x2 x3 x4 x5 xn
<dbl> <int> <int> <int> <int> <int> <int>
1 1995 0 0 5 1 0 0
>
...and here's an edited version of the other answer that will also produce the same results as above. This solution implements pivot_longer() before creating the logical variable for exceeding the threshold, so it does not require the across() function. Also note that since this uses 120 as the threshold value and none of the data meets this threshold, the sums are all 0.
df_example %>%
pivot_longer(x1:x5) %>%
mutate(greater_120 = value > 120) %>%
group_by(year,name) %>%
summarise(sum_120 = sum(greater_120,na.rm = TRUE)) %>%
pivot_wider(id_cols = year,names_from = "name", values_from = sum_120)
...and the output:
`summarise()` regrouping output by 'year' (override with `.groups` argument)
# A tibble: 1 x 6
# Groups: year [1]
year x1 x2 x3 x4 x5
<dbl> <int> <int> <int> <int> <int>
1 1995 0 0 0 0 0
>
Conclusions
As usual, there are many ways to accomplish a given task in R. Depending on one's preferences, the problem can be solved with Base R or the tidyverse. One of the quirks of the tidyverse is that some operations such as summarise() are much easier to perform on narrow format tidy data than on wide format data. Therefore, it's important to be proficient with tidyr::pivot_longer() and pivot_wider() when working in the tidyverse.
That said, with the production release of dplyr 1.0.0, the team at RStudio continues to add features that facilitate working with wide format data.
This should solve your problem
library(tidyverse)
library(lubridate)
df_example <- structure(list(date = structure(c(9131, 9132, 9133, 9134, 9135,
9136), class = "Date"), x1 = c(50.75, 62.625, 57.25, 56.571,
36.75, 39.125), x2 = c(62.25, 58.714, 49.875, 56.375, 43.25,
41.625), x3 = c(90.25, NA, 70.125, 75.75, 83.286, 98.5),
x4 = c(60, 72, 68.375, 65.5, 63.25, 55.875), x5 = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), xn = c(53.25,
61.143, 56.571, 58.571, 36.25, 44.375), year = c(1995, 1995, 1995, 1995,
1995, 1995), month = c(1, 1, 1, 1, 1, 1), day = c(1, 2, 3,
4, 5, 6)), row.names = c(NA, -6L), class = c("tbl_df", "tbl",
"data.frame"))
df_example %>%
pivot_longer(x1:x5) %>%
mutate(greater_120 = value > 120) %>%
group_by(year(date)) %>%
summarise(sum_120 = sum(greater_120,na.rm = TRUE))

Iterate through columns' suffixes in a for loop. R

I am trying to modify my dataset with a for loop. I want to modify certain cells of some columns depending on the value of its "paired" column. My dataset could be:
data1989 <- data.frame("date" = c("1987-01-01", "1987-01-03", "1987-01-19"),
"NDVI_1" = c(NA, 0.589, 0.120),
"NDVI_3" = c(NA, 0.447, NA),
"NDVI_4" = c(NA, NA, NA),
"pixelQA_1" = c(NA, 66.897,90.599),
"pixelQA_3" = c(NA, 66.097,NA),
"pixelQA_4" = c(NA, NA, NA),
stringsAsFactors = FALSE)
> data1989
date NDVI_1 NDVI_3 NDVI_4 pixelQA_1 pixelQA_3 pixelQA_4
1 1987-01-01 NA NA NA NA NA NA
2 1987-01-03 0.589 0.447 NA 66.897 66.097 NA
3 1987-01-19 0.120 NA NA 90.599 NA NA
Columns are "paired" by the suffix of each column, so NDVI_1 is paired with pixelQA_1, and so on. I want to modify the values under NDVI's columns depending on it's "paired" values on pixelQA column, following:
if PixelQa is NA -> then NDVI should be also NA.
if Pixel Qa is 66±0.5 OR 130±0.5 -> then NDVI remains the same value.
if Pixel Qa is different to 66±0.5 OR 130±0.5 -> then NDVI value is set to NA (this is bad quality data which needs to be ignored).
Applying these very simple rules my data should look like:
data1989clean <- data.frame("date" = c("1987-01-01", "1987-01-03", "1987-01-19"),
"NDVI_1" = c(NA, NA, NA),
"NDVI_3" = c(NA, 0.447, NA),
"NDVI_4" = c(NA, NA, NA),
"pixelQA_1" = c(NA, 66.897,90.599),
"pixelQA_3" = c(NA, 66.097,NA),
"pixelQA_4" = c(NA, NA, NA),
stringsAsFactors = FALSE)
> data1989clean
date NDVI_1 NDVI_3 NDVI_4 pixelQA_1 pixelQA_3 pixelQA_4
1 1987-01-01 NA NA NA NA NA NA
2 1987-01-03 NA 0.447 NA 66.897 66.097 NA
3 1987-01-19 NA NA NA 90.599 NA NA
To reach my goal I am trying the following for loop:
for(i in 1:4){
data1989$NDVI_[i] <- ifelse(data1989$pixelQA_[i] < 66.5 & data1989$pixelQA_[i] > 65.5 |
data1989$pixelQA_[i] < 130.5 & data1989$pixelQA_[i] > 129.5,
data1989$NDVI_[i], NA)
}
But so far it is not working, as the dataset output looks exactly the same as the original one. Any suggestion will be welcomed.
As suggested by #George Savva, you can achieve this by pivoting longer, correcting the data, and pivoting back wider. So, using the tidyverse, that gives:
library(tidyverse)
newdd1 <-
#
data1989 %>%
#
pivot_longer(cols = -date,
names_to = c(".value", "set"),
names_sep = "_") %>%
#
mutate(NDVI = case_when(is.na(pixelQA) ~ NA_real_,
between(pixelQA, 65.5, 66.5) ~ NDVI,
between(pixelQA, 129.5, 130.5) ~ NDVI,
TRUE ~ NA_real_)) %>%
#
pivot_wider(names_from = set,
values_from = c(NDVI, pixelQA))

Divide by last row in mutate in Tidyverse

So this is a relatively simple problem, I have a dataset as below
df <- structure(list(term = c("(Intercept)", "overall_quality", "overall_costs",
"wwpf"), estimate = c(0.388607224137536, 0.456477162621961, 0.485612564501229,
NA), std.error = c(0.499812263278414, 0.0987819420575201, 0.108042289289401,
NA), statistic = c(0.777506381273137, 4.62105879995918, 4.49465267438447,
NA), p.value = c(0.440597919486169, 0.0000279867005591494, 0.0000426773877613654,
NA), average = c(NA, 8.09615384615385, 7.86538461538461, 7.90384615384615
), Elasticity = c(NA, 3.69570933584318, 3.81952959386543, NA)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -4L))
I am trying to use below
df %>% mutate(Elasticity= average*estimate/average[nrow(df)])
Expected output: https://ibb.co/42ptLXx
basically, divide by last row value & since I am trying to incorporate this in function, I need the method to be dynamic & not hard coded value.
Please help !
We can use n() to return the index of last row for subsetting the value of that column
library(dplyr)
df %>%
mutate(Elasticity= average*estimate/average[n()])
If we need a function (using rlang_0.4.0), we can make use {{..}} for evaluation
f1 <- function(dat, col1, col2) {
dat %>%
mutate(Elasticity = {{col1}} * {{col2}}/{{col1}}[n()])
}
f1(df, average, estimate)
# A tibble: 4 x 7
# term estimate std.error statistic p.value average Elasticity
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 (Intercept) 0.389 0.500 0.778 0.441 NA NA
#2 overall_quality 0.456 0.0988 4.62 0.0000280 8.10 0.468
#3 overall_costs 0.486 0.108 4.49 0.0000427 7.87 0.483
#4 wwpf NA NA NA NA 7.90 NA

Resources