Creating a new dataframes based on two conditions from two other dataframes

Creating a new dataframes based on two conditions from two other dataframes - r

I'm fairly new to coding languages and have been asked to create a new dataframe based on two existing dataframes. Dataframe 1 is the original and dataframe 2 is a subset of the original. The new data frame needs to be a copy of the original with certain scores removed if they meet certain conditions from
df2, ie identify the task type in df2 and remove the corresponding value from df1, if the sample ID matches.
E.g.Dataframe1:
sample_id Low Mid High
13420 NA 2.4 3.7
43905 7.5 NA NA
52078 5.6 3.2 5.6
43292 10 NA 1.9
79327 5.7 3.2 NA
Dataframe2:
Sample Task type
13420 High
52078 Low
52078 Mid
43292 High
79327 Low
New dataframe:
13420 NA 2.4 NA
43905 7.5 NA NA
52078 NA NA 5.6
43292 10 NA NA
79327 NA 3.2 NA
Can anyone help, please? I've tried a few conditional statements, but have had no luck.

sample data
df1 <- data.table::fread("sample_id Low Mid High
13420 NA 2.4 3.7
43905 7.5 NA NA
52078 5.6 3.2 5.6
43292 10 NA 1.9
79327 5.7 3.2 NA")
df2 <- data.table::fread("Sample Tasktype
13420 High
52078 Low
52078 Mid
43292 High
79327 Low")
code
library( data.table )
#or make data.frames a data.table
data.table::setDT(df1);data.table::setDT(df2)
#melt df1 to long format
df1.melt <- melt( df1, id.vars = "sample_id" )
#update join the molten dataset with df2, updating the value with NA
df1.melt[ df2, value := NA, on = .(sample_id = Sample, variable = Tasktype) ]
#and cast df1 wit the new values back to wide format
dcast( df1.melt, sample_id ~ variable, value.var = "value" )
output
# sample_id Low Mid High
# 1: 13420 NA 2.4 NA
# 2: 43292 10.0 NA NA
# 3: 43905 7.5 NA NA
# 4: 52078 NA NA 5.6
# 5: 79327 NA 3.2 NA

Here is an approach with dplyr and tidyr:
library(dplyr)
library(tidyr)
df1 %>%
pivot_longer(-sample_id) %>%
left_join(df2, by = c("sample_id" = "Sample",
"name" = "Task.type"),
keep = TRUE) %>%
mutate(value = ifelse(!is.na(Task.type) & Task.type == name,
NA_real_, value)) %>%
dplyr::select(-c(Sample,Task.type)) %>%
pivot_wider(id_cols = c("sample_id"))
## A tibble: 5 x 4
# sample_id Low Mid High
# <int> <dbl> <dbl> <dbl>
#1 13420 NA 2.4 NA
#2 43905 7.5 NA NA
#3 52078 NA NA 5.6
#4 43292 10 NA NA
#5 79327 NA 3.2 NA
Sample Data
df1<-structure(list(sample_id = c(13420L, 43905L, 52078L, 43292L,
79327L), Low = c(NA, 7.5, 5.6, 10, 5.7), Mid = c(2.4, NA, 3.2,
NA, 3.2), High = c(3.7, NA, 5.6, 1.9, NA)), class = "data.frame", row.names = c(NA,
-5L))
df2 <- structure(list(Sample = c(13420L, 52078L, 52078L, 43292L, 79327L
), Task.type = structure(c(1L, 2L, 3L, 1L, 2L), .Label = c("High",
"Low", "Mid"), class = "factor")), class = "data.frame", row.names = c(NA,
-5L))

Related

Simple but not easy merge task

I have two incomplete dataframes (df_a, df_b): Columns are missing or NA values. "by" is the merge index and df_a has "priority" over df_b.
df_a = structure(list(Datum = structure(c(1635163200, 1635166800, 1635170400, 1635174000), class = c("POSIXct", "POSIXt")), Vorhersage = c(10.297922, 10.155121, 10.044135, 9.699513), Export = c("10.912", "10.47", NA, NA), color = c("rgb(0,128,0)", "rgb(0,128,0)", NA, NA), Status = c("ok", "ok", NA, NA), Plausibilität = c("4", "4", NA, NA), min = c(7.93000000000001, 9.4, 8.7, 8.3), max = c(12.31715325, 12.42822725, 12.51326325, 12.28620625)), row.names = c(NA, -4L), class = "data.frame")
df_b = structure(list(Datum = structure(c(1632510000, 1632513600, 1632517200, 1632520800), class = c("POSIXct", "POSIXt")), Vorhersage = c(14.821988, 14.832919, 14.706179, 14.573527), Referenz = c(16.6, 16.2, 15.9, 16), DWD_Name = c("Elpersbüttel", "Elpersbüttel", "Elpersbüttel", "Elpersbüttel"), Export = c(17.198, 16.713, 16.378, 16.358), color = c("rgb(0,128,0)", "rgb(0,128,0)", "rgb(0,128,0)", "rgb(0,128,0)"), Status = c("ok", "ok", "ok", "ok"), Plausibilität = c(4, 4, 4, 4), min = c(13.05, 12.808, 11.631891, 12.312), max = c(17, 17, 16.9, 16.7)), row.names = c(NA, -4L), class = "data.frame")
desired output is:
Datum Vorhersage Export color Status Plausibilität min max Referenz
1 2021-10-25 14:00:00 10.3 10.912 rgb(0,128,0) ok 4 7.9 12 NA
2 2021-10-25 15:00:00 10.2 10.47 rgb(0,128,0) ok 4 9.4 12 NA
3 2021-10-25 16:00:00 10.0 <NA> <NA> <NA> <NA> 8.7 13 NA
4 2021-10-25 17:00:00 9.7 <NA> <NA> <NA> <NA> 8.3 12 NA
5 2021-09-24 21:00:00 14.8 17.198 rgb(0,128,0) ok 4 13.1 17 17
6 2021-09-24 22:00:00 14.8 16.713 rgb(0,128,0) ok 4 12.8 17 16
7 2021-09-24 23:00:00 14.7 16.378 rgb(0,128,0) ok 4 11.6 17 16
8 2021-09-25 00:00:00 14.6 16.358 rgb(0,128,0) ok 4 12.3 17 16
DWD_Name
1 <NA>
2 <NA>
3 <NA>
4 <NA>
5 Elpersbüttel
6 Elpersbüttel
7 Elpersbüttel
8 Elpersbüttel
# for rebuild:
structure(list(Datum = structure(c(1635163200, 1635166800, 1635170400,
1635174000, 1632510000, 1632513600, 1632517200, 1632520800), class = c("POSIXct",
"POSIXt")), Vorhersage = c(10.297922, 10.155121, 10.044135, 9.699513,
14.821988, 14.832919, 14.706179, 14.573527), Export = c("10.912",
"10.47", NA, NA, "17.198", "16.713", "16.378", "16.358"), color = c("rgb(0,128,0)",
"rgb(0,128,0)", NA, NA, "rgb(0,128,0)", "rgb(0,128,0)", "rgb(0,128,0)",
"rgb(0,128,0)"), Status = c("ok", "ok", NA, NA, "ok", "ok", "ok",
"ok"), Plausibilität = c("4", "4", NA, NA, "4", "4", "4", "4"
), min = c(7.93000000000001, 9.4, 8.7, 8.3, 13.05, 12.808, 11.631891,
12.312), max = c(12.31715325, 12.42822725, 12.51326325, 12.28620625,
17, 17, 16.9, 16.7), Referenz = c(NA, NA, NA, NA, 16.6, 16.2,
15.9, 16), DWD_Name = c(NA, NA, NA, NA, "Elpersbüttel", "Elpersbüttel",
"Elpersbüttel", "Elpersbüttel")), row.names = c(NA, -8L), class = "data.frame")
Thanks to the help of #r2evans I tried the following:
by = "Datum"
library(data.table)
colnms <- setdiff(intersect(names(df_a), names(df_b)), by)
setDT(df_a)
setDT(df_b)
merge(df_a, df_b, by = by, all = TRUE
)[, (colnms) := lapply(colnms, function(nm) fcoalesce(.SD[[paste0(nm, ".x")]], .SD[[paste0(nm, ".y")]]))
][, c(outer(colnms, c(".x", ".y"), paste0)) := NULL ][]
but I get the following error:
Error in fcoalesce(.SD[[paste0(nm, ".x")]], .SD[[paste0(nm, ".y")]]) :
Item 2 is type double but the first item is type character. Please coerce

Most of the other answers are good, but many either over-complicate the result (in my opinion) or they perform a left or right join, not the full join as expected in the OP.
Here's a quick solution that uses dynamic column names.
library(data.table)
colnms <- setdiff(intersect(names(df_a), names(df_b)), "by")
colnms
# [1] "a"
setDT(df_a)
setDT(df_b)
merge(df_a, df_b, by = "by", all = TRUE
)[, (colnms) := lapply(colnms, function(nm) fcoalesce(.SD[[paste0(nm, ".x")]], .SD[[paste0(nm, ".y")]]))
][, c(outer(colnms, c(".x", ".y"), paste0)) := NULL ][]
# by b c a
# <num> <num> <num> <num>
# 1: 1 1 NA 1
# 2: 2 NA 2 2
# 3: 3 3 3 3
# 4: 4 NA 4 4
Notes:
the normal data.table::[ merge is a left-join only, so we need to use data.table::merge in order to be able to get a full-join with all=TRUE;
because it's using merge, the repeated columns get the .x and .y suffixes, something we can easily capitalize on;
the canonical and most-performant way when using (colnms) := ... is to also include .SDcols=colnms, but that won't work as well here since we need the suffixed columns, not the colnms columns themselves; this is a slight performance penalty but certainly not an anti-pattern (I believe) given what we need to do; and since we could have more than one duplicate column, we have to be careful to do it with each pair at a time, not all of them at once;
the last [-block (using outer) is for removing the duplicate columns; without it, the output would have column names c("by", "a.x", "b", "a.y", "c", "a"). It uses outer because that's a straight-forward way to get 1-or-more colnms and combine .x and .y to each of them; it then uses data.table's := NULL shortcut for removing one-or-more columns.

This isn't the most elegant, but you can make a function that applies your rule to coalesce the values if they occur in both data frames.
# find the unique column names (not called "by")
cols <- union(names(df_a),names(df_b))
cols <- cols[!(cols == "by")]
# merge the data sets
df_merge <- merge(df_a, df_b, by = "by", all = TRUE)
# function to check for the base column names that now have a '.x' and
# a '.y' version. for the columns, fill in the NAs from '.x' with the
# value from '.y'
col_val <- function(col_base, df) {
x <- names(df)
if (all(paste0(col_base, c(".x", ".y")) %in% x)) {
na.x <- is.na(df[[paste0(col_base, ".x")]])
df[[paste0(col_base, ".x")]][na.x] <- df[[paste0(col_base, ".y")]][na.x]
df[[paste0(col_base, ".x")]]
} else {
df[[col_base]]
}
}
# apply this function to every column
cbind(df_merge["by"], sapply(cols, col_val, df = df_merge))
This will give the following result.
by a b c
1 1 1 1 NA
2 2 2 NA 2
3 3 3 3 3
4 4 4 NA 4
I know you specified base, by the natural_join() function is worth mentioning.
library(rqdatatable)
natural_join(df_a, df_b, by = "by", jointype = "FULL")
This gives exactly what you want.
by a b c
1 1 1 1 NA
2 2 2 NA 2
3 3 3 3 3
4 4 4 NA 4

Not the answer with R base. But one possible solution with the package data.table
library(data.table)
setDT(df_a)
setDT(df_b)
df_a <- rbind(df_a, list(4, NA, NA))
df_b <- rbind(list(1, NA, NA), df_b)
df_a[df_b, `:=` (a = fifelse(is.na(a), i.a, a), c = c), on = .(by)][]
#> by a b c
#> 1: 1 1 1 NA
#> 2: 2 2 NA 2
#> 3: 3 3 3 3
#> 4: 4 4 NA 4
Edit with the help of #r2evans, A much more elegant and efficient solution:
df_a[df_b, `:=` (a = fcoalesce(a, i.a), c = c), on = .(by)][]
#> by a b c
#> 1: 1 1 1 NA
#> 2: 2 2 NA 2
#> 3: 3 3 3 3
#> 4: 4 4 NA 4
Created on 2021-10-19 by the reprex package (v2.0.1)

here a dynamic solution.. not bad, but maybe someone knows how to speed it up.
get_complete_df<-function(df_a,df_b, by = "by"){
df_a = unique(df_a)
df_b = unique(df_b)
nam_a = names(df_a)[!(names(df_a) == by)]
nam_b = names(df_b)[!(names(df_b) == by)]
nums_a = unlist(lapply(df_a, is.numeric))
nums_b = unlist(lapply(df_b, is.numeric))
nums = unique(names(df_a)[nums_a],names(df_b)[nums_b])
## try to supplement NAs
x = df_b[[by]][df_b[[by]] %in% df_a[[by]]]
y = nam_b[nam_b %in% nam_a]
vna = is.na(df_a[df_a[,1] %in% x,y])
df_a[df_a[,1] %in% x ,y][vna] = df_b[df_b[,1] %in% x,y][vna]
## get complete df
all_names = c(nam_a,nam_b )
all_names = c(by, unique(all_names))
all_by = na.omit(unique(c(df_a[[by]],df_b[[by]]) ))
## build
df_o = as.data.frame(matrix(nrow = length(all_by),ncol = length(all_names)))
names(df_o) = all_names
df_o[[by]] = all_by
## fill in content
df_o[df_o[,1] %in% df_b[,1],names(df_b)] = df_b
df_o[df_o[,1] %in% df_a[,1],names(df_a)] = df_a ## df_a has priority!
# fix numeric:
# why did some(!) num fields changed to chr ?
df_o[,nums] = as.data.frame(apply(df_o[,nums], 2, as.numeric))
df_o
}

Replacing certain values in a column with words

My goal is to replace a specific column's numeric values into certain words based off of a range to use in a future categorical test. Im trying to change this dataframe below:
Lets call this data frame as DF
SubjectID
ColumnA
ColumnB
Column C
Subject1
38
2.3
2.1
Subject2
12
2.1
2.0
Subject3
1
1.1
1.9
Subject4
34
3.2
1.5
Subject5
1
1.7
1.5
Subject6
56
3.9
1.7
To achieve a dataframe such as the one here:
SubjectID
ColumnA
ColumnB
Column C
Subject1
Mid
2.3
2.1
Subject2
Low
2.1
2.0
Subject3
Low
1.1
1.9
Subject4
Mid
3.2
1.5
Subject5
Low
1.7
1.5
Subject6
High
3.9
1.7
So in this case, I want to only change columnA's value names based off of a specific range the data values lie in.
For this example:
A value of Low represents a value lower than 30.
A value of Mid represents a value between 30 and 50
A value of High represents a value higher than 50
What would be the best way to do this?

We could use case_when
library(dplyr)
DF <- DF %>%
mutate(ColumnA = case_when(ColumnA < 30 ~ "Low",
between(ColumnA, 30, 50) ~ "Mid", TRUE ~ "High"))
DF
SubjectID ColumnA ColumnB ColumnC
1 Subject1 Mid 2.3 2.1
2 Subject2 Low 2.1 2.0
3 Subject3 Low 1.1 1.9
4 Subject4 Mid 3.2 1.5
5 Subject5 Low 1.7 1.5
6 Subject6 High 3.9 1.7
Another convenient option without doing multiple expressions is cut from base R
cut(DF$ColumnA, breaks = c(-Inf, 30, 50, Inf), labels = c("Low", "Mid", "High"))
[1] Mid Low Low Mid Low High
Levels: Low Mid High
data
DF <- structure(list(SubjectID = c("Subject1", "Subject2", "Subject3",
"Subject4", "Subject5", "Subject6"), ColumnA = c(38L, 12L, 1L,
34L, 1L, 56L), ColumnB = c(2.3, 2.1, 1.1, 3.2, 1.7, 3.9), ColumnC = c(2.1,
2, 1.9, 1.5, 1.5, 1.7)), class = "data.frame", row.names = c(NA,
-6L))

If you prefer a base R solution you can use nested ifelse:
DF$ColumnA <- ifelse(DF$ColumnA < 30, "Low",
ifelse(DF$ColumnA >= 30 & DF$ColumnA <= 50, "Mid", "High"))
Result:
DF
SubjectID ColumnA ColumnB ColumnC
1 Subject1 Mid 2.3 2.1
2 Subject2 Low 2.1 2.0
3 Subject3 Low 1.1 1.9
4 Subject4 Mid 3.2 1.5
5 Subject5 Low 1.7 1.5
6 Subject6 High 3.9 1.7

Sum row of list of dataframe in r

I have a list of dataframes. Each dataframe is a Stock quote whose row names are dates and column names are buy price, sell price, shares and PL.
I want to obtain a column that contains the percentage of every positive PL contribution to the total daily PL.
Making it simplier. I have the following Data:
mylist= structure(list(`1` = structure(list(ID = c(35L, '2009-01-03', '2009-01-04', '2009-01-05'), Income = c(100, 200, 300, 400)), .Names = c("Date", "Income"), row.names = c(1L, 2L, 3L, 4L), class = "data.frame"), `2` = structure(list(ID = c('2009-01-02', '2009-01-03', '2009-01-04', '2009-01-05'), Income = c(500, -600, 700, 800)), .Names = c("Date", "Income"), row.names = c(1L, 2L, 3L, 4L), class = "data.frame"), `3` = structure(list(ID = c('2009-01-02', '2009-01-03', '2009-01-04'), Income = c(100, 200, 300)), .Names = c("Date", "Income"), row.names = c(1L, 2L, 3L), class = "data.frame")))
Which looks like this:
$`1`
Date Income
1 2009-01-01 100
2 2009-01-03 200
3 2009-01-04 300
4 2009-01-05 400
$`2`
Date Income
1 2009-01-02 500
2 2009-01-03 -600
3 2009-01-04 700
4 2009-01-05 800
$`3`
Date Income
1 2009-01-02 100
2 2009-01-03 200
3 2009-01-04 300
I want to obtain something that looks like this:
$`1`
Date Income Perc
1 2009-01-03 100 1.00
2 2009-01-03 200 0.20
3 2009-01-04 300 0.23
4 2009-01-05 400 0.33
$`2`
Date Income Perc
1 2009-01-02 500 0.83
2 2009-01-03 600 -1.50
3 2009-01-04 700 0.54
4 2009-01-05 800 0.67
$`3`
Date Income Perc
1 2009-01-02 100 0.17
2 2009-01-03 200 0.20
3 2009-01-04 300 0.23

I have two solutions for your problem. I highly recommend combining your data frame in one master data frame in order to reduce the complexity of the code if at all possible. I am sure there are better solutions to the "Separate Data Frame" problem, but most of them will involve multiple loops and thus negatively impact performance.
Data
mylist= structure(list(`1` = structure(list(ID = c('2009-01-02', '2009-01-03', '2009-01-04', '2009-01-05'), Income = c(100, 200, 300, 400)), .Names = c("Date", "Income"), row.names = c(1L, 2L, 3L, 4L), class = "data.frame"), `2` = structure(list(ID = c('2009-01-02', '2009-01-03', '2009-01-04', '2009-01-05'), Income = c(500, -600, 700, 800)), .Names = c("Date", "Income"), row.names = c(1L, 2L, 3L, 4L), class = "data.frame"), `3` = structure(list(ID = c('2009-01-02', '2009-01-03', '2009-01-04'), Income = c(100, 200, 300)), .Names = c("Date", "Income"), row.names = c(1L, 2L, 3L), class = "data.frame")))
Combined Data Frame
library(dplyr)
# add an ID to each data frame
for(i in 1:length(mylist)){
mylist[[i]] <- cbind(mylist[[i]], stock_id = names(mylist)[i])
}
# create data frame with all observations
my_data_frame <- do.call(rbind, mylist)
rownames(my_data_frame) <- NULL
my_data_frame %>%
group_by(Date) %>%
mutate(Perc = Income/sum(Income[Income > 0]))
# A tibble: 11 x 4
# Groups: Date [4]
Date Income stock_id Perc
<chr> <dbl> <chr> <dbl>
1 2009-01-02 100 1 0.143
2 2009-01-03 200 1 0.5
3 2009-01-04 300 1 0.231
4 2009-01-05 400 1 0.333
5 2009-01-02 500 2 0.714
6 2009-01-03 -600 2 -1.5
7 2009-01-04 700 2 0.538
8 2009-01-05 800 2 0.667
9 2009-01-02 100 3 0.143
10 2009-01-03 200 3 0.5
11 2009-01-04 300 3 0.231
Separate Data Frames
library(dplyr)
all_dates <- unique(unlist(lapply(mylist, function(x) unique(x$Date))))
for(i in 1:length(mylist)){
mylist[[i]] <- cbind(mylist[[i]], stock_id = names(mylist)[i])
}
perc_all <- list()
for(i in 1:length(all_dates)){
temporary <- lapply(mylist, function(x) x[x$Date == all_dates[i], ])
all_obs_date <- do.call(rbind, temporary)
all_obs_date$Perc <- all_obs_date$Income/sum(all_obs_date$Income[all_obs_date$Income > 0])
perc_all[[i]] <- all_obs_date
}
perc_final <- do.call(rbind, perc_all)
lapply(mylist, function(x) {
left_join(x, perc_final) %>% select(-stock_id)
})
$`1`
Date Income Perc
1 2009-01-02 100 0.1428571
2 2009-01-03 200 0.5000000
3 2009-01-04 300 0.2307692
4 2009-01-05 400 0.3333333
$`2`
Date Income Perc
1 2009-01-02 500 0.7142857
2 2009-01-03 -600 -1.5000000
3 2009-01-04 700 0.5384615
4 2009-01-05 800 0.6666667
$`3`
Date Income Perc
1 2009-01-02 100 0.1428571
2 2009-01-03 200 0.5000000
3 2009-01-04 300 0.2307692

Iterate through columns' suffixes in a for loop. R

I am trying to modify my dataset with a for loop. I want to modify certain cells of some columns depending on the value of its "paired" column. My dataset could be:
data1989 <- data.frame("date" = c("1987-01-01", "1987-01-03", "1987-01-19"),
"NDVI_1" = c(NA, 0.589, 0.120),
"NDVI_3" = c(NA, 0.447, NA),
"NDVI_4" = c(NA, NA, NA),
"pixelQA_1" = c(NA, 66.897,90.599),
"pixelQA_3" = c(NA, 66.097,NA),
"pixelQA_4" = c(NA, NA, NA),
stringsAsFactors = FALSE)
> data1989
date NDVI_1 NDVI_3 NDVI_4 pixelQA_1 pixelQA_3 pixelQA_4
1 1987-01-01 NA NA NA NA NA NA
2 1987-01-03 0.589 0.447 NA 66.897 66.097 NA
3 1987-01-19 0.120 NA NA 90.599 NA NA
Columns are "paired" by the suffix of each column, so NDVI_1 is paired with pixelQA_1, and so on. I want to modify the values under NDVI's columns depending on it's "paired" values on pixelQA column, following:
if PixelQa is NA -> then NDVI should be also NA.
if Pixel Qa is 66±0.5 OR 130±0.5 -> then NDVI remains the same value.
if Pixel Qa is different to 66±0.5 OR 130±0.5 -> then NDVI value is set to NA (this is bad quality data which needs to be ignored).
Applying these very simple rules my data should look like:
data1989clean <- data.frame("date" = c("1987-01-01", "1987-01-03", "1987-01-19"),
"NDVI_1" = c(NA, NA, NA),
"NDVI_3" = c(NA, 0.447, NA),
"NDVI_4" = c(NA, NA, NA),
"pixelQA_1" = c(NA, 66.897,90.599),
"pixelQA_3" = c(NA, 66.097,NA),
"pixelQA_4" = c(NA, NA, NA),
stringsAsFactors = FALSE)
> data1989clean
date NDVI_1 NDVI_3 NDVI_4 pixelQA_1 pixelQA_3 pixelQA_4
1 1987-01-01 NA NA NA NA NA NA
2 1987-01-03 NA 0.447 NA 66.897 66.097 NA
3 1987-01-19 NA NA NA 90.599 NA NA
To reach my goal I am trying the following for loop:
for(i in 1:4){
data1989$NDVI_[i] <- ifelse(data1989$pixelQA_[i] < 66.5 & data1989$pixelQA_[i] > 65.5 |
data1989$pixelQA_[i] < 130.5 & data1989$pixelQA_[i] > 129.5,
data1989$NDVI_[i], NA)
}
But so far it is not working, as the dataset output looks exactly the same as the original one. Any suggestion will be welcomed.

As suggested by #George Savva, you can achieve this by pivoting longer, correcting the data, and pivoting back wider. So, using the tidyverse, that gives:
library(tidyverse)
newdd1 <-
#
data1989 %>%
#
pivot_longer(cols = -date,
names_to = c(".value", "set"),
names_sep = "_") %>%
#
mutate(NDVI = case_when(is.na(pixelQA) ~ NA_real_,
between(pixelQA, 65.5, 66.5) ~ NDVI,
between(pixelQA, 129.5, 130.5) ~ NDVI,
TRUE ~ NA_real_)) %>%
#
pivot_wider(names_from = set,
values_from = c(NDVI, pixelQA))

difference between first non-NA and last non-NA in each row

I have a data frame with up to 5 measurements (x) and their corresponding time:
df = structure(list(x1 = c(92.9595722286402, 54.2085219673818,
46.3227062573019,
NA, 65.1501442134141, 49.736451235317), time1 = c(43.2715277777778,
336.625, 483.975694444444, NA, 988.10625, 510.072916666667),
x2 = c(82.8368681534474, 53.7981639701784, 12.9993531230419,
NA, 64.5678816290574, 55.331442940348), time2 = c(47.8166666666667,
732, 506.747222222222, NA, 1455.25486111111, 958.976388888889
), x3 = c(83.5433119686794, 65.723072881366, 19.0147593408309,
NA, 65.1989838202356, 36.7000828457705), time3 = c(86.5888888888889,
1069.02083333333, 510.275, NA, 1644.21527777778, 1154.95694444444
), x4 = c(NA, 66.008102917677, 40.6243513885846, NA, 62.1694420909955,
29.0078249523063), time4 = c(NA, 1379.22986111111, 520.726388888889,
NA, 2057.20833333333, 1179.86805555556), x5 = c(NA, 61.0047472617535,
45.324715258421, NA, 59.862110645527, 45.883161439362), time5 = c(NA,
1825.33055555556, 523.163888888889, NA, 3352.26944444444,
1364.99513888889)), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -6L))
"NA" means that the person (row) didn't have a measurement.
I would like to calculate the difference between the last existing measurement and the first one.
So for the first one it would be x3 minus x1 (6.4), for the second it would be -6.8 and so on.
I tried something like this, which didnt work:
df$diff = apply(df %>% select(., contains("x")), 1, function(x) head(x,
na.rm = T) - tail(x, na.rm=T))
Any suggestions? Also, is apply/rowwise the most efficient way, or is there a vectorized function to do that?

A vectorized way would be using max.col where we get "first" and "last" non-NA value using ties.method parameter
#Get column number of first and last col
first_col <- max.col(!is.na(df[x_cols]), ties.method = "first")
last_col <- max.col(!is.na(df[x_cols]), ties.method = "last")
#subset the dataframe to include only `"x"` cols
new_df <- as.data.frame(df[grep("^x", names(df))])
#Subtract last non-NA value with the first one
df$new_calc <- new_df[cbind(1:nrow(df), last_col)] -
new_df[cbind(1:nrow(df), first_col)]
Using apply you could do
x_cols <- grep("^x", names(df))
df$new_calc <- apply(df[x_cols], 1, function(x) {
new_x <- x[!is.na(x)]
if (length(new_x) > 0)
new_x[length(new_x)] - new_x[1L]
else NA
})

We can use tidyverse methods on the tbl_df. Create a row names column (rownames_to_column), gather the 'x' columns to 'long' format while removing the NA elements (na.rm = TRUE), grouped by row name, get the difference of first and last 'val'ues and bind the extracted column with the original dataset 'df'
library(tidyverse)
rownames_to_column(df, 'rn') %>%
select(rn, starts_with('x')) %>%
gather(key, val, -rn, na.rm = TRUE) %>%
group_by(rn) %>%
summarise(Diff = diff(c(first(val), last(val)))) %>%
mutate(rn = as.numeric(rn)) %>%
complete(rn = min(rn):max(rn)) %>%
pull(Diff) %>%
bind_cols(df, new_col = .)
# A tibble: 6 x 11
# x1 time1 x2 time2 x3 time3 x4 time4 x5 time5 new_col
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 93.0 43.3 82.8 47.8 83.5 86.6 NA NA NA NA -9.42
#2 54.2 337. 53.8 732 65.7 1069. 66.0 1379. 61.0 1825. 6.80
#3 46.3 484. 13.0 507. 19.0 510. 40.6 521. 45.3 523. -0.998
#4 NA NA NA NA NA NA NA NA NA NA NA
#5 65.2 988. 64.6 1455. 65.2 1644. 62.2 2057. 59.9 3352. -5.29
#6 49.7 510. 55.3 959. 36.7 1155. 29.0 1180. 45.9 1365. -3.85

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Creating a new dataframes based on two conditions from two other dataframes - r

Related

Simple but not easy merge task

Replacing certain values in a column with words

Sum row of list of dataframe in r

Iterate through columns' suffixes in a for loop. R

difference between first non-NA and last non-NA in each row

Categories

Resources