Dplyr - Mean for multiple columns - r

I want to calculate the mean for several columns and thus create a new column for the mean using dplyr and without melting + merging.
> head(growth2)
CODE_COUNTRY CODE_PLOT IV12_ha_yr IV23_ha_yr IV34_ha_yr IV14_ha_yr IV24_ha_yr IV13_ha_yr
1 1 6 4.10 6.97 NA NA NA 4.58
2 1 17 9.88 8.75 NA NA NA 8.25
3 1 30 NA NA NA NA NA NA
4 1 37 15.43 15.07 11.89 10.00 12.09 14.33
5 1 41 20.21 15.01 14.72 11.31 13.27 17.09
6 1 46 12.64 14.36 13.65 9.07 12.47 12.36
>
I need a new column within the dataset with the mean of all the IV columns.
I tried this:
growth2 %>%
group_by(CODE_COUNTRY, CODE_PLOT) %>%
summarise(IVmean=mean(IV12_ha_yr:IV13_ha_yr, na.rm=TRUE))
And returned several errors depending on the example used, such as:
Error in NA_real_:NA_real_ : NA/NaN argument
or
Error in if (trim > 0 && n) { : missing value where TRUE/FALSE needed

You don't need to group, just select() and then mutate()
library(dplyr)
mutate(df, IVMean = rowMeans(select(df, starts_with("IV")), na.rm = TRUE))

Use . in dplyr.
library(dplyr)
mutate(df, IVMean = rowMeans(select(., starts_with("IV")), na.rm = TRUE))

Here is a dplyr solution using c_across which is designed for row-wise aggregations. This makes it easy to refer to columns by name, type or position and to apply any function to the selected columns.
library("tidyverse")
df <-
tibble::tribble(
~CODE_COUNTRY, ~CODE_PLOT, ~IV12_ha_yr, ~IV23_ha_yr, ~IV34_ha_yr, ~IV14_ha_yr, ~IV24_ha_yr, ~IV13_ha_yr,
1L, 6L, 4.1, 6.97, NA, NA, NA, 4.58,
1L, 17L, 9.88, 8.75, NA, NA, NA, 8.25,
1L, 30L, NA, NA, NA, NA, NA, NA,
1L, 37L, 15.43, 15.07, 11.89, 10, 12.09, 14.33,
1L, 41L, 20.21, 15.01, 14.72, 11.31, 13.27, 17.09,
1L, 46L, 12.64, 14.36, 13.65, 9.07, 12.47, 12.36
)
df %>%
rowwise() %>%
mutate(
IV_mean = mean(c_across(starts_with("IV")), na.rm = TRUE),
IV_sd = sd(c_across(starts_with("IV")), na.rm = TRUE)
)
#> # A tibble: 6 × 10
#> # Rowwise:
#> CODE_COUNTRY CODE_PLOT IV12_ha_yr IV23_ha_yr IV34_ha_yr IV14_ha_yr IV24_ha_yr
#> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 6 4.1 6.97 NA NA NA
#> 2 1 17 9.88 8.75 NA NA NA
#> 3 1 30 NA NA NA NA NA
#> 4 1 37 15.4 15.1 11.9 10 12.1
#> 5 1 41 20.2 15.0 14.7 11.3 13.3
#> 6 1 46 12.6 14.4 13.6 9.07 12.5
#> # … with 3 more variables: IV13_ha_yr <dbl>, IV_mean <dbl>, IV_sd <dbl>
Created on 2022-06-25 by the reprex package (v2.0.1)

I tried to comment on Rick Scriven's answer but don't have the experience points for it. Anyway, wanted to contribute. His answer said to do this:
library(dplyr)
mutate(df, IVMean = rowMeans(select(df, starts_with("IV")), na.rm = TRUE))
That works, but if all columns don't start with "IV", which was my case, how do you do it? Turns out, that select does not want a logical vector, so you can't use AND or OR. For example, you cannot say "starts_with('X') | starts_with('Y')". You have to build a numeric vector. Here is how it is done.
mutate(df, IVMean = rowMeans(select(df, c(starts_with("IV"), starts_with("IX"))), na.rm = TRUE))

you can use as follows:
your data
data<- structure(list(CODE_COUNTRY = c(1L, 1L, 1L, 1L, 1L, 1L), CODE_PLOT = c(6L,
17L, 30L, 37L, 41L, 46L), IV12_ha_yr = c(4.1, 9.88, NA, 15.43,
20.21, 12.64), IV23_ha_yr = c(6.97, 8.75, NA, 15.07, 15.01, 14.36
), IV34_ha_yr = c(NA, NA, NA, 11.89, 14.72, 13.65), IV14_ha_yr = c(NA,
NA, NA, 10, 11.31, 9.07), IV24_ha_yr = c(NA, NA, NA, 12.09, 13.27,
12.47), IV13_ha_yr = c(4.58, 8.25, NA, 14.33, 17.09, 12.36)), .Names = c("CODE_COUNTRY",
"CODE_PLOT", "IV12_ha_yr", "IV23_ha_yr", "IV34_ha_yr", "IV14_ha_yr",
"IV24_ha_yr", "IV13_ha_yr"), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6"))
mydata <- cbind(data,IVMean=apply(data[,3:8],1,mean, na.rm=TRUE))
you can also do this
mydata <- cbind(data,IVMean=rowMeans(data[3:8], na.rm=TRUE))

Related

Rowsums on two vectors of paired columns but conditional on specific values

I have a dataset that looks like the one below where there are three "pairs" of columns pertaining to the type (datA, datB, datC), and the total for each type (datA_total, datB_total, datC_total):
structure(list(datA = c(1L, NA, 5L, 3L, 8L, NA), datA_total = c(20L,
30L, 40L, 15L, 10L, NA), datB = c(5L, 5L, NA, 6L, 1L, NA), datB_total = c(80L,
10L, 10L, 5L, 4L, NA), datC = c(NA, 4L, 1L, NA, 3L, NA), datC_total = c(NA,
10L, 15L, NA, 20L, NA)), class = "data.frame", row.names = c(NA,
-6L))
# datA datA_total datB datB_total datC datC_total
#1 1 20 5 80 NA NA
#2 NA 30 5 10 4 10
#3 5 40 NA 10 1 15
#4 3 15 6 5 NA NA
#5 8 10 1 4 3 20
#6 NA NA NA NA NA NA
I'm trying to create a rowSums across each row to determine the total visits across each data type conditional on whether they meet a criteria of having ANY score ranging (1-5).
Here is my thought process:
Select only the variables that are the data types (i.e. datA, datB, datC)
Across each row based on EACH data type, determine if that data type meets a criteria (i.e. datA -> does it contain (1,2,3,4,5))
If that data type column does contain one of the 5 values above ^, then look to its paired total variable and ready that value to be rowSummed (i.e. datA -> does it contain (1,2,3,4,5)? -> if yes, then grab datA_total value = 20).
The goal is to end up with a total column like below:
# datA datA_total datB datB_total datC datC_total overall_total
#1 1 20 5 80 NA NA 100
#2 NA 30 5 10 4 10 20
#3 5 40 NA 10 1 15 55
#4 3 15 6 5 NA NA 15
#5 8 10 1 4 3 20 24
#6 NA NA NA NA NA NA 0
You'll notice that row #2 only contained a total of 20 even though there is 30 in datA_total. This is a result of the conditional selection in that datA for row#2 contains "NA" rather than one of the five scores (1,2,3,4,5). Hence, the datA_total of 30 was not included in the rowSums calculation.
My code below shows the vectors I created and my attempt at a conditional rowSums but I end up getting an error regarding mutate... I'm not sure how to integrate the "conditional pairing" portion of this problem:
type_vars <- c("datA", "datB", "datC")
type_scores <- c("1", "2", "3", "4", "5")
type_visits <- c("datA_total", "datB_total", "datC_total")
df <- df %>%
mutate(overall_total = rowSums(all_of(type_visits[type_vars %in% type_scores])))
Any help/tips would be appreciated
dplyr's across should do the job.
library(dplyr)
# copying your tibble
data <-
tibble(
datA = c(1, NA, 5, 3, 8, NA),
datA_total = c(20, 30, 40, 15, 10, NA),
datB = c(5, 5, NA, 6, 1, NA),
datB_total = c(80, 10, 10, 5, 4, NA),
datC = c(NA, 4, 1, NA, 3, NA),
datC_total = c(NA, 10, 15, NA, 20, NA)
)
data %>%
mutate(across(c('A', 'B', 'C') %>% paste0('dat', .), \(x) (x %in% 1:5) * get(cur_column() %>% paste0(., '_total')), .names = "{col}_aux")) %>%
rowwise() %>%
mutate(overall_total = sum(across(ends_with('aux')), na.rm = TRUE)) %>%
select(any_of(c(names(data), 'overall_total')))
# A tibble: 6 × 7
datA datA_total datB datB_total datC datC_total overall_total
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 20 5 80 NA NA 100
2 NA 30 5 10 4 10 20
3 5 40 NA 10 1 15 55
4 3 15 6 5 NA NA 15
5 8 10 1 4 3 20 24
6 NA NA NA NA NA NA 0
First, we create an 'aux' column for each dat. It is 0 if dat is not within 1:5, and dat_total otherwise. Then we sum ignoring NA.

Replace NA with mean of variable grouped by time and treatment

I have a data frame, similar to the one below (see dput), recording responses of a variable to a treatment over time:
df <- structure(list( time = c(0, 0, 0, 0, 0, 0, 14, 14, 14, 14, 14, 14, 33, 33, 33, 33, 33, 33, 90, 90, 90, 90, 90, 90),
trt = structure(c(2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L),
.Label = c("1", "2"), class = "factor"),
A1 = c(6.301, 5.426, 5.6021, NA, NA, NA, 6.1663, 6.426, 6.8239, 2.301, 4.7047, 2.301, 5.8062, 4.97, 4.97, 2.301, 2.301, 2.301, 2.301, 2.301, 2.301, 2.301, 2.301, 2.301),
B1 = c(5.727, 5.727, 5.4472, NA, NA, NA, 6.6021, 7.028, 7.1249, 3.028, 3.1663, 3.6021, 5.727, 5.2711, 5.2389, 3.3554, 3.9031, 4.2389, 3.727, 3.6021, 3.6021, 3.8239, 3.727, 3.426)),
row.names = c(NA, -24L), class = c("tbl_df", "tbl", "data.frame"))
which looks lie this:
time trt A1 B1
<dbl> <fct> <dbl> <dbl>
1 0 2 6.30 5.73
2 0 2 5.43 5.73
3 0 2 5.60 5.45
4 0 1 NA NA
5 0 1 NA NA
6 0 1 NA NA
7 14 2 6.17 6.60
8 14 2 6.43 7.03
9 14 2 6.82 7.12
10 14 1 2.30 3.03
In our experiments, we don’t always record values for all treatments at time == 0. I want to replace any missing values (NA) when (and only when) time == 0 with the mean of the trt ‘2’ group at time == 0. So NA in A1 all become 5.78, and those in B1 become 5.63.
Using answers from here and here, as well as some others, I have been able to come up with the following:
df %>%
mutate_if(is.numeric, funs(if_else(is.na(.),if_else(time == 0, 0, .), .)))
This replaces NA at time == 0 with 0 (this is useful for some of my variables where there is no data in any of the treatments at time == 0, but not what i'm after here). I also tried this:
df %>%
mutate_if(is.numeric, funs(if_else(is.na(.),if_else(time == 0, mean(., na.rm = TRUE), .), .)))
This is closer to what I want, but is averaging the values from the whole column/variable. Can I make it average only those values from treatment ‘2’ when time == 0?
I think I would just use indexing in base R for this:
within(df, {A1[is.na(A1) & time == 0] <- mean(A1[trt == "2" & time == 0])
B1[is.na(B1) & time == 0] <- mean(B1[trt == "2" & time == 0])})
#> # A tibble: 24 x 4
#> time trt A1 B1
#> <dbl> <fct> <dbl> <dbl>
#> 1 0 2 6.30 5.73
#> 2 0 2 5.43 5.73
#> 3 0 2 5.60 5.45
#> 4 0 1 5.78 5.63
#> 5 0 1 5.78 5.63
#> 6 0 1 5.78 5.63
#> 7 14 2 6.17 6.60
#> 8 14 2 6.43 7.03
#> 9 14 2 6.82 7.12
#> 10 14 1 2.30 3.03
#> # ... with 14 more rows
Created on 2020-05-15 by the reprex package (v0.3.0)
If we add group_by(time), we can recode the missing columns to the time-specific mean values for the observations where time == 0 as follows.
df <- structure(list( time = c(0, 0, 0, 0, 0, 0, 14, 14, 14, 14, 14, 14, 33, 33, 33, 33, 33, 33, 90, 90, 90, 90, 90, 90),
trt = structure(c(2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L),
.Label = c("1", "2"), class = "factor"),
A1 = c(6.301, 5.426, 5.6021, NA, NA, NA, 6.1663, 6.426, 6.8239, 2.301, 4.7047, 2.301, 5.8062, 4.97, 4.97, 2.301, 2.301, 2.301, 2.301, 2.301, 2.301, 2.301, 2.301, 2.301),
B1 = c(5.727, 5.727, 5.4472, NA, NA, NA, 6.6021, 7.028, 7.1249, 3.028, 3.1663, 3.6021, 5.727, 5.2711, 5.2389, 3.3554, 3.9031, 4.2389, 3.727, 3.6021, 3.6021, 3.8239, 3.727, 3.426)),
row.names = c(NA, -24L), class = c("tbl_df", "tbl", "data.frame"))
library(dplyr)
df %>% group_by(time) %>%
mutate(A1 = if_else(is.na(A1) & time == 0,mean(A1,na.rm=TRUE),A1),
B1 = if_else(is.na(B1) & time == 0,mean(B1,na.rm=TRUE),B1))
...and the output:
# A tibble: 24 x 4
# Groups: time [4]
time trt A1 B1
<dbl> <fct> <dbl> <dbl>
1 0 2 6.30 5.73
2 0 2 5.43 5.73
3 0 2 5.60 5.45
4 0 1 5.78 5.63
5 0 1 5.78 5.63
6 0 1 5.78 5.63
7 14 2 6.17 6.60
8 14 2 6.43 7.03
9 14 2 6.82 7.12
10 14 1 2.30 3.03
# ... with 14 more rows
>
UPDATE: general solution across multiple columns
Per the comments in my answer, here is a solution that uses the development version of dplyr to access the new across() function.
devtools::install_github("tidyverse/dplyr") # needed for across()
# get all columns except time and trt
theColumns <- colnames(df)[!(colnames(df) %in% c("time","trt"))]
df %>% group_by(time) %>%
mutate(across(theColumns,~if_else(is.na(.) & time == 0,mean(.,na.rm=TRUE),.)))
...and the output:
# Groups: time [4]
time trt A1 B1
<dbl> <fct> <dbl> <dbl>
1 0 2 6.30 5.73
2 0 2 5.43 5.73
3 0 2 5.60 5.45
4 0 1 5.78 5.63
5 0 1 5.78 5.63
6 0 1 5.78 5.63
7 14 2 6.17 6.60
8 14 2 6.43 7.03
9 14 2 6.82 7.12
10 14 1 2.30 3.03
# … with 14 more rows
>
As i was unable to access the development version of dplyr to use the new across() function, I combined elements of both answers above to give the result i wanted:
df %>%
mutate_if(is.numeric, funs(if_else(is.na(.) & time == 0, mean(.[trt == "2" & time == 0]), .)))
It looks like across() is intended to replace the _if functions in the long run (see here), but this solution works in the meantime.

Collapsing columns and removing NAs [duplicate]

This question already has answers here:
How to implement coalesce efficiently in R
(9 answers)
Closed 2 years ago.
I have a data.frame, in this format:
A w x y z
0.23 1 NA NA NA
0.12 NA 2 NA NA
0.45 NA 2 NA NA
0.89 NA NA 3 NA
0.12 NA NA NA 4
And I want to collapse w:x:y:z into a single column, while removing NA's. Desired result:
A Comb
0.23 1
0.12 2
0.45 2
0.89 3
0.12 4
My approach so far is:
df %>% unite("Comb", w:x:y:z, na.rm=TRUE, remove=TRUE)
However, "Comb" is being populated with strings such as 1_NA_NA_NA and NA_NA_NA_4 i.e. it is not removing the NA's. I've tried switching to character NA's, but that leads to bizarre and unpredictable results. What am I doing wrong?
I'd also like to be able to do this when the original data.frame is populated with strings (in place of the numbers). Is there a method for this?
Using dplyr::coalesce we can do the following:
df %>%
mutate(Comb = coalesce(w,x,y,z)) %>%
select(A, Comb)
which gives the following output:
A Comb
<dbl> <dbl>
1 0.23 1
2 0.12 2
3 0.45 2
4 0.89 3
5 0.12 4
In unite, na.rm does not remove integer/factor columns.
Convert them to the character and then use unite.
library(dplyr)
df %>%
mutate_at(vars(w:z), as.character) %>%
tidyr::unite('comb', w:z, na.rm = TRUE)
# A comb
#1 0.23 1
#2 0.12 2
#3 0.45 2
#4 0.89 3
#5 0.12 4
data
df <- structure(list(A = c(0.23, 0.12, 0.45, 0.89, 0.12), w = c(1L,
NA, NA, NA, NA), x = c(NA, 2L, 2L, NA, NA), y = c(NA, NA, NA,
3L, NA), z = c(NA, NA, NA, NA, 4L)), class = "data.frame",
row.names = c(NA, -5L))
Another option is fcoalesce from data.table
library(data.table)
setDT(df)[, .(A, Comb = fcoalesce(w, x, y, z))]
data
df <- structure(list(A = c(0.23, 0.12, 0.45, 0.89, 0.12), w = c(1L,
NA, NA, NA, NA), x = c(NA, 2L, 2L, NA, NA), y = c(NA, NA, NA,
3L, NA), z = c(NA, NA, NA, NA, 4L)), class = "data.frame",
row.names = c(NA, -5L))
Using na.omit.
dat <- transform(dat[1], Comb=apply(dat[-1], 1, na.omit))
# A Comb
# 1 0.23 1
# 2 0.12 2
# 3 0.45 2
# 4 0.89 3
# 5 0.12 4
Data
dat <- structure(list(A = c(0.23, 0.12, 0.45, 0.89, 0.12), w = c(1L,
NA, NA, NA, NA), x = c(NA, 2L, 2L, NA, NA), y = c(NA, NA, NA,
3L, NA), z = c(NA, NA, NA, NA, 4L)), row.names = c(NA, -5L), class = "data.frame")

Dividing data into quintiles using group_by

I am looking for a way to change my way in such a way that it sorts the data into quintiles instead of the top 5 and bottom 5. My current code looks like this:
CombData <- CombData %>%
group_by(Date) %>%
mutate(
R=min_rank(Value),
E_P = case_when(
R < 6 ~ "5w",
R > max(R, na.rm =TRUE) - 5 ~ "5b",
TRUE ~ NA_character_)
) %>%
ungroup() %>%
arrange(Date, E_P)
My dataset is quite large therefore I will just provide sample data. The data I use is more complex and the code should, therefore, allow for varying lengths of the column Date and also for multiple values that are missing (NAs):
df <- data.frame( Date = c(rep("2010-01-31",16), rep("2010-02-28", 14)), Value=c(rep(c(1,2,3,4,5,6,7,8,9,NA,NA,NA,NA,NA,15),2))
Afterward, I would also like to test the minimum size of quintiles i.e. how many data points are minimum in each quintile in the entire dataset.
The expected output would look like this:
structure(list(Date = structure(c(14640, 14640, 14640, 14640,
14640, 14640, 14640, 14640, 14640, 14640, 14640, 14640, 14640,
14640, 14640, 14640, 14668, 14668, 14668, 14668, 14668, 14668,
14668, 14668, 14668, 14668, 14668, 14668, 14668, 14668), class = "Date"),
Value = c(1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 15, NA, NA, NA, NA,
NA, 2, 3, 4, 5, 6, 7, 8, 9, 15, NA, NA, NA, NA, NA), R = c(1L,
1L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, NA, NA, NA, NA,
NA, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, NA, NA, NA, NA, NA
), S_P = c("Worst", "Worst", "Worst", NA, NA, NA, NA, "Best",
"Best", "Best", NA, NA, NA, NA, NA, NA, "Worst", "Worst", NA, NA,
NA, NA, NA, "Best", "Best", NA, NA, NA, NA, NA)), row.names = c(NA,
-30L), class = c("tbl_df", "tbl", "data.frame"))
Probably, you could use something like this with quantile :
library(dplyr)
out <- CombData %>%
group_by(Date) %>%
mutate(S_P = case_when(Value <= quantile(Value, 0.2, na.rm = TRUE) ~ 'Worst',
Value >= quantile(Value, 0.8, na.rm = TRUE) ~ 'Best'))
You could change the value of quantile according to your preference.
To get minimum number of "Best" and "Worst" we can do :
out %>%
count(Date, S_P) %>%
na.omit() %>%
ungroup() %>%
select(-Date) %>%
group_by(S_P) %>%
top_n(-1, n)
# S_P n
# <chr> <int>
#1 Best 2
#2 Worst 2
When I understand you correctly, you want to rank your column 'Value' and mark those with rank below the quantile 20% with "worst" and those above 80% with "best". After that you want a table.
You could use use ave for both, the ranking and the quantile identification. The quantile function yields three groups, that you can identify with findInterval, code as a factor variable and label them at will. I'm not sure, though, which ranks should be included in the quantiles, I therefore make the E_P coding in two separate columns for comparison purposes.
dat2 <- within(dat, {
R <- ave(Value, Date, FUN=function(x) rank(x, na.last="keep"))
E_P <- ave(R, Date, FUN=function(x) {
findInterval(x, quantile(R, c(.2, .8), na.rm=TRUE))
})
E_P.fac <- factor(E_P, labels=c("worst", NA, "best"))
})
dat2 <- dat2[order(dat2$Date, dat2$E_P), ] ## order by date and E_P
Yields:
dat2
# Date Value E_P.fac E_P R
# 1 2010-01-31 1 worst 0 1.5
# 16 2010-01-31 1 worst 0 1.5
# 2 2010-01-31 2 <NA> 1 3.0
# 3 2010-01-31 3 <NA> 1 4.0
# 4 2010-01-31 4 <NA> 1 5.0
# 5 2010-01-31 5 <NA> 1 6.0
# 6 2010-01-31 6 <NA> 1 7.0
# 7 2010-01-31 7 <NA> 1 8.0
# 8 2010-01-31 8 best 2 9.0
# 9 2010-01-31 9 best 2 10.0
# 15 2010-01-31 15 best 2 11.0
# 10 2010-01-31 NA <NA> NA NA
# 11 2010-01-31 NA <NA> NA NA
# 12 2010-01-31 NA <NA> NA NA
# 13 2010-01-31 NA <NA> NA NA
# 14 2010-01-31 NA <NA> NA NA
# 17 2010-02-28 2 worst 0 1.0
# 18 2010-02-28 3 worst 0 2.0
# 19 2010-02-28 4 <NA> 1 3.0
# 20 2010-02-28 5 <NA> 1 4.0
# 21 2010-02-28 6 <NA> 1 5.0
# 22 2010-02-28 7 <NA> 1 6.0
# 23 2010-02-28 8 <NA> 1 7.0
# 24 2010-02-28 9 <NA> 1 8.0
# 30 2010-02-28 15 best 2 9.0
# 25 2010-02-28 NA <NA> NA NA
# 26 2010-02-28 NA <NA> NA NA
# 27 2010-02-28 NA <NA> NA NA
# 28 2010-02-28 NA <NA> NA NA
# 29 2010-02-28 NA <NA> NA NA
When I check the quintiles of the Rank column, it appears to be right.
quantile(dat2$R, c(.2, .8), na.rm=TRUE)
# 20% 80%
# 2.8 8.2
After that you could just make a table to get the numbers of each category.
with(dat2, table(Date, E_P.fac))
# E_P.fac
# Date worst <NA> best
# 2010-01-31 2 6 3
# 2010-02-28 2 6 1
Data
dat <- structure(list(Date = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("2010-01-31", "2010-02-28"
), class = "factor"), Value = c(1, 2, 3, 4, 5, 6, 7, 8, 9, NA,
NA, NA, NA, NA, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, NA, NA, NA, NA,
NA, 15)), row.names = c(NA, -30L), class = "data.frame")

Dynamically subset and mutate data.table?

I have 2 separate DFs, I want to mutate 2 new columns in dat2 ('Avg_of_nonNA', and a 'Cols' to track which column its using) based on the non-NA columns in dat1. I need take a subset of dat2 because the matrix is dense whereas dat1 is sparse (So I can take advantage of the sparse-ness). The only way to match the columns is to match the common elements in the names: (0-1,1-2,2-3,3-4) in my case. The rest of the column names are gibberish. Its requiring string splitting and matching--causing many problems because I can't chain stuff together because each row has a different combination of columns to average (dummy example is simplified). I do have a working solution, but it is painfully slow across my 1M+ rows. Here is that solution:
I'm looking for a way to get rid of the for loop. Any suggestions?
for (z in 1:5) {
relevant_cols=dat1[z,] %>%
select_if(~!all(is.na(.))) %>%
names %>% strsplit(.,'_') %>% map(.,2) %>% unlist()
id=dat1[z,'ID']$`ID`
dat2[`ID`== id,`:=`(Avg_of_nonNA = (mean(as.numeric(.SD))),Cols=paste0(relevant_cols,collapse='/')), .SDcols=names(dat2) %like% paste0(relevant_cols,collapse='|')]
}
Data Below
> dat1
ID gjfkg_0-1_fkjdk_fjdkd jdfsje_1-2_fhks_ejfskj dfjs_2-3_vjskf_wqew gdlkrzc_3-4_rjrkj Avg_of_nonNA_otherDT
1: 1 2.23 1.37 NA NA 1.5
2: 2 1.98 NA NA 1.760 6.5
3: 3 NA 4.45 9.350 3.320 11.0
4: 4 NA NA 6.642 2.019 15.5
5: 5 NA 3.21 3.677 NA 18.5
> dat2
ID ewrwer_0-1_iopi_opop erewtt_1-2_rueiwu_vcvbc erewr_2-3_iirew_rewr mnmn_3-4_cxzxzc_gjd
1: 1 1 2 3 4
2: 2 5 6 7 8
3: 3 9 10 11 12
4: 4 13 14 15 16
5: 5 17 18 19 20
dput(dat1)
structure(list(ID = 1:5, `gjfkg_0-1_fkjdk_fjdkd` = c(2.23, 1.98,
NA, NA, NA), `jdfsje_1-2_fhks_ejfskj` = c(1.37, NA, 4.45, NA,
3.21), `dfjs_2-3_vjskf_wqew` = c(NA, NA, 9.35, 6.642, 3.677),
`gdlkrzc_3-4_rjrkj` = c(NA, 1.76, 3.32, 2.019, NA)), row.names = c(NA, -5L), class = c("data.table",
"data.frame"))
dput(dat2)
structure(list(ID = 1:5, `ewrwer_0-1_iopi_opop` = c(1L, 5L, 9L,
13L, 17L), `erewtt_1-2_rueiwu_vcvbc` = c(2L, 6L, 10L, 14L, 18L
), `erewr_2-3_iirew_rewr` = c(3L, 7L, 11L, 15L, 19L), `mnmn_3-4_cxzxzc_gjd` = c(4L,
8L, 12L, 16L, 20L)), row.names = c(NA, -5L), class = c("data.table",
"data.frame"))
Expected output:
Here is an option:
setDT(dat1)
setDT(dat2)
nm <- sapply(strsplit(names(dat1[, -"ID"]), "_"), `[[`, 2L)
dat2[, c("Avg_of_nonNA_otherDT", "Cols") := {
nas <- is.na(dat1[,-"ID"])
m <- col(nas)
m[] <- nm[m]
m[nas] <- ""
.(rowMeans(.SD * NA^nas, na.rm=TRUE),
gsub("\\s+", "/", trimws(do.call(paste, as.data.frame(m)))))
}, .SDcols=-"ID"]
output:
ID ewrwer_0-1_iopi_opop erewtt_1-2_rueiwu_vcvbc erewr_2-3_iirew_rewr mnmn_3-4_cxzxzc_gjd Avg_of_nonNA_otherDT Cols
1: 1 1 2 3 4 1.5 0-1/1-2
2: 2 5 6 7 8 6.5 0-1/3-4
3: 3 9 10 11 12 11.0 1-2/2-3/3-4
4: 4 13 14 15 16 15.5 2-3/3-4
5: 5 17 18 19 20 18.5 1-2/2-3

Resources