can not use dplyr programming syntax in across() in r - r

I want to use dplyr programming syntax (combine !! and :=) to evaluate a function in .fn argument but failed.
The code like this:
library(zoo)
library(glue)
aa = structure(list(region = c(1, 2, 3, 4), co_mean = c(5, 5, 5, 5
), o3_mean = c(5, 5, 5, 5), pm2.5_mean = c(5, 5, 5, 5)), row.names = c(NA,
-4L), class = c("tbl_df", "tbl", "data.frame"))
for (i in 1:3) {
fun_name_1 = glue('lag{i}')
fun_name_2 = glue('lag0{i}')
aa = aa %>% group_by(region) %>%
mutate(across(.cols = contains('mean'),
.fns = list(!!fun_name_1 := ~lag(., i), # ERROR OCCUR AT HERE
!!fun_name_2 := ~ rollmeanr(., i)),
.names = '{.col}_{.fn}'))
aa
}
I don't know how to solve it.
Any help will be highly appreciated!
======UPDATE========
My new code and new ERROR:
library(zoo)
library(glue)
aa = structure(list(region = c(1, 2, 3, 4), co_mean = c(5, 5, 5, 5
), o3_mean = c(5, 5, 5, 5), pm2.5_mean = c(5, 5, 5, 5)), row.names = c(NA,
-4L), class = c("tbl_df", "tbl", "data.frame"))
for (i in 1:3) {
# i <- 1
fun_name_1 = glue('lag{i}')
fun_name_2 = glue('lag0{i}')
aa %>%
group_by(region) %>%
mutate(across(.cols = contains('mean'),
.fns = setNames(list(~lag(., i),
~ rollmeanr(., i)), c(fun_name_1, fun_name_2)),
.names = '{.col}_{.fn}'))
aa
}
# Error: Problem with `mutate()` input `..1`.
# x 'names' attribute [6] must be the same length as the vector [5]
# i Input `..1` is `across(...)`.
# i The error occurred in group 1: region = 1.
# Run `rlang::last_error()` to see where the error occurred.

It would work as a named list. It makes perfect sense to pass a group by first (assuming that the OP's original example data have multiple rows per group)
i <- 1
fun_name_1 = glue('lag{i}')
fun_name_2 = glue('lag0{i}')
aa %>%
group_by(region) %>%
mutate(across(.cols = contains('mean'),
.fns = setNames(list(~lag(., i),
~ rollmeanr(., i)), c(fun_name_1, fun_name_2)),
.names = '{.col}_{.fn}'))
-output
# A tibble: 4 x 10
# Groups: region [4]
# region co_mean o3_mean pm2.5_mean co_mean_lag1 co_mean_lag01 o3_mean_lag1 o3_mean_lag01 pm2.5_mean_lag1 pm2.5_mean_lag01
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 1 5 5 5 NA 5 NA 5 NA 5
#2 2 5 5 5 NA 5 NA 5 NA 5
#3 3 5 5 5 NA 5 NA 5 NA 5
#4 4 5 5 5 NA 5 NA 5 NA
Could specify the fill = TRUE in rollmean
aa %>%
group_by(region) %>%
mutate(across(.cols = contains('mean'),
.fns = setNames(list(~lag(., i),
~ rollmeanr(., i, fill = TRUE)), c(fun_name_1, fun_name_2)),
.names = '{.col}_{.fn}'))

First I don't think your data should be grouped, at least for the data shared it doesn't make sense to have only 1 row in the group and then calculate lag value and rolling mean on it.
You can have appropriate column names using .names in across and use map_dfc to combine everything into one dataframe.
library(dplyr)
library(purrr)
library(zoo)
map_dfc(1:3, function(x) {
aa %>%
transmute(across(.cols = contains('mean'),
.fns = list(lag = ~lag(., x),
lag0 = ~rollmeanr(., x, fill = NA)),
.names = sprintf('{fn}_{col}_%d', x)))
})
You can add group_by(Region) if you are trying it on some another dataset.

Related

How to automatically fill in a blank column

I am trying to get the list of sums of two columns from my original data set, from left to right
I have made a loop:
for (i in 1:ncol(df)) {
m = i
n = i + 1
if (i %% 2 != 0) {
df_cum$V1 <- sum(df[,m] + df[,n])
}
}
But, the way to add value to the new list is wrong:
df_cum$V1 <- sum(df[,m] + df[,n])
would be really appreciated if anyone knows how to do that in R
You can try split.default(), i.e.
sapply(split.default(df, gsub('\\d+', '', names(df))), sum)
A B
17 12
A base R option using tapply -
tapply(unlist(df),
rep(1:ncol(df), each = nrow(df) * 2, length.out = nrow(df) * ncol(df)),
sum)
# 1 2 3
#17 12 13
The logic here is to create group of every 2 columns and sum them.
data
It is easier to help if you provide data in a reproducible format
df <- data.frame(A1 = c(0, 3, 2), A2 = c(2, 6, 4),
B1 = c(3, 0, 1), B2 = c(2, 3, 3),
C1 = c(7, 3, 2), C2 = c(1, 0, 0))
We can do this in tidyverse
library(dplyr)
library(tidyr)
df1 %>%
pivot_longer(everything(), names_to = c(".value", "grp"),
names_sep ="(?<=[A-Z])(?=[0-9])") %>%
select(-grp) %>%
summarise(across(everything(), sum, na.rm = TRUE), .groups = 'drop')
-output
# A tibble: 1 x 3
A B C
<dbl> <dbl> <dbl>
1 17 12 13
Or using base R
aggregate(values ~ ., transform(stack(df1),
ind = sub("\\d+", "", ind)), FUN = sum)
ind values
1 A 17
2 B 12
3 C 13
Or another option with rowsum from base R
with(stack(df1), rowsum(values, group = trimws(ind, whitespace = "\\d+")))
[,1]
A 17
B 12
C 13
Or another option is with colSums and rowsum
{tmp <- colSums(df1); rowsum(tmp, group = substr(names(tmp), 1, 1))}
[,1]
A 17
B 12
C 13
data
df1 <- structure(list(A1 = c(0, 3, 2), A2 = c(2, 6, 4), B1 = c(3, 0,
1), B2 = c(2, 3, 3), C1 = c(7, 3, 2), C2 = c(1, 0, 0)),
class = "data.frame", row.names = c(NA,
-3L))

R: how to combine rows using Pivot_wider

I have a table like the following:
A, B, C
1, Yes, 3
1, No, 2
2, Yes, 4
2, No, 6
etc
I want to convert it to:
A, Yes, No
1, 3, 2
2, 4, 6
I have tried using:
dat <- dat %>%
spread(B, C) %>%
group_by(A)
However, now I have a bunch of NA values. Is it possible to use pivot_longer to do this instead?
We can use pivot_wider
library(tidyr)
pivot_wider(dat, names_from = B, values_from = C)
-output
# A tibble: 2 x 3
# A Yes No
# <dbl> <dbl> <dbl>
#1 1 3 2
#2 2 4 6
If there are duplicate rows, then an option is to create a sequence by that column
library(data.table)
library(dplyr)
dat1 <- bind_rows(dat, dat) # // example with duplicates
dat1 %>%
mutate(rn = rowid(B)) %>%
pivot_wider(names_from = B, values_from = C) %>%
select(-rn)
-output
# A tibble: 4 x 3
# A Yes No
# <dbl> <dbl> <dbl>
#1 1 3 2
#2 2 4 6
#3 1 3 2
#4 2 4 6
data
dat <- structure(list(A = c(1, 1, 2, 2), B = c("Yes", "No", "Yes", "No"
), C = c(3, 2, 4, 6)), class = "data.frame", row.names = c(NA,
-4L))

Arranging rows and columns in R

I have an output data frame as below. But I would like to rearrange to achieve the result in df2. Is there a way for me to arrange or group it?
df>
a_test1 b_test1 c_test1 a_test2 b_test2 c_test2
Test Test1 Test1 Test1 Test2 Test2 Test2
Result 10 9 4 4 3 1
df2>
a b c
Test1 10 9 4
Test2 4 3 1
dat <- data.frame(a_test1 = 10,
b_test1 = 9,
c_test1 = 4,
a_test2 = 4,
b_test2 = 3,
c_test2 = 1)
You can achieve this with this code:
library(tidyverse)
dat %>%
pivot_longer(cols = everything(),
names_sep = "_",
names_to = c("prefix", "suffix")) %>%
pivot_wider(names_from = prefix)
which gives:
# A tibble: 2 x 4
suffix a b c
<chr> <dbl> <dbl> <dbl>
1 test1 10 9 4
2 test2 4 3 1
UPDATE:
TO asked if it would still work with different column names that contain several underscores as separator:
dat2 <- data.frame(a_test1_10 = 10,
b_test1_10 = 9,
c_test1_10 = 4,
a_test2_10 = 4,
b_test2_10 = 3,
c_test2_10 = 1)
pivot_spec <- data.frame(.name = colnames(dat2),
.value = c("a", "b", "c", "a", "b", "c"),
test_group = c("test1", "test1", "test1", "test2", "test2", "test2"))
This pivot_spec looks like:
.name .value test_group
1 a_test1_10 a test1
2 b_test1_10 b test1
3 c_test1_10 c test1
4 a_test2_10 a test2
5 b_test2_10 b test2
6 c_test2_10 c test2
and then ou can just continue pivoting. Actually, the whole pivoting now looks much cleaner and you don't need to combine a pivot_longer with a pivot_wider.
dat2 %>%
pivot_longer_spec(pivot_spec)
which gives:
# A tibble: 2 x 4
test_group a b c
<chr> <dbl> <dbl> <dbl>
1 test1 10 9 4
2 test2 4 3 1
As you can see, createing this pivot_spec template makes the whole thing extremely flexible. The .name column contains all your required data columns, the .value column contains the new column names and maps the old column names to the new ones. And the test_group (you can choose whatever name you like) column determines the rows that would be created and which original column should appear in which column.
You can reshape the dat, filter rows and turn column into rownames.
tidyr::pivot_longer(df,
cols = everything(),
names_to = c('.value', 'col'),
names_sep = '_') %>%
dplyr::filter(!grepl('Test', a)) %>%
type.convert(as.is = TRUE) %>%
tibble::column_to_rownames('col')
# a b c
#test1 10 9 4
#test2 4 3 1
data
df <- structure(list(a_test1 = c("Test1", "10"), b_test1 = c("Test1",
"9"), c_test1 = c("Test1", "4"), a_test2 = c("Test2", "4"), b_test2 = c("Test2",
"3"), c_test2 = c("Test2", "1")), class = "data.frame", row.names = c("Test",
"Result"))

how can I use mutate in dplyr to modify variable dynamically?

I wish to dynamically select variables and modify them in tibble dataframe. I have copied a sample problem. Can someone please help with that.
Thanks
df <- tibble(
x = c(1, 2, 3),
y = c(4, 5, 6),
z = c(6, 7, 8))
variables = c("x", "y")
for (var in variables)
{
df <- df %>% mutate(var = var + 1)
}
We can use mutate with across
library(dplyr)
df %>%
mutate(across(all_of(variables), ~ . + 1))
-output
# A tibble: 3 x 3
# x y z
# <dbl> <dbl> <dbl>
#1 2 5 6
#2 3 6 7
#3 4 7 8
data
df <- tibble(
x = c(1, 2, 3),
y = c(4, 5, 6),
z = c(6, 7, 8))
variables = c("x", "y")
Try this. You can use !! from rlang and sym() from dplyr to make the evaluation you want using the operator :=. Here the code:
library(dplyr)
#Data and code
df <- tibble(
x = c(1, 2, 3),
y = c(4, 5, 6),
z = c(6, 7, 8))
variables = c("x", "y")
for (var in variables)
{
var <- sym(var)
df <- df %>% mutate(!!var := !!var + 1)
}
Output:
# A tibble: 3 x 3
x y z
<dbl> <dbl> <dbl>
1 2 5 6
2 3 6 7
3 4 7 8

R Replace NA for all Columns Except *

library(tidyverse)
df <- tibble(Date = c(rep(as.Date("2020-01-01"), 3), NA),
col1 = 1:4,
thisCol = c(NA, 8, NA, 3),
thatCol = 25:28,
col999 = rep(99, 4))
#> # A tibble: 4 x 5
#> Date col1 thisCol thatCol col999
#> <date> <int> <dbl> <int> <dbl>
#> 1 2020-01-01 1 NA 25 99
#> 2 2020-01-01 2 8 26 99
#> 3 2020-01-01 3 NA 27 99
#> 4 NA 4 3 28 99
My actual R data frame has hundreds of columns that aren't neatly named, but can be approximated by the df data frame above.
I want to replace all values of NA with 0, with the exception of several columns (in my example I want to leave out the Date column and the thatCol column. I'd want to do it in this sort of fashion:
df %>% replace(is.na(.), 0)
#> Error: Assigned data `values` must be compatible with existing data.
#> i Error occurred for column `Date`.
#> x Can't convert <double> to <date>.
#> Run `rlang::last_error()` to see where the error occurred.
And my unsuccessful ideas for accomplishing the "everything except" replace NA are shown below.
df %>% replace(is.na(c(., -c(Date, thatCol)), 0))
df %>% replace_na(list([, c(2:3, 5)] = 0))
df %>% replace_na(list(everything(-c(Date, thatCol)) = 0))
Is there a way to select everything BUT in the way I need to? There's hundred of columns, named inconsistently, so typing them one by one is not a practical option.
You can use mutate_at :
library(dplyr)
Remove them by Name
df %>% mutate_at(vars(-c(Date, thatCol)), ~replace(., is.na(.), 0))
Remove them by position
df %>% mutate_at(-c(1,4), ~replace(., is.na(.), 0))
Select them by name
df %>% mutate_at(vars(col1, thisCol, col999), ~replace(., is.na(.), 0))
Select them by position
df %>% mutate_at(c(2, 3, 5), ~replace(., is.na(.), 0))
If you want to use replace_na
df %>% mutate_at(vars(-c(Date, thatCol)), tidyr::replace_na, 0)
Note that mutate_at is soon going to be replaced by across in dplyr 1.0.0.
You have several options here based on data.table.
One of the coolest options: setnafill (version >= 1.12.4):
library(data.table)
setDT(df)
data.table::setnafill(df,fill = 0, cols = colnames(df)[!(colnames(df) %in% c("Date", thatCol)]))
Note that your dataframe is updated by reference.
Another base solution:
to_change<-grep("^(this|col)",names(df))
df[to_change]<- sapply(df[to_change],function(x) replace(x,is.na(x),0))
df
# A tibble: 4 x 5
Date col1 thisCol thatCol col999
<date> <dbl> <dbl> <int> <dbl>
1 2020-01-01 1 0 25 99
2 2020-01-01 2 8 26 99
3 2020-01-01 3 0 27 99
4 NA 0 3 28 99
Data(I changed one value):
df <- structure(list(Date = structure(c(18262, 18262, 18262, NA), class = "Date"),
col1 = c(1L, 2L, 3L, NA), thisCol = c(NA, 8, NA, 3), thatCol = 25:28,
col999 = c(99, 99, 99, 99)), row.names = c(NA, -4L), class = c("tbl_df",
"tbl", "data.frame"))
replace works on a data.frame, so we can just do the replacement by index and update the original dataset
df[-c(1, 4)] <- replace(df[-c(1, 4)], is.na(df[-c(1, 4)]), 0)
Or using replace_na with across (from the new dplyr)
library(dplyr)
library(tidyr)
df %>%
mutate(across(-c(Date, thatCol), ~ replace_na(., 0)))
If you know the ones that you don't want to change, you could do it like this:
df <- tibble(Date = c(rep(as.Date("2020-01-01"), 3), NA),
col1 = 1:4,
thisCol = c(NA, 8, NA, 3),
thatCol = 25:28,
col999 = rep(99, 4))
#dplyr
df_nonreplace <- select(df, c("Date", "thatCol"))
df_replace <- df[ ,!names(df) %in% names(df_nonreplace)]
df_replace[is.na(df_replace)] <- 0
df <- cbind(df_nonreplace, df_replace)
> head(df)
Date thatCol col1 thisCol col999
1 2020-01-01 25 1 0 99
2 2020-01-01 26 2 8 99
3 2020-01-01 27 3 0 99
4 <NA> 28 4 3 99

Resources