"dynamic" naming of column in function - r

I'd like to assign a dynamic name, i.e. the name of the variable I'm passing to the function, as a column name in the dataframe that is being created by the function.
I've tried
- deparse(substitute(x))
- toString(x)
but no success...
Code
a <- (1:3)
b <- (5:7)
df <- data.frame(a,b)
fun <- function(x){
x %>% mutate(c=a+b)
colnames(x)[3] <- deparse(substitute(x))
}
Expected behaviour
after running fun(df):
a b df
1 1 5 6
2 2 6 8
3 3 7 10
instead:
> fun(df)
Error in names(x) <- value :
'names' attribute [3] must be the same length as the vector [2]

We can use := with evaluation (!!)
fun <- function(x){
nm1 <- deparse(substitute(x))
x %>%
mutate(!! nm1 := a+b)
}
fun(df)
# a b df
#1 1 5 6
#2 2 6 8
#3 3 7 10
In the OP's function, the the output of x %>% mutate is not assigned back, therefore, the original dataset have only two columns and not three i.e. if we do
fun <- function(x){
nm1 <- deparse(substitute(x))
x <- x %>% # assign the output after mutate
mutate(c=a+b)
colnames(x)[3] <- nm1
x # return the dataset
}
fun(df)
# a b df
#1 1 5 6
#2 2 6 8
#3 3 7 10

Related

Extract first Non NA value over multiple columns

I'm still learning R and was wondering if I there was an elegant way of manipulating the below df to achieve df2.
I'm not sure if it's a loop that is supposed to be used for this, but basically I want to extract the first Non NA "X_No" Value if the "X_No" value is NA in the first row. This would perhaps be best described through an example from df to the desired df2.
A_ID <- c('A','B','I','N')
A_No <- c(11,NA,15,NA)
B_ID <- c('B','C','D','J')
B_No <- c(NA,NA,12,NA)
C_ID <- c('E','F','G','P')
C_No <- c(NA,13,14,20)
D_ID <- c('J','K','L','M')
D_No <- c(NA,NA,NA,40)
E_ID <- c('W','X','Y','Z')
E_No <- c(50,32,48,40)
df <- data.frame(A_ID,A_No,B_ID,B_No,C_ID,C_No,D_ID,D_No,E_ID,E_No)
ID <- c('A','D','F','M','W')
No <- c(11,12,13,40,50)
df2 <- data.frame(ID,No)
I'm hoping for an elegant solution to this as there are over a 1000 columns similar to the example provided.
I've looked all over the web for a similar example however to no avail that would reproduce the expected result.
Your help is very much appreciated.
Thankyou
I don't know if I'd call it "elegant", but here is a potential solution:
library(tidyverse)
A_ID <- c('A','B','I','N')
A_No <- c(11,NA,15,NA)
B_ID <- c('B','C','D','J')
B_No <- c(NA,NA,12,NA)
C_ID <- c('E','F','G','P')
C_No <- c(NA,13,14,20)
D_ID <- c('J','K','L','M')
D_No <- c(NA,NA,NA,40)
E_ID <- c('W','X','Y','Z')
E_No <- c(50,32,48,40)
df <- data.frame(A_ID,A_No,B_ID,B_No,C_ID,C_No,D_ID,D_No,E_ID,E_No)
ID <- c('A','D','F','M','W')
No <- c(11,12,13,40,50)
df2 <- data.frame(ID,No)
output <- df %>%
pivot_longer(everything(),
names_sep = "_",
names_to = c("Col", ".value")) %>%
drop_na() %>%
group_by(Col) %>%
slice_head(n = 1) %>%
ungroup() %>%
select(-Col)
df2
#> ID No
#> 1 A 11
#> 2 D 12
#> 3 F 13
#> 4 M 40
#> 5 W 50
output
#> # A tibble: 5 × 2
#> ID No
#> <chr> <dbl>
#> 1 A 11
#> 2 D 12
#> 3 F 13
#> 4 M 40
#> 5 W 50
all_equal(df2, output)
#> [1] TRUE
Created on 2023-02-08 with reprex v2.0.2
Using base R with max.col (assuming the columns are alternating with ID, No)
ind <- max.col(!is.na(t(df[c(FALSE, TRUE)])), "first")
m1 <- cbind(seq_along(ind), ind)
data.frame(ID = t(df[c(TRUE, FALSE)])[m1], No = t(df[c(FALSE, TRUE)])[m1])
ID No
1 A 11
2 D 12
3 F 13
4 M 40
5 W 50
Here is a data.table solution that should scale well to a (very) large dataset.
functionally
split the data.frame to a list of chunks of columns, based on their
names. So all columns startting with A_ go to
the first element, all colums startting with B_ to the second
Then, put these list elements on top of each other, using
data.table::rbindlist. Ignure the column-namaes (this only works if
A_ has the same number of columns as B_ has the same number of cols
as n_)
Now get the first non-NA value of each value in the first column
code
library(data.table)
# split based on what comes after the underscore
L <- split.default(df, f = gsub("(.*)_.*", "\\1", names(df)))
# bind together again
DT <- rbindlist(L, use.names = FALSE)
# extract the first value of the non-NA
DT[!is.na(A_No), .(No = A_No[1]), keyby = .(ID = A_ID)]
# ID No
# 1: A 11
# 2: D 12
# 3: F 13
# 4: G 14
# 5: I 15
# 6: M 40
# 7: P 20
# 8: W 50
# 9: X 32
#10: Y 48
#11: Z 40

Change the names of multiple cells in R using a function

Consider the following data frame:
df <- setNames(data.frame(1:5,rep(1,5)), c("id", "value"))
I want to change the names for multiple cells in the column "id". Let's say I want to change the following:
df$id[df$id %In% 2:3] <- 1
df$id[df$id == 4] <- 3
However, instead of using the code above, I want to create a function, where I can do the transformation more "smooth" (because I have a lot of data frames, where I need to change the names for the cells). I want to create a function:
mapping <- function(...) {
...
}
where I afterward can create a simple and smooth mapping function for my df, where I only have to specific the "old" and the "new" names for the cells. Something like this:
df_mapping <- function(...) {
2.1
3.1
4.3
}
And then I can apply the function on my data and specific which column it should do it for, and it will work in the same way as the code with gsub:
df <- df_mapping(df,id)
Is it possible to create that mapping function?
if we need a function, then can have a 'data' argument, column name, values to replace and replacer value, then create the logical condition, subset the column, assign with replacer_val and return the dataset after the assignment
f1 <- function(dat, colnm, values_to_replace, replacer_val) {
dat[[colnm]][dat[[colnm]] %in% values_to_replace] <- replacer_val
return(dat)
}
f1(df, "id", c(2, 3), 1)
-output
# id value
#1 1 1
#2 1 1
#3 1 1
#4 4 1
#5 5 1
To replace values with corresponding sets of replacers,
f2 <- function(dat, colnm, values_to_replace, replacer_vals) {
nm1 <- setNames(replacer_vals, values_to_replace)
v1 <- nm1[as.character(dat[[colnm]])]
i1 <- !is.na(v1)
dat[[colnm]][i1] <- v1[i1]
return(dat)
}
f2(df, "id", c(2, 3), c(5, 6))
# id value
#1 1 1
#2 5 1
#3 6 1
#4 4 1
#5 5 1
Or another option is to create a key/value dataset and use merge or join
library(data.table)
f3 <- function(dat, colnm, values_to_replace, replacer_vals) {
keydat <- data.frame(key = values_to_replace, val = replacer_vals)
names(keydat)[1] <- colnm
dt <- as.data.table(dat)
dt[keydat, (colnm) := val, on = colnm][]
return(dt)
}
f3(df, "id", c(2, 5), c(3, 6))
Maybe a mapping like below could help
mapping <- function(df, id, to_replace, obj_value) {
transform(df, id = replace(id, id %in% to_replace, obj_value))
}
e.g.,
> mapping(df, id, c(2, 3), 1)
id value
1 1 1
2 1 1
3 1 1
4 4 1
5 5 1
You can use dplyr's recode function
mapping <- function(data, col, old, new) {
data[[col]] <- dplyr::recode(data[[col]], !!!setNames(new, old))
data
}
mapping(df, "id", c(2, 3), c(7L, 8L))
# id value
#1 1 1
#2 7 1
#3 8 1
#4 4 1
#5 5 1

Create a data frame with the common columns of a data frame list - R

I need to get the common columns of a data frame list separated in different data frames. Please look at the following example:
df1 <- data.frame(Dates = c('01-01-2020','02-01-2020','03-01-2020'), col1 = c(1,2,3), col2 = c(3,2,1))
df2 <- data.frame(Dates = c('01-01-2020','02-01-2020','03-01-2020'), col1 = c(4,5,6), col2 = c(6,5,4))
df3 <- data.frame(Dates = c('01-01-2020','02-01-2020'), col1 = c(7,8), col2 = c(8,7))
ldf <- list(df1, df2, df3)
The desired output would be the following two data frames:
df_col1:
Date df1 df2 df3
01-01-2020 1 4 7
02-01-2020 2 5 8
03-01-2020 3 6 NA
df_col2:
Date df1 df2 df3
01-01-2020 3 6 8
02-01-2020 2 5 7
03-01-2020 1 4 NA
Of course, ldf is actually way longer, but the number of columns is fixed to 5, so the number of outputs is also fixed (4). This means I wouldn't mind if I use a block of code for each output.
I've tried several things but none seems to work. I'm using base R and hope to find a solution wihtout additional packages.
Thanks a lot for your time!
We bind the list elements with bind_rows from dplyr, then loop over the 'col' columns, along with the common 'Dates', reshape to 'wide' format with pivot_wider and rename if needed
library(dplyr)
library(purrr)
library(tidyr)
library(stringr)
newdf <- bind_rows(ldf)
out <- map(names(newdf)[-1], ~
newdf %>%
select(Dates, .x) %>%
mutate(rn = rowid(Dates)) %>%
pivot_wider(names_from =rn, values_from = !! rlang::sym(.x)) %>%
rename_at(-1, ~ str_c('df', seq_along(.))))
-output
out
#[[1]]
# A tibble: 3 x 4
# Dates df1 df2 df3
# <chr> <dbl> <dbl> <dbl>
#1 01-01-2020 1 4 7
#2 02-01-2020 2 5 8
#3 03-01-2020 3 6 NA
#[[2]]
# A tibble: 3 x 4
# Dates df1 df2 df3
# <chr> <dbl> <dbl> <dbl>
#1 01-01-2020 3 6 8
#2 02-01-2020 2 5 7
#3 03-01-2020 1 4 NA
Or using base R
newdf <- do.call(rbind, ldf)
f1 <- function(dat, colName) {
lst1 <- split(dat[[colName]], dat$Dates)
do.call(rbind, lapply(lst1, `length<-`, max(lengths(lst1))))
}
f1(newdf, 'col1')
f1(newdf, 'col2')
Another Base R option is to do:
m <- Reduce(function(x,y)merge(x, y, by='Dates', all=TRUE), ldf)
lapply(split.default(m[-1], sub("\\..*", "", names(m[-1]))), cbind, m[1])
Another wordy approach using base R:
#Code
names(ldf) <- paste0('df',1:length(ldf))
#Function
myfun <- function(x) {
y <- reshape(x,direction = 'long',
v.names='col',
idvar = 'Dates',varying = list(2:3))
return(y)
}
z <- do.call(rbind,lapply(ldf,myfun))
z$Data <- gsub("\\..*","",rownames(z))
rownames(z) <- NULL
#Reshape
z2 <- reshape(z,idvar = c('Dates','time'),timevar = 'Data')
#List
List <- split(z2,z2$time)
List
Output:
List
$`1`
Dates time col.df1 col.df2 col.df3
1 01-01-2020 1 1 4 7
2 02-01-2020 1 2 5 8
3 03-01-2020 1 3 6 NA
$`2`
Dates time col.df1 col.df2 col.df3
4 01-01-2020 2 3 6 8
5 02-01-2020 2 2 5 7
6 03-01-2020 2 1 4 NA

performing multiple functions on every columns and rows of a list of dataframes using their names stored in a list

DATA
foo <- dplyr::tibble(a=c("a","b",NA),b=c("a","b","c"),colC=NA)
bar <- dplyr::tibble(a=c("a","b",NA),b=c("a","b","c"),colC=NA)
all_tibbles <- c("foo","bar")
lapply(mget(all_list), function(y) sapply(y, function(x) all(is.na(x))))
$foo
# A tibble: 3 x 3
a b colC
<chr> <chr> <lgl>
1 a a NA
2 b b NA
3 NA c NA
$bar
# A tibble: 3 x 3
a b colC
<chr> <chr> <lgl>
1 a a NA
2 b b NA
3 NA c NA
I would like to remove all columns from every data frame in mget(all_list)
This created the logical vector using base apply functions.
lapply(mget(all_tibbles), function(y) sapply(y, function(x) all(is.na(x))))
Then remove all rows with the minimum number of missing values
lapply(mget(all_tibbles),function(x)
x[-which.min(rowSums((!is.na(x)))),])
and then store these back in the same variables foo and bar. I have a large character vector with tibble names btw.
Can I use a tidyr package to simplify things? base functions are fairly complicated, and am trying to avoid for loops
An option is select_if
library(dplyr)
library(purrr)
library(stringr)
out <- mget(all_tibbles) %>%
map(~ .x %>%
select_if(~ any(!is.na(.))))
out
#$foo
# A tibble: 3 x 2
# a b
# <chr> <chr>
#1 a a
#2 b b
#3 <NA> c
#$bar
# A tibble: 3 x 2
# a b
# <chr> <chr>
#1 a a
#2 b b
#3 <NA> c
names(out) <- str_c(names(out), "_edited")
If we need to update "foo", "bar" (not recommended)
list2env(out, .GlobalEnv)
Or using keep
mget(all_tibbles) %>%
map(~ keep(.x, colSums(!is.na(.)) > 0))
For the second case with rows
out1 <- mget(all_tibbles) %>%
map(~ .x %>%
slice(-which.min(rowSums(!is.na(.)))))
names(out2) <- str_c(names(out), "_edited2")
list2env(out2, .GlobalEnv)
Or we can use Filter from base R to remove columns (OP already showed a base R option for removing rows)
lapply(mget(all_tibbles), function(x)
Filter(function(y) any(!is.na(y)), x))

Loop by variable names

I want to create a for loop by variable names.
Each time, I calculte the max between each two variables, and define a new one in data df. New variables look like this:var1_1, var1_2... Here is my code:
df=data.frame(matrix(c(1:6), nrow = 2))
colnames(df) = c("x", "y", "z")
for(i in length(names(df))-1){
df = df %>% mutate(paste0("var", i, "_", i+1) = max(names(df)[i], names(df)[i+1]))
}
But there gives error.
Expected output:
>df
x y z var1_2 var1_3 var2_3
1 3 5 3 5 5
2 4 6 4 6 6
One way via base R,
m1 <- sapply(combn(names(df),2, simplify = FALSE), function(i) do.call(pmax, df[i]))
nms <- combn(ncol(m1), 2, function(i) paste0('Var', i[1], '_', i[2]))
cbind(df, setNames(data.frame(m1), nms))
# x y z Var1_2 Var1_3 Var2_3
#1 1 3 5 3 5 5
#2 2 4 6 4 6 6
If you really want to use a Loop you can try:
ind<-combn(3,2)
for(i in 1:dim(df)[2]){
i <- ind[,i]
name <- paste0("var", i[1], "_", i[2])
val <- names(df)[i[ifelse(sum(df[,i[1]]) > sum(df[,i[2]]),1,2)]]
df <- mutate_(df, .dots= setNames(list(val),name))
}

Resources