Pass vector of column names to paste() within mutate (dplyr) - r

I'm trying to write a function that takes as one of its arguments a vector of column names from user. The column names will be used to specify what columns of the dataframe will be pasted together to form a new column within dplyr::mutate. I tried to collapse the elements of argument vector first and then use the collapsed string in mutate - this is wrong. See that latest attempt below. I made other attempts but I'm not understanding the new quo, enquo, UQ, !!!, !!, and so on within dplyr. Can someone show what I need to do?
df <- data.frame(.yr = c("2000", "2001", "2002"), .mo = c("12", "01", "02"), .other = rnorm(3))
cols <- colnames(df)[1:2]
do_want <- df %>%
mutate(new = paste(.yr, .mo, sep = "-"))
my_func <- function(dat, vars){
.vars <- paste(vars, collapse = ",")
result <- dat %>%
mutate(new = paste(.vars, sep = "-" ))
return(result)
}
my_func(dat = df, vars = cols)
edit: this is my attempt at using quo and !! in the function definition. the result is a column of repeated string ".yr,.mo"
my_func <- function(dat, vars){
.vars <- quo(paste(vars, collapse = ","))
result <- dat %>%
mutate(new = paste(!!.vars, sep = "-" ))
return(result)
}

Because you have a list of strings, you can use rlang::syms in your function to take the strings and turn them into symbols. Then you can use !!! to splice the arguments together to put into paste.
my_func <- function(dat, vars){
.vars <- rlang::syms(vars)
result <- dat %>%
mutate(new = paste(!!!.vars, sep = "-" ))
return(result)
}
my_func(dat = df, vars = cols)
.yr .mo .other new
1 2000 12 -0.2663456 2000-12
2 2001 01 0.5463433 2001-01
3 2002 02 -1.3133078 2002-02

Use unite.
names <- iris %>% colnames()
iris %>% mutate(new = paste(names)) #Error
iris %>% unite("new",names,remove=F) #OK

Use mutate_ instead of mutate & turning the expression into a string worked for me:
dplyr_solution <- function(dat, vars){
.vars <- paste(vars, collapse = ",")
result <- dat %>%
mutate_(new = paste0('paste(', .vars, ', sep="-")'))
return(result)
}
dplyr_solution(dat = df, vars = cols)

Related

Looping error with lists: for function not works inside purrr::map2 in R

I built a function to use it inside the purrr::map2 function and run it in two lists. When I run the function steps separately it works ok. But apparently in map2 it runs the first time (for the first elements of list .x[[1]] .y[[1]]) and then in the second round throws this error in the for function:
How can I find out why it's not working?
PS: It's hard to put an example of the data here because they are lists with very specific characteristics for this function. I'm sorrry.
Follow the function:
df <- list()
build_HUW_raster <- function(.x, .y) {
list.time <- .x %>%
split(.$id) %>%
purrr::map(~list(t=as.matrix(.x$date),
xy=unname(as.matrix(.x[,c(22,23)])))
)
for(i in 1:50){
cat(i," ")
path=list.time[[i]]
ctmc=ctmcmove::path2ctmc(path$xy,path$t,r,method="LinearInterp")
df[[i]] <- as.data.frame(do.call(cbind, ctmc))
}
df <- df %>% purrr::map(~ group_by(., ec) %>%
summarise(rt = mean(rt)) %>%
arrange(desc(rt))
)
stacktime <- df %>% purrr::map(~ rename(., cell = ec)) %>%
map(~dplyr::left_join(cargo.grid, ., by="cell", copy=T)) %>%
map(~raster::rasterize(., r, field="rt", na.rm=F, background=0)) %>%
raster::stack()
stackprop <- .y %>%
split(.$id) %>%
purrr::map(~ raster::rasterize(., y = r,
field=.$proportion,
fun=function(x, ...)median(x))) %>%
raster::stack()
stack_huw <- raster::overlay(raster::calc(stacktime, fun=function(x)
ifelse(is.na(x), NA, x/sum(x, na.rm=T))), stackprop, fun=function(x,y)x*y
)
raster_mean <- raster::stackApply(stack_huw,
indices = rep(1,raster::nlayers(stack_huw)),
fun = "mean",
na.rm = F
)
}
result.list <- purrr::map2 (.x=list1, .y=list2, fun=build_HUW_raster)
The reason is based on the element looped. [[ extracts the list element and depending on the class of the element, map loops over either individual elements if it is a vector/matrix or the columns in case of data.frame as these are units. By using [, it extracts the element as a list
list(1, 2, 3)[1]
[[1]]
[1] 1
vs
list(1, 2, 3)[[1]]
[1] 1
When we loop over map and apply some functions that require a specific structure i.e. colSums require a matrix/data.frame ie. with dim attributes, it fails if we use [[
> map(replicate(2, data.frame(col1 = 1:5, col2 = 6:10), simplify = FALSE)[[1]], colSums)
Error in .f(.x[[i]], ...) :
'x' must be an array of at least two dimensions
> map(replicate(2, data.frame(col1 = 1:5, col2 = 6:10), simplify = FALSE)[1], colSums)
[[1]]
col1 col2
15 40
Here, we may change the code to
purrr::map2(.x=list1[1], .y=list2[1], fun=build_HUW_raster)

Iterating through dataframes for creating and filling new data frames

I have the following large dataframes:
Jan_Feb2019
Mar_Apr2019
May_Jun2019
Jul_Aug2019
Sep_Oct2019
Nov_Dec2019
Jan_Feb2020
Mar_2020
And i use the following code to generate other dataframes and fill the columns with the data i want.
#Jan_Feb2019
Jan_Feb2019_df <- as.data.frame(Jan_Feb2019$reactions$summary$total_count)
colnames(Jan_Feb2019_df)[1] <- "Reactions"
Jan_Feb2019_df$Shares <- Jan_Feb2019$shares$count
Jan_Feb2019_df$Comments <- Jan_Feb2019$comments$summary$total_count
Jan_Feb2019_df$Message <- Jan_Feb2019$message
Jan_Feb2019_df$Likes <- Jan_Feb2019$likes$summary$total_count
Jan_Feb2019_df$CreatedDate <- Jan_Feb2019$created_time
Jan_Feb2019_df$PostID <- Jan_Feb2019$id
Jan_Feb2019_df$Love <- Jan_Feb2019$reacts_love$summary$total_count
Jan_Feb2019_df$Angry <- Jan_Feb2019$reacts_angry$summary$total_count
Jan_Feb2019_df$Sad <- Jan_Feb2019$reacts_sad$summary$total_count
Jan_Feb2019_df$HAHA <- Jan_Feb2019$reacts_haha$summary$total_count
Jan_Feb2019_df$WOW <- Jan_Feb2019$reacts_wow$summary$total_count
Jan_Feb2019_df$CreatedDate <- anytime(Jan_Feb2019_df[,6])
Jan_Feb2019_df$insights.data <- Jan_Feb2019$insights$data
Jan_Feb2019_df <- Jan_Feb2019_df %>%
unnest(insights.data) %>%
unnest(values) %>%
select(Message,Shares,Comments,Reactions,Likes,CreatedDate,PostID,Love,Angry,Sad,HAHA,WOW,name,value) %>%
pivot_wider(names_from = name, values_from = value)
Is there a way to iterate between all the above dataframes, so i won't have to repeat the process 8 times?
Thanks
The code below is untested. I have tried to follow the code in the question, making it general. There are 2 functions.
fillNewDf takes the old object as only argument and creates and fills the new data frame.
makeNewDf takes the old object name as an argument and calls fillNewDf returning its value.
If the objects are in the global environment then makeNewDf argument envir default value is used.
fillNewDf <- function(X){
vec <- X[['reactions']][['summary']][['total_count']]
Y <- data.frame(Reactions = vec)
Y[['Shares']] <- X[['shares']][['count']]
Y[['Comments']] <- X[['comments']][['summary']][['total_count']]
Y[['Message']] <- X[['message']]
Y[['Likes']] <- X[['likes']][['summary']][['total_count']]
Y[['CreatedDate']] <- X[['created_time']]
Y[['PostID']] <- X[['id']]
Y[['Love']] <- X[['reacts_love']][['summary']][['total_count']]
Y[['Angry']] <- X[['reacts_angry']][['summary']][['total_count']]
Y[['Sad']] <- X[['reacts_sad']][['summary']][['total_count']]
Y[['HAHA']] <- X[['reacts_haha']][['summary']][['total_count']]
Y[['WOW']] <- X[['reacts_wow']][['summary']][['total_count']]
Y[['CreatedDate']] <- anytime(Y[, 6])
Y[['insights.data']] <- X[['insights']][['data']]
Y %>%
unnest(insights.data) %>%
unnest(values) %>%
select(Message, Shares, Comments, Reactions, Likes, CreatedDate, PostID, Love, Angry, Sad, HAHA, WOW, name, value) %>%
pivot_wider(names_from = name, values_from = value)
}
makeNewDf <- function(X, envir = .GlobalEnv){
DF <- get(X, envir = envir)
filNewDf(DF)
}
Now get the names of the objects to be processed with ls() and create a list with the new data frames.
old_names <- ls(pattern = '\\d{4}$')
new_list <- lapply(old_list, makeNewDf)
names(new_list) <- paste(old_names, "df", sep = "_")
If these new data frames are to become objects in the global environment, list2env(new_list) will create them with the same names as the names attribute of new_list.

Pass a data.frame column name to a function that uses purrr::map

I'm working with nested dataframes and want to pass the name of the top level dataframe, and the name of a column containing lower level dataframes, to a function that uses purrr::map to iterate over the lower level data frames.
Here's a toy example.
library(dplyr)
library(purrr)
library(tibble)
library(tidyr)
df1 <- tibble(x = c("a","b","c", "a","b","c"), y = 1:6)
df1 <- df1 %>%
group_by(x) %>%
nest()
testfunc1 <- function(df) {
df <- df %>%
mutate(out = map(data, min))
tibble(min1 = df$out)
}
testfunc2 <- function(df, col_name) {
df <- df %>%
mutate(out = map(col_name, min))
tibble(min2 = df$out)
}
df1 <- bind_cols(df1, testfunc1(df1))
df1 <- bind_cols(df1, testfunc2(df1, "data"))
df1$min1
df1$min2
testfunc1 behaves as expected, in this case giving the minimum of each data column in a new column. In testfunc2, where I've tried to pass the column name, a string reading "data" is passed to the new column. I think I understand from the thread here (Pass a data.frame column name to a function) why this doesn't behave as I want, but I haven't been able to figure out how to make it work in this case. Any suggestions would be great.
This should work for you, it uses the tidy eval framework. This assumes col_name is a string.
testfunc2 <- function(df, col_name) {
df <- df %>%
mutate(out = map(!! rlang::sym(col_name), min))
tibble(min2 = df$out)
}
EDIT:
If you'd rather pass a bare column name to the function, instead of a string, use enquo instead of sym.
testfunc2 <- function(df, col_name) {
col_quo = enquo(col_name)
df <- df %>%
mutate(out = map(!! col_quo, min))
tibble(min2 = df$out)
}

Construct variable names in select_

I am trying to write a function that will (in part) rename a variable by combining its source dataframe and existing variable name. In essence, I want:
df1 <- data.frame(a = 1, b = 2)
to become:
df1 %>%
rename(df1_a = a)
# df1_a b
#1 1 2
But I want to do this programatically, something along the lines of:
fun <- function(df, var) {
outdf <- rename_(df, paste(df, var, sep = "_") = var)
return(outdf)
}
This admittedly naive approach obviously doesn't work, but I haven't been able to figure it out. I'm sure the answer is somewhere in the nse vignette (https://cran.r-project.org/web/packages/dplyr/vignettes/nse.html), but that doesn't seem to address constructing variable names.
Not sure if this is the proper dplyr-esque way, but it'll get you going.
fun <- function(df, var) {
x <- deparse(substitute(df))
y <- deparse(substitute(var))
rename_(df, .dots = with(df, setNames(as.list(y), paste(x, y, sep = "_"))))
}
fun(df1, a)
# df1_a b
# 1 1 2
fun(df1, b)
# a df1_b
# 1 1 2
lazyeval isn't really needed here because the environment of both inputs is known. That being said:
library(lazyeval)
library(dplyr)
library(magrittr)
fun = function(df, var) {
df_ = lazy(df)
var_ = lazy(var)
fun_(df_, var_)
}
fun_ = function(df_, var_) {
new_var_string =
paste(df_ %>% as.character %>% extract(1),
var_ %>% as.character %>% extract(1),
sep = "_")
dots = list(var_) %>% setNames(new_var_string)
df_ %>%
lazy_eval %>%
rename_(.dots = dots)
}
fun(df1, a)

How to get the name of a data.frame within a list?

How can I get a data frame's name from a list? Sure, get() gets the object itself, but I want to have its name for use within another function. Here's the use case, in case you would rather suggest a work around:
lapply(somelistOfDataframes, function(X) {
ddply(X, .(idx, bynameofX), summarise, checkSum = sum(value))
})
There is a column in each data frame that goes by the same name as the data frame within the list. How can I get this name bynameofX? names(X) would return the whole vector.
EDIT: Here's a reproducible example:
df1 <- data.frame(value = rnorm(100), cat = c(rep(1,50),
rep(2,50)), idx = rep(letters[1:4],25))
df2 <- data.frame(value = rnorm(100,8), cat2 = c(rep(1,50),
rep(2,50)), idx = rep(letters[1:4],25))
mylist <- list(cat = df1, cat2 = df2)
lapply(mylist, head, 5)
I'd use the names of the list in this fashion:
dat1 = data.frame()
dat2 = data.frame()
l = list(dat1 = dat1, dat2 = dat2)
> str(l)
List of 2
$ dat1:'data.frame': 0 obs. of 0 variables
$ dat2:'data.frame': 0 obs. of 0 variables
and then use lapply + ddply like:
lapply(names(l), function(x) {
ddply(l[[x]], c("idx", x), summarise,checkSum = sum(value))
})
This remains untested without a reproducible answer. But it should help you in the right direction.
EDIT (ran2): Here's the code using the reproducible example.
l <- lapply(names(mylist), function(x) {
ddply(mylist[[x]], c("idx", x), summarise,checkSum = sum(value))
})
names(l) <- names(mylist); l
Here is the dplyr equivalent
library(dplyr)
catalog =
data_frame(
data = someListOfDataframes,
cat = names(someListOfDataframes)) %>%
rowwise %>%
mutate(
renamed =
data %>%
rename_(.dots =
cat %>%
as.name %>%
list %>%
setNames("cat")) %>%
list)
catalog$renamed %>%
bind_rows(.id = "number") %>%
group_by(number, idx, cat) %>%
summarize(checkSum = sum(value))
you could just firstly use names(list)->list_name and then use list_name[1] , list_name[2] etc. to get each list name. (you may also need as.numeric(list_name[x]) if your list names are numbers.

Resources