R read.csv loop with iterating variable in quotation marks - r

This is my code:
library(data.table)
library(stringr)
parameters <- c("conductivity","calcium","chloride","magnesium","phosphate","potassium","salinity","sodium","sulphate")
for (i in parameters){
i <- read.csv(str_c("./Data/Parameters/",i,".csv"), sep=",", header=FALSE)
i <- unique(i)
i <- subset(i, select=c(1,2,4,6))
i <- setnames(i, c("site","date", str_c("",i,""), "material"))
i[,3] <- as.numeric(i[,3])
i <- subset(i, i > 0)
}
Now, there are two things that don't work here.
The first is in the setnames function: it doesn't understand that it needs to label one of the columns of the CSV with the variable name.
The second is that it doesn't actually call the imported files with 'conductivity', 'calcium', etc but simply calls them all 'i'.
How can I fix this?

We can do this with map
library(stringr)
library(dplyr)
library(purrr)
map(parameters, ~ read_csv(str_c("./Data/Parameters/", .x, ".csv")) %>%
distinct %>%
select(1, 2, 4, 6) %>%
rename_at(3, ~ .x) %>%
mutate(!! .x := !! rlang::sym(.x)) %>%
filter_at(vars(.x), any_vars(. > 0))
)

Another option using data.table:
library(data.table)
parameters <- c("conductivity","calcium","chloride","magnesium","phosphate",
"potassium","salinity","sodium","sulphate")
lapply(parameters, function(x) {
DT <- unique(fread(paste0("./Data/Parameters/", x, ".csv"), header=FALSE))[,
V4 := as.numeric(V4)][
V4 > 0, paste0("V", c(1,2,4,6))]
setnames(DT, names(DT), c("site","date", x, "material"))
})

Related

Looping error with lists: for function not works inside purrr::map2 in R

I built a function to use it inside the purrr::map2 function and run it in two lists. When I run the function steps separately it works ok. But apparently in map2 it runs the first time (for the first elements of list .x[[1]] .y[[1]]) and then in the second round throws this error in the for function:
How can I find out why it's not working?
PS: It's hard to put an example of the data here because they are lists with very specific characteristics for this function. I'm sorrry.
Follow the function:
df <- list()
build_HUW_raster <- function(.x, .y) {
list.time <- .x %>%
split(.$id) %>%
purrr::map(~list(t=as.matrix(.x$date),
xy=unname(as.matrix(.x[,c(22,23)])))
)
for(i in 1:50){
cat(i," ")
path=list.time[[i]]
ctmc=ctmcmove::path2ctmc(path$xy,path$t,r,method="LinearInterp")
df[[i]] <- as.data.frame(do.call(cbind, ctmc))
}
df <- df %>% purrr::map(~ group_by(., ec) %>%
summarise(rt = mean(rt)) %>%
arrange(desc(rt))
)
stacktime <- df %>% purrr::map(~ rename(., cell = ec)) %>%
map(~dplyr::left_join(cargo.grid, ., by="cell", copy=T)) %>%
map(~raster::rasterize(., r, field="rt", na.rm=F, background=0)) %>%
raster::stack()
stackprop <- .y %>%
split(.$id) %>%
purrr::map(~ raster::rasterize(., y = r,
field=.$proportion,
fun=function(x, ...)median(x))) %>%
raster::stack()
stack_huw <- raster::overlay(raster::calc(stacktime, fun=function(x)
ifelse(is.na(x), NA, x/sum(x, na.rm=T))), stackprop, fun=function(x,y)x*y
)
raster_mean <- raster::stackApply(stack_huw,
indices = rep(1,raster::nlayers(stack_huw)),
fun = "mean",
na.rm = F
)
}
result.list <- purrr::map2 (.x=list1, .y=list2, fun=build_HUW_raster)
The reason is based on the element looped. [[ extracts the list element and depending on the class of the element, map loops over either individual elements if it is a vector/matrix or the columns in case of data.frame as these are units. By using [, it extracts the element as a list
list(1, 2, 3)[1]
[[1]]
[1] 1
vs
list(1, 2, 3)[[1]]
[1] 1
When we loop over map and apply some functions that require a specific structure i.e. colSums require a matrix/data.frame ie. with dim attributes, it fails if we use [[
> map(replicate(2, data.frame(col1 = 1:5, col2 = 6:10), simplify = FALSE)[[1]], colSums)
Error in .f(.x[[i]], ...) :
'x' must be an array of at least two dimensions
> map(replicate(2, data.frame(col1 = 1:5, col2 = 6:10), simplify = FALSE)[1], colSums)
[[1]]
col1 col2
15 40
Here, we may change the code to
purrr::map2(.x=list1[1], .y=list2[1], fun=build_HUW_raster)

how to change all POSIXt to character throughout a list of df

I know for a df, I can do df<-df %>% modify_if(is.POSIXt, as.character). If I have a list of df lst, how can I do this through every df in lst? I know I probably need to use map or lapply, but I am not sure how. Could someone give me some guidance?
Thanks.
If we have a list of datasets and wanted to change the column type and return the same data.frame format
library(purrr)
library(dplyr)
library(lubridate)
map(lst, ~ .x %>%
mutate(across(where(is.POSIXt), as.character)))
Or with inherits
map(lst1, ~ .x %>%
mutate(across(where(~ inherits(., 'POSIXct')), as.character )))
You're on the right track:
new.lst <- lapply( lst, \(x) x %>% modify_if( is.POSIXt, as.character) )
For ancient versions of R:
new.lst <- lst %>% map( ~ .x %>% modify_if( is.POSIXt, as.character) )
Or lapply with function(x):
new.lst <- lapply( lst, function(x) x %>% modify_if( is.POSIXt, as.character) )
With minimal data to demonstrate:
library(lubridate)
lst <- list( data.frame(x=now(),y=1), data.frame(z=now()-days(2),t=3) )
lst[[1]]$x
new.lst <- lapply( lst, \(x) x %>% modify_if( is.POSIXt, as.character) )
new.lst[[1]]$x
Output:
> lst[[1]]$x
[1] "2021-05-20 00:48:34 CEST"
> new.lst[[1]]$x
[1] "2021-05-20 00:48:34"
(the CEST part shows that the first output is a date)

Function in tidyverse

I want to create tidyverse with intermediate function.
I have a structure as
temp1 = sapply(df, function(x) .....)
temp2 = sapply(temp1, function(x) .......... )
temp3 = sapply(df, function(x) ..........)
temp = data.frame(temp2/temp3)
And I want to get something like this
sapply(df, function(x) .......) %>% sapply(df, function(x) ....... )
%>% ......
Reproducible example:
df = data.frame(a = c(1,2,3), b = c(1,2,3))
temp1 = sapply(df, function(x) x*3)
temp2 = sapply(temp1, function(x) x+4 )
temp3 = sapply(df, function(x) x/4)
temp = data.frame(temp2/temp3)
Assuming you have more complicated functions to perform on every column than the one shown you could use purrr functions like :
library(purrr)
map2_df(map(df, ~.x * 3 + 4), map(df, ~.x/4), `/`)
# a b
# <dbl> <dbl>
#1 28 28
#2 20 20
#3 17.3 17.3
To the best of my knowledge, the pipe operator do not remember the first block of the chain, only the previous one, so you have to use an intermediate step.
However, you can simplify the first part of your code to a pipeline:
temp1 = df %>% sapply(function(x) x*3) %>% sapply(function(x) x+4)
temp = temp1/sapply(df, function(x) x/4)
You can use brackets to wrap a whole pipe chain and use it as a data frame.
(df %>% sapply(., function(x) x*3) %>% sapply(., function(x) x+4 )) /
(df %>% sapply(., function(x) x/4) )

Construct variable names in select_

I am trying to write a function that will (in part) rename a variable by combining its source dataframe and existing variable name. In essence, I want:
df1 <- data.frame(a = 1, b = 2)
to become:
df1 %>%
rename(df1_a = a)
# df1_a b
#1 1 2
But I want to do this programatically, something along the lines of:
fun <- function(df, var) {
outdf <- rename_(df, paste(df, var, sep = "_") = var)
return(outdf)
}
This admittedly naive approach obviously doesn't work, but I haven't been able to figure it out. I'm sure the answer is somewhere in the nse vignette (https://cran.r-project.org/web/packages/dplyr/vignettes/nse.html), but that doesn't seem to address constructing variable names.
Not sure if this is the proper dplyr-esque way, but it'll get you going.
fun <- function(df, var) {
x <- deparse(substitute(df))
y <- deparse(substitute(var))
rename_(df, .dots = with(df, setNames(as.list(y), paste(x, y, sep = "_"))))
}
fun(df1, a)
# df1_a b
# 1 1 2
fun(df1, b)
# a df1_b
# 1 1 2
lazyeval isn't really needed here because the environment of both inputs is known. That being said:
library(lazyeval)
library(dplyr)
library(magrittr)
fun = function(df, var) {
df_ = lazy(df)
var_ = lazy(var)
fun_(df_, var_)
}
fun_ = function(df_, var_) {
new_var_string =
paste(df_ %>% as.character %>% extract(1),
var_ %>% as.character %>% extract(1),
sep = "_")
dots = list(var_) %>% setNames(new_var_string)
df_ %>%
lazy_eval %>%
rename_(.dots = dots)
}
fun(df1, a)

R-Hmisc impute by cluster result

I want to impute a variable x3 by the its mean corresponding to each cluster calculated considering other 2 variables X1 and X2.
I know that you can pass a function to impute from Hmisc package, like "mean" and it does the work. So I would like to pass a function that does all the following.
I use to write the code to do so:
df1 <- data.frame(x1=runif(1000,0,100),
x2=runif(1000,0,100),
x3=c(runif(900,0,100),rep(NA,100)))
I want to pass a function that does all of this:
clust<-kmeans(df1[,-grep('x3', colnames(df1))], 3)
df1$clust<-clust$cluster
library(plyr)
cc<-ddply(df1, 'clust',summarise, mean=mean(x3, na.rm=TRUE))
df2<-merge(df1,cc, by='clust')
df2$x3imputed2<-ifelse(is.na(df2$x3),df2$mean, df2$x3)
Is there a way to pass all this code as a function and use it in Hmisc? (I had a problem with ddply introducing x3 as a variable).
Something like the following:
ff<-function(i) {
clust<-kmeans(df1[,-grep(i, colnames(df1))], 3)
df1$clust<-clust$cluster
cc<-aggregate(df1[,i], by=list(clust=df1$clust), "mean", na.rm=TRUE)
df2<-merge(df1,cc, by='clust')
df2$x3imputed2<-ifelse(is.na(df2[, i]),df2$x, df2[,i])
}
f1$imputedx3<-with(df1, impute(x3,ff))
But I get an error:
empty cluster: try a better set of initial centers
And when I replace it by x3 I don't get the same error.
Try
library(lazyeval)
library(dplyr)
f1 <- function(dat, cname){
#get the third argument i.e, 'cname'
nm1 <- match.call()[[3]]
#paste 'imputed' for renaming the new column later
nm2 <- paste0(nm1, 'imputed')
#create an numeric column index that will be removed in kmeans calc
indx <- grep(cname, colnames(dat))
#get the 'kmeans' of the columns other than the 'cname'
clust <- kmeans(dat[,-indx],3)$cluster
#group by 'clust' and create new column with 'mutate'
dat %>%
group_by(clust=clust) %>%
mutate_(interp(~ifelse(is.na(v), mean(v, na.rm=TRUE), v),
v=as.name(cname))) %>%
#rename the column
setNames(., c(head(names(.),-1), nm2))
}
f1(df1, 'x3')
Or you could pass it without quotes by using v= lazy(cname)
f2 <- function(dat, cname){
nm1 <- match.call()[[3]]
nm2 <- paste0(nm1, 'imputed')
indx <- grep(nm1, colnames(dat))
clust <- kmeans(dat[,-indx],3)$cluster
dat %>%
group_by(clust=clust) %>%
mutate_(interp(~ifelse(is.na(v), mean(v, na.rm=TRUE), v),
v= lazy(cname))) %>%
setNames(., c(head(names(.),-1), nm2))
}
f2(df1, x3)

Resources