I want to create tidyverse with intermediate function.
I have a structure as
temp1 = sapply(df, function(x) .....)
temp2 = sapply(temp1, function(x) .......... )
temp3 = sapply(df, function(x) ..........)
temp = data.frame(temp2/temp3)
And I want to get something like this
sapply(df, function(x) .......) %>% sapply(df, function(x) ....... )
%>% ......
Reproducible example:
df = data.frame(a = c(1,2,3), b = c(1,2,3))
temp1 = sapply(df, function(x) x*3)
temp2 = sapply(temp1, function(x) x+4 )
temp3 = sapply(df, function(x) x/4)
temp = data.frame(temp2/temp3)
Assuming you have more complicated functions to perform on every column than the one shown you could use purrr functions like :
library(purrr)
map2_df(map(df, ~.x * 3 + 4), map(df, ~.x/4), `/`)
# a b
# <dbl> <dbl>
#1 28 28
#2 20 20
#3 17.3 17.3
To the best of my knowledge, the pipe operator do not remember the first block of the chain, only the previous one, so you have to use an intermediate step.
However, you can simplify the first part of your code to a pipeline:
temp1 = df %>% sapply(function(x) x*3) %>% sapply(function(x) x+4)
temp = temp1/sapply(df, function(x) x/4)
You can use brackets to wrap a whole pipe chain and use it as a data frame.
(df %>% sapply(., function(x) x*3) %>% sapply(., function(x) x+4 )) /
(df %>% sapply(., function(x) x/4) )
Related
I built a function to use it inside the purrr::map2 function and run it in two lists. When I run the function steps separately it works ok. But apparently in map2 it runs the first time (for the first elements of list .x[[1]] .y[[1]]) and then in the second round throws this error in the for function:
How can I find out why it's not working?
PS: It's hard to put an example of the data here because they are lists with very specific characteristics for this function. I'm sorrry.
Follow the function:
df <- list()
build_HUW_raster <- function(.x, .y) {
list.time <- .x %>%
split(.$id) %>%
purrr::map(~list(t=as.matrix(.x$date),
xy=unname(as.matrix(.x[,c(22,23)])))
)
for(i in 1:50){
cat(i," ")
path=list.time[[i]]
ctmc=ctmcmove::path2ctmc(path$xy,path$t,r,method="LinearInterp")
df[[i]] <- as.data.frame(do.call(cbind, ctmc))
}
df <- df %>% purrr::map(~ group_by(., ec) %>%
summarise(rt = mean(rt)) %>%
arrange(desc(rt))
)
stacktime <- df %>% purrr::map(~ rename(., cell = ec)) %>%
map(~dplyr::left_join(cargo.grid, ., by="cell", copy=T)) %>%
map(~raster::rasterize(., r, field="rt", na.rm=F, background=0)) %>%
raster::stack()
stackprop <- .y %>%
split(.$id) %>%
purrr::map(~ raster::rasterize(., y = r,
field=.$proportion,
fun=function(x, ...)median(x))) %>%
raster::stack()
stack_huw <- raster::overlay(raster::calc(stacktime, fun=function(x)
ifelse(is.na(x), NA, x/sum(x, na.rm=T))), stackprop, fun=function(x,y)x*y
)
raster_mean <- raster::stackApply(stack_huw,
indices = rep(1,raster::nlayers(stack_huw)),
fun = "mean",
na.rm = F
)
}
result.list <- purrr::map2 (.x=list1, .y=list2, fun=build_HUW_raster)
The reason is based on the element looped. [[ extracts the list element and depending on the class of the element, map loops over either individual elements if it is a vector/matrix or the columns in case of data.frame as these are units. By using [, it extracts the element as a list
list(1, 2, 3)[1]
[[1]]
[1] 1
vs
list(1, 2, 3)[[1]]
[1] 1
When we loop over map and apply some functions that require a specific structure i.e. colSums require a matrix/data.frame ie. with dim attributes, it fails if we use [[
> map(replicate(2, data.frame(col1 = 1:5, col2 = 6:10), simplify = FALSE)[[1]], colSums)
Error in .f(.x[[i]], ...) :
'x' must be an array of at least two dimensions
> map(replicate(2, data.frame(col1 = 1:5, col2 = 6:10), simplify = FALSE)[1], colSums)
[[1]]
col1 col2
15 40
Here, we may change the code to
purrr::map2(.x=list1[1], .y=list2[1], fun=build_HUW_raster)
Have a data frame with a concatenated column that I want to order numerically with the number after -
df <- data.frame(Order = c("A23_2-A27_3-A40_4-A10_1", "A25_2-A21_3-A11_1", "A9_1", "A33_2-A8_1"))
and want to have a result like this:
df <- data.frame(Order = c("A10A23A27A40", "A11A25A21", "A9", "A8A33"))
tried couple of things with tidyverse but couldn't get a clean result.
df %>%
rowid_to_column() %>%
separate_rows(Order, sep='-') %>%
separate(Order, c('Order', 'v'), convert = TRUE) %>%
arrange(v)%>%
group_by(rowid) %>%
summarise(Order = str_c(Order, collapse = ''))
# A tibble: 4 x 2
rowid Order
<int> <chr>
1 1 A10A23A27A40
2 2 A11A25A21
3 3 A9
4 4 A8A33
Another base R approach:
df$Order <- sapply(strsplit(df$Order, '-'), function(x) {
spl <- strsplit(x, '_') # split by '_'
spl <- do.call(rbind, spl) # create a 2-column matrix
ord <- order(as.numeric(spl[, 2])) # order of numeric parts
paste(spl[ord, 1], collapse='') # concatenate in correct order
})
Here is a base R option:
df$Order <-
sapply(strsplit(df$Order, "-"), function(x)
paste0(gsub("\\_.*", "", x[order(as.numeric(sub("^[^_]*_", "", x)))]), collapse = ""))
Output
Order
1 A10A23A27A40
2 A11A25A21
3 A9
4 A8A33
Or a tidyverse option:
library(tidyverse)
df %>%
mutate(Order = map(str_split(Order, "-"), ~
str_c(
str_replace_all(.x[order(as.numeric(str_replace_all(.x, "^[^_]*_", "")))], "\\_.*", ""), collapse = ""
)))
This is my code:
library(data.table)
library(stringr)
parameters <- c("conductivity","calcium","chloride","magnesium","phosphate","potassium","salinity","sodium","sulphate")
for (i in parameters){
i <- read.csv(str_c("./Data/Parameters/",i,".csv"), sep=",", header=FALSE)
i <- unique(i)
i <- subset(i, select=c(1,2,4,6))
i <- setnames(i, c("site","date", str_c("",i,""), "material"))
i[,3] <- as.numeric(i[,3])
i <- subset(i, i > 0)
}
Now, there are two things that don't work here.
The first is in the setnames function: it doesn't understand that it needs to label one of the columns of the CSV with the variable name.
The second is that it doesn't actually call the imported files with 'conductivity', 'calcium', etc but simply calls them all 'i'.
How can I fix this?
We can do this with map
library(stringr)
library(dplyr)
library(purrr)
map(parameters, ~ read_csv(str_c("./Data/Parameters/", .x, ".csv")) %>%
distinct %>%
select(1, 2, 4, 6) %>%
rename_at(3, ~ .x) %>%
mutate(!! .x := !! rlang::sym(.x)) %>%
filter_at(vars(.x), any_vars(. > 0))
)
Another option using data.table:
library(data.table)
parameters <- c("conductivity","calcium","chloride","magnesium","phosphate",
"potassium","salinity","sodium","sulphate")
lapply(parameters, function(x) {
DT <- unique(fread(paste0("./Data/Parameters/", x, ".csv"), header=FALSE))[,
V4 := as.numeric(V4)][
V4 > 0, paste0("V", c(1,2,4,6))]
setnames(DT, names(DT), c("site","date", x, "material"))
})
I am trying to write a function that will (in part) rename a variable by combining its source dataframe and existing variable name. In essence, I want:
df1 <- data.frame(a = 1, b = 2)
to become:
df1 %>%
rename(df1_a = a)
# df1_a b
#1 1 2
But I want to do this programatically, something along the lines of:
fun <- function(df, var) {
outdf <- rename_(df, paste(df, var, sep = "_") = var)
return(outdf)
}
This admittedly naive approach obviously doesn't work, but I haven't been able to figure it out. I'm sure the answer is somewhere in the nse vignette (https://cran.r-project.org/web/packages/dplyr/vignettes/nse.html), but that doesn't seem to address constructing variable names.
Not sure if this is the proper dplyr-esque way, but it'll get you going.
fun <- function(df, var) {
x <- deparse(substitute(df))
y <- deparse(substitute(var))
rename_(df, .dots = with(df, setNames(as.list(y), paste(x, y, sep = "_"))))
}
fun(df1, a)
# df1_a b
# 1 1 2
fun(df1, b)
# a df1_b
# 1 1 2
lazyeval isn't really needed here because the environment of both inputs is known. That being said:
library(lazyeval)
library(dplyr)
library(magrittr)
fun = function(df, var) {
df_ = lazy(df)
var_ = lazy(var)
fun_(df_, var_)
}
fun_ = function(df_, var_) {
new_var_string =
paste(df_ %>% as.character %>% extract(1),
var_ %>% as.character %>% extract(1),
sep = "_")
dots = list(var_) %>% setNames(new_var_string)
df_ %>%
lazy_eval %>%
rename_(.dots = dots)
}
fun(df1, a)
How can I get a data frame's name from a list? Sure, get() gets the object itself, but I want to have its name for use within another function. Here's the use case, in case you would rather suggest a work around:
lapply(somelistOfDataframes, function(X) {
ddply(X, .(idx, bynameofX), summarise, checkSum = sum(value))
})
There is a column in each data frame that goes by the same name as the data frame within the list. How can I get this name bynameofX? names(X) would return the whole vector.
EDIT: Here's a reproducible example:
df1 <- data.frame(value = rnorm(100), cat = c(rep(1,50),
rep(2,50)), idx = rep(letters[1:4],25))
df2 <- data.frame(value = rnorm(100,8), cat2 = c(rep(1,50),
rep(2,50)), idx = rep(letters[1:4],25))
mylist <- list(cat = df1, cat2 = df2)
lapply(mylist, head, 5)
I'd use the names of the list in this fashion:
dat1 = data.frame()
dat2 = data.frame()
l = list(dat1 = dat1, dat2 = dat2)
> str(l)
List of 2
$ dat1:'data.frame': 0 obs. of 0 variables
$ dat2:'data.frame': 0 obs. of 0 variables
and then use lapply + ddply like:
lapply(names(l), function(x) {
ddply(l[[x]], c("idx", x), summarise,checkSum = sum(value))
})
This remains untested without a reproducible answer. But it should help you in the right direction.
EDIT (ran2): Here's the code using the reproducible example.
l <- lapply(names(mylist), function(x) {
ddply(mylist[[x]], c("idx", x), summarise,checkSum = sum(value))
})
names(l) <- names(mylist); l
Here is the dplyr equivalent
library(dplyr)
catalog =
data_frame(
data = someListOfDataframes,
cat = names(someListOfDataframes)) %>%
rowwise %>%
mutate(
renamed =
data %>%
rename_(.dots =
cat %>%
as.name %>%
list %>%
setNames("cat")) %>%
list)
catalog$renamed %>%
bind_rows(.id = "number") %>%
group_by(number, idx, cat) %>%
summarize(checkSum = sum(value))
you could just firstly use names(list)->list_name and then use list_name[1] , list_name[2] etc. to get each list name. (you may also need as.numeric(list_name[x]) if your list names are numbers.