R for loop with characters list - r

I have a df such as:
name <- rep(c("a","b","c"),5)
QV.low <- runif(15, 2, 5)
QV.med <- runif(15, 5.0, 7.5)
QV.high <- runif(15, 7.5, 10)
df <- as.data.frame(cbind(name, QV.low, QV.med,QV.high))
and a list of names:
name.list <- c("a","b")
I want to do an operation, eg:
df %>%
subset(name %in% name.list) %>%
summarise(.,sum = sum(QV.low))
but I want to for each QV. variable via a loop.
I tried:
QV.list <- c("QV.low", "QV.med", "QV.high")
for(qv in 1:length(QV.list)){
QV <- noquote(QV.list[qv])
print(QV)
df %>%
subset(name %in% name.list) %>%
summarise(.,sum = sum(QV))
}
But it does not work.
How can I "extract" the character value from the QV.list in order to use it as df variable later?

You need to have at least 3 different names in namecol otherwise namecol %in% name.list1 is useless. If there's no filter and no pipe, there's no need for a loop. A simple colSums(df[,-1]) will do the job.
library(tidyverse)
QV.low <- runif(10, 2, 5)
QV.med <- runif(10, 5.0, 7.5)
QV.high <- runif(10, 7.5, 10)
namecol <- sample(c("a","b", "c"), 10, replace = T)
df <- data.frame(namecol, QV.low, QV.med,QV.high)
df
name.list1 <- c("a","b") # select some names
QV.list <- c("QV.low", "QV.med", "QV.high")
for(i in QV.list){
QV <- noquote(i)
print(QV)
qv <- sym(i)
print(df %>%
filter(namecol %in% name.list1) %>%
summarise(sum = sum(!!qv)))
}
will give you
[1] QV.low
sum
1 29.093
[1] QV.med
sum
1 61.07034
[1] QV.high
sum
1 86.02611

if I understood your problem you can resolve with this:
for( name in names(df)){
df[,name]
....
df %>% summarise(.,sum = sum(df[,name]))
}

Related

New column with random boolean values while controlling the ratio of TRUE/FALSE per category

In R I've got a dataset like this one:
df <- data.frame(
ID = c(1:30),
x1 = seq(0, 1, length.out = 30),
x2 = seq(100, 3000, length.out = 30),
category = gl(3, 10, labels = c("NEGATIVE", "NEUTRAL", "POSITIVE"))
)
Now I want to add a new column with randomized boolean values, but inside each category the proportion of TRUE and FALSE values should be the same (i.e. the randomizing process should generate the same count of true and false values, in the above data frame 5 TRUEs and 5 FALSEs in each of the 3 categories). How to do this?
You can sample a vector of "TRUE" and "FALSE" values without replacement so you have a randomized and balanced column in your data-frame.
sample(rep(c("TRUE","FALSE"),each=5),10,replace=FALSE)
Based on Yacine Hajji answer:
addRandomBool <- function(df, p){
n <- ceiling(nrow(df) * p)
df$bool <- sample(rep(c("TRUE","FALSE"), times = c(n, nrow(df) - n)))
df
}
Reduce(rbind, lapply(split(df, df$category), addRandomBool, p = 0.5))
where parametar p determines the proportion of TRUE.
This will sample within each group from a vector of 5 TRUE and 5 FALSE without replacement. It will assume that there are always 10 records per group.
library(dplyr)
library(tidyr)
df <- data.frame(
ID = c(1:30),
x1 = seq(0, 1, length.out = 30),
x2 = seq(100, 3000, length.out = 30),
category = gl(3, 10, labels = c("NEGATIVE", "NEUTRAL", "POSITIVE"))
)
set.seed(pi)
df %>%
group_by(category) %>%
nest() %>%
mutate(data = lapply(data,
function(df){ # Function to saple and assign the new_col
df$new_col <- sample(rep(c(FALSE, TRUE),
each = 5),
size = 10,
replace = FALSE)
df
})) %>%
unnest(cols = "data")
This next example is a little more generalized, but still assumes (approximately) even distribution of TRUE and FALSE within a group. But it can accomodate variable group sizes, and even groups with odd numbers of records (but will favor FALSE for odd numbers of records)
library(dplyr)
library(tidyr)
df <- data.frame(
ID = c(1:30),
x1 = seq(0, 1, length.out = 30),
x2 = seq(100, 3000, length.out = 30),
category = gl(3, 10, labels = c("NEGATIVE", "NEUTRAL", "POSITIVE"))
)
set.seed(pi)
df %>%
group_by(category) %>%
nest() %>%
mutate(data = lapply(data,
function(df){
df$new_col <- sample(rep(c(FALSE, TRUE),
length.out = nrow(df)),
size = nrow(df),
replace = FALSE)
df
})) %>%
unnest(cols = "data")
Maintaining Column Order
A couple of options to maintain the column order:
First, you can save the column order before you do your group_by - nest, and then use select to set the order when you're done.
set.seed(pi)
orig_col <- names(df) # original column order
df %>%
group_by(category) %>%
nest() %>%
mutate(data = lapply(data,
function(df){
df$new_col <- sample(rep(c(FALSE, TRUE),
length.out = nrow(df)),
size = nrow(df),
replace = FALSE)
df
})) %>%
unnest(cols = "data") %>%
select_at(c(orig_col, "new_col")) # Restore the column order
Or you can use a base R solution that doesn't change the column order in the first place
df <- split(df, df["category"])
df <- lapply(df,
function(df){
df$new_col <- sample(rep(c(FALSE, TRUE),
length.out = nrow(df)),
size = nrow(df),
replace = FALSE)
df
})
do.call("rbind", c(df, list(make.row.names = FALSE)))
There are likely a dozen other ways to do this, and probably more efficient ways that I'm not thinking of.

Apply a function to multiple datasets using lapply

I have a large number of datasets for which I want to create the same variable. I would like to create a function to avoid having to repeat the same code many times.
I tried the code below: the first 3 lines describe the creation of the variable that I am trying to apply through the function created below.
data1 <- data1 %>%
dplyr::group_by(id)%>%
dplyr::mutate(new_var = sum(score))
list_data <- c(data1, data2, data3)
my_func <- function(x) {
x <- x %>%
dplyr::group_by(id) %>%
dplyr::mutate(new_var = sum(score))
}
lapply(list_data, my_func)
I obtain the error message
no applicable method for 'group_by' applied to an object of class
"character".
Could you please help me figure this out?
for me this works fine:
my_func <- function(x) {
x <- x %>%
dplyr::group_by(id) %>%
dplyr::mutate(new_var = sum(score))
}
data1 <- data.frame(id = rep(1:3, each = 3), score = 1:9)
data2 <- data.frame(id = rep(1:3, each = 3), score = 11:19)
data3 <- data.frame(id = rep(1:3, each = 3), score = 21:29)
list_data <- list(data1, data2, data3)
lapply(list_data, my_func)

How to delete specific record of one dataframe according to values in another dataframe in R

I have two dataframe (dat1 & dat2). Some records in dat2 need to be deleted, according to if var1 in dat1 is negative. I use the following codes, but I think they are not the best one, because I use an extra temporary dataframe tmp. Could we have a better method?
library(dplyr)
Date1 <- c("1999-12-17", "2005-1-5", "2003-11-2", "2005-6-12", "2005-8-9")
Date1 <- as.POSIXct(Date1, tz = "UTC")
Date2 <- c("2005-1-5", "2005-6-12", "2005-8-9")
Date2 <- as.POSIXct(Date2, tz = "UTC")
var1 <- c(-3, -10, 9, 5, 8)
var2 <- c(0.2, 0.6, 0.15)
dat1 <- data.frame(Date1, var1)
dat2 <- data.frame(Date2, var2)
#Below is what I did
tmp <- inner_join(dat1, dat2, by = c("Date1" = "Date2"))
tmp <- tmp[-tmp$var1 < 0, ]
dat2 <- tmp[, c(1,3)]
Something like this should work:
dat2 %>%
left_join(dat1, by = c("Date2" = "Date1")) %>%
filter(var1 > 0) %>%
mutate(var1 = NULL)
Given you're already using dplyr, why not make better use of pipes, filter, and select as such
library(dplyr)
dat2 %>%
left_join(dat1, by = c("Date2" = "Date1")) %>%
filter(var1 >= 0) %>%
select(-var1)

Apply different data to a function in R

I have the following data frame:
library(tidyverse)
set.seed(1234)
df <- data.frame(
x = seq(1, 100, 1),
y = rnorm(100)
)
Where I apply a smooth spline using different knots:
nknots <- seq(4, 15, 1)
output <- map(nknots, ~ smooth.spline(x = df$x, y = df$y, nknots = .x))
What I need to do now is to apply the same function using 2-point and 3-point averages:
df_2 <- df %>%
group_by(., x = round(.$x/2)*2) %>%
summarise_all(funs(mean))
df_3 <- df %>%
group_by(., x = round(.$x/3)*3) %>%
summarise_all(funs(mean))
In summary, I need to apply the function I used in output with the following data frames:
df
df_2
df_3
Of course, this is a minimal example, so I am looking for a efficient way of doing it. Preferably with the purrr package.
Using lapply, and the library zoo to calculate the moving average in a more simple and elegant manner:
library(zoo)
lapply(1:3,function(roll){
dftemp <- as.data.frame(rollmean(df,roll))
map(nknots, ~ smooth.spline(x = dftemp$x, y = dftemp$y, nknots = .x))
})
Here's one possible solution:
library(tidyverse)
set.seed(1234)
df <- data.frame(x = seq(1, 100, 1),
y = rnorm(100))
# funtion to get v-point averages
GetAverages = function(v) {
df %>%
group_by(., x = round(.$x/v)*v) %>%
summarise_all(funs(mean)) }
# specify nunber of knots
nknots <- seq(4, 15, 1)
dt_res = tibble(v=1:3) %>% # specify v-point averages
mutate(d = map(v, GetAverages)) %>% # get data for each v-point
crossing(., data.frame(nknots=nknots)) %>% # combine each dataset with a knot
mutate(res = map2(d, nknots, ~smooth.spline(x = .x$x, y = .x$y, nknots = .y))) # apply smooth spline
You can use dt_res$res[dt_res$v == 1] to see all results for your original daatset, dt_res$res[dt_res$v == 2] to see results for your 2-point estimate, etc.

rowwise filtering in dplyr

I wanted to use dplyr instead of apply,1 in order to filter a dataset rowwise according to a logical expression, ie for this example I´d like to remove all rows that have one or more values of 99.
However, I was surprised by the poor performance in dplyr. Any ideas if I can speed this up in dplyr? Also, I would have thought that the rowwise function would pipe the individual rows, but apparently not (see below). How can I use the rowwise function?
library(tidyverse)
s <- tibble(rows = seq(from = 250, to = 5000, by = 250)) #my original dataset has 400K rows...
s$num <- map(s$rows, ~ rnorm(.x * 6))
s$num <-
map(s$num, ~ replace(.x, sample(1:length(.x), size = length(.x) / 20), 99))
s$mat <- map(s$num, ~ as_data_frame(matrix(.x, ncol = 6)))
help_an <- function(vec) {
browser()
return(!any(vec == 99))
}
help_dp_t <- function(df) {
clo1 <- proc.time()
a <- as_data_frame(t(df)) %>% summarise_all(help_an)
df2 <- filter(df, t(a)[, 1])
b <- tibble(time = (proc.time() - clo1)[3], df = list(df2))
return(b)
}
s$dplyr <- map(s$mat, ~ dplyr::mutate(help_dp_t(.x)))
help_lap <- function(df) {
clo1 <- proc.time()
a_base <- df[apply(df, 1, function(x)
! any(x == 99)), ]
b <- tibble(time = (proc.time() - clo1)[3], df = list(a_base))
return(b)
}
s$lapply <- map(s$mat, ~ mutate(help_lap(.x)))
s$equal_dplyr_lapply <-
map2_lgl(s$dplyr, s$lapply, ~ all.equal(.x$df, .y$df))
s$dplyr_time <- map_dbl(s$dplyr, "time")
s$lapply_time <- map_dbl(s$lapply, "time")
ggplot(gather(s, ... = c(7, 8)), aes(x = rows, y = value, color = key)) +
geom_line()
I tried the following with rowwise, but the rowwise pipe does not send a vector, but the entire df to the help_an function.
help_dp_r <- function(df) {
clo1 <- proc.time()
df2 <-
df %>% rowwise() %>% mutate(cond = help_an(.)) ### . is not passed on as a vector, but the entire df??
b <- tibble(time = (proc.time() - clo1)[3], df = list(df2))
}
s$dplyr_r <- map(s$mat, ~ dplyr::mutate(help_dp_r(.x)))

Resources