Making a function over a list with tidyverse and sapply - r

I am trying to build a function to obtain a weighted mean at the same variable in different dataframes in a list. The function is not taking some arguments (wage and weight), I believe there is a "" or [[]] problems but I can't seem to make it work.
Here's the reproducible example that gives me the error
set.seed(555)
lista <- list(A = data.frame(wage = (runif(10, min=50, max=100)), weight = (runif(10, min=0, max=1))),
B = data.frame(wage = (runif(10, min=55, max=105)), weight = (runif(10, min=0.1, max=1))))
list
wmeanf <- function(df, x, w) {
mean <- df %>% summarise (weighted.mean(x,w))
mean
}
twmean <- sapply(lista, function (X) wmeanf (df = X, x = wage, w = weight))
Thanks!

There are several ways to accomplish this. Hopefully one of these gets you going in the right direction:
library(tidyverse)
set.seed(555)
lista <- list(A = data.frame(wage = (runif(10, min=50, max=100)), weight = (runif(10, min=0, max=1))),
B = data.frame(wage = (runif(10, min=55, max=105)), weight = (runif(10, min=0.1, max=1))))
map(lista, ~ weighted.mean(x = .$wage, w = .$weight))
#> $A
#> [1] 75.60411
#>
#> $B
#> [1] 70.22652
lapply(lista, function(x) { weighted.mean(x = x$wage, w = x$weight) })
#> $A
#> [1] 75.60411
#>
#> $B
#> [1] 70.22652
sapply(lista, function(x) { weighted.mean(x = x$wage, w = x$weight) })
#> A B
#> 75.60411 70.22652
Created on 2020-05-05 by the reprex package (v0.3.0)

After #Jason's suggestion to look here about Dplyr evaluation and quoting I found a way to make my original intended function work:
set.seed(555)
lista <- list(A = data.frame(wage = (runif(10, min=50, max=100)), weight = (runif(10, min=0, max=1))),
B = data.frame(wage = (runif(10, min=55, max=105)), weight = (runif(10, min=0.1, max=1))))
wmeanf <- function(df, x, w) {
x <- enquo(x)
w <- enquo(w)
mean <- df %>% summarise (weighted.mean(!!x,!!w))
mean
}
sapply(lista, function (X) wmeanf (df = X, x = wage, w = weight))
$`A.weighted.mean(wage, weight)`
[1] 75.6041053069
$`B.weighted.mean(wage, weight)`
[1] 70.2265239366

Related

Extract values from list of named lists in R

Based on the names of sublists with xyz values of a list, I would like to extract a sample of the xyz values from a sublist. Note: the lists do not start at 1.
Example data
set.seed(123)
data <- list('4' = list(x = rnorm(5), y = rnorm(5), z = rnorm(5)),
'5' = list(x = rnorm(5), y = rnorm(5), z = rnorm(5)),
'6' = list(x = rnorm(5), y = rnorm(5), z = rnorm(5)),
'7' = list(x = rnorm(5), y = rnorm(5), z = rnorm(5)),
'8' = list(x = rnorm(5), y = rnorm(5), z = rnorm(5)))
Function to extract random values (derived from here)
I have the following function to sample random xyz values from the list:
get_elements <- function(data, i) {
#select the main list
tmp <- data[[i]]
#Check the length of each sublist, select minimum value
#and sample 1 number from 1 to that number
rand_int <- sample(min(lengths(tmp)), 1)
#select that element from each sub-list
sapply(tmp, `[[`, rand_int)
}
Example of function
# Show list number 8
data[['8']]
#> $x
#> [1] 0.3796395 -0.5023235 -0.3332074 -1.0185754 -1.0717912
#> $y
#> [1] 0.30352864 0.44820978 0.05300423 0.92226747 2.05008469
#> $z
#> [1] -0.4910312 -2.3091689 1.0057385 -0.7092008 -0.6880086
# Extract random combination from list 8
get_elements(data, '8')
#> x y z
#> -0.33320738 0.05300423 1.00573852
Rewrite function
Using the same function as above, I replaced i with 'i':
get_elements <- function(data, i) {
tmp <- data[['i']] # <-- changed i to 'i'
rand_int <- sample(min(lengths(tmp)), 1)
sapply(tmp, `[[`, rand_int)
}
Error & Question
get_elements(data, 8)
Warning in min(lengths(tmp)): no non-missing arguments to min;
returning Inf list()
Suddenly the function breaks, and I do not understand why? What is the reason for this error?
Try this:
get_elements <- function(data, i)
{
tmp <- data[[paste(i)]]
rand_int <- sample(min(lengths(tmp)), 1)
sapply(tmp, `[[`, rand_int)
}
The reason your initial code failed is because "i" is understood as "character i", not as "convert variable i into character". See:
i = 1
print("i") # i
print(i) # 1

mdply (or similar) a function instead of for loop

I want to generate data from a function iterating over a range of values. The setting is best explained in a small example:
myfun <- function(a, b, sims) {
x = 3/a*b
y = mean(a*rnorm(sims))
return(data.frame(x = x, y = y))
}
# Output I want:
d <- data.frame(x = 0, y= 0)
d[1,] <- myfun(a=4, b=2, sims = 100)
d[2,] <- myfun(a=4, b=3, sims = 100)
d[3,] <- myfun(a=4, b=4, sims = 100)
# --> With a for loop this is easy
# Using mdply, however, does not work
a <- expand.grid(a=1:3)
d <- plyr::mdply(a, myfun, b=seq(1,100, length=100), sims = 100)
You can use Map :
data <- expand.grid(a = 1:3, b = 1:100)
result <- do.call(rbind, Map(myfun, data$a, data$b, MoreArgs = list(sims = 100)))
head(result)
# x y
#1 3.0 -0.17846248
#2 1.5 0.06837716
#3 1.0 0.01034184
#4 6.0 -0.02898619
#5 3.0 0.10077290
#6 2.0 0.22321839
A similar way would be if you Vectorize myfun. Vectorize is a wrapper around mapply.
myfun_vec <- Vectorize(myfun)
t(myfun_vec(data$a, data$b, 100))
A purrr option :
result <- purrr::map2_df(data$a, data$b, myfun, sims = 100)

How execute pairwise.t.test into a list with `for` loop?

My list (lt):
df_1 <- data.frame(
x = replicate(
n = 2,
expr = runif(n = 30, min = 20, max = 100)
),
y = sample(
x = 1:3, size = 30, replace = TRUE
)
)
lt <- split(
x = df_1,
f = df_1[['y']]
)
vars <- names(df_1)[1:2]
I try:
for (i in vars) {
for (i in i) {
print(pairwise.t.test(x = lt[, i], g = lt[['y']], p.adj = 'bonferroni'))
}
}
But, the error message is:
Error in lista[, i] : incorrect number of dimensions
What's problem?
We don't need to split
pairwise.t.test(unlist(df_1[1:2]), g = rep(df_1$y, 2), p.adj = 'bonferroni')
#Pairwise comparisons using t tests with pooled SD
#data: unlist(df_1[1:2]) and rep(df_1$y, 2)
# 1 2
#2 1.00 -
#3 0.91 1.00

IDW parameters in R

I want to perform IDW interpolation using R using the idw command from the gstat package. I have this data:
#settings
library(gstat)
library(dplyr)
library(sp)
library(tidyr)
id_rep <- rep(c(1,2), 20)
f <- rep(c(930,930.2), each=20)
perc <- rep(c(90, 80), each=10)
x <- sample(1:50, 40)
y <- sample(50:100, 40)
E <- runif(40)
df <- data.frame(id_rep, perc, x,y, f, E)
df_split <- split(df, list(df$id_rep, df$perc, df$f), drop = TRUE, sep="_")
#grid
x.range <- range(df$x)
y.range <- range(df$y)
grid <- expand.grid(x = seq(x.range[1], x.range[2], by=1),
y = seq(y.range[1], y.range[2], by=1))
coordinates(grid) <- ~x + y
#interpolation
lst_interp_idw <- lapply(df_split, function(X) {
coordinates(X) <- ~x + y
E_idw <- idw(E~ 1, X, grid, idp=1, nmax=3) %>% as.data.frame()
df_interp <- select(E_idw, x,y,E_pred=var1.pred)
df_interp
})
df_interp_idw <- bind_rows(lst_interp_idw, .id = "interact") %>%
separate(interact, c("id_rep", "perc", "f"), sep = "\\_")
Now I want to perform each run with different idp and nmax parameters within certain values​ (idp from 1 to 3 by 0.5, and nmax 3 to 6 by 1) and get out a data frame with columns for each combination of idp and nmax values. I try with two for loops but it doesn't work.
EDIT
the code that doesn't work is:
idp = seq(from = 1, to = 3, by = 0.5)
nmax = seq(from = 3, to = 6, by = 1)
...
for(i in idp) {
for(j in nmax)
{ E_idw= idw(E ~ 1, X, grid, nmax = i, idp = j)
}
}
...
Here is a way how to store the result of every iteration in a list.
#settings
#install.packages("gstat")
library(gstat)
library(dplyr)
library(sp)
library(tidyr)
id_rep <- rep(c(1,2), 20)
f <- rep(c(930,930.2), each=20)
perc <- rep(c(90, 80), each=10)
x <- sample(1:50, 40)
y <- sample(50:100, 40)
E <- runif(40)
df <- data.frame(id_rep, perc, x,y, f, E)
df_split <- split(df, list(df$id_rep, df$perc, df$f), drop = TRUE, sep="_")
#grid
x.range <- range(df$x)
y.range <- range(df$y)
grid <- expand.grid(x = seq(x.range[1], x.range[2], by=1),
y = seq(y.range[1], y.range[2], by=1))
coordinates(grid) <- ~x + y
# ==============================================
# NEW function
# ==============================================
idp = seq(from = 1, to = 3, by = 0.5)
nmax = seq(from = 3, to = 6, by = 1)
#interpolation
lst_interp_idw <- lapply(df_split, function(X) {
coordinates(X) <- ~x + y
df_interp <- vector(length(idp)*length(nmax), mode = "list" )
k <- 0
for(i in idp) {
for(j in nmax) {
print(paste(i, j))
# Iterator
k <- k + 1
E_idw= idw(E ~ 1, X, grid, nmax = i, idp = j) %>% as.data.frame()
df_interp[[k]] <- select(E_idw, x,y,E_pred=var1.pred)
}
}
return(df_interp)
})
# ==============================================
Some plausibility checks (lapply is applied to 8 list elements and 20 variations are calculated):
length(lst_interp_idw) # 8
length(lst_interp_idw[[1]]) #20
length(lst_interp_idw[[1]]) #20
It should be easy for you to adapt the last line of your code
df_interp_idw <- bind_rows(lst_interp_idw, .id = "interact") %>%
separate(interact, c("id_rep", "perc", "f"), sep = "\\_")
to format the output in the desired format. This highly depends on how you want to present the different interpolation alternatives.

Randomly assign teachers to classrooms imposing restrictions

This question is very similar to a question I asked before. The added complication is that I have N schools with G grades and C classrooms. Additionally, I want to assign each of T teachers to 2 classrooms within a single school and grade.
I can generate some fake data with the following code:
library(randomNames)
set.seed(6232015)
n.schools <-20
gen.names <- function(n, which.names = "both", name.order = "last.first"){
names <- unique(randomNames(n=n, which.names = which.names, name.order = name.order))
need <- n - length(names)
while(need>0){
names <- unique(c(randomNames(n=need, which.names = which.names, name.order = name.order), names))
need <- n - length(names)
}
return(names)
}
#Generates the classrooms data frame
grade <- c(3,4,5)
classroom <- c(LETTERS[1:4])
classroom <- expand.grid(grade=c(3,4,5),
classroom=c(LETTERS[1:4]),
School.ID=paste0(gen.names(n = n.schools, which.names = "last"), ' School'))
#Generates teachers data frame
n.teachers=nrow(classroom)/2
gen.teachers <- function(n.teachers){
Teacher.ID <- gen.names(n = n.teachers, name.order = "last.first")
Teacher.exp <- runif(n = n.teachers, min = 1, max = 30)
Teacher.Other <- sample(c(0,1), replace = T, prob = c(0.5, 0.5), size = n.teachers)
Teacher.RE <- rnorm(n = n.teachers, mean = 0, sd = 1)
Teachers <- data.frame(Teacher.ID, Teacher.exp, Teacher.Other, Teacher.RE)
return(Teachers)
}
Teachers <- gen.teachers(n.teachers = n.teachers)
The data frame that I want to create would have 240 rows with 7 variables. Using sample like in the answer to my previous question will not work (I think) because of the restrictions I want to impose. I thought about using group_by() but I don't think that would do the trick...
Thanks!
This works, but I'm hopping to learn a more elegant solution
library(randomNames)
library(dplyr)
set.seed(6232015)
n.schools <-20
n.grades <- 3
n.classrooms <- 4
total.classrooms <- n.classrooms*n.grades*n.schools
gen.names <- function(n, which.names = "both", name.order = "last.first"){
names <- unique(randomNames(n=n, which.names = which.names, name.order = name.order))
need <- n - length(names)
while(need>0){
names <- unique(c(randomNames(n=need, which.names = which.names, name.order = name.order), names))
need <- n - length(names)
}
return(names)
}
#Generates teachers data frame
n.teachers=total.classrooms/2
gen.teachers <- function(n.teachers){
Teacher.ID <- gen.names(n = n.teachers, name.order = "last.first")
Teacher.exp <- runif(n = n.teachers, min = 1, max = 30)
Teacher.Other <- sample(c(0,1), replace = T, prob = c(0.5, 0.5), size = n.teachers)
Teacher.RE <- rnorm(n = n.teachers, mean = 0, sd = 1)
Teachers <- data.frame(Teacher.ID, Teacher.exp, Teacher.Other, Teacher.RE) %>% mutate(Teacher.ID=as.character(Teacher.ID))
return(Teachers)
}
Teachers <- gen.teachers(n.teachers = n.teachers)
str(Teachers$Teacher.ID)
#Make a ‘schoolGrade’ object and then reshape
schoolGrade <- expand.grid(grade = c(3,4,5),
School.ID = paste0(gen.names(n = n.schools, which.names = "last"),
' School'))
# assign each of T teachers to 2 classrooms within a single school and grade
cuttoff1<-n.teachers/2
schoolGrade$A <- Teachers$Teacher.ID[1:cuttoff1]
schoolGrade$B <- Teachers$Teacher.ID[1:cuttoff1]
schoolGrade$C <- Teachers$Teacher.ID[(cuttoff1+1):n.teachers]
schoolGrade$D <- Teachers$Teacher.ID[(cuttoff1+1):n.teachers]
library(tidyr)
schoolGrade <- gather(schoolGrade, Classroom, Teacher.ID, A:D) %>% full_join(Teachers, by="Teacher.ID")
The main problem is if i want to increase n.classrooms from 4 to 20. In that case instead of having 4 lines going from A to D I would have 20, plus the additional cutoffs. Which is very complicated...
This answer allows me to easily set n.classrooms to whatever value, for example 20.
The problem is that this code is painfully slow. Suggestion to improve it are very welcome!
library(dplyr)
library(randomNames)
library(geosphere)
set.seed(7142015)
# Define Parameters
n.Schools <- 20
first.grade<-3
last.grade<-5
n.Grades <-last.grade-first.grade+1
n.Classrooms <- 20 # THIS IS WHAT I WANTED TO BE ABLE TO CHANGE
n.Teachers <- (n.Schools*n.Grades*n.Classrooms)/2 #Two classrooms per teacher
# Define Random names function:
gen.names <- function(n, which.names = "both", name.order = "last.first"){
names <- unique(randomNames(n=n, which.names = which.names, name.order = name.order))
need <- n - length(names)
while(need>0){
names <- unique(c(randomNames(n=need, which.names = which.names, name.order = name.order), names))
need <- n - length(names)
}
return(names)
}
# Generate n.Schools names
gen.schools <- function(n.schools) {
School.ID <-
paste0(gen.names(n = n.schools, which.names = "last"), ' School')
School.long <- rnorm(n = n.schools, mean = 21.7672, sd = 0.025)
School.lat <- rnorm(n = n.schools, mean = 58.8471, sd = 0.025)
School.RE <- rnorm(n = n.schools, mean = 0, sd = 1)
Schools <-
data.frame(School.ID, School.lat, School.long, School.RE) %>%
mutate(School.ID = as.character(School.ID)) %>%
rowwise() %>% mutate (School.distance = distHaversine(
p1 = c(School.long, School.lat),
p2 = c(21.7672, 58.8471), r = 3961
))
return(Schools)
}
Schools <- gen.schools(n.schools = n.Schools)
# Generate Grades
Grades <- c(first.grade:last.grade)
# Generate n.Classrooms
Classrooms <- LETTERS[1:n.Classrooms]
# Group schools and grades
SchGr <- outer(paste0(Schools$School.ID, '-'), paste0(Grades, '-'), FUN="paste")
#head(SchGr)
# Group SchGr and Classrooms
SchGrClss <- outer(SchGr, paste0(Classrooms, '-'), FUN="paste")
#head(SchGrClss)
# These are the combination of School-Grades-Classroom
SchGrClssTmp <- as.matrix(SchGrClss, ncol=1, nrow=length(SchGrClss) )
SchGrClssEnd <- as.data.frame(SchGrClssTmp)
# Assign n.Teachers (2 classroom in a given school-grade)
Allpairs <- as.data.frame(t(combn(SchGrClssTmp, 2)))
AllpairsTmp <- paste(Allpairs$V1, Allpairs$V2, sep=" ")
library(stringr)
xm <- do.call(rbind, str_split(string = AllpairsTmp, pattern = "-"))
separoPairs <- as.data.frame((xm), stringsAsFactors = FALSE)
separoPairs <- separoPairs %>% select(-V7) %>% #Drops empty column
mutate(V1=as.character(V1), V4=as.character(V4), V2=as.numeric(V2), V5=as.numeric(V5)) %>% mutate(V4 = trimws(V4, which = "both"))
#Only the rows with V1=V4 and V2=V5 are valid
validPairs <- separoPairs %>% filter(V1==V4 & V2==V5) %>% select(V1, V2, V3, V6)
# Generate n.Teachers
gen.teachers <- function(n.teachers){
Teacher.ID <- gen.names(n = n.teachers, name.order = "last.first")
Teacher.exp <- runif(n = n.teachers, min = 1, max = 30)
Teacher.Other <- sample(c(0,1), replace = T, prob = c(0.5, 0.5), size = n.teachers)
Teacher.RE <- rnorm(n = n.teachers, mean = 0, sd = 1)
Teachers <- data.frame(Teacher.ID, Teacher.exp, Teacher.Other, Teacher.RE)
return(Teachers)
}
Teachers <- gen.teachers(n.teachers = n.Teachers) %>%
mutate(Teacher.ID = as.character(Teacher.ID))
# Randomly assign n.Teachers teachers to the "ValidPairs"
TmpAssignments <- validPairs[sample(1:nrow(validPairs), n.Teachers), ]
Assignments <- cbind.data.frame(Teachers$Teacher.ID, TmpAssignments)
names(Assignments) <- c("Teacher.ID", "School.ID", "Grade", "Class_1", "Class_2")
# Tidy Data
library(tidyr)
TeacherClassroom <- Assignments %>%
gather(x, Classroom, Class_1,Class_2) %>%
select(-x) %>%
mutate(Teacher.ID = as.character(Teacher.ID))
# Merge
DF_Classrooms <- TeacherClassroom %>% full_join(Teachers, by="Teacher.ID") %>% full_join(Schools, by="School.ID")
rm(list=setdiff(ls(), "DF_Classrooms")) # Clean the work space!
Thanks!

Resources