randomly assign teachers to a school with dplyr or similar? - r

Suppose I have a data frame with 8 schools and its characteristics, and another with 48 teachers and its characteristics. I can generate some fake data with the following code:
library(dplyr)
library(geosphere)
set.seed(6232015)
n.schools <-8
n.teachers <- 48
makeRandomString <- function(pre, n=1, length=12) {
randomString <- c(1:n) # initialize vector
for (i in 1:n) {
randomString[i] <- paste0(pre,'.', paste(sample(c(0:9, letters, LETTERS),
length, replace=TRUE),
collapse=""))
}
return(randomString)
}
gen.teachers <- function(n.teachers){
Teacher.ID <- makeRandomString(pre= 'T', n = n.teachers, length = 20)
Teacher.exp <- runif(n = n.teachers, min = 1, max = 30)
Teacher.Other <- sample(c(0,1), replace = T, prob = c(0.5, 0.5), size = n.teachers)
Teachers <- data.frame(Teacher.ID, Teacher.exp, Teacher.Other)
return(Teachers)
}
gen.schools <- function(n.schools){
School.ID <- makeRandomString(pre= 'S', n = n.schools, length = 20)
School.lat <- runif(n = n.schools, min = -2, max = 2)
School.long <- runif(n = n.schools, min = -2, max = 2)
Schools <- data.frame(School.ID, School.lat, School.long) %>%
rowwise() %>% mutate (School.distance = distHaversine(p1 = c(School.long, School.lat),
p2 = c(0, 0), r = 3961))
return(Schools)
}
Teachers <- gen.teachers(n.teachers = n.teachers)
Schools <- gen.schools(n.schools = n.schools)
To each shool, I want to assign 6 teachers (every teacher get 1 and only 1 school). I could use:
Teachers %>% sample_n(6)
To get a list of 6 teachers assign those to a school, remove them from my original pool and keep going with a loop. My guess/hope is that there is a much easier way of doing this.
Thanks for the help!

In the context of your code
sample(rep(Schools$School.ID, each = 6))
gives a random sequence of schools where each school.id appears 6 times. Set Teachers$AssignedSchool to this sample and each teacher has an assigned school

Related

Using app function from {terra} package on raster stacks? (in parallel)

I have four high resolution rasters for a country. I have split each raster into tiles and done some other processing to them. I now want to apply a function to each cell, of each 'stack' of the raster tiles, to produce one set of output tiles. The function is a little complex. I have tried to synthesise some data below to reproduce my current approach. It works (ish) but I'm convinced that there's a better way to do this. To use parallel processing on my unix box, I simply swap mapply for mcmapply, but I haven't done that in the example below as I presume many will be working on Windows machines. I'd welcome ideas on my approach and particularly optimisation.
library("terra")
library("glue")
## Make some toy data
dir.create("temp_folder")
dir.create("result_folder")
x <- rast(ncols = 10, nrows = 10)
a <- rast(ncol = 100, nrow = 100)
some_values <- as.integer(runif(10000, min = 1, max = 100))
ind <- which(some_values %in% sample(some_values, 15))
some_values[ind] <- NA
values(a) <- some_values
a_tiles <- makeTiles(a, x, glue("temp_folder/tile_a_{1:100}.tif"), overwrite = TRUE)
b <- rast(ncol = 100, nrow = 100)
some_values <- as.integer(runif(10000, min = 1, max = 100))
ind <- which(some_values %in% sample(some_values, 15))
some_values[ind] <- NA
values(b) <- some_values
b_tiles <- makeTiles(b, x, glue("temp_folder/tile_b_{1:100}.tif"), overwrite = TRUE)
c <-rast(ncol = 100, nrow = 100)
some_values <- as.integer(runif(10000, min = 1, max = 100))
ind <- which(some_values %in% sample(some_values, 15))
some_values[ind] <- NA
values(c) <- some_values
c_tiles <- makeTiles(c, x, glue("temp_folder/tile_c_{1:100}.tif"), overwrite = TRUE)
d <- rast(ncol = 100, nrow = 100)
some_values <- as.integer(runif(10000, min = 1, max = 100))
ind <- which(some_values %in% sample(some_values, 15))
some_values[ind] <- NA
values(d) <- some_values
d_tiles <- makeTiles(d, x, glue("temp_folder/tile_d_{1:100}.tif"), overwrite = TRUE)
## Outer function so that this can be used in parallel ? But maybe this is a silly way to do it?
outer_function <- function(a_tiles, b_tiles, c_tiles, d_tiles, output_files) {
one_a_tile <- rast(unlist(a_tiles))
one_b_tile <- rast(unlist(b_tiles))
one_c_tile <- rast(unlist(c_tiles))
one_d_tile <- rast(unlist(d_tiles))
output_file <- output_files
# I replace any NAs with 0 as an NA will break my 'if' statement of the inner_function.
# I get Error in if (z["a"] <= z["b"]) { : missing value where TRUE/FALSE needed
one_a_tile[is.na(one_a_tile)] <- 0
one_b_tile[is.na(one_b_tile)] <- 0
one_c_tile[is.na(one_c_tile)] <- 0
one_d_tile[is.na(one_d_tile)] <- 0
z <- sds(one_a_tile, one_b_tile, one_c_tile, one_d_tile)
## Inner function that actually does the work I want doing
inner_function <- function(z) {
names(z) <- c('a', 'b', 'c', 'd')
if (z['a'] <= z['b']) {
y <- rowSums(cbind((z['c'] + z['a'] * 10),
(z['c'] + z['a'] * 20)))
}
if (z['a'] >= z['b']) {
y <- rowSums(cbind((z['c'] + z['a'] * 40),
(z['c'] + z['a'] * 10)))
}
if (z['a'] == z['b']) {
y <- rowSums(cbind((z['c'] + z['a'] * 60),
(z['c'] + z['a'] * 10)))
}
y <- ifelse(y == 0, NA, y)
return(y)
}
app(z,
inner_function,
filename = output_file,
overwrite = TRUE,
wopt = list(datatype = "INT4U"))
return(output_file)
}
results <- mapply(outer_function,
a_tiles = a_tiles,
b_tiles = b_tiles,
c_tiles = c_tiles,
d_tiles = d_tiles,
output_files = output_files <- glue("result_folder/result_tile_{1:length(d_tiles)}.tif"))
names(results) <- NULL
unlink("temp_folder", recursive = TRUE)
unlink("result_folder", recursive = TRUE)

Why does function return NULL?

A beginner in R over here, so apologies for the basic question.
Why does ATE return a null vector instead of saving the values of the difference of the means?
fun.cluster <- function(M, N){
set.seed(02139)
J <- 1:M # vector J_i
df <- as.data.frame(matrix(data=1:N, nrow = N, ncol = 1)) #data frame of all original values
df$cluster <- cut(df$V1, M, labels = 1:M) #breaking the dataframe into clusters
df$cluster <- as.numeric(df$cluster)
Y1 <- as.vector(sample(J, 5)) # assigning treatment
df$treatment <- ifelse(df$cluster %in% Y1, df$treatment <- 1, df$treatment <- 0)
#Inducing intracluster correlation:
mu_0j <- runif(n = 50, min = -1, max = 1)
df$V1[df$treatment==0] <- mu_0j
mu_1j <- runif(n=50, min = -0.5, max = 1.5)
df$V1[df$treatment==0] <- mu_1j
# drawing values
y_0i <- rnorm(n = 50, mean = mu_0j, sd = 1)
y_1i <- rnorm(n = 50, mean = mu_1j, sd = 1)
D_i <- as.vector(c(y_0i, y_1i))
# calculating ATE:
ATE[i] <- mean(y_1i - y_0i)
}
ATE <- c()
for(i in 1:10){
fun.cluster(M = 10, N = 100)
}

In R & dabestr, how do I get grouped differences correctly?

Using dabestr package I'm trying to get the differences between two sets of control & test data. Moifying slightly example from help file I tried:
library(dabestr)
N <- 70
c1 <- rnorm(N, mean = 50, sd = 20)
t1 <- rnorm(N, mean = 200, sd = 20)
ID <- seq(1:N)
long.data <- tibble::tibble(ID = ID, Control1 = c1, Test1 = t1)
meandiff1 <- long.data %>%
tidyr::gather(key = Group, value = Measurement, Control1:Test1)
ID <- seq(1:N) + N
c2 <- rnorm(N, mean = 100, sd = 70)
t2 <- rnorm(N, mean = 100, sd = 70)
long.data <- tibble::tibble(ID = ID, Control2 = c2, Test2 = t2)
meandiff2 <- long.data %>%
tidyr::gather(key = Group, value = Measurement, Control2:Test2)
meandiff <- dplyr::bind_rows(meandiff1, meandiff2)
paired_mean_diff <-
dabest(meandiff, x = Group, y = Measurement,
idx = c("Control1", "Test1", "Control2", "Test2"),
paired = TRUE,
id.col = ID)
plot(paired_mean_diff)
I get these results:
So not only is everything compared to Control1 but also the paired = TRUE option seems to have no effect. I was hoping to get something similar to examples from the package page:
Any pointers on how to achieve that?
For a paired plot, you want to nest the idx keyword option as such:
paired_mean_diff <-
dabest(meandiff, x = Group, y = Measurement,
idx = list(c("Control1", "Test1"),
c("Control2", "Test2")),
paired = TRUE,
id.col = ID)

randomize observations by groups (blocks) without replacement

This is a follow up question. The answers in the previous question are doing the random sampling with replacement. How can I change the code so that I assign each observation to on of J "urn" without putting the observation back in the 'lottery'?
This is the code I have right now:
set.seed(9782)
I <- 500
g <- 10
library(dplyr)
anon_id <- function(n = 1, lenght = 12) {
randomString <- c(1:n)
for (i in 1:n)
{
randomString[i] <- paste(sample(c(0:9, letters, LETTERS),
lenght, replace = TRUE),
collapse = "")
}
return(randomString)
}
df <- data.frame(id = anon_id(n = I, lenght = 16),
group = sample(1:g, I, T))
J <- 3
p <- c(0.25, 0.5, 0.25)
randomize <- function(data, urns=2, block_id = NULL, p=NULL, seed=9782) {
if(is.null(p)) p <- rep(1/urns, urns)
if(is.null(block_id)){
df1 <- data %>%
mutate(Treatment = sample(x = c(1:urns),
size = n(),
replace = T,
prob = p))
return(df1)
}else{
df1 <- data %>% group_by_(block_id) %>%
mutate(Treatment = sample(x = c(1:urns),
size = n(),
replace = T,
prob = p))
}
}
df1 <- randomize(data = df, urns = J, block_id = "group", p = p, seed = 9782)
If I change replace = T to replace = F I get the following error:
Error: cannot take a sample larger than the population when 'replace = FALSE'
Clarification of my objective:
Suppose that I have 10 classrooms (or villages, or something like that). To keep it simple, suppose each classroom has 20 students (in reality they will have N_j). Classroom per classroom, I want to assign each student to one of J groups, for example J=3. P says the fraction that will be assigned to each group. For example 25% to group 1 40% to group 2 and 35% to group 3.
This solution is based on #Frank's comment. I created one function that does the randomization for block j and another that calls that function for every block.
randomize_block <- function(data, block=NULL, block_name=NULL, urns, p, seed=9782) {
set.seed(seed)
if(!is.null(block)) {
condition <- paste0(block_name,"==",block)
df <- data %>% filter_(condition)
} else df <- data
if(is.null(p)) p <- rep(1/urns, urns)
N <- nrow(df)
Np <- round(N*p,0)
if(sum(Np)!=N) Np[1] <- N - sum(Np[2:length(Np)])
Urns = rep(seq_along(p), Np)
Urns = sample(Urns)
df$urn <- Urns
return(df)
}
randomize <- function(data, block_name=NULL, urns, p, seed=9782) {
if(is.null(p)) p <- rep(1/urns, urns)
if(!is.null(block_name)){
blocks <- unique(data[,block_name])
df <- lapply(blocks, randomize_block,
data = data,
block_name=block_name,
urns = urns,
p = p,
seed=seed)
return(data.table::rbindlist(df))
}else {
df <- randomize_block(data = data,
urns = urns, p = p,
seed=seed)
}
}
test <- randomize(data = df, block_name = "group",
urns = 3, p = c(0.25, 0.5, 0.25),
seed=4222016)
I'm trying to figure out if it is possible to use dplyr to do this, alternative solutions implementing that are more than welcome!
My answer to your other question is without replacement, as can be seen below:
block_rand <- as.tibble(randomizr::block_ra(blocks = df$group, conditions = c("urn_1","urn_2","urn_3")))
df2 <- as.tibble(bind_cols(df, block_rand))
df2 %>% janitor::tabyl(group, value)
df2 %>%
group_by(id) %>%
filter(n()>1) %>%
str()

Randomly assign teachers to classrooms imposing restrictions

This question is very similar to a question I asked before. The added complication is that I have N schools with G grades and C classrooms. Additionally, I want to assign each of T teachers to 2 classrooms within a single school and grade.
I can generate some fake data with the following code:
library(randomNames)
set.seed(6232015)
n.schools <-20
gen.names <- function(n, which.names = "both", name.order = "last.first"){
names <- unique(randomNames(n=n, which.names = which.names, name.order = name.order))
need <- n - length(names)
while(need>0){
names <- unique(c(randomNames(n=need, which.names = which.names, name.order = name.order), names))
need <- n - length(names)
}
return(names)
}
#Generates the classrooms data frame
grade <- c(3,4,5)
classroom <- c(LETTERS[1:4])
classroom <- expand.grid(grade=c(3,4,5),
classroom=c(LETTERS[1:4]),
School.ID=paste0(gen.names(n = n.schools, which.names = "last"), ' School'))
#Generates teachers data frame
n.teachers=nrow(classroom)/2
gen.teachers <- function(n.teachers){
Teacher.ID <- gen.names(n = n.teachers, name.order = "last.first")
Teacher.exp <- runif(n = n.teachers, min = 1, max = 30)
Teacher.Other <- sample(c(0,1), replace = T, prob = c(0.5, 0.5), size = n.teachers)
Teacher.RE <- rnorm(n = n.teachers, mean = 0, sd = 1)
Teachers <- data.frame(Teacher.ID, Teacher.exp, Teacher.Other, Teacher.RE)
return(Teachers)
}
Teachers <- gen.teachers(n.teachers = n.teachers)
The data frame that I want to create would have 240 rows with 7 variables. Using sample like in the answer to my previous question will not work (I think) because of the restrictions I want to impose. I thought about using group_by() but I don't think that would do the trick...
Thanks!
This works, but I'm hopping to learn a more elegant solution
library(randomNames)
library(dplyr)
set.seed(6232015)
n.schools <-20
n.grades <- 3
n.classrooms <- 4
total.classrooms <- n.classrooms*n.grades*n.schools
gen.names <- function(n, which.names = "both", name.order = "last.first"){
names <- unique(randomNames(n=n, which.names = which.names, name.order = name.order))
need <- n - length(names)
while(need>0){
names <- unique(c(randomNames(n=need, which.names = which.names, name.order = name.order), names))
need <- n - length(names)
}
return(names)
}
#Generates teachers data frame
n.teachers=total.classrooms/2
gen.teachers <- function(n.teachers){
Teacher.ID <- gen.names(n = n.teachers, name.order = "last.first")
Teacher.exp <- runif(n = n.teachers, min = 1, max = 30)
Teacher.Other <- sample(c(0,1), replace = T, prob = c(0.5, 0.5), size = n.teachers)
Teacher.RE <- rnorm(n = n.teachers, mean = 0, sd = 1)
Teachers <- data.frame(Teacher.ID, Teacher.exp, Teacher.Other, Teacher.RE) %>% mutate(Teacher.ID=as.character(Teacher.ID))
return(Teachers)
}
Teachers <- gen.teachers(n.teachers = n.teachers)
str(Teachers$Teacher.ID)
#Make a ‘schoolGrade’ object and then reshape
schoolGrade <- expand.grid(grade = c(3,4,5),
School.ID = paste0(gen.names(n = n.schools, which.names = "last"),
' School'))
# assign each of T teachers to 2 classrooms within a single school and grade
cuttoff1<-n.teachers/2
schoolGrade$A <- Teachers$Teacher.ID[1:cuttoff1]
schoolGrade$B <- Teachers$Teacher.ID[1:cuttoff1]
schoolGrade$C <- Teachers$Teacher.ID[(cuttoff1+1):n.teachers]
schoolGrade$D <- Teachers$Teacher.ID[(cuttoff1+1):n.teachers]
library(tidyr)
schoolGrade <- gather(schoolGrade, Classroom, Teacher.ID, A:D) %>% full_join(Teachers, by="Teacher.ID")
The main problem is if i want to increase n.classrooms from 4 to 20. In that case instead of having 4 lines going from A to D I would have 20, plus the additional cutoffs. Which is very complicated...
This answer allows me to easily set n.classrooms to whatever value, for example 20.
The problem is that this code is painfully slow. Suggestion to improve it are very welcome!
library(dplyr)
library(randomNames)
library(geosphere)
set.seed(7142015)
# Define Parameters
n.Schools <- 20
first.grade<-3
last.grade<-5
n.Grades <-last.grade-first.grade+1
n.Classrooms <- 20 # THIS IS WHAT I WANTED TO BE ABLE TO CHANGE
n.Teachers <- (n.Schools*n.Grades*n.Classrooms)/2 #Two classrooms per teacher
# Define Random names function:
gen.names <- function(n, which.names = "both", name.order = "last.first"){
names <- unique(randomNames(n=n, which.names = which.names, name.order = name.order))
need <- n - length(names)
while(need>0){
names <- unique(c(randomNames(n=need, which.names = which.names, name.order = name.order), names))
need <- n - length(names)
}
return(names)
}
# Generate n.Schools names
gen.schools <- function(n.schools) {
School.ID <-
paste0(gen.names(n = n.schools, which.names = "last"), ' School')
School.long <- rnorm(n = n.schools, mean = 21.7672, sd = 0.025)
School.lat <- rnorm(n = n.schools, mean = 58.8471, sd = 0.025)
School.RE <- rnorm(n = n.schools, mean = 0, sd = 1)
Schools <-
data.frame(School.ID, School.lat, School.long, School.RE) %>%
mutate(School.ID = as.character(School.ID)) %>%
rowwise() %>% mutate (School.distance = distHaversine(
p1 = c(School.long, School.lat),
p2 = c(21.7672, 58.8471), r = 3961
))
return(Schools)
}
Schools <- gen.schools(n.schools = n.Schools)
# Generate Grades
Grades <- c(first.grade:last.grade)
# Generate n.Classrooms
Classrooms <- LETTERS[1:n.Classrooms]
# Group schools and grades
SchGr <- outer(paste0(Schools$School.ID, '-'), paste0(Grades, '-'), FUN="paste")
#head(SchGr)
# Group SchGr and Classrooms
SchGrClss <- outer(SchGr, paste0(Classrooms, '-'), FUN="paste")
#head(SchGrClss)
# These are the combination of School-Grades-Classroom
SchGrClssTmp <- as.matrix(SchGrClss, ncol=1, nrow=length(SchGrClss) )
SchGrClssEnd <- as.data.frame(SchGrClssTmp)
# Assign n.Teachers (2 classroom in a given school-grade)
Allpairs <- as.data.frame(t(combn(SchGrClssTmp, 2)))
AllpairsTmp <- paste(Allpairs$V1, Allpairs$V2, sep=" ")
library(stringr)
xm <- do.call(rbind, str_split(string = AllpairsTmp, pattern = "-"))
separoPairs <- as.data.frame((xm), stringsAsFactors = FALSE)
separoPairs <- separoPairs %>% select(-V7) %>% #Drops empty column
mutate(V1=as.character(V1), V4=as.character(V4), V2=as.numeric(V2), V5=as.numeric(V5)) %>% mutate(V4 = trimws(V4, which = "both"))
#Only the rows with V1=V4 and V2=V5 are valid
validPairs <- separoPairs %>% filter(V1==V4 & V2==V5) %>% select(V1, V2, V3, V6)
# Generate n.Teachers
gen.teachers <- function(n.teachers){
Teacher.ID <- gen.names(n = n.teachers, name.order = "last.first")
Teacher.exp <- runif(n = n.teachers, min = 1, max = 30)
Teacher.Other <- sample(c(0,1), replace = T, prob = c(0.5, 0.5), size = n.teachers)
Teacher.RE <- rnorm(n = n.teachers, mean = 0, sd = 1)
Teachers <- data.frame(Teacher.ID, Teacher.exp, Teacher.Other, Teacher.RE)
return(Teachers)
}
Teachers <- gen.teachers(n.teachers = n.Teachers) %>%
mutate(Teacher.ID = as.character(Teacher.ID))
# Randomly assign n.Teachers teachers to the "ValidPairs"
TmpAssignments <- validPairs[sample(1:nrow(validPairs), n.Teachers), ]
Assignments <- cbind.data.frame(Teachers$Teacher.ID, TmpAssignments)
names(Assignments) <- c("Teacher.ID", "School.ID", "Grade", "Class_1", "Class_2")
# Tidy Data
library(tidyr)
TeacherClassroom <- Assignments %>%
gather(x, Classroom, Class_1,Class_2) %>%
select(-x) %>%
mutate(Teacher.ID = as.character(Teacher.ID))
# Merge
DF_Classrooms <- TeacherClassroom %>% full_join(Teachers, by="Teacher.ID") %>% full_join(Schools, by="School.ID")
rm(list=setdiff(ls(), "DF_Classrooms")) # Clean the work space!
Thanks!

Resources