Randomly assign teachers to classrooms imposing restrictions - r

This question is very similar to a question I asked before. The added complication is that I have N schools with G grades and C classrooms. Additionally, I want to assign each of T teachers to 2 classrooms within a single school and grade.
I can generate some fake data with the following code:
library(randomNames)
set.seed(6232015)
n.schools <-20
gen.names <- function(n, which.names = "both", name.order = "last.first"){
names <- unique(randomNames(n=n, which.names = which.names, name.order = name.order))
need <- n - length(names)
while(need>0){
names <- unique(c(randomNames(n=need, which.names = which.names, name.order = name.order), names))
need <- n - length(names)
}
return(names)
}
#Generates the classrooms data frame
grade <- c(3,4,5)
classroom <- c(LETTERS[1:4])
classroom <- expand.grid(grade=c(3,4,5),
classroom=c(LETTERS[1:4]),
School.ID=paste0(gen.names(n = n.schools, which.names = "last"), ' School'))
#Generates teachers data frame
n.teachers=nrow(classroom)/2
gen.teachers <- function(n.teachers){
Teacher.ID <- gen.names(n = n.teachers, name.order = "last.first")
Teacher.exp <- runif(n = n.teachers, min = 1, max = 30)
Teacher.Other <- sample(c(0,1), replace = T, prob = c(0.5, 0.5), size = n.teachers)
Teacher.RE <- rnorm(n = n.teachers, mean = 0, sd = 1)
Teachers <- data.frame(Teacher.ID, Teacher.exp, Teacher.Other, Teacher.RE)
return(Teachers)
}
Teachers <- gen.teachers(n.teachers = n.teachers)
The data frame that I want to create would have 240 rows with 7 variables. Using sample like in the answer to my previous question will not work (I think) because of the restrictions I want to impose. I thought about using group_by() but I don't think that would do the trick...
Thanks!

This works, but I'm hopping to learn a more elegant solution
library(randomNames)
library(dplyr)
set.seed(6232015)
n.schools <-20
n.grades <- 3
n.classrooms <- 4
total.classrooms <- n.classrooms*n.grades*n.schools
gen.names <- function(n, which.names = "both", name.order = "last.first"){
names <- unique(randomNames(n=n, which.names = which.names, name.order = name.order))
need <- n - length(names)
while(need>0){
names <- unique(c(randomNames(n=need, which.names = which.names, name.order = name.order), names))
need <- n - length(names)
}
return(names)
}
#Generates teachers data frame
n.teachers=total.classrooms/2
gen.teachers <- function(n.teachers){
Teacher.ID <- gen.names(n = n.teachers, name.order = "last.first")
Teacher.exp <- runif(n = n.teachers, min = 1, max = 30)
Teacher.Other <- sample(c(0,1), replace = T, prob = c(0.5, 0.5), size = n.teachers)
Teacher.RE <- rnorm(n = n.teachers, mean = 0, sd = 1)
Teachers <- data.frame(Teacher.ID, Teacher.exp, Teacher.Other, Teacher.RE) %>% mutate(Teacher.ID=as.character(Teacher.ID))
return(Teachers)
}
Teachers <- gen.teachers(n.teachers = n.teachers)
str(Teachers$Teacher.ID)
#Make a ‘schoolGrade’ object and then reshape
schoolGrade <- expand.grid(grade = c(3,4,5),
School.ID = paste0(gen.names(n = n.schools, which.names = "last"),
' School'))
# assign each of T teachers to 2 classrooms within a single school and grade
cuttoff1<-n.teachers/2
schoolGrade$A <- Teachers$Teacher.ID[1:cuttoff1]
schoolGrade$B <- Teachers$Teacher.ID[1:cuttoff1]
schoolGrade$C <- Teachers$Teacher.ID[(cuttoff1+1):n.teachers]
schoolGrade$D <- Teachers$Teacher.ID[(cuttoff1+1):n.teachers]
library(tidyr)
schoolGrade <- gather(schoolGrade, Classroom, Teacher.ID, A:D) %>% full_join(Teachers, by="Teacher.ID")
The main problem is if i want to increase n.classrooms from 4 to 20. In that case instead of having 4 lines going from A to D I would have 20, plus the additional cutoffs. Which is very complicated...

This answer allows me to easily set n.classrooms to whatever value, for example 20.
The problem is that this code is painfully slow. Suggestion to improve it are very welcome!
library(dplyr)
library(randomNames)
library(geosphere)
set.seed(7142015)
# Define Parameters
n.Schools <- 20
first.grade<-3
last.grade<-5
n.Grades <-last.grade-first.grade+1
n.Classrooms <- 20 # THIS IS WHAT I WANTED TO BE ABLE TO CHANGE
n.Teachers <- (n.Schools*n.Grades*n.Classrooms)/2 #Two classrooms per teacher
# Define Random names function:
gen.names <- function(n, which.names = "both", name.order = "last.first"){
names <- unique(randomNames(n=n, which.names = which.names, name.order = name.order))
need <- n - length(names)
while(need>0){
names <- unique(c(randomNames(n=need, which.names = which.names, name.order = name.order), names))
need <- n - length(names)
}
return(names)
}
# Generate n.Schools names
gen.schools <- function(n.schools) {
School.ID <-
paste0(gen.names(n = n.schools, which.names = "last"), ' School')
School.long <- rnorm(n = n.schools, mean = 21.7672, sd = 0.025)
School.lat <- rnorm(n = n.schools, mean = 58.8471, sd = 0.025)
School.RE <- rnorm(n = n.schools, mean = 0, sd = 1)
Schools <-
data.frame(School.ID, School.lat, School.long, School.RE) %>%
mutate(School.ID = as.character(School.ID)) %>%
rowwise() %>% mutate (School.distance = distHaversine(
p1 = c(School.long, School.lat),
p2 = c(21.7672, 58.8471), r = 3961
))
return(Schools)
}
Schools <- gen.schools(n.schools = n.Schools)
# Generate Grades
Grades <- c(first.grade:last.grade)
# Generate n.Classrooms
Classrooms <- LETTERS[1:n.Classrooms]
# Group schools and grades
SchGr <- outer(paste0(Schools$School.ID, '-'), paste0(Grades, '-'), FUN="paste")
#head(SchGr)
# Group SchGr and Classrooms
SchGrClss <- outer(SchGr, paste0(Classrooms, '-'), FUN="paste")
#head(SchGrClss)
# These are the combination of School-Grades-Classroom
SchGrClssTmp <- as.matrix(SchGrClss, ncol=1, nrow=length(SchGrClss) )
SchGrClssEnd <- as.data.frame(SchGrClssTmp)
# Assign n.Teachers (2 classroom in a given school-grade)
Allpairs <- as.data.frame(t(combn(SchGrClssTmp, 2)))
AllpairsTmp <- paste(Allpairs$V1, Allpairs$V2, sep=" ")
library(stringr)
xm <- do.call(rbind, str_split(string = AllpairsTmp, pattern = "-"))
separoPairs <- as.data.frame((xm), stringsAsFactors = FALSE)
separoPairs <- separoPairs %>% select(-V7) %>% #Drops empty column
mutate(V1=as.character(V1), V4=as.character(V4), V2=as.numeric(V2), V5=as.numeric(V5)) %>% mutate(V4 = trimws(V4, which = "both"))
#Only the rows with V1=V4 and V2=V5 are valid
validPairs <- separoPairs %>% filter(V1==V4 & V2==V5) %>% select(V1, V2, V3, V6)
# Generate n.Teachers
gen.teachers <- function(n.teachers){
Teacher.ID <- gen.names(n = n.teachers, name.order = "last.first")
Teacher.exp <- runif(n = n.teachers, min = 1, max = 30)
Teacher.Other <- sample(c(0,1), replace = T, prob = c(0.5, 0.5), size = n.teachers)
Teacher.RE <- rnorm(n = n.teachers, mean = 0, sd = 1)
Teachers <- data.frame(Teacher.ID, Teacher.exp, Teacher.Other, Teacher.RE)
return(Teachers)
}
Teachers <- gen.teachers(n.teachers = n.Teachers) %>%
mutate(Teacher.ID = as.character(Teacher.ID))
# Randomly assign n.Teachers teachers to the "ValidPairs"
TmpAssignments <- validPairs[sample(1:nrow(validPairs), n.Teachers), ]
Assignments <- cbind.data.frame(Teachers$Teacher.ID, TmpAssignments)
names(Assignments) <- c("Teacher.ID", "School.ID", "Grade", "Class_1", "Class_2")
# Tidy Data
library(tidyr)
TeacherClassroom <- Assignments %>%
gather(x, Classroom, Class_1,Class_2) %>%
select(-x) %>%
mutate(Teacher.ID = as.character(Teacher.ID))
# Merge
DF_Classrooms <- TeacherClassroom %>% full_join(Teachers, by="Teacher.ID") %>% full_join(Schools, by="School.ID")
rm(list=setdiff(ls(), "DF_Classrooms")) # Clean the work space!
Thanks!

Related

Incorrect number of probabilities dimsensions when optimizing a multi-season elo model

My data frame looks like this:
and the code to compute the initial/optimized elo
# Elo Rating System
library(eurolig)
library(tidyverse)
library(lubridate)
plldf2 <- read_csv('pll_elor.csv',show_col_types = FALSE)
plldf3 <- plldf2[(plldf2$season == 2019),]
# Helpers -----------------------------------------------------------------
# Expected win probability before a game
getExpectedProb <- function(r_team, r_opp, home_adv, s) {
1 / (1 + 10 ^ ((r_opp - r_team - home_adv) / s))
}
# Get Elo rating for next season
getCarryOver <- function(rating, c) {
c * rating + 1505 * (1 - c)
}
# Get margin of victory multiplier
getMovMultiplier <- function(points_diff, elo_diff) {
((points_diff + 3) ^ 0.8) / (7.5 + 0.006 * elo_diff)
}
getEloSummary <- function(df) {
df %>%
pivot_longer(
cols = ends_with("_new"),
names_to = "type",
values_to = "elo"
) %>%
select(
season,
date,
team,
opp,
type,
elo
) %>%
mutate(
team = ifelse(type == "elo_home_new", team, opp),
order = rank(date),
team_id = paste0(team, "-", season)
) %>%
select(season, date, team, elo, order, team_id)
}
# Algorithm ---------------------------------------------------------------
# For a single season
getSeasonElo <- function(df, k, home_adv, s, initial_elo) {
team_ratings <- initial_elo
# Data frame to store the subsequent values obtained by the algorithm
ratings_df <- df %>%
mutate(
home_adv = NA,
win_points_home = NA,
win_points_away = NA,
expected_prob_home = NA,
expected_prob_away = NA,
mov_home = NA,
mov_away = NA,
elo_home_prev = NA,
elo_away_prev = NA,
elo_home_new = NA,
elo_away_new = NA,
prob_pred = NA
)
for (i in 1:nrow(df)) {
team_home <- df$team[i]
team_away <- df$opp[i]
elo_home <- team_ratings[[team_home]]
elo_away <- team_ratings[[team_away]]
# Home advantage set to 0 for Final 4 games
h <- ifelse(df$phase[i] == "ff", 0, home_adv)
# Assign 1 for wins and 0 for losses
win_points_home <- ifelse(
df$score[i] > df$opp_score[i],
1,
0
)
win_points_away <- ifelse(win_points_home == 0, 1, 0)
# Find pre-game win probabilities
expected_prob_home <- getExpectedProb(
r_team = elo_home,
r_opp = elo_away,
home_adv = h,
s = s)
expected_prob_away <- 1 - expected_prob_home
# Margin of victory multiplier
points_diff_abs <- abs(df$score[i] - df$opp_score[i])
elo_diff_home <- elo_home + h - elo_away
elo_diff_away <- elo_away - elo_home - h
mov_home <- getMovMultiplier(points_diff_abs, elo_diff_home)
mov_away <- getMovMultiplier(points_diff_abs, elo_diff_away)
# Update Elo ratings
elo_home_new <- elo_home + k * (win_points_home - expected_prob_home) * mov_home
elo_away_new <- elo_away + k * (win_points_away - expected_prob_away) * mov_away
team_ratings[[team_home]] <- elo_home_new
team_ratings[[team_away]] <- elo_away_new
prob_pred <- sample(
x = c(team_home, team_away),
size = length(list(expected_prob_home,expected_prob_away)),
prob = c(expected_prob_home, expected_prob_away) #ISSUE HERE
)
ratings_df$home_adv[i] <- h
ratings_df$win_points_home[i] <- win_points_home
ratings_df$win_points_away[i] <- win_points_away
ratings_df$expected_prob_home[i] <- expected_prob_home
ratings_df$expected_prob_away[i] <- expected_prob_away
ratings_df$mov_home[i] <- mov_home
ratings_df$mov_away[i] <- mov_away
ratings_df$elo_home_prev[i] <- elo_home
ratings_df$elo_away_prev[i] <- elo_away
ratings_df$elo_home_new[i] <- elo_home_new
ratings_df$elo_away_new[i] <- elo_away_new
ratings_df$prob_pred[i] <- prob_pred
}
ratings_df <- ratings_df %>%
mutate(
winner = ifelse(score > opp_score, team, opp),
winner_pred = ifelse(elo_home_prev + home_adv >= elo_away_prev,
team, opp),
correct_pred = ifelse(winner == winner_pred, TRUE, FALSE)
)
list(ratings_df = ratings_df, team_elo = team_ratings)
}
# Algorithm for several seasons
getElo <- function(df, k, home_adv, s, carry) {
df <- arrange(df, season)
season_results <- split(df, df$season)
# Start with first season
teams <- sort(unique(season_results[[1]]$team))
# Since it is the first season overall, all teams start with 1300 Elo points
initial_ratings <- as.list(rep(1300, length(teams)))
names(initial_ratings) <- teams
first_season_ratings <- getSeasonElo(
season_results[[1]],
k = k,
home_adv = home_adv,
s = s,
initial_elo = initial_ratings
)
# TODO: Use the last recorded Elo rating, not last season
elo_final <- tibble(
season = unique(season_results[[1]]$season),
team = names(first_season_ratings$team_elo),
elo = unlist(first_season_ratings$team_elo)
)
season_ratings <- vector("list", length(season_results))
season_ratings[[1]] <- first_season_ratings
for (i in 2:length(season_ratings)) {
teams <- sort(unique(season_results[[i]]$team))
teams_new <- teams[!teams %in% elo_final$team]
teams_new_elo <- as.list(rep(1300, length(teams_new)))
names(teams_new_elo) <- teams_new
teams_old <- teams[teams %in% elo_final$team]
teams_old_elo <- vector("list", length(teams_old))
names(teams_old_elo) <- teams_old
for (j in seq_along(teams_old)) {
elo_team <- elo_final %>%
filter(team == teams_old[j])
teams_old_elo[[j]] <- elo_team$elo[which.max(elo_team$season)]
}
teams_old_elo <- lapply(teams_old_elo, getCarryOver, c = carry)
initial_elo <- c(teams_new_elo, teams_old_elo)
season_ratings[[i]] <- getSeasonElo(
season_results[[i]],
k = k,
home_adv = home_adv,
s = s,
initial_elo = initial_elo
)
elo_final_season <- tibble(
season = unique(season_results[[i]]$season),
team = names(season_ratings[[i]]$team_elo),
elo = unlist(season_ratings[[i]]$team_elo)
)
elo_final <- bind_rows(elo_final, elo_final_season)
}
output_df <- map_df(season_ratings, function(x) x$ratings_df)
output_df
}
# Tunning -----------------------------------------------------------------
# Grid optimization
k <- seq(10, 50, by = 5)
h <- seq(0, 150, by = 25)
c <- seq(0.5, 1, by = 0.1)
grid_df <- expand_grid(k, h, c)
checkAccuracy <- function(df, k, h, c) {
df <- getElo(df, k, h, s = 400, c)
sum(df$correct_pred) / nrow(df)
}
n <- nrow(grid_df)
accuracy <- numeric(n)
for (i in 1:n) {
acc <- checkAccuracy(
df = results,
k = grid_df$k[i],
h = grid_df$h[i],
c = grid_df$c[i]
)
accuracy[i] <- acc
}
acc_df <- cbind(grid_df, accuracy) %>%
as_tibble() %>%
arrange(desc(accuracy))
# Ratings -----------------------------------------------------------------
elo_df <- getElo(plldf2, k = 25, home_adv = 100, s = 400, carry = 0.8)
elo_summary <- getEloSummary(elo_df) %>%
left_join(teaminfo, by = c("team" = "team", "season"))
tester <- getSeasonElo(
plldf3,
k = 25,
home_adv = 100,
s = 400,
initial_elo = initial_ratings)
getEloSummary(tester)
elo_summary %>%
ggplot(aes(order, elo, group = team_id)) +
geom_line()
but when I attempt to run the 'getElo' function to optimize the model it says there is a incorrect number of probabilities and I get the error below. However when I subset my data and run it for a single season using the 'getSeasonElo' function it computes it without issue. I assumed the problem was coming from the 'size' parameter being set to 1 by default, which I've fixed to account for the multi-season calculation, but the incorrect number is still happening? Not sure what I missed.
prob_pred <- sample(
x = c(team_home, team_away),
size = length(list(expected_prob_home,expected_prob_away)),
prob = c(expected_prob_home, expected_prob_away) #ISSUE HERE
)

Why does function return NULL?

A beginner in R over here, so apologies for the basic question.
Why does ATE return a null vector instead of saving the values of the difference of the means?
fun.cluster <- function(M, N){
set.seed(02139)
J <- 1:M # vector J_i
df <- as.data.frame(matrix(data=1:N, nrow = N, ncol = 1)) #data frame of all original values
df$cluster <- cut(df$V1, M, labels = 1:M) #breaking the dataframe into clusters
df$cluster <- as.numeric(df$cluster)
Y1 <- as.vector(sample(J, 5)) # assigning treatment
df$treatment <- ifelse(df$cluster %in% Y1, df$treatment <- 1, df$treatment <- 0)
#Inducing intracluster correlation:
mu_0j <- runif(n = 50, min = -1, max = 1)
df$V1[df$treatment==0] <- mu_0j
mu_1j <- runif(n=50, min = -0.5, max = 1.5)
df$V1[df$treatment==0] <- mu_1j
# drawing values
y_0i <- rnorm(n = 50, mean = mu_0j, sd = 1)
y_1i <- rnorm(n = 50, mean = mu_1j, sd = 1)
D_i <- as.vector(c(y_0i, y_1i))
# calculating ATE:
ATE[i] <- mean(y_1i - y_0i)
}
ATE <- c()
for(i in 1:10){
fun.cluster(M = 10, N = 100)
}

IDW parameters in R

I want to perform IDW interpolation using R using the idw command from the gstat package. I have this data:
#settings
library(gstat)
library(dplyr)
library(sp)
library(tidyr)
id_rep <- rep(c(1,2), 20)
f <- rep(c(930,930.2), each=20)
perc <- rep(c(90, 80), each=10)
x <- sample(1:50, 40)
y <- sample(50:100, 40)
E <- runif(40)
df <- data.frame(id_rep, perc, x,y, f, E)
df_split <- split(df, list(df$id_rep, df$perc, df$f), drop = TRUE, sep="_")
#grid
x.range <- range(df$x)
y.range <- range(df$y)
grid <- expand.grid(x = seq(x.range[1], x.range[2], by=1),
y = seq(y.range[1], y.range[2], by=1))
coordinates(grid) <- ~x + y
#interpolation
lst_interp_idw <- lapply(df_split, function(X) {
coordinates(X) <- ~x + y
E_idw <- idw(E~ 1, X, grid, idp=1, nmax=3) %>% as.data.frame()
df_interp <- select(E_idw, x,y,E_pred=var1.pred)
df_interp
})
df_interp_idw <- bind_rows(lst_interp_idw, .id = "interact") %>%
separate(interact, c("id_rep", "perc", "f"), sep = "\\_")
Now I want to perform each run with different idp and nmax parameters within certain values​ (idp from 1 to 3 by 0.5, and nmax 3 to 6 by 1) and get out a data frame with columns for each combination of idp and nmax values. I try with two for loops but it doesn't work.
EDIT
the code that doesn't work is:
idp = seq(from = 1, to = 3, by = 0.5)
nmax = seq(from = 3, to = 6, by = 1)
...
for(i in idp) {
for(j in nmax)
{ E_idw= idw(E ~ 1, X, grid, nmax = i, idp = j)
}
}
...
Here is a way how to store the result of every iteration in a list.
#settings
#install.packages("gstat")
library(gstat)
library(dplyr)
library(sp)
library(tidyr)
id_rep <- rep(c(1,2), 20)
f <- rep(c(930,930.2), each=20)
perc <- rep(c(90, 80), each=10)
x <- sample(1:50, 40)
y <- sample(50:100, 40)
E <- runif(40)
df <- data.frame(id_rep, perc, x,y, f, E)
df_split <- split(df, list(df$id_rep, df$perc, df$f), drop = TRUE, sep="_")
#grid
x.range <- range(df$x)
y.range <- range(df$y)
grid <- expand.grid(x = seq(x.range[1], x.range[2], by=1),
y = seq(y.range[1], y.range[2], by=1))
coordinates(grid) <- ~x + y
# ==============================================
# NEW function
# ==============================================
idp = seq(from = 1, to = 3, by = 0.5)
nmax = seq(from = 3, to = 6, by = 1)
#interpolation
lst_interp_idw <- lapply(df_split, function(X) {
coordinates(X) <- ~x + y
df_interp <- vector(length(idp)*length(nmax), mode = "list" )
k <- 0
for(i in idp) {
for(j in nmax) {
print(paste(i, j))
# Iterator
k <- k + 1
E_idw= idw(E ~ 1, X, grid, nmax = i, idp = j) %>% as.data.frame()
df_interp[[k]] <- select(E_idw, x,y,E_pred=var1.pred)
}
}
return(df_interp)
})
# ==============================================
Some plausibility checks (lapply is applied to 8 list elements and 20 variations are calculated):
length(lst_interp_idw) # 8
length(lst_interp_idw[[1]]) #20
length(lst_interp_idw[[1]]) #20
It should be easy for you to adapt the last line of your code
df_interp_idw <- bind_rows(lst_interp_idw, .id = "interact") %>%
separate(interact, c("id_rep", "perc", "f"), sep = "\\_")
to format the output in the desired format. This highly depends on how you want to present the different interpolation alternatives.

randomize observations by groups (blocks) without replacement

This is a follow up question. The answers in the previous question are doing the random sampling with replacement. How can I change the code so that I assign each observation to on of J "urn" without putting the observation back in the 'lottery'?
This is the code I have right now:
set.seed(9782)
I <- 500
g <- 10
library(dplyr)
anon_id <- function(n = 1, lenght = 12) {
randomString <- c(1:n)
for (i in 1:n)
{
randomString[i] <- paste(sample(c(0:9, letters, LETTERS),
lenght, replace = TRUE),
collapse = "")
}
return(randomString)
}
df <- data.frame(id = anon_id(n = I, lenght = 16),
group = sample(1:g, I, T))
J <- 3
p <- c(0.25, 0.5, 0.25)
randomize <- function(data, urns=2, block_id = NULL, p=NULL, seed=9782) {
if(is.null(p)) p <- rep(1/urns, urns)
if(is.null(block_id)){
df1 <- data %>%
mutate(Treatment = sample(x = c(1:urns),
size = n(),
replace = T,
prob = p))
return(df1)
}else{
df1 <- data %>% group_by_(block_id) %>%
mutate(Treatment = sample(x = c(1:urns),
size = n(),
replace = T,
prob = p))
}
}
df1 <- randomize(data = df, urns = J, block_id = "group", p = p, seed = 9782)
If I change replace = T to replace = F I get the following error:
Error: cannot take a sample larger than the population when 'replace = FALSE'
Clarification of my objective:
Suppose that I have 10 classrooms (or villages, or something like that). To keep it simple, suppose each classroom has 20 students (in reality they will have N_j). Classroom per classroom, I want to assign each student to one of J groups, for example J=3. P says the fraction that will be assigned to each group. For example 25% to group 1 40% to group 2 and 35% to group 3.
This solution is based on #Frank's comment. I created one function that does the randomization for block j and another that calls that function for every block.
randomize_block <- function(data, block=NULL, block_name=NULL, urns, p, seed=9782) {
set.seed(seed)
if(!is.null(block)) {
condition <- paste0(block_name,"==",block)
df <- data %>% filter_(condition)
} else df <- data
if(is.null(p)) p <- rep(1/urns, urns)
N <- nrow(df)
Np <- round(N*p,0)
if(sum(Np)!=N) Np[1] <- N - sum(Np[2:length(Np)])
Urns = rep(seq_along(p), Np)
Urns = sample(Urns)
df$urn <- Urns
return(df)
}
randomize <- function(data, block_name=NULL, urns, p, seed=9782) {
if(is.null(p)) p <- rep(1/urns, urns)
if(!is.null(block_name)){
blocks <- unique(data[,block_name])
df <- lapply(blocks, randomize_block,
data = data,
block_name=block_name,
urns = urns,
p = p,
seed=seed)
return(data.table::rbindlist(df))
}else {
df <- randomize_block(data = data,
urns = urns, p = p,
seed=seed)
}
}
test <- randomize(data = df, block_name = "group",
urns = 3, p = c(0.25, 0.5, 0.25),
seed=4222016)
I'm trying to figure out if it is possible to use dplyr to do this, alternative solutions implementing that are more than welcome!
My answer to your other question is without replacement, as can be seen below:
block_rand <- as.tibble(randomizr::block_ra(blocks = df$group, conditions = c("urn_1","urn_2","urn_3")))
df2 <- as.tibble(bind_cols(df, block_rand))
df2 %>% janitor::tabyl(group, value)
df2 %>%
group_by(id) %>%
filter(n()>1) %>%
str()

randomly assign teachers to a school with dplyr or similar?

Suppose I have a data frame with 8 schools and its characteristics, and another with 48 teachers and its characteristics. I can generate some fake data with the following code:
library(dplyr)
library(geosphere)
set.seed(6232015)
n.schools <-8
n.teachers <- 48
makeRandomString <- function(pre, n=1, length=12) {
randomString <- c(1:n) # initialize vector
for (i in 1:n) {
randomString[i] <- paste0(pre,'.', paste(sample(c(0:9, letters, LETTERS),
length, replace=TRUE),
collapse=""))
}
return(randomString)
}
gen.teachers <- function(n.teachers){
Teacher.ID <- makeRandomString(pre= 'T', n = n.teachers, length = 20)
Teacher.exp <- runif(n = n.teachers, min = 1, max = 30)
Teacher.Other <- sample(c(0,1), replace = T, prob = c(0.5, 0.5), size = n.teachers)
Teachers <- data.frame(Teacher.ID, Teacher.exp, Teacher.Other)
return(Teachers)
}
gen.schools <- function(n.schools){
School.ID <- makeRandomString(pre= 'S', n = n.schools, length = 20)
School.lat <- runif(n = n.schools, min = -2, max = 2)
School.long <- runif(n = n.schools, min = -2, max = 2)
Schools <- data.frame(School.ID, School.lat, School.long) %>%
rowwise() %>% mutate (School.distance = distHaversine(p1 = c(School.long, School.lat),
p2 = c(0, 0), r = 3961))
return(Schools)
}
Teachers <- gen.teachers(n.teachers = n.teachers)
Schools <- gen.schools(n.schools = n.schools)
To each shool, I want to assign 6 teachers (every teacher get 1 and only 1 school). I could use:
Teachers %>% sample_n(6)
To get a list of 6 teachers assign those to a school, remove them from my original pool and keep going with a loop. My guess/hope is that there is a much easier way of doing this.
Thanks for the help!
In the context of your code
sample(rep(Schools$School.ID, each = 6))
gives a random sequence of schools where each school.id appears 6 times. Set Teachers$AssignedSchool to this sample and each teacher has an assigned school

Resources