randomize observations by groups (blocks) without replacement - r

This is a follow up question. The answers in the previous question are doing the random sampling with replacement. How can I change the code so that I assign each observation to on of J "urn" without putting the observation back in the 'lottery'?
This is the code I have right now:
set.seed(9782)
I <- 500
g <- 10
library(dplyr)
anon_id <- function(n = 1, lenght = 12) {
randomString <- c(1:n)
for (i in 1:n)
{
randomString[i] <- paste(sample(c(0:9, letters, LETTERS),
lenght, replace = TRUE),
collapse = "")
}
return(randomString)
}
df <- data.frame(id = anon_id(n = I, lenght = 16),
group = sample(1:g, I, T))
J <- 3
p <- c(0.25, 0.5, 0.25)
randomize <- function(data, urns=2, block_id = NULL, p=NULL, seed=9782) {
if(is.null(p)) p <- rep(1/urns, urns)
if(is.null(block_id)){
df1 <- data %>%
mutate(Treatment = sample(x = c(1:urns),
size = n(),
replace = T,
prob = p))
return(df1)
}else{
df1 <- data %>% group_by_(block_id) %>%
mutate(Treatment = sample(x = c(1:urns),
size = n(),
replace = T,
prob = p))
}
}
df1 <- randomize(data = df, urns = J, block_id = "group", p = p, seed = 9782)
If I change replace = T to replace = F I get the following error:
Error: cannot take a sample larger than the population when 'replace = FALSE'
Clarification of my objective:
Suppose that I have 10 classrooms (or villages, or something like that). To keep it simple, suppose each classroom has 20 students (in reality they will have N_j). Classroom per classroom, I want to assign each student to one of J groups, for example J=3. P says the fraction that will be assigned to each group. For example 25% to group 1 40% to group 2 and 35% to group 3.

This solution is based on #Frank's comment. I created one function that does the randomization for block j and another that calls that function for every block.
randomize_block <- function(data, block=NULL, block_name=NULL, urns, p, seed=9782) {
set.seed(seed)
if(!is.null(block)) {
condition <- paste0(block_name,"==",block)
df <- data %>% filter_(condition)
} else df <- data
if(is.null(p)) p <- rep(1/urns, urns)
N <- nrow(df)
Np <- round(N*p,0)
if(sum(Np)!=N) Np[1] <- N - sum(Np[2:length(Np)])
Urns = rep(seq_along(p), Np)
Urns = sample(Urns)
df$urn <- Urns
return(df)
}
randomize <- function(data, block_name=NULL, urns, p, seed=9782) {
if(is.null(p)) p <- rep(1/urns, urns)
if(!is.null(block_name)){
blocks <- unique(data[,block_name])
df <- lapply(blocks, randomize_block,
data = data,
block_name=block_name,
urns = urns,
p = p,
seed=seed)
return(data.table::rbindlist(df))
}else {
df <- randomize_block(data = data,
urns = urns, p = p,
seed=seed)
}
}
test <- randomize(data = df, block_name = "group",
urns = 3, p = c(0.25, 0.5, 0.25),
seed=4222016)
I'm trying to figure out if it is possible to use dplyr to do this, alternative solutions implementing that are more than welcome!

My answer to your other question is without replacement, as can be seen below:
block_rand <- as.tibble(randomizr::block_ra(blocks = df$group, conditions = c("urn_1","urn_2","urn_3")))
df2 <- as.tibble(bind_cols(df, block_rand))
df2 %>% janitor::tabyl(group, value)
df2 %>%
group_by(id) %>%
filter(n()>1) %>%
str()

Related

Incorrect number of probabilities dimsensions when optimizing a multi-season elo model

My data frame looks like this:
and the code to compute the initial/optimized elo
# Elo Rating System
library(eurolig)
library(tidyverse)
library(lubridate)
plldf2 <- read_csv('pll_elor.csv',show_col_types = FALSE)
plldf3 <- plldf2[(plldf2$season == 2019),]
# Helpers -----------------------------------------------------------------
# Expected win probability before a game
getExpectedProb <- function(r_team, r_opp, home_adv, s) {
1 / (1 + 10 ^ ((r_opp - r_team - home_adv) / s))
}
# Get Elo rating for next season
getCarryOver <- function(rating, c) {
c * rating + 1505 * (1 - c)
}
# Get margin of victory multiplier
getMovMultiplier <- function(points_diff, elo_diff) {
((points_diff + 3) ^ 0.8) / (7.5 + 0.006 * elo_diff)
}
getEloSummary <- function(df) {
df %>%
pivot_longer(
cols = ends_with("_new"),
names_to = "type",
values_to = "elo"
) %>%
select(
season,
date,
team,
opp,
type,
elo
) %>%
mutate(
team = ifelse(type == "elo_home_new", team, opp),
order = rank(date),
team_id = paste0(team, "-", season)
) %>%
select(season, date, team, elo, order, team_id)
}
# Algorithm ---------------------------------------------------------------
# For a single season
getSeasonElo <- function(df, k, home_adv, s, initial_elo) {
team_ratings <- initial_elo
# Data frame to store the subsequent values obtained by the algorithm
ratings_df <- df %>%
mutate(
home_adv = NA,
win_points_home = NA,
win_points_away = NA,
expected_prob_home = NA,
expected_prob_away = NA,
mov_home = NA,
mov_away = NA,
elo_home_prev = NA,
elo_away_prev = NA,
elo_home_new = NA,
elo_away_new = NA,
prob_pred = NA
)
for (i in 1:nrow(df)) {
team_home <- df$team[i]
team_away <- df$opp[i]
elo_home <- team_ratings[[team_home]]
elo_away <- team_ratings[[team_away]]
# Home advantage set to 0 for Final 4 games
h <- ifelse(df$phase[i] == "ff", 0, home_adv)
# Assign 1 for wins and 0 for losses
win_points_home <- ifelse(
df$score[i] > df$opp_score[i],
1,
0
)
win_points_away <- ifelse(win_points_home == 0, 1, 0)
# Find pre-game win probabilities
expected_prob_home <- getExpectedProb(
r_team = elo_home,
r_opp = elo_away,
home_adv = h,
s = s)
expected_prob_away <- 1 - expected_prob_home
# Margin of victory multiplier
points_diff_abs <- abs(df$score[i] - df$opp_score[i])
elo_diff_home <- elo_home + h - elo_away
elo_diff_away <- elo_away - elo_home - h
mov_home <- getMovMultiplier(points_diff_abs, elo_diff_home)
mov_away <- getMovMultiplier(points_diff_abs, elo_diff_away)
# Update Elo ratings
elo_home_new <- elo_home + k * (win_points_home - expected_prob_home) * mov_home
elo_away_new <- elo_away + k * (win_points_away - expected_prob_away) * mov_away
team_ratings[[team_home]] <- elo_home_new
team_ratings[[team_away]] <- elo_away_new
prob_pred <- sample(
x = c(team_home, team_away),
size = length(list(expected_prob_home,expected_prob_away)),
prob = c(expected_prob_home, expected_prob_away) #ISSUE HERE
)
ratings_df$home_adv[i] <- h
ratings_df$win_points_home[i] <- win_points_home
ratings_df$win_points_away[i] <- win_points_away
ratings_df$expected_prob_home[i] <- expected_prob_home
ratings_df$expected_prob_away[i] <- expected_prob_away
ratings_df$mov_home[i] <- mov_home
ratings_df$mov_away[i] <- mov_away
ratings_df$elo_home_prev[i] <- elo_home
ratings_df$elo_away_prev[i] <- elo_away
ratings_df$elo_home_new[i] <- elo_home_new
ratings_df$elo_away_new[i] <- elo_away_new
ratings_df$prob_pred[i] <- prob_pred
}
ratings_df <- ratings_df %>%
mutate(
winner = ifelse(score > opp_score, team, opp),
winner_pred = ifelse(elo_home_prev + home_adv >= elo_away_prev,
team, opp),
correct_pred = ifelse(winner == winner_pred, TRUE, FALSE)
)
list(ratings_df = ratings_df, team_elo = team_ratings)
}
# Algorithm for several seasons
getElo <- function(df, k, home_adv, s, carry) {
df <- arrange(df, season)
season_results <- split(df, df$season)
# Start with first season
teams <- sort(unique(season_results[[1]]$team))
# Since it is the first season overall, all teams start with 1300 Elo points
initial_ratings <- as.list(rep(1300, length(teams)))
names(initial_ratings) <- teams
first_season_ratings <- getSeasonElo(
season_results[[1]],
k = k,
home_adv = home_adv,
s = s,
initial_elo = initial_ratings
)
# TODO: Use the last recorded Elo rating, not last season
elo_final <- tibble(
season = unique(season_results[[1]]$season),
team = names(first_season_ratings$team_elo),
elo = unlist(first_season_ratings$team_elo)
)
season_ratings <- vector("list", length(season_results))
season_ratings[[1]] <- first_season_ratings
for (i in 2:length(season_ratings)) {
teams <- sort(unique(season_results[[i]]$team))
teams_new <- teams[!teams %in% elo_final$team]
teams_new_elo <- as.list(rep(1300, length(teams_new)))
names(teams_new_elo) <- teams_new
teams_old <- teams[teams %in% elo_final$team]
teams_old_elo <- vector("list", length(teams_old))
names(teams_old_elo) <- teams_old
for (j in seq_along(teams_old)) {
elo_team <- elo_final %>%
filter(team == teams_old[j])
teams_old_elo[[j]] <- elo_team$elo[which.max(elo_team$season)]
}
teams_old_elo <- lapply(teams_old_elo, getCarryOver, c = carry)
initial_elo <- c(teams_new_elo, teams_old_elo)
season_ratings[[i]] <- getSeasonElo(
season_results[[i]],
k = k,
home_adv = home_adv,
s = s,
initial_elo = initial_elo
)
elo_final_season <- tibble(
season = unique(season_results[[i]]$season),
team = names(season_ratings[[i]]$team_elo),
elo = unlist(season_ratings[[i]]$team_elo)
)
elo_final <- bind_rows(elo_final, elo_final_season)
}
output_df <- map_df(season_ratings, function(x) x$ratings_df)
output_df
}
# Tunning -----------------------------------------------------------------
# Grid optimization
k <- seq(10, 50, by = 5)
h <- seq(0, 150, by = 25)
c <- seq(0.5, 1, by = 0.1)
grid_df <- expand_grid(k, h, c)
checkAccuracy <- function(df, k, h, c) {
df <- getElo(df, k, h, s = 400, c)
sum(df$correct_pred) / nrow(df)
}
n <- nrow(grid_df)
accuracy <- numeric(n)
for (i in 1:n) {
acc <- checkAccuracy(
df = results,
k = grid_df$k[i],
h = grid_df$h[i],
c = grid_df$c[i]
)
accuracy[i] <- acc
}
acc_df <- cbind(grid_df, accuracy) %>%
as_tibble() %>%
arrange(desc(accuracy))
# Ratings -----------------------------------------------------------------
elo_df <- getElo(plldf2, k = 25, home_adv = 100, s = 400, carry = 0.8)
elo_summary <- getEloSummary(elo_df) %>%
left_join(teaminfo, by = c("team" = "team", "season"))
tester <- getSeasonElo(
plldf3,
k = 25,
home_adv = 100,
s = 400,
initial_elo = initial_ratings)
getEloSummary(tester)
elo_summary %>%
ggplot(aes(order, elo, group = team_id)) +
geom_line()
but when I attempt to run the 'getElo' function to optimize the model it says there is a incorrect number of probabilities and I get the error below. However when I subset my data and run it for a single season using the 'getSeasonElo' function it computes it without issue. I assumed the problem was coming from the 'size' parameter being set to 1 by default, which I've fixed to account for the multi-season calculation, but the incorrect number is still happening? Not sure what I missed.
prob_pred <- sample(
x = c(team_home, team_away),
size = length(list(expected_prob_home,expected_prob_away)),
prob = c(expected_prob_home, expected_prob_away) #ISSUE HERE
)

How to map the mean over bootstrapped samples in a tidy data frame

I'm trying to map a function that will calculate p_hat from the bootstrap samples I mapped previously in my data frame. I am having difficulty with my function or my use of map, which I can say is a work in progress.
library(tidyverse)
library(rsample)
ttSample <- data.frame(grad = c(0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1))
bootstrapper <- function(n) {bootstraps(data = ttSample, times = n)}
meanTaker <- function(columnVal)
{
for(i in 1:length(columnVal))
{
meanHolder <- vector("double", nrow(columnVal[[i]]))
for(j in 1:nrow(columnVal[[i]]))
{
meanHolder[[j]] <- mean(as.data.frame(columnVal$splits[[j]])$grad)
}
mean(meanHolder)
}
}
bootFrame <- data.frame(n = rep(c(250, 1000, 5000, 10000), 3),
confLev = rep(c(0.9, 0.95, 0.99))) %>%
arrange(n, confLev) %>%
mutate(alpha = 1 - confLev,
upperCI = confLev + (alpha / 2),
lowerCI = confLev - (alpha / 2),
samples = map(ttSample, list),
boots = map(.x = .$n, .f = bootstrapper)) %>%
mutate(p_hat = map(.x = .$boots, .f = meanTaker))
You need to remember that map applies the function to each element of the input, not the entire input. Having that in mind, we can rewrite meanTaker as
meanTaker <- function(boot) {
grads <- sapply(boot$splits, function (split) split$data$grad)
mean(grads)
}

rgenoud - How to pass parameters to the function?

I have a function that currently plays nice with rgenoud. It has one parameter (xx) and rgenoud will optimize xx perfectly.
However, I would like to add a second parameter to my function that wouldnt be optimized by rgendoud . For example, I would like my function to either fit a model with a gaussian link or a poisson link and to specify that when I call rgenoud.
Any idea?
thanks
edit: here is a minimal working example of what I mean. How would you get the last line to work?
adstock reflect the fact that TV advertising should have an impact on the number of quotes of future weeks.
Adstock[t] = Ads[t] + rate* Ads[t-1] + rate^2*Ads[t-2] + .... + rate^max_memory * Ads[t-max_memory]
We want rgenoud to figure out what rate and max_memory will return the model with the best fit. Best fit is defined as the lowest RMSE.
set.seed(107)
library(fpp)
library(rgenoud)
adstock_k <- function(x, adstock_rate = 0, max_memory = 12){
learn_rates <- rep(adstock_rate, max_memory+1) ^ c(0:max_memory)
adstocked_advertising <- stats::filter(c(rep(0, max_memory), x), learn_rates, method="convolution")
adstocked_advertising <- adstocked_advertising[!is.na(adstocked_advertising)]
return(as.numeric(adstocked_advertising))
}
getRMSE <- function(x, y) {
mean((x-y)^2) %>% sqrt
}
df <- data.frame(insurance) %>%
mutate(Quotes = round (Quotes*1000, digits = 0 ))
df$idu <- as.numeric(rownames(df))
my_f <- function(xx){
adstock_rate <- xx[1]
adstock_memory <- xx[2]
df.temp <- df %>%
mutate(adstock = adstock_k(TV.advert, adstock_rate/100, adstock_memory ))
mod <- lm(data=df.temp, Quotes ~ adstock )
getRMSE( df.temp$Quotes, predict(mod))
}
domaine <- cbind(c(30,1), c(85, 8))
#this works
min_f <- genoud(my_f, nvars = 2, max = F, pop.size=1000, wait.generations=10, Domains = domaine, data.type.int = T)
#here I try to add a second parameter to the function.
my_f2 <- function(xx,first_n_weeks=20){
adstock_rate <- xx[1]
adstock_memory <- xx[2]
df.temp <- df %>%
filter(idu<= first_n_weeks) %>%
mutate(adstock = adstock_k(TV.advert, adstock_rate/100, adstock_memory ))
mod <- lm(data=df.temp, Quotes ~ adstock )
getRMSE( df.temp$Quotes, predict(mod))
}
#this doesnt work
min_f2 <- genoud(my_f2(first_n_week=10), nvars = 2, max = F, pop.size=1000, wait.generations=10, Domains = domaine, data.type.int = T)
Include the argument in the call to genoud, e.g.
genoud(my_f2, nvars = 2, max = F, pop.size=1000, wait.generations=10, Domains = domaine, data.type.int = T, first_n_weeks = 10)

Randomly assign teachers to classrooms imposing restrictions

This question is very similar to a question I asked before. The added complication is that I have N schools with G grades and C classrooms. Additionally, I want to assign each of T teachers to 2 classrooms within a single school and grade.
I can generate some fake data with the following code:
library(randomNames)
set.seed(6232015)
n.schools <-20
gen.names <- function(n, which.names = "both", name.order = "last.first"){
names <- unique(randomNames(n=n, which.names = which.names, name.order = name.order))
need <- n - length(names)
while(need>0){
names <- unique(c(randomNames(n=need, which.names = which.names, name.order = name.order), names))
need <- n - length(names)
}
return(names)
}
#Generates the classrooms data frame
grade <- c(3,4,5)
classroom <- c(LETTERS[1:4])
classroom <- expand.grid(grade=c(3,4,5),
classroom=c(LETTERS[1:4]),
School.ID=paste0(gen.names(n = n.schools, which.names = "last"), ' School'))
#Generates teachers data frame
n.teachers=nrow(classroom)/2
gen.teachers <- function(n.teachers){
Teacher.ID <- gen.names(n = n.teachers, name.order = "last.first")
Teacher.exp <- runif(n = n.teachers, min = 1, max = 30)
Teacher.Other <- sample(c(0,1), replace = T, prob = c(0.5, 0.5), size = n.teachers)
Teacher.RE <- rnorm(n = n.teachers, mean = 0, sd = 1)
Teachers <- data.frame(Teacher.ID, Teacher.exp, Teacher.Other, Teacher.RE)
return(Teachers)
}
Teachers <- gen.teachers(n.teachers = n.teachers)
The data frame that I want to create would have 240 rows with 7 variables. Using sample like in the answer to my previous question will not work (I think) because of the restrictions I want to impose. I thought about using group_by() but I don't think that would do the trick...
Thanks!
This works, but I'm hopping to learn a more elegant solution
library(randomNames)
library(dplyr)
set.seed(6232015)
n.schools <-20
n.grades <- 3
n.classrooms <- 4
total.classrooms <- n.classrooms*n.grades*n.schools
gen.names <- function(n, which.names = "both", name.order = "last.first"){
names <- unique(randomNames(n=n, which.names = which.names, name.order = name.order))
need <- n - length(names)
while(need>0){
names <- unique(c(randomNames(n=need, which.names = which.names, name.order = name.order), names))
need <- n - length(names)
}
return(names)
}
#Generates teachers data frame
n.teachers=total.classrooms/2
gen.teachers <- function(n.teachers){
Teacher.ID <- gen.names(n = n.teachers, name.order = "last.first")
Teacher.exp <- runif(n = n.teachers, min = 1, max = 30)
Teacher.Other <- sample(c(0,1), replace = T, prob = c(0.5, 0.5), size = n.teachers)
Teacher.RE <- rnorm(n = n.teachers, mean = 0, sd = 1)
Teachers <- data.frame(Teacher.ID, Teacher.exp, Teacher.Other, Teacher.RE) %>% mutate(Teacher.ID=as.character(Teacher.ID))
return(Teachers)
}
Teachers <- gen.teachers(n.teachers = n.teachers)
str(Teachers$Teacher.ID)
#Make a ‘schoolGrade’ object and then reshape
schoolGrade <- expand.grid(grade = c(3,4,5),
School.ID = paste0(gen.names(n = n.schools, which.names = "last"),
' School'))
# assign each of T teachers to 2 classrooms within a single school and grade
cuttoff1<-n.teachers/2
schoolGrade$A <- Teachers$Teacher.ID[1:cuttoff1]
schoolGrade$B <- Teachers$Teacher.ID[1:cuttoff1]
schoolGrade$C <- Teachers$Teacher.ID[(cuttoff1+1):n.teachers]
schoolGrade$D <- Teachers$Teacher.ID[(cuttoff1+1):n.teachers]
library(tidyr)
schoolGrade <- gather(schoolGrade, Classroom, Teacher.ID, A:D) %>% full_join(Teachers, by="Teacher.ID")
The main problem is if i want to increase n.classrooms from 4 to 20. In that case instead of having 4 lines going from A to D I would have 20, plus the additional cutoffs. Which is very complicated...
This answer allows me to easily set n.classrooms to whatever value, for example 20.
The problem is that this code is painfully slow. Suggestion to improve it are very welcome!
library(dplyr)
library(randomNames)
library(geosphere)
set.seed(7142015)
# Define Parameters
n.Schools <- 20
first.grade<-3
last.grade<-5
n.Grades <-last.grade-first.grade+1
n.Classrooms <- 20 # THIS IS WHAT I WANTED TO BE ABLE TO CHANGE
n.Teachers <- (n.Schools*n.Grades*n.Classrooms)/2 #Two classrooms per teacher
# Define Random names function:
gen.names <- function(n, which.names = "both", name.order = "last.first"){
names <- unique(randomNames(n=n, which.names = which.names, name.order = name.order))
need <- n - length(names)
while(need>0){
names <- unique(c(randomNames(n=need, which.names = which.names, name.order = name.order), names))
need <- n - length(names)
}
return(names)
}
# Generate n.Schools names
gen.schools <- function(n.schools) {
School.ID <-
paste0(gen.names(n = n.schools, which.names = "last"), ' School')
School.long <- rnorm(n = n.schools, mean = 21.7672, sd = 0.025)
School.lat <- rnorm(n = n.schools, mean = 58.8471, sd = 0.025)
School.RE <- rnorm(n = n.schools, mean = 0, sd = 1)
Schools <-
data.frame(School.ID, School.lat, School.long, School.RE) %>%
mutate(School.ID = as.character(School.ID)) %>%
rowwise() %>% mutate (School.distance = distHaversine(
p1 = c(School.long, School.lat),
p2 = c(21.7672, 58.8471), r = 3961
))
return(Schools)
}
Schools <- gen.schools(n.schools = n.Schools)
# Generate Grades
Grades <- c(first.grade:last.grade)
# Generate n.Classrooms
Classrooms <- LETTERS[1:n.Classrooms]
# Group schools and grades
SchGr <- outer(paste0(Schools$School.ID, '-'), paste0(Grades, '-'), FUN="paste")
#head(SchGr)
# Group SchGr and Classrooms
SchGrClss <- outer(SchGr, paste0(Classrooms, '-'), FUN="paste")
#head(SchGrClss)
# These are the combination of School-Grades-Classroom
SchGrClssTmp <- as.matrix(SchGrClss, ncol=1, nrow=length(SchGrClss) )
SchGrClssEnd <- as.data.frame(SchGrClssTmp)
# Assign n.Teachers (2 classroom in a given school-grade)
Allpairs <- as.data.frame(t(combn(SchGrClssTmp, 2)))
AllpairsTmp <- paste(Allpairs$V1, Allpairs$V2, sep=" ")
library(stringr)
xm <- do.call(rbind, str_split(string = AllpairsTmp, pattern = "-"))
separoPairs <- as.data.frame((xm), stringsAsFactors = FALSE)
separoPairs <- separoPairs %>% select(-V7) %>% #Drops empty column
mutate(V1=as.character(V1), V4=as.character(V4), V2=as.numeric(V2), V5=as.numeric(V5)) %>% mutate(V4 = trimws(V4, which = "both"))
#Only the rows with V1=V4 and V2=V5 are valid
validPairs <- separoPairs %>% filter(V1==V4 & V2==V5) %>% select(V1, V2, V3, V6)
# Generate n.Teachers
gen.teachers <- function(n.teachers){
Teacher.ID <- gen.names(n = n.teachers, name.order = "last.first")
Teacher.exp <- runif(n = n.teachers, min = 1, max = 30)
Teacher.Other <- sample(c(0,1), replace = T, prob = c(0.5, 0.5), size = n.teachers)
Teacher.RE <- rnorm(n = n.teachers, mean = 0, sd = 1)
Teachers <- data.frame(Teacher.ID, Teacher.exp, Teacher.Other, Teacher.RE)
return(Teachers)
}
Teachers <- gen.teachers(n.teachers = n.Teachers) %>%
mutate(Teacher.ID = as.character(Teacher.ID))
# Randomly assign n.Teachers teachers to the "ValidPairs"
TmpAssignments <- validPairs[sample(1:nrow(validPairs), n.Teachers), ]
Assignments <- cbind.data.frame(Teachers$Teacher.ID, TmpAssignments)
names(Assignments) <- c("Teacher.ID", "School.ID", "Grade", "Class_1", "Class_2")
# Tidy Data
library(tidyr)
TeacherClassroom <- Assignments %>%
gather(x, Classroom, Class_1,Class_2) %>%
select(-x) %>%
mutate(Teacher.ID = as.character(Teacher.ID))
# Merge
DF_Classrooms <- TeacherClassroom %>% full_join(Teachers, by="Teacher.ID") %>% full_join(Schools, by="School.ID")
rm(list=setdiff(ls(), "DF_Classrooms")) # Clean the work space!
Thanks!

randomly assign teachers to a school with dplyr or similar?

Suppose I have a data frame with 8 schools and its characteristics, and another with 48 teachers and its characteristics. I can generate some fake data with the following code:
library(dplyr)
library(geosphere)
set.seed(6232015)
n.schools <-8
n.teachers <- 48
makeRandomString <- function(pre, n=1, length=12) {
randomString <- c(1:n) # initialize vector
for (i in 1:n) {
randomString[i] <- paste0(pre,'.', paste(sample(c(0:9, letters, LETTERS),
length, replace=TRUE),
collapse=""))
}
return(randomString)
}
gen.teachers <- function(n.teachers){
Teacher.ID <- makeRandomString(pre= 'T', n = n.teachers, length = 20)
Teacher.exp <- runif(n = n.teachers, min = 1, max = 30)
Teacher.Other <- sample(c(0,1), replace = T, prob = c(0.5, 0.5), size = n.teachers)
Teachers <- data.frame(Teacher.ID, Teacher.exp, Teacher.Other)
return(Teachers)
}
gen.schools <- function(n.schools){
School.ID <- makeRandomString(pre= 'S', n = n.schools, length = 20)
School.lat <- runif(n = n.schools, min = -2, max = 2)
School.long <- runif(n = n.schools, min = -2, max = 2)
Schools <- data.frame(School.ID, School.lat, School.long) %>%
rowwise() %>% mutate (School.distance = distHaversine(p1 = c(School.long, School.lat),
p2 = c(0, 0), r = 3961))
return(Schools)
}
Teachers <- gen.teachers(n.teachers = n.teachers)
Schools <- gen.schools(n.schools = n.schools)
To each shool, I want to assign 6 teachers (every teacher get 1 and only 1 school). I could use:
Teachers %>% sample_n(6)
To get a list of 6 teachers assign those to a school, remove them from my original pool and keep going with a loop. My guess/hope is that there is a much easier way of doing this.
Thanks for the help!
In the context of your code
sample(rep(Schools$School.ID, each = 6))
gives a random sequence of schools where each school.id appears 6 times. Set Teachers$AssignedSchool to this sample and each teacher has an assigned school

Resources