Vectorize Functions in R

Vectorize Functions in R - r

I have this dataset in R that looks something like this:
id = sample.int(10000, 100000, replace = TRUE)
res = c(1,0)
results = sample(res, 100000, replace = TRUE)
date_exam_taken = sample(seq(as.Date('1999/01/01'), as.Date('2020/01/01'), by="day"), 100000, replace = TRUE)
my_data = data.frame(id, results, date_exam_taken)
my_data <- my_data[order(my_data$id, my_data$date_exam_taken),]
my_data$general_id = 1:nrow(my_data)
my_data$exam_number = ave(my_data$general_id, my_data$id, FUN = seq_along)
my_data$general_id = NULL
Using the R programming language, I wrote this loop that calculates conditional probabilities for the next exam results of a student conditional on the previous exam:
library(data.table)
setDT(my_data)
my_list = vector("list", length(unique(my_data$id)))
# Create an empty vector with pre-specified dimensions
my_vector = vector("list", 100)
for (i in 1:length(unique(my_data$id)))
{
tryCatch({
start_i = my_data[my_data$id == i,]
pairs_i = data.frame(first = head(start_i$results, -1), second = tail(start_i$results, -1))
frame_i = as.data.table(table(pairs_i))
frame_i[, id := i]
print(frame_i)
my_vector[[i]] = frame_i
}, error = function(e){})
}
final = rbindlist(my_vector)
I am now trying to "vectorize" this code for improved efficiency. Here is my attempt:
# I don't think I need to create a "list or vector" to store the results in advance?
my_vector = sapply(unique(my_data$id), function(i) {
{tryCatch({
setDT(my_data)
start_i = my_data[my_data$id == i,]
pairs_i = data.frame(first = head(start_i$results, -1), second = tail(start_i$results, -1))
frame_i = as.data.frame(table(pairs_i))
frame_i$i = i
print(frame_i)
return(frame_i)
}, error = function(e){print(paste("An error occurred:", e))})
}
})
# produced an error, so I tried a different code
final = rbindlist(my_vector, fill = TRUE)
# not sure if this fully worked either?
final = do.call(rbind.data.frame, my_vector)
Have I correctly "vectorized" this code?
UPDATE: Probability Calculation Template:
library(dplyr)
total_1 = final %>% group_by(first, second) %>% summarise(totals = n())
total_2 = total_1 %>% group_by(first) %>% summarise(sum = sum(totals))
join = merge(x = total_1, y = total_2, by = c("first"), all = TRUE)
join$probs = join$totals/join$sum
na.omit(join)

library(magrittr)
library(dplyr)
group_by(my_data, id)%>%
summarise(first = head(results, -1), second = tail(results, -1))%>%
ungroup

Related

Double Loops in R: Use .name_repair to specify repair?

I have this dataset in R:
set.seed(123)
myFun <- function(n = 5000) {
a <- do.call(paste0, replicate(5, sample(LETTERS, n, TRUE), FALSE))
paste0(a, sprintf("%04d", sample(9999, n, TRUE)), sample(LETTERS, n, TRUE))
}
col1 = myFun(100)
col2 = myFun(100)
col3 = myFun(100)
col4 = myFun(100)
group <- c("A","B","C","D")
group = sample(group, 100, replace=TRUE)
example = data.frame(col1, col2, col3, col4, group)
col1 col2 col3 col4 group
1 SKZDZ9876D BTAMF8110T LIBFV6882H ZFIPL4295E A
2 NXJRX7189Y AIZGY5809C HSMIH4556D YJGJP8022H C
3 XPTZB2035P EEKXK0873A PCPNW1021S NMROS4134O A
4 LJMCM3436S KGADK2847O SRMUI5723N RDIXI7301N B
5 ADITC6567L HUOCT5660P AQCNE3753K FUMGY1428B D
6 BAEDP8491P IAGQG4816B TXXQH6337M SDACH5752D C
I wrote this loop that compares different string distance metrics between all combinations of (col1,col2) and (col3,col4):
method = c("osa", "lv", "dl", "hamming", "lcs", "qgram", "cosine", "jaccard", "jw","soundex")
library(stringdist)
results = list()
for (i in 1:length(method))
{
method_i = method[i]
name_1_i = paste0("col1_col_2", method_i)
name_2_i = paste0("col3_col_4", method_i)
p1_i = stringdistmatrix(col1, col2, method = method_i, useNames = "string") %>%
as_tibble(rownames = "a") %>%
pivot_longer(-1, names_to = "b", values_to = name_1_i)
p2_i = stringdistmatrix(col3, col4, method = method_i, useNames = "string") %>%
as_tibble(rownames = "a") %>%
pivot_longer(-1, names_to = "b", values_to = name_2_i)
p1_i = p1_i[,3]
p2_i = p2_i[,3]
final_i = cbind(p1_i, p2_i)
results[[i]] = final_i
}
final = do.call(cbind.data.frame, results)
final = cbind(col1,col2, col3,col4, final)
average_col1_col2_dist = (final$col1_col_2osa + final$col1_col_2lv + final$col1_col_2dl + final$col1_col_2hamming + final$col1_col_2lcs + final$col1_col_2qgram + final$col1_col_2cosine + final$col1_col_2jaccard + final$col1_col_2jw + final$col1_col_2soundex)/10
average_col3_col4_dist = ( final$col3_col_4osa + final$col3_col_4lv + final$col3_col_4dl + final$col3_col_4hamming + final$col3_col_4lcs + final$col3_col_4qgram + final$col3_col_4cosine + final$col3_col_4jaccard + final$col3_col_4jw + final$col3_col_4soundex)/10
final = data.frame( col1, col2, col3, col4, average_col1_col2_dist, average_col3_col4_dist)
final = scale(final)
Now, I would like to make this a "double loop" and have the same comparisons being done, but the comparisons should be made only within each "group" :
results = list()
for (i in 1:length(method))
for (j in 1:length(unique(example$group))
{
{
groups_j = unique(example$group[j])
my_data_i = file[which(file$fsa == groups_j ), ]
method_i = method[i]
name_1_i = paste0("col1_col_2", method_i)
name_2_i = paste0("col3_col_4", method_i)
p1_i = stringdistmatrix(my_data_i$col1, my_data_i$col2, method = method_i, useNames = "string") %>%
as_tibble(rownames = "a") %>%
pivot_longer(-1, names_to = "b", values_to = name_1_i)
p2_i = stringdistmatrix(my_data_i$col3, my_data_i$col4, method = method_i, useNames = "string") %>%
as_tibble(rownames = "a") %>%
pivot_longer(-1, names_to = "b", values_to = name_2_i)
p1_i = p1_i[,3]
p2_i = p2_i[,3]
final_i = cbind(p1_i, p2_i)
results[[i]] = final_i
}
}
final = do.call(cbind.data.frame, results)
final = cbind(col1,col2, col3,col4, final)
average_col1_col2_dist = (final$col1_col_2osa + final$col1_col_2lv + final$col1_col_2dl + final$col1_col_2hamming + final$col1_col_2lcs + final$col1_col_2qgram + final$col1_col_2cosine + final$col1_col_2jaccard + final$col1_col_2jw + final$col1_col_2soundex)/10
average_col3_col4_dist = ( final$col3_col_4osa + final$col3_col_4lv + final$col3_col_4dl + final$col3_col_4hamming + final$col3_col_4lcs + final$col3_col_4qgram + final$col3_col_4cosine + final$col3_col_4jaccard + final$col3_col_4jw + final$col3_col_4soundex)/10
final = data.frame( col1, col2, col3, col4, average_col1_col2_dist, average_col3_col4_dist)
final = scale(final)
But I keep getting this error:
Error:
! Column 1 must be named.
Use .name_repair to specify repair.
Caused by error in `repaired_names()`:
! Names can't be empty.
x Empty name found at location 1.
Does anyone know how I can fix this?
Thank you!

In the process of trying to understand what you're doing, I drifted pretty far from your original code. There isn't anything necessarily wrong with the majority of it!
Your code
As far as your grouping code...
You started with
for(this in that)
for(this in that)
{
{
The brackets nest what's inside the for statement. You need to
for(this in that) {
for(this in that) {
# or this works
for(this in that)
{
for(this in that)
{
When you specified your for criteria, you went with integers. However, you could just go with the string, as in
for(i in method) { # i is a string
# versus
for(i in 1:length(method)) { # i is an integer
When you wrote the nested for statement you missed a closing parenthesis.
for(j in 1:length(unique(example$group)) # end parentheses missing!
# should have been
for(j in 1:length(unique(example$group)))
# easier to see like this:
for(j in 1:length(
unique(
example$group
)
)
)
Did you know? You can set RStudio to use 'Rainbow parentheses' which is great for ensuring you don't miss closing parentheses or brackets. Go to Tools -> Global Options -> Code (left menu in popup) -> Display (top menu in menu popup) & 'Rainbow parentheses' is the last item in the list. This is what it looks like with my current appearance settings:
When extracting the group, you selected a dataset row, not a unique value.
# this selects jth row, then looks for unique values
groups_j = unique(example$group[j])
# you need to get the unique values, then iterate
group_j = unique(example$group)[j]
# the j goes outside the call for unique()
In this code, you wrote file and file$fsa. I'm assuming this is equivalent to example and example$group, because I don't have whatever is in file.
All of these lines of code do the same thing. Keep in mind that group is in the data frame, but it's an object in the environment on its own, as well.
my_data_i = example[which(example$group == group_j), ] # this would work
my_data_i <- filter(example, groups == group_j) # this would work
my_data_i <- example[group == group_j, ] # this would work
my_data_i <- example[example$group == group_j, ] # this would work
If you iterated over groups instead of indices, you can skip the creation of group_j which is the only time you used j.
for(j in unique(example$group)) {
my_data_i <- example[example$group == j, ]
}
When you sent the results using i and i alone, you would overwrite the data with each iteration over j.
The first group iteration can go in results[[i]], but the next group, can either be bound to that data or placed in a list within a list.
For example:
results[[1]] <- group 1, method 1
results[[1]] <- rbind(results[[1]], [group 2, method 1])
# or
results[[1]][[1]] <- group 1, method 1
results[[1]][[2]] <- group 2, method 1
Considering those two options for the lists (above), the first will allow your remaining code (creation of final, averages, etc.) to work without any changes. However, if you use the second option (above), that code will require modification.
If you leave for(j with 1:length this will work:
if(j < 2) {
results[[i]] <- final_i
} else {
results[[i]] <- rbind(results[[i]], final_i)
}
If you use for(j in unique(example$group)), you could use this:
if(isTRUE(j == unique(example$group)[1])) { # isTRUE() to avoid null errors
results[[i]] <- final_i
} else {
results[[i]] <- rbind(results[[i]], final_i)
}
Your nested for statements all in one chunk.
results = list()
for (i in 1:length(method)) { # bracket missing here; it was in the wrong place
for (j in 1:length(unique(example$group))) { # missing a parentheses here
# { # this needs to be after each for statement
# groups_j = unique(example$group[j]) # you have selected the jth row, not the jth unique
group_j = unique(example$group)[j] # the selection goes outside the call for unique()
# use things like print or message to check what your function does
# print(group_j)
# message('this is a message ', group_j) # notice the different color in the console?
my_data_i <- example[group == group_j, ] # this would work
method_i = method[i]
name_1_i = paste0("col1_col_2", method_i)
name_2_i = paste0("col3_col_4", method_i)
p1_i = stringdistmatrix(my_data_i$col1, my_data_i$col2, method = method_i, useNames = "string") %>%
as_tibble(rownames = "a") %>%
pivot_longer(-1, names_to = "b", values_to = name_1_i)
p2_i = stringdistmatrix(my_data_i$col3, my_data_i$col4, method = method_i, useNames = "string") %>%
as_tibble(rownames = "a") %>%
pivot_longer(-1, names_to = "b", values_to = name_2_i)
p1_i = p1_i[,3]
p2_i = p2_i[,3]
final_i = cbind(p1_i, p2_i)
# results[[i]] = final_i # you replace this content everytime you change groups
# you need to append the values between groups (assuming you want one column per test type)
# first append, then combine
if(j < 2) { # use < instead of == to avoid null error
results[[i]] <- final_i
} else {
results[[i]] <- rbind(results[[i]], final_i)
}
}
}
My code to accomplish the same task
I've added a few checks and balances to make it more dynamic. You can send any number of columns, methods, or groups to grpComp.
This uses tidyverse, glue, and stringdist.
This first function is called by the other function.
library(tidyverse)
library(stringdist)
library(glue)
strD <- function(c1, c2, mm) { # input column 1; column 2; measurement method
res <- stringdistmatrix(c1, c2, method = mm, useNames = 'string')
f_res <- matrix(res) # extract values and flatten
}
This is the distances by group function.
grpComp <- function(fr, methods, grp) { # data frame of columns to compare,
# methods to use, groups (vector same length as df rows)
cnames <- names(fr)
if(length(cnames) %% 2 != 0) {
message('there are an uneven number of columns to compare')
break # something's wrong
}
if(length(grp) != nrow(fr)) {
message('there groups vector length must match number of rows in the data')
break # something's wrong
}
# extract distances
dists <- map(
method,
function(j) {
str_ds <- map_dfc( # by column sets
seq(from = 1, to = length(cnames), by = 2),
function(i) {
str_gr <- map_dfr( # by group
unique(grp),
function(k) {
as.data.frame(list( # has to be list for `col.names` to work
strD(fr[grp == k, cnames[i]],
fr[grp == k, cnames[i + 1]], j)),
optional = F, row.names = NULL,
col.names = paste0("c", i, i+1, '_', j))
}) # combine groups by rows
str_gr
}) # combine methods by columns
str_ds
}) %>% do.call(cbind, .)
ncnames <- names(dists) %>% substr(1, 3) %>% unique() # determine unique col groups
for(m in ncnames) { # get averages for each comparison set
dists <- mutate(dists,
"ave_{m}" := rowMeans(across(contains(m))) %>% scale())
}
dists <- select(dists, contains('ave'))
}
This is how you would use this code.
test5 <- grpComp(example[, 1:4], methods, example$group)
Even though your function for non-grouped data is working, I thought I would include that code as well.
strComp <- function(fr, methods) { # data frame of columns to compare, methods to use
cnames <- names(fr)
if(length(cnames) %% 2 != 0) {
message('there are an uneven number of columns to compare')
break # something's wrong
}
# extract distances
dists <- map(
method,
function(j) {
str_ds <- map_dfc(
seq(from = 1, to = length(cnames), by = 2),
function(i) {
as.data.frame(list( # has to be list for `col.names` to work
strD(fr[, cnames[i]], fr[, cnames[i + 1]], j)), optional = F,
col.names = paste0("c", i, i+1, '_', j))
})
str_ds
}) %>% do.call(cbind, .)
ncnames <- names(dists) %>% substr(1, 3) %>% unique() # determine unique col groups
for(k in ncnames) { # get averages for each comparison set
dists <- mutate(dists,
"ave_{k}" := rowMeans(across(contains(k))) %>% scale())
}
dists <- select(dists, contains('ave'))
}
To use this function:
test4 <- strComp(example[, 1:4], methods)

Make a function in R that takes a list as an argument

I have the following function in R:
get_rad_wait_time_data <- function(data) {
# get file
file.to.load <- tryCatch(file.choose(new = T), error = function(e) "")
# Want to add months parameter
# months = as.list(months)
# read the file in and clean col names
df <- read.csv(file.to.load) %>%
clean_names()
# clean file and mutate columns
df_clean <- df %>%
filter(!is.na(acc)) %>%
select(
mrn
, step_start_time
, step_end_time
, step_from_to
, wait_time
) %>%
mutate(
step_start_time_clean = mdy_hms(step_start_time)
, step_end_time_clean = mdy_hms(step_end_time)
, elapsed_time = difftime(step_end_time_clean, step_start_time_clean, units = "mins")
, elapsed_time_int = as.integer(elapsed_time)
, procedure_start_year = year(step_start_time_clean)
, procedure_start_month = month(step_start_time_clean)
, procedure_start_month_name = month(step_start_time_clean, label = T, abbr = T)
, procedure_start_day = day(step_start_time_clean)
, procedure_start_dow = wday(step_start_time_clean, label = T, abbr = T)
, procedure_start_hour = hour(step_start_time_clean)
, procedure_end_year = year(step_end_time_clean)
, procedure_end_month = month(step_end_time_clean)
, procedure_end_month_name = month(step_end_time_clean, label = T, abbr = T)
, procedure_end_day = day(step_end_time_clean)
, procedure_end_dow = wday(step_end_time_clean, label = T, abbr = T)
, procedure_end_hour = hour(step_end_time_clean)
) %>%
filter(procedure_start_month_name %in% c("Apr","May","Jun")) %>%
filter(elapsed_time_int >= 0)
dt <- data.table(df_clean)
dt[, mrn := na.locf(mrn, fromLast = T, na.rm = F)]
df_clean <- setDF(dt)
df_clean <- df_clean %>%
group_by(
mrn
, step_start_time_clean
, step_end_time_clean
) %>%
mutate(
proc_count = n()
, avg_time_per_proc = round(elapsed_time_int / proc_count, 2)
)
df_clean <- as.data.frame(df_clean)
}
I want my function to be something like get_rad_wait_time_data(data, months) where I can input something like `months = c("Jan","Feb","Mar")
No clue how to go from here

I changed the function to the following which seems to work:
get_rad_wait_time_data <- function(data, months) {
# get file
file.to.load <- tryCatch(file.choose(new = T), error = function(e) "")
# Months
months = as.list(months)
# read the file in and clean col names
df <- read.csv(file.to.load) %>%
clean_names()
# clean file and mutate columns
df_clean <- df %>%
filter(!is.na(acc)) %>%
select(
mrn
, step_start_time
, step_end_time
, step_from_to
, wait_time
) %>%
mutate(
step_start_time_clean = mdy_hms(step_start_time)
, step_end_time_clean = mdy_hms(step_end_time)
, elapsed_time = difftime(step_end_time_clean, step_start_time_clean, units = "mins")
, elapsed_time_int = as.integer(elapsed_time)
, procedure_start_year = year(step_start_time_clean)
, procedure_start_month = month(step_start_time_clean)
, procedure_start_month_name = month(step_start_time_clean, label = T, abbr = T)
, procedure_start_day = day(step_start_time_clean)
, procedure_start_dow = wday(step_start_time_clean, label = T, abbr = T)
, procedure_start_hour = hour(step_start_time_clean)
, procedure_end_year = year(step_end_time_clean)
, procedure_end_month = month(step_end_time_clean)
, procedure_end_month_name = month(step_end_time_clean, label = T, abbr = T)
, procedure_end_day = day(step_end_time_clean)
, procedure_end_dow = wday(step_end_time_clean, label = T, abbr = T)
, procedure_end_hour = hour(step_end_time_clean)
) %>%
#filter(procedure_start_month_name %in% c("Apr","May","Jun")) %>%
filter(procedure_start_month_name %in% months) %>%
filter(elapsed_time_int >= 0)
dt <- data.table(df_clean)
dt[, mrn := na.locf(mrn, fromLast = T, na.rm = F)]
df_clean <- setDF(dt)
df_clean <- df_clean %>%
group_by(
mrn
, step_start_time_clean
, step_end_time_clean
) %>%
mutate(
proc_count = n()
, avg_time_per_proc = round(elapsed_time_int / proc_count, 2)
)
df_clean <- as.data.frame(df_clean)
}

Append to dataframe through for loop and function

When I run a code with save as csv at the end everything runs correctly. It means I filter my dataset by country do some continuations and save it as a country file.
When I try to rbind my datasets I receive nothing. I try different solutions which I found but nothing is working. Clearly I do not understand why I can not rbind.
The code:
library(dplyr)
library(readxl)
setwd("Z:/Reporting_Private/Tableau")
dataupl <- read_excel("Analysis Map_Tableau - Ready.xlsm", sheet = "Data")
df = dataupl
#select right columns
df = df[,1:6]
colnames(df)=c("Office", "Employee","Territiry","Sales","Leads","Act")
#change n/a to zero
df[is.na(df)]=0
countries = df %>% select(Office) %>% distinct()
countries = as.data.frame(countries)
engine <- function(input){
df = df %>% filter(Office==input)
SCALESALES = scale(df$Sales)
SCALELEADS= scale(df$Leads)
SCALEACT= scale(df$Act)
df = df %>% mutate(SCALESALES = SCALESALES, SCALELEADS = SCALELEADS, SCALEACT = SCALEACT)
df$SLegend = ave(df$Sales, df$SalesLegend, FUN = min)
df$SLegend = ifelse(df$SLegend>0, df$SLegend,0)
df$LLegend = ave(df$Leads, df$LeadsLegend, FUN = min)
df$ALegend = ave(df$Act, df$ActLegend, FUN = min)
#write.csv(final, file = paste0(input,".csv"))
dftotal = data.frame()
dftotal = rbind(dftotal,df)
}
for (i in 1:nrow(countries)){
input = countries[i,]
engine(input)
}

It seems that you can split your dataframe L <- split(df, df$Office) and then lapply(L, ...) Instead of write.csv(...) you have to return the dataframe: return(df) in your function.
So, something like this:
engine <- function(dfi) {
SCALESALES = scale(dfi$Sales)
SCALELEADS = scale(dfi$Leads)
SCALEACT = scale(dfi$Act)
dfi = dfi %>% mutate(SCALESALES = SCALESALES, SCALELEADS = SCALELEADS, SCALEACT = SCALEACT)
dfi$SLegend = ave(dfi$Sales, dfi$SalesLegend, FUN = min)
dfi$SLegend = ifelse(dfi$SLegend>0, dfi$SLegend, 0)
dfi$LLegend = ave(dfi$Leads, dfi$LeadsLegend, FUN = min)
dfi$ALegend = ave(dfi$Act, dfi$ActLegend, FUN = min)
return(dfi)
}
L <- split(df, df$Office)
Lnew <- lapply(L, engine)
dftotal <- Lnew[[1]]
for (i in 2:length(Lnew)) dftotal <- rbind(dftotal, Lnew[[i]])

How can I extend tidyr::spread() while maintaining the order of column names?

How to expand when maintaining the rank of numbers when using the spread function?
library(tidyverse)
data.frame(time = paste0("t_", 1:100)) %>%
rowwise() %>%
mutate(rnd = sample(1:100, size=1)) %>%
spread(time, rnd)
The column names of the execution result of the code shown above are t_1, t_11, t_100, .....
I want to get column names in order of numbers(t_1, t_2, t_3, ...).

You can try two things:
(1) Make "time" a factor with levels matching the order you want:
data.frame(time = factor(paste0("t_", 1:100), levels = paste0("t_", 1:100))) %>%
rowwise() %>%
mutate(rnd = sample(1:100, size=1)) %>%
spread(time, rnd)
(2) Force the order using a select statement:
data.frame(time = paste0("t_", 1:100)) %>%
rowwise() %>%
mutate(rnd = sample(1:100, size=1)) %>%
spread(time, rnd) %>%
select(paste0("t_", 1:100))

Here is a new function that retains column order. Only one small change is needed (see annotation):
my_spread <- function (data, key, value, fill = NA, convert = FALSE, drop = TRUE,
sep = NULL) {
key_col <- tidyr:::col_name(substitute(key))
value_col <- tidyr:::col_name(substitute(value))
tbl_df(my_spread_(data, key_col, value_col, fill = fill, convert = convert,
drop = drop, sep = sep))
}
my_spread_ <- function (data, key_col, value_col, fill = NA, convert = FALSE,
drop = TRUE, sep = NULL) {
col <- data[key_col]
#col_id <- tidyr:::id(col, drop = drop) # Old line
col_id <- seq_len(nrow(data)) # New line 1
attr(col_id, 'n') <- nrow(data) # New line 2
col_labels <- tidyr:::split_labels(col, col_id, drop = drop)
rows <- data[setdiff(names(data), c(key_col, value_col))]
if (length(rows) == 0) {
row_id <- structure(1L, n = 1L)
row_labels <- as.data.frame(matrix(nrow = 1, ncol = 0))
}
else {
row_id <- id(rows, drop = drop)
row_labels <- tidyr:::split_labels(rows, row_id, drop = drop)
rownames(row_labels) <- NULL
}
overall <- tidyr:::id(list(col_id, row_id), drop = FALSE)
n <- attr(overall, "n")
if (anyDuplicated(overall)) {
groups <- split(seq_along(overall), overall)
groups <- groups[vapply(groups, length, integer(1)) >
1]
str <- vapply(
groups,
function(x) paste0("(", paste0(x, collapse = ", "), ")"), character(1)
)
stop("Duplicate identifiers for rows ", paste(str, collapse = ", "),
call. = FALSE)
}
if (length(overall) < n) {
overall <- match(seq_len(n), overall, nomatch = NA)
}
else {
overall <- order(overall)
}
value <- data[[value_col]]
ordered <- value[overall]
if (!is.na(fill)) {
ordered[is.na(ordered)] <- fill
}
if (convert && !is.character(ordered)) {
ordered <- as.character(ordered)
}
dim(ordered) <- c(attr(row_id, "n"), attr(col_id, "n"))
colnames(ordered) <- enc2utf8( tidyr:::col_names(col_labels, sep = sep))
ordered <- tidyr:::as_data_frame_matrix(ordered)
if (convert) {
ordered[] <- lapply(ordered, type.convert, as.is = TRUE)
}
tidyr:::append_df(row_labels, ordered)
}

randomize observations by groups (blocks)

I have a data frame with I obsevations, and each observation belongs to one of g categories.
set.seed(9782)
I <- 500
g <- 10
library(dplyr)
anon_id <- function(n = 1, length = 12) {
randomString <- c(1:n)
for (i in 1:n)
{
randomString[i] <- paste(sample(c(0:9, letters, LETTERS),
length, replace = TRUE),
collapse = "")
}
return(randomString)
}
df <- data.frame(id = anon_id(n = I, length = 16),
group = sample(1:g, I, T))
I want to randomly assign each observation to one of J "urns", given some vector of probabilities p. That is the probability of being assign to urn J=1 is p[1]. The added complexity is that I want to do this block by block.
If I ignore the blocks, I can do this easily:
J <- 3
p <- c(0.25, 0.5, 0.25)
df1 <- df %>% mutate(urn = sample(x = c(1:J), size = I, replace = T, prob = p))
I thought about this method to do it by "block"
# Block randomization
randomize_block <- function(g) {
df1 <- df %>% filter(group==g)
size <- nrow(df1)
df1 <- df1 %>% mutate(urn = sample(x = c(1:J),
size = size,
replace = T,
prob = p))
return(df1)
}
df2 <- lapply(1:g, randomize_block)
df2 <- data.table::rbindlist(df2)
Is there a better way?

Not sure if this is better, but here is a base R technique with data.frame df, that has group name "group" as well as urn assignments 1:J with assignment probabilities in vector p of length J.
# get urn assignment
urnAssignment <- lapply(unique(df$group),
function(i) sample(1:J, nrow(df[group==i,]), replace =T, prob=p))
# get a list that collects position of observations
obsOrder <- lapply(unique(df$group),
function(i) which(df$group == i))
df$urnAssignment <- unlist(urnAssignment)[unlist(obsOrder)]

randomizr::block_ra does exactly what you want.
library(randomizr)
library(janitor) #just for the tabyl function
block_rand <- as.tibble(randomizr::block_ra(blocks = df$group, conditions = c("urn_1","urn_2","urn_3")))
df2 <- as.tibble(bind_cols(df, block_rand))
df2 %>% janitor::tabyl(group, value)

This does the trick using dplyr:
randomize <- function(data, groups=2, block_id = NULL, p=NULL, seed=9782) {
if(is.null(p)) p <- rep(1/groups, groups)
if(is.null(block_id)){
df1 <- data %>%
mutate(Treatment = sample(x = c(1:groups),
size = n(),
replace = T,
prob = p))
return(df1)
}else{
df1 <- data %>% group_by_(block_id) %>%
mutate(Treatment = sample(x = c(1:groups),
size = n(),
replace = T,
prob = p))
}
}
df1 <- randomize(data = df, groups = J, block_id = "group", p = p, seed = 9782)

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Vectorize Functions in R - r

library(magrittr) library(dplyr) group_by(my_data, id)%>% summarise(first = head(results, -1), second = tail(results, -1))%>% ungroup

Related

Double Loops in R: Use .name_repair to specify repair?

Make a function in R that takes a list as an argument

Append to dataframe through for loop and function

How can I extend tidyr::spread() while maintaining the order of column names?

randomize observations by groups (blocks)

Categories

Resources