Calculating Multiple Distance Metrics [duplicate] - r

I have this dataset in R:
set.seed(123)
myFun <- function(n = 5000) {
a <- do.call(paste0, replicate(5, sample(LETTERS, n, TRUE), FALSE))
paste0(a, sprintf("%04d", sample(9999, n, TRUE)), sample(LETTERS, n, TRUE))
}
col1 = myFun(100)
col2 = myFun(100)
col3 = myFun(100)
col4 = myFun(100)
group <- c("A","B","C","D")
group = sample(group, 100, replace=TRUE)
example = data.frame(col1, col2, col3, col4, group)
col1 col2 col3 col4 group
1 SKZDZ9876D BTAMF8110T LIBFV6882H ZFIPL4295E A
2 NXJRX7189Y AIZGY5809C HSMIH4556D YJGJP8022H C
3 XPTZB2035P EEKXK0873A PCPNW1021S NMROS4134O A
4 LJMCM3436S KGADK2847O SRMUI5723N RDIXI7301N B
5 ADITC6567L HUOCT5660P AQCNE3753K FUMGY1428B D
6 BAEDP8491P IAGQG4816B TXXQH6337M SDACH5752D C
I wrote this loop that compares different string distance metrics between all combinations of (col1,col2) and (col3,col4):
method = c("osa", "lv", "dl", "hamming", "lcs", "qgram", "cosine", "jaccard", "jw","soundex")
library(stringdist)
results = list()
for (i in 1:length(method))
{
method_i = method[i]
name_1_i = paste0("col1_col_2", method_i)
name_2_i = paste0("col3_col_4", method_i)
p1_i = stringdistmatrix(col1, col2, method = method_i, useNames = "string") %>%
as_tibble(rownames = "a") %>%
pivot_longer(-1, names_to = "b", values_to = name_1_i)
p2_i = stringdistmatrix(col3, col4, method = method_i, useNames = "string") %>%
as_tibble(rownames = "a") %>%
pivot_longer(-1, names_to = "b", values_to = name_2_i)
p1_i = p1_i[,3]
p2_i = p2_i[,3]
final_i = cbind(p1_i, p2_i)
results[[i]] = final_i
}
final = do.call(cbind.data.frame, results)
final = cbind(col1,col2, col3,col4, final)
average_col1_col2_dist = (final$col1_col_2osa + final$col1_col_2lv + final$col1_col_2dl + final$col1_col_2hamming + final$col1_col_2lcs + final$col1_col_2qgram + final$col1_col_2cosine + final$col1_col_2jaccard + final$col1_col_2jw + final$col1_col_2soundex)/10
average_col3_col4_dist = ( final$col3_col_4osa + final$col3_col_4lv + final$col3_col_4dl + final$col3_col_4hamming + final$col3_col_4lcs + final$col3_col_4qgram + final$col3_col_4cosine + final$col3_col_4jaccard + final$col3_col_4jw + final$col3_col_4soundex)/10
final = data.frame( col1, col2, col3, col4, average_col1_col2_dist, average_col3_col4_dist)
final = scale(final)
Now, I would like to make this a "double loop" and have the same comparisons being done, but the comparisons should be made only within each "group" :
results = list()
for (i in 1:length(method))
for (j in 1:length(unique(example$group))
{
{
groups_j = unique(example$group[j])
my_data_i = file[which(file$fsa == groups_j ), ]
method_i = method[i]
name_1_i = paste0("col1_col_2", method_i)
name_2_i = paste0("col3_col_4", method_i)
p1_i = stringdistmatrix(my_data_i$col1, my_data_i$col2, method = method_i, useNames = "string") %>%
as_tibble(rownames = "a") %>%
pivot_longer(-1, names_to = "b", values_to = name_1_i)
p2_i = stringdistmatrix(my_data_i$col3, my_data_i$col4, method = method_i, useNames = "string") %>%
as_tibble(rownames = "a") %>%
pivot_longer(-1, names_to = "b", values_to = name_2_i)
p1_i = p1_i[,3]
p2_i = p2_i[,3]
final_i = cbind(p1_i, p2_i)
results[[i]] = final_i
}
}
final = do.call(cbind.data.frame, results)
final = cbind(col1,col2, col3,col4, final)
average_col1_col2_dist = (final$col1_col_2osa + final$col1_col_2lv + final$col1_col_2dl + final$col1_col_2hamming + final$col1_col_2lcs + final$col1_col_2qgram + final$col1_col_2cosine + final$col1_col_2jaccard + final$col1_col_2jw + final$col1_col_2soundex)/10
average_col3_col4_dist = ( final$col3_col_4osa + final$col3_col_4lv + final$col3_col_4dl + final$col3_col_4hamming + final$col3_col_4lcs + final$col3_col_4qgram + final$col3_col_4cosine + final$col3_col_4jaccard + final$col3_col_4jw + final$col3_col_4soundex)/10
final = data.frame( col1, col2, col3, col4, average_col1_col2_dist, average_col3_col4_dist)
final = scale(final)
But I keep getting this error:
Error:
! Column 1 must be named.
Use .name_repair to specify repair.
Caused by error in `repaired_names()`:
! Names can't be empty.
x Empty name found at location 1.
Does anyone know how I can fix this?
Thank you!

In the process of trying to understand what you're doing, I drifted pretty far from your original code. There isn't anything necessarily wrong with the majority of it!
Your code
As far as your grouping code...
You started with
for(this in that)
for(this in that)
{
{
The brackets nest what's inside the for statement. You need to
for(this in that) {
for(this in that) {
# or this works
for(this in that)
{
for(this in that)
{
When you specified your for criteria, you went with integers. However, you could just go with the string, as in
for(i in method) { # i is a string
# versus
for(i in 1:length(method)) { # i is an integer
When you wrote the nested for statement you missed a closing parenthesis.
for(j in 1:length(unique(example$group)) # end parentheses missing!
# should have been
for(j in 1:length(unique(example$group)))
# easier to see like this:
for(j in 1:length(
unique(
example$group
)
)
)
Did you know? You can set RStudio to use 'Rainbow parentheses' which is great for ensuring you don't miss closing parentheses or brackets. Go to Tools -> Global Options -> Code (left menu in popup) -> Display (top menu in menu popup) & 'Rainbow parentheses' is the last item in the list. This is what it looks like with my current appearance settings:
When extracting the group, you selected a dataset row, not a unique value.
# this selects jth row, then looks for unique values
groups_j = unique(example$group[j])
# you need to get the unique values, then iterate
group_j = unique(example$group)[j]
# the j goes outside the call for unique()
In this code, you wrote file and file$fsa. I'm assuming this is equivalent to example and example$group, because I don't have whatever is in file.
All of these lines of code do the same thing. Keep in mind that group is in the data frame, but it's an object in the environment on its own, as well.
my_data_i = example[which(example$group == group_j), ] # this would work
my_data_i <- filter(example, groups == group_j) # this would work
my_data_i <- example[group == group_j, ] # this would work
my_data_i <- example[example$group == group_j, ] # this would work
If you iterated over groups instead of indices, you can skip the creation of group_j which is the only time you used j.
for(j in unique(example$group)) {
my_data_i <- example[example$group == j, ]
}
When you sent the results using i and i alone, you would overwrite the data with each iteration over j.
The first group iteration can go in results[[i]], but the next group, can either be bound to that data or placed in a list within a list.
For example:
results[[1]] <- group 1, method 1
results[[1]] <- rbind(results[[1]], [group 2, method 1])
# or
results[[1]][[1]] <- group 1, method 1
results[[1]][[2]] <- group 2, method 1
Considering those two options for the lists (above), the first will allow your remaining code (creation of final, averages, etc.) to work without any changes. However, if you use the second option (above), that code will require modification.
If you leave for(j with 1:length this will work:
if(j < 2) {
results[[i]] <- final_i
} else {
results[[i]] <- rbind(results[[i]], final_i)
}
If you use for(j in unique(example$group)), you could use this:
if(isTRUE(j == unique(example$group)[1])) { # isTRUE() to avoid null errors
results[[i]] <- final_i
} else {
results[[i]] <- rbind(results[[i]], final_i)
}
Your nested for statements all in one chunk.
results = list()
for (i in 1:length(method)) { # bracket missing here; it was in the wrong place
for (j in 1:length(unique(example$group))) { # missing a parentheses here
# { # this needs to be after each for statement
# groups_j = unique(example$group[j]) # you have selected the jth row, not the jth unique
group_j = unique(example$group)[j] # the selection goes outside the call for unique()
# use things like print or message to check what your function does
# print(group_j)
# message('this is a message ', group_j) # notice the different color in the console?
my_data_i <- example[group == group_j, ] # this would work
method_i = method[i]
name_1_i = paste0("col1_col_2", method_i)
name_2_i = paste0("col3_col_4", method_i)
p1_i = stringdistmatrix(my_data_i$col1, my_data_i$col2, method = method_i, useNames = "string") %>%
as_tibble(rownames = "a") %>%
pivot_longer(-1, names_to = "b", values_to = name_1_i)
p2_i = stringdistmatrix(my_data_i$col3, my_data_i$col4, method = method_i, useNames = "string") %>%
as_tibble(rownames = "a") %>%
pivot_longer(-1, names_to = "b", values_to = name_2_i)
p1_i = p1_i[,3]
p2_i = p2_i[,3]
final_i = cbind(p1_i, p2_i)
# results[[i]] = final_i # you replace this content everytime you change groups
# you need to append the values between groups (assuming you want one column per test type)
# first append, then combine
if(j < 2) { # use < instead of == to avoid null error
results[[i]] <- final_i
} else {
results[[i]] <- rbind(results[[i]], final_i)
}
}
}
My code to accomplish the same task
I've added a few checks and balances to make it more dynamic. You can send any number of columns, methods, or groups to grpComp.
This uses tidyverse, glue, and stringdist.
This first function is called by the other function.
library(tidyverse)
library(stringdist)
library(glue)
strD <- function(c1, c2, mm) { # input column 1; column 2; measurement method
res <- stringdistmatrix(c1, c2, method = mm, useNames = 'string')
f_res <- matrix(res) # extract values and flatten
}
This is the distances by group function.
grpComp <- function(fr, methods, grp) { # data frame of columns to compare,
# methods to use, groups (vector same length as df rows)
cnames <- names(fr)
if(length(cnames) %% 2 != 0) {
message('there are an uneven number of columns to compare')
break # something's wrong
}
if(length(grp) != nrow(fr)) {
message('there groups vector length must match number of rows in the data')
break # something's wrong
}
# extract distances
dists <- map(
method,
function(j) {
str_ds <- map_dfc( # by column sets
seq(from = 1, to = length(cnames), by = 2),
function(i) {
str_gr <- map_dfr( # by group
unique(grp),
function(k) {
as.data.frame(list( # has to be list for `col.names` to work
strD(fr[grp == k, cnames[i]],
fr[grp == k, cnames[i + 1]], j)),
optional = F, row.names = NULL,
col.names = paste0("c", i, i+1, '_', j))
}) # combine groups by rows
str_gr
}) # combine methods by columns
str_ds
}) %>% do.call(cbind, .)
ncnames <- names(dists) %>% substr(1, 3) %>% unique() # determine unique col groups
for(m in ncnames) { # get averages for each comparison set
dists <- mutate(dists,
"ave_{m}" := rowMeans(across(contains(m))) %>% scale())
}
dists <- select(dists, contains('ave'))
}
This is how you would use this code.
test5 <- grpComp(example[, 1:4], methods, example$group)
Even though your function for non-grouped data is working, I thought I would include that code as well.
strComp <- function(fr, methods) { # data frame of columns to compare, methods to use
cnames <- names(fr)
if(length(cnames) %% 2 != 0) {
message('there are an uneven number of columns to compare')
break # something's wrong
}
# extract distances
dists <- map(
method,
function(j) {
str_ds <- map_dfc(
seq(from = 1, to = length(cnames), by = 2),
function(i) {
as.data.frame(list( # has to be list for `col.names` to work
strD(fr[, cnames[i]], fr[, cnames[i + 1]], j)), optional = F,
col.names = paste0("c", i, i+1, '_', j))
})
str_ds
}) %>% do.call(cbind, .)
ncnames <- names(dists) %>% substr(1, 3) %>% unique() # determine unique col groups
for(k in ncnames) { # get averages for each comparison set
dists <- mutate(dists,
"ave_{k}" := rowMeans(across(contains(k))) %>% scale())
}
dists <- select(dists, contains('ave'))
}
To use this function:
test4 <- strComp(example[, 1:4], methods)

Related

Double Loops in R: Use .name_repair to specify repair?

I have this dataset in R:
set.seed(123)
myFun <- function(n = 5000) {
a <- do.call(paste0, replicate(5, sample(LETTERS, n, TRUE), FALSE))
paste0(a, sprintf("%04d", sample(9999, n, TRUE)), sample(LETTERS, n, TRUE))
}
col1 = myFun(100)
col2 = myFun(100)
col3 = myFun(100)
col4 = myFun(100)
group <- c("A","B","C","D")
group = sample(group, 100, replace=TRUE)
example = data.frame(col1, col2, col3, col4, group)
col1 col2 col3 col4 group
1 SKZDZ9876D BTAMF8110T LIBFV6882H ZFIPL4295E A
2 NXJRX7189Y AIZGY5809C HSMIH4556D YJGJP8022H C
3 XPTZB2035P EEKXK0873A PCPNW1021S NMROS4134O A
4 LJMCM3436S KGADK2847O SRMUI5723N RDIXI7301N B
5 ADITC6567L HUOCT5660P AQCNE3753K FUMGY1428B D
6 BAEDP8491P IAGQG4816B TXXQH6337M SDACH5752D C
I wrote this loop that compares different string distance metrics between all combinations of (col1,col2) and (col3,col4):
method = c("osa", "lv", "dl", "hamming", "lcs", "qgram", "cosine", "jaccard", "jw","soundex")
library(stringdist)
results = list()
for (i in 1:length(method))
{
method_i = method[i]
name_1_i = paste0("col1_col_2", method_i)
name_2_i = paste0("col3_col_4", method_i)
p1_i = stringdistmatrix(col1, col2, method = method_i, useNames = "string") %>%
as_tibble(rownames = "a") %>%
pivot_longer(-1, names_to = "b", values_to = name_1_i)
p2_i = stringdistmatrix(col3, col4, method = method_i, useNames = "string") %>%
as_tibble(rownames = "a") %>%
pivot_longer(-1, names_to = "b", values_to = name_2_i)
p1_i = p1_i[,3]
p2_i = p2_i[,3]
final_i = cbind(p1_i, p2_i)
results[[i]] = final_i
}
final = do.call(cbind.data.frame, results)
final = cbind(col1,col2, col3,col4, final)
average_col1_col2_dist = (final$col1_col_2osa + final$col1_col_2lv + final$col1_col_2dl + final$col1_col_2hamming + final$col1_col_2lcs + final$col1_col_2qgram + final$col1_col_2cosine + final$col1_col_2jaccard + final$col1_col_2jw + final$col1_col_2soundex)/10
average_col3_col4_dist = ( final$col3_col_4osa + final$col3_col_4lv + final$col3_col_4dl + final$col3_col_4hamming + final$col3_col_4lcs + final$col3_col_4qgram + final$col3_col_4cosine + final$col3_col_4jaccard + final$col3_col_4jw + final$col3_col_4soundex)/10
final = data.frame( col1, col2, col3, col4, average_col1_col2_dist, average_col3_col4_dist)
final = scale(final)
Now, I would like to make this a "double loop" and have the same comparisons being done, but the comparisons should be made only within each "group" :
results = list()
for (i in 1:length(method))
for (j in 1:length(unique(example$group))
{
{
groups_j = unique(example$group[j])
my_data_i = file[which(file$fsa == groups_j ), ]
method_i = method[i]
name_1_i = paste0("col1_col_2", method_i)
name_2_i = paste0("col3_col_4", method_i)
p1_i = stringdistmatrix(my_data_i$col1, my_data_i$col2, method = method_i, useNames = "string") %>%
as_tibble(rownames = "a") %>%
pivot_longer(-1, names_to = "b", values_to = name_1_i)
p2_i = stringdistmatrix(my_data_i$col3, my_data_i$col4, method = method_i, useNames = "string") %>%
as_tibble(rownames = "a") %>%
pivot_longer(-1, names_to = "b", values_to = name_2_i)
p1_i = p1_i[,3]
p2_i = p2_i[,3]
final_i = cbind(p1_i, p2_i)
results[[i]] = final_i
}
}
final = do.call(cbind.data.frame, results)
final = cbind(col1,col2, col3,col4, final)
average_col1_col2_dist = (final$col1_col_2osa + final$col1_col_2lv + final$col1_col_2dl + final$col1_col_2hamming + final$col1_col_2lcs + final$col1_col_2qgram + final$col1_col_2cosine + final$col1_col_2jaccard + final$col1_col_2jw + final$col1_col_2soundex)/10
average_col3_col4_dist = ( final$col3_col_4osa + final$col3_col_4lv + final$col3_col_4dl + final$col3_col_4hamming + final$col3_col_4lcs + final$col3_col_4qgram + final$col3_col_4cosine + final$col3_col_4jaccard + final$col3_col_4jw + final$col3_col_4soundex)/10
final = data.frame( col1, col2, col3, col4, average_col1_col2_dist, average_col3_col4_dist)
final = scale(final)
But I keep getting this error:
Error:
! Column 1 must be named.
Use .name_repair to specify repair.
Caused by error in `repaired_names()`:
! Names can't be empty.
x Empty name found at location 1.
Does anyone know how I can fix this?
Thank you!
In the process of trying to understand what you're doing, I drifted pretty far from your original code. There isn't anything necessarily wrong with the majority of it!
Your code
As far as your grouping code...
You started with
for(this in that)
for(this in that)
{
{
The brackets nest what's inside the for statement. You need to
for(this in that) {
for(this in that) {
# or this works
for(this in that)
{
for(this in that)
{
When you specified your for criteria, you went with integers. However, you could just go with the string, as in
for(i in method) { # i is a string
# versus
for(i in 1:length(method)) { # i is an integer
When you wrote the nested for statement you missed a closing parenthesis.
for(j in 1:length(unique(example$group)) # end parentheses missing!
# should have been
for(j in 1:length(unique(example$group)))
# easier to see like this:
for(j in 1:length(
unique(
example$group
)
)
)
Did you know? You can set RStudio to use 'Rainbow parentheses' which is great for ensuring you don't miss closing parentheses or brackets. Go to Tools -> Global Options -> Code (left menu in popup) -> Display (top menu in menu popup) & 'Rainbow parentheses' is the last item in the list. This is what it looks like with my current appearance settings:
When extracting the group, you selected a dataset row, not a unique value.
# this selects jth row, then looks for unique values
groups_j = unique(example$group[j])
# you need to get the unique values, then iterate
group_j = unique(example$group)[j]
# the j goes outside the call for unique()
In this code, you wrote file and file$fsa. I'm assuming this is equivalent to example and example$group, because I don't have whatever is in file.
All of these lines of code do the same thing. Keep in mind that group is in the data frame, but it's an object in the environment on its own, as well.
my_data_i = example[which(example$group == group_j), ] # this would work
my_data_i <- filter(example, groups == group_j) # this would work
my_data_i <- example[group == group_j, ] # this would work
my_data_i <- example[example$group == group_j, ] # this would work
If you iterated over groups instead of indices, you can skip the creation of group_j which is the only time you used j.
for(j in unique(example$group)) {
my_data_i <- example[example$group == j, ]
}
When you sent the results using i and i alone, you would overwrite the data with each iteration over j.
The first group iteration can go in results[[i]], but the next group, can either be bound to that data or placed in a list within a list.
For example:
results[[1]] <- group 1, method 1
results[[1]] <- rbind(results[[1]], [group 2, method 1])
# or
results[[1]][[1]] <- group 1, method 1
results[[1]][[2]] <- group 2, method 1
Considering those two options for the lists (above), the first will allow your remaining code (creation of final, averages, etc.) to work without any changes. However, if you use the second option (above), that code will require modification.
If you leave for(j with 1:length this will work:
if(j < 2) {
results[[i]] <- final_i
} else {
results[[i]] <- rbind(results[[i]], final_i)
}
If you use for(j in unique(example$group)), you could use this:
if(isTRUE(j == unique(example$group)[1])) { # isTRUE() to avoid null errors
results[[i]] <- final_i
} else {
results[[i]] <- rbind(results[[i]], final_i)
}
Your nested for statements all in one chunk.
results = list()
for (i in 1:length(method)) { # bracket missing here; it was in the wrong place
for (j in 1:length(unique(example$group))) { # missing a parentheses here
# { # this needs to be after each for statement
# groups_j = unique(example$group[j]) # you have selected the jth row, not the jth unique
group_j = unique(example$group)[j] # the selection goes outside the call for unique()
# use things like print or message to check what your function does
# print(group_j)
# message('this is a message ', group_j) # notice the different color in the console?
my_data_i <- example[group == group_j, ] # this would work
method_i = method[i]
name_1_i = paste0("col1_col_2", method_i)
name_2_i = paste0("col3_col_4", method_i)
p1_i = stringdistmatrix(my_data_i$col1, my_data_i$col2, method = method_i, useNames = "string") %>%
as_tibble(rownames = "a") %>%
pivot_longer(-1, names_to = "b", values_to = name_1_i)
p2_i = stringdistmatrix(my_data_i$col3, my_data_i$col4, method = method_i, useNames = "string") %>%
as_tibble(rownames = "a") %>%
pivot_longer(-1, names_to = "b", values_to = name_2_i)
p1_i = p1_i[,3]
p2_i = p2_i[,3]
final_i = cbind(p1_i, p2_i)
# results[[i]] = final_i # you replace this content everytime you change groups
# you need to append the values between groups (assuming you want one column per test type)
# first append, then combine
if(j < 2) { # use < instead of == to avoid null error
results[[i]] <- final_i
} else {
results[[i]] <- rbind(results[[i]], final_i)
}
}
}
My code to accomplish the same task
I've added a few checks and balances to make it more dynamic. You can send any number of columns, methods, or groups to grpComp.
This uses tidyverse, glue, and stringdist.
This first function is called by the other function.
library(tidyverse)
library(stringdist)
library(glue)
strD <- function(c1, c2, mm) { # input column 1; column 2; measurement method
res <- stringdistmatrix(c1, c2, method = mm, useNames = 'string')
f_res <- matrix(res) # extract values and flatten
}
This is the distances by group function.
grpComp <- function(fr, methods, grp) { # data frame of columns to compare,
# methods to use, groups (vector same length as df rows)
cnames <- names(fr)
if(length(cnames) %% 2 != 0) {
message('there are an uneven number of columns to compare')
break # something's wrong
}
if(length(grp) != nrow(fr)) {
message('there groups vector length must match number of rows in the data')
break # something's wrong
}
# extract distances
dists <- map(
method,
function(j) {
str_ds <- map_dfc( # by column sets
seq(from = 1, to = length(cnames), by = 2),
function(i) {
str_gr <- map_dfr( # by group
unique(grp),
function(k) {
as.data.frame(list( # has to be list for `col.names` to work
strD(fr[grp == k, cnames[i]],
fr[grp == k, cnames[i + 1]], j)),
optional = F, row.names = NULL,
col.names = paste0("c", i, i+1, '_', j))
}) # combine groups by rows
str_gr
}) # combine methods by columns
str_ds
}) %>% do.call(cbind, .)
ncnames <- names(dists) %>% substr(1, 3) %>% unique() # determine unique col groups
for(m in ncnames) { # get averages for each comparison set
dists <- mutate(dists,
"ave_{m}" := rowMeans(across(contains(m))) %>% scale())
}
dists <- select(dists, contains('ave'))
}
This is how you would use this code.
test5 <- grpComp(example[, 1:4], methods, example$group)
Even though your function for non-grouped data is working, I thought I would include that code as well.
strComp <- function(fr, methods) { # data frame of columns to compare, methods to use
cnames <- names(fr)
if(length(cnames) %% 2 != 0) {
message('there are an uneven number of columns to compare')
break # something's wrong
}
# extract distances
dists <- map(
method,
function(j) {
str_ds <- map_dfc(
seq(from = 1, to = length(cnames), by = 2),
function(i) {
as.data.frame(list( # has to be list for `col.names` to work
strD(fr[, cnames[i]], fr[, cnames[i + 1]], j)), optional = F,
col.names = paste0("c", i, i+1, '_', j))
})
str_ds
}) %>% do.call(cbind, .)
ncnames <- names(dists) %>% substr(1, 3) %>% unique() # determine unique col groups
for(k in ncnames) { # get averages for each comparison set
dists <- mutate(dists,
"ave_{k}" := rowMeans(across(contains(k))) %>% scale())
}
dists <- select(dists, contains('ave'))
}
To use this function:
test4 <- strComp(example[, 1:4], methods)

Processing a data frame in r by subgroup: is it possible to get rid of the 'for' loop?

I frequently work with data frames and have to run some sophisticated data wrangling / manipulations by subgroup that is defined in one of the columns. I am aware of dplyr and group_by and know that many things could be solved using group_by. However, often I have to do some pretty intricate calculations and end up just using the 'for' loop.
I was wondering about the existence of some other general approach or paradigm that is faster/more elegant. Maybe map (that I am not very familiar with)?
Below is an example. Notice - it is fake and meaningless. So let's ignore why I need to do those things or the fact that there could be 2 consequtive NAs in a column, etc. That's not the focus of my question. The point is that often I have to operate "within the constraints of a subgroup" and then - inside that subgroup - I have to do operations columnwise, rowwise and sometimes even cellwise.
I also realize that I could probably put most of that code inside a function, split my data frame into a list based on 'group', apply this function to each element of that list and then do.call(rbind...) at the end. But is this the only way?
Thanks a lot for any hints!
library(dplyr)
library(forcats)
set.seed(123)
x <- tibble(group = c(rep('a', 10), rep('b', 10), rep('c', 10)),
attrib = c(sample(c("one", "two", "three", "four"), 10, replace = T),
sample(c("one", "two", "three"), 10, replace = T),
sample(c("one", "three", "four"), 10, replace = T)),
v1 = sample(c(1:5, NA), 30, replace = T),
v2 = sample(c(1:5, NA), 30, replace = T),
v3 = sample(c(1:5, NA), 30, replace = T),
n1 = abs(rnorm(30)), n2 = abs(rnorm(30)), n3 = abs(rnorm(30)))
v_vars = paste0("v", 1:3)
n_vars = paste0("n", 1:3)
results <- NULL # Placeholder for final results
for(i in seq(length(unique(x$group)))) { # loop through groups
mygroup <- unique(x$group)[i]
mysubtable <- x %>% filter(group == mygroup)
# IMPUTE NAs in v columns
# Replace every NA with a mean of values above and below it; and if it's the first or
# the last value, with the mean of 2 values below or above it.
for (v in v_vars){ # loop through v columns
which_nas <- which(is.na(mysubtable[[v]])) # create index of NAs for column v
if (length(which_nas) == 0) next else {
for (na in which_nas) { # loop through indexes of column values that are NAs
if (na == 1) {
mysubtable[[v]][na] <- mean(c(mysubtable[[v]][na + 1],
mysubtable[[v]][na + 2]), na.rm = TRUE)
} else if (na == nrow(mysubtable)) {
mysubtable[[v]][na] <- mean(c(mysubtable[[v]][na - 2],
mysubtable[[v]][na - 1]), na.rm = TRUE)
} else {
mysubtable[[v]][na] <- mean(c(mysubtable[[v]][na - 1],
mysubtable[[v]][na + 1]), na.rm = TRUE)
}
} # end of loop through NA indexes
} # end of else
} # end of loop through v vars
# Aggregate v columns (mean) for each value of column 'attrib'
result1 <- mysubtable %>% group_by(attrib) %>%
summarize_at(v_vars, mean)
# Aggregate n columns (sum) for each value of column 'attrib'
result2 <- mysubtable %>% group_by(attrib) %>%
summarize_at(n_vars, sum)
# final result should contain the name of the group
results[[i]] <- cbind(mygroup, result1, result2[-1])
}
results <- do.call(rbind, results)
Maybe this example is too simple, but in this case, the only thing you need to pull out is the imputation.
my_impute <- function(x) {
which_nas <- which(is.na(x))
for (na in which_nas) {
if (na == 1) {
x[na] <- mean(c(x[na + 1], x[na + 2]), na.rm = TRUE)
} else if (na == length(x)) {
x[na] <- mean(c(x[na - 2], x[na - 1]), na.rm = TRUE)
} else {
x[na] <- mean(c(x[na - 1], x[na + 1]), na.rm = TRUE)
}
}
x
}
Then you just need to group appropriately and impute and summarize.
x2 <- x %>% group_by(group) %>% mutate_at(v_vars, my_impute) %>%
group_by(group, attrib)
full_join(x2 %>% summarize_at(v_vars, mean),
x2 %>% summarize_at(n_vars, sum))
My usual method for things like this, where similar calculations need to be on a bunch of columns, is to put it in long format. Here it feels a little like the long way round, but perhaps this would be useful to see.
x %>% mutate(row=1:n()) %>% gather("variable", "value", c(v_vars, n_vars)) %>%
separate(variable, c("var", "x"), sep=1) %>% spread(var, value) %>%
arrange(group, x, row) %>% group_by(group, x) %>%
mutate(v=my_impute(v)) %>% group_by(group, attrib, x) %>%
summarize(v=mean(v), n=sum(n)) %>%
gather("var", "value", v, n) %>% mutate(X=paste0(var, x)) %>%
select(-x, -var) %>% spread(X, value)
More generally, split-apply-combine is probably the way to go, as you suggest in your question; here's a way using the tidyverse.
doX <- function(x) {
x2 <- x %>% mutate_at(v_vars, my_impute) %>% group_by(attrib)
full_join(x2 %>% summarize_at(v_vars, mean),
x2 %>% summarize_at(n_vars, sum))
}
x %>% group_by(group) %>% nest() %>%
mutate(result=map(data, doX)) %>% select(-data) %>% unnest()
The more traditional method is with do.call, split, and rbind; here I don't make the effort to keep the group information.
do.call(rbind, lapply(split(x, x$group), doX))
The first thing to do is to change your data imputing into a function. I made some simple modifications to have it accept a vector and simplified the call to mean.
fx_na_rm <- function(z) {
which_nas <- which(is.na(z))
if (length(which_nas) > 0) {
for (na in which_nas) { # loop through indexes of column values that are NAs
if (na == 1) {
z[na] <- mean(z[na + (1:2)], na.rm = TRUE)
} else if (na == nrow(mysubtable)) {
z[na] <- mean(z[na - (1:2)], na.rm = TRUE)
} else {
z[na] <- mean(z[c(na - 1, na + 1)], na.rm = TRUE)
}
} # end of loop through NA indexes
}
return(z)
}
I like data.table so here's a solution that uses it. Now since you use different functions for the n and v variable groups, most purrr or any other solutions will also be a little funny.
library(data.table)
dt <- copy(as.data.table(x))
v_vars = paste0("v", 1:3)
n_vars = paste0("n", 1:3)
dt[, (v_vars) := lapply(.SD, as.numeric), .SDcols = v_vars]
dt[, (v_vars) := lapply(.SD, fx_na_rm), by = group, .SDcols = v_vars]
# see https://stackoverflow.com/questions/50626316/r-data-table-apply-function-a-to-some-columns-and-function-b-to-some-others
scols <- list(v_vars, n_vars)
funs <- rep(c(mean, sum), lengths(scols))
dt[, setNames(Map(function(f, x) f(x), funs, .SD), unlist(scols))
, by = .(group,attrib)
, .SDcols = unlist(scols)]
The for loop itself is difficult to vectorize because the results can depend on itself. Here is my attempt which is not an identical output to yours:
# not identical
fx_na_rm2 <- function(z) {
which_nas <- which(is.na(z))
if (length(which_nas) > 0) {
ind <- c(rbind(which_nas - 1 + 2 * (which_nas == 1) + -1 * (which_nas == length(z)),
which_nas + 1 + 1 * (which_nas == 1) + -2 * (which_nas == length(z))))
z[which_nas] <- colMeans(matrix(z[ind], nrow = 2), na.rm = T)
}
return(z)
}

Append to dataframe through for loop and function

When I run a code with save as csv at the end everything runs correctly. It means I filter my dataset by country do some continuations and save it as a country file.
When I try to rbind my datasets I receive nothing. I try different solutions which I found but nothing is working. Clearly I do not understand why I can not rbind.
The code:
library(dplyr)
library(readxl)
setwd("Z:/Reporting_Private/Tableau")
dataupl <- read_excel("Analysis Map_Tableau - Ready.xlsm", sheet = "Data")
df = dataupl
#select right columns
df = df[,1:6]
colnames(df)=c("Office", "Employee","Territiry","Sales","Leads","Act")
#change n/a to zero
df[is.na(df)]=0
countries = df %>% select(Office) %>% distinct()
countries = as.data.frame(countries)
engine <- function(input){
df = df %>% filter(Office==input)
SCALESALES = scale(df$Sales)
SCALELEADS= scale(df$Leads)
SCALEACT= scale(df$Act)
df = df %>% mutate(SCALESALES = SCALESALES, SCALELEADS = SCALELEADS, SCALEACT = SCALEACT)
df$SLegend = ave(df$Sales, df$SalesLegend, FUN = min)
df$SLegend = ifelse(df$SLegend>0, df$SLegend,0)
df$LLegend = ave(df$Leads, df$LeadsLegend, FUN = min)
df$ALegend = ave(df$Act, df$ActLegend, FUN = min)
#write.csv(final, file = paste0(input,".csv"))
dftotal = data.frame()
dftotal = rbind(dftotal,df)
}
for (i in 1:nrow(countries)){
input = countries[i,]
engine(input)
}
It seems that you can split your dataframe L <- split(df, df$Office) and then lapply(L, ...) Instead of write.csv(...) you have to return the dataframe: return(df) in your function.
So, something like this:
engine <- function(dfi) {
SCALESALES = scale(dfi$Sales)
SCALELEADS = scale(dfi$Leads)
SCALEACT = scale(dfi$Act)
dfi = dfi %>% mutate(SCALESALES = SCALESALES, SCALELEADS = SCALELEADS, SCALEACT = SCALEACT)
dfi$SLegend = ave(dfi$Sales, dfi$SalesLegend, FUN = min)
dfi$SLegend = ifelse(dfi$SLegend>0, dfi$SLegend, 0)
dfi$LLegend = ave(dfi$Leads, dfi$LeadsLegend, FUN = min)
dfi$ALegend = ave(dfi$Act, dfi$ActLegend, FUN = min)
return(dfi)
}
L <- split(df, df$Office)
Lnew <- lapply(L, engine)
dftotal <- Lnew[[1]]
for (i in 2:length(Lnew)) dftotal <- rbind(dftotal, Lnew[[i]])

How can I extend tidyr::spread() while maintaining the order of column names?

How to expand when maintaining the rank of numbers when using the spread function?
library(tidyverse)
data.frame(time = paste0("t_", 1:100)) %>%
rowwise() %>%
mutate(rnd = sample(1:100, size=1)) %>%
spread(time, rnd)
The column names of the execution result of the code shown above are t_1, t_11, t_100, .....
I want to get column names in order of numbers(t_1, t_2, t_3, ...).
You can try two things:
(1) Make "time" a factor with levels matching the order you want:
data.frame(time = factor(paste0("t_", 1:100), levels = paste0("t_", 1:100))) %>%
rowwise() %>%
mutate(rnd = sample(1:100, size=1)) %>%
spread(time, rnd)
(2) Force the order using a select statement:
data.frame(time = paste0("t_", 1:100)) %>%
rowwise() %>%
mutate(rnd = sample(1:100, size=1)) %>%
spread(time, rnd) %>%
select(paste0("t_", 1:100))
Here is a new function that retains column order. Only one small change is needed (see annotation):
my_spread <- function (data, key, value, fill = NA, convert = FALSE, drop = TRUE,
sep = NULL) {
key_col <- tidyr:::col_name(substitute(key))
value_col <- tidyr:::col_name(substitute(value))
tbl_df(my_spread_(data, key_col, value_col, fill = fill, convert = convert,
drop = drop, sep = sep))
}
my_spread_ <- function (data, key_col, value_col, fill = NA, convert = FALSE,
drop = TRUE, sep = NULL) {
col <- data[key_col]
#col_id <- tidyr:::id(col, drop = drop) # Old line
col_id <- seq_len(nrow(data)) # New line 1
attr(col_id, 'n') <- nrow(data) # New line 2
col_labels <- tidyr:::split_labels(col, col_id, drop = drop)
rows <- data[setdiff(names(data), c(key_col, value_col))]
if (length(rows) == 0) {
row_id <- structure(1L, n = 1L)
row_labels <- as.data.frame(matrix(nrow = 1, ncol = 0))
}
else {
row_id <- id(rows, drop = drop)
row_labels <- tidyr:::split_labels(rows, row_id, drop = drop)
rownames(row_labels) <- NULL
}
overall <- tidyr:::id(list(col_id, row_id), drop = FALSE)
n <- attr(overall, "n")
if (anyDuplicated(overall)) {
groups <- split(seq_along(overall), overall)
groups <- groups[vapply(groups, length, integer(1)) >
1]
str <- vapply(
groups,
function(x) paste0("(", paste0(x, collapse = ", "), ")"), character(1)
)
stop("Duplicate identifiers for rows ", paste(str, collapse = ", "),
call. = FALSE)
}
if (length(overall) < n) {
overall <- match(seq_len(n), overall, nomatch = NA)
}
else {
overall <- order(overall)
}
value <- data[[value_col]]
ordered <- value[overall]
if (!is.na(fill)) {
ordered[is.na(ordered)] <- fill
}
if (convert && !is.character(ordered)) {
ordered <- as.character(ordered)
}
dim(ordered) <- c(attr(row_id, "n"), attr(col_id, "n"))
colnames(ordered) <- enc2utf8( tidyr:::col_names(col_labels, sep = sep))
ordered <- tidyr:::as_data_frame_matrix(ordered)
if (convert) {
ordered[] <- lapply(ordered, type.convert, as.is = TRUE)
}
tidyr:::append_df(row_labels, ordered)
}

randomize observations by groups (blocks)

I have a data frame with I obsevations, and each observation belongs to one of g categories.
set.seed(9782)
I <- 500
g <- 10
library(dplyr)
anon_id <- function(n = 1, length = 12) {
randomString <- c(1:n)
for (i in 1:n)
{
randomString[i] <- paste(sample(c(0:9, letters, LETTERS),
length, replace = TRUE),
collapse = "")
}
return(randomString)
}
df <- data.frame(id = anon_id(n = I, length = 16),
group = sample(1:g, I, T))
I want to randomly assign each observation to one of J "urns", given some vector of probabilities p. That is the probability of being assign to urn J=1 is p[1]. The added complexity is that I want to do this block by block.
If I ignore the blocks, I can do this easily:
J <- 3
p <- c(0.25, 0.5, 0.25)
df1 <- df %>% mutate(urn = sample(x = c(1:J), size = I, replace = T, prob = p))
I thought about this method to do it by "block"
# Block randomization
randomize_block <- function(g) {
df1 <- df %>% filter(group==g)
size <- nrow(df1)
df1 <- df1 %>% mutate(urn = sample(x = c(1:J),
size = size,
replace = T,
prob = p))
return(df1)
}
df2 <- lapply(1:g, randomize_block)
df2 <- data.table::rbindlist(df2)
Is there a better way?
Not sure if this is better, but here is a base R technique with data.frame df, that has group name "group" as well as urn assignments 1:J with assignment probabilities in vector p of length J.
# get urn assignment
urnAssignment <- lapply(unique(df$group),
function(i) sample(1:J, nrow(df[group==i,]), replace =T, prob=p))
# get a list that collects position of observations
obsOrder <- lapply(unique(df$group),
function(i) which(df$group == i))
df$urnAssignment <- unlist(urnAssignment)[unlist(obsOrder)]
randomizr::block_ra does exactly what you want.
library(randomizr)
library(janitor) #just for the tabyl function
block_rand <- as.tibble(randomizr::block_ra(blocks = df$group, conditions = c("urn_1","urn_2","urn_3")))
df2 <- as.tibble(bind_cols(df, block_rand))
df2 %>% janitor::tabyl(group, value)
This does the trick using dplyr:
randomize <- function(data, groups=2, block_id = NULL, p=NULL, seed=9782) {
if(is.null(p)) p <- rep(1/groups, groups)
if(is.null(block_id)){
df1 <- data %>%
mutate(Treatment = sample(x = c(1:groups),
size = n(),
replace = T,
prob = p))
return(df1)
}else{
df1 <- data %>% group_by_(block_id) %>%
mutate(Treatment = sample(x = c(1:groups),
size = n(),
replace = T,
prob = p))
}
}
df1 <- randomize(data = df, groups = J, block_id = "group", p = p, seed = 9782)

Resources