loop through 2 dataframes - r

I am new to R and trying to loop through each row of df1 and search for rows in df2 that are close in distance (5mi/8046.72m). I think df1 is looping as intended but I don't think it is going through all of df2.
{for (i in 1:1452){
p1 <- df1[i, 4:5]
p2 <- df2[1:11, 2:3]
d <- distCosine(p1, p2, r=6378137)
return(d< 8046.72)
i <- i+1}
}
I get the output:
[1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE

I would just use an apply function. First, let's make your problem reproducible by creating some "fake" data - I am making the lon/lat pairs artificially close so that we can get a few TRUE's back in the results:
library(geosphere)
df1 <- data.frame(X1 = sample(letters, 100, replace = T),
x2 = sample(letters, 100, replace = T),
x3 = sample(letters, 100, replace = T),
lon = sample(10:12 + rnorm(100, 0, 0.1), 100, replace = T),
lat = sample(10:12 + rnorm(100, 0, 0.1), replace = T))
df2 <- data.frame(x1 = sample(letters, 100, replace = T),
lon = sample(10:12 + rnorm(100, 0, 0.1), 100, replace = T),
lat = sample(10:12 + rnorm(100, 0, 0.1), 100, replace = T))
We can then create two matrices containing the values of interest:
m1 <- as.matrix(df1[, c("lon", "lat")])
m2 <- as.matrix(df2[1:11, c("lon", "lat")])
Now we can use the apply function across the rows of m2 which return a 100 X 11 matrix:
results <- apply(m2, 1, FUN = function(x) distCosine(x, m1))
To get the less than 5 mi (~8046.72m), results, we simply subset:
results[results < 8046.72]
# Showing the next two for alternative output
which(results < 8046.72)
which(results < 8046.72, arr.ind = T)
Note: In your question, it looks like you are interested in the first 1,452 rows -- this would mean the results would we be a 1,452 X 11 matrix.

Related

Remove columns with uniform values

I have dataframe with 100+ columns. I want to filter those columns based on their nonuniformity.
Ex. if there're columns with more than 90% (or 95% or 99%) NAs (or 0-s, or -999, or whatever value), remove them from dataframe.
I can remove with NAs, or 0-s, but the problem is I don't know what value it will be.
Ex. of removing NAs with more than 90% df[, which(colMeans(!is.na(df)) > 0.9)]
I would simply use table to count the number of occurence of each value, when the maximum of these values exceeds the needed threshold you can discard the column.
In the following toy example, x, y and z are "constant". For x there are 96% of NA values, for y there are 99% of 0 and for z there are 97% of -1 (but any value would work).
set.seed(26012023)
df <- data.frame(w = rnorm(100), x = c(rep(NA, 96), rnorm(4)), y = c(rep(0, 99), rnorm(1)),
z = c(rep(-1, 97), rnorm(3)))
apply(df, 2, function(x, cutoff = .95) {
tab <- table(x, useNA = "ifany")
max_val <- max(tab)
max_val >= cutoff * length(x)
})
# w x y z
# FALSE TRUE TRUE TRUE
We can create a toy example, defining the following data.frame named df
# Seed to make it reproducible
set.seed(12345)
df <- data.frame(cbind(Var1 = c(rep(10,19),1),
Var2 = sample(letters[1:5],20, prob = c(0.8,0.1,0.5,0.25,0.25), replace = T),
Var3 = sample(c("Yes","No"), 20, prob = c(.95, .05), replace = T),
Var4 = sample(1:3, 20, replace = T),
Var5 = c(rep(NA,15),rep(1,5))))
Then we compute the maximum frequency of a single value for each colum and finally we delet those that exceed the required value
# Calculate the maximum frequency for a single value for each column
aux <- apply(df,2,function(x) max(prop.table(table(x, useNA = "ifany"))))
# Define new.df as df whithout the columns that have a value more than a 90% of times
new.df <- df[,-which(aux>.9)]

Using app function from {terra} package on raster stacks? (in parallel)

I have four high resolution rasters for a country. I have split each raster into tiles and done some other processing to them. I now want to apply a function to each cell, of each 'stack' of the raster tiles, to produce one set of output tiles. The function is a little complex. I have tried to synthesise some data below to reproduce my current approach. It works (ish) but I'm convinced that there's a better way to do this. To use parallel processing on my unix box, I simply swap mapply for mcmapply, but I haven't done that in the example below as I presume many will be working on Windows machines. I'd welcome ideas on my approach and particularly optimisation.
library("terra")
library("glue")
## Make some toy data
dir.create("temp_folder")
dir.create("result_folder")
x <- rast(ncols = 10, nrows = 10)
a <- rast(ncol = 100, nrow = 100)
some_values <- as.integer(runif(10000, min = 1, max = 100))
ind <- which(some_values %in% sample(some_values, 15))
some_values[ind] <- NA
values(a) <- some_values
a_tiles <- makeTiles(a, x, glue("temp_folder/tile_a_{1:100}.tif"), overwrite = TRUE)
b <- rast(ncol = 100, nrow = 100)
some_values <- as.integer(runif(10000, min = 1, max = 100))
ind <- which(some_values %in% sample(some_values, 15))
some_values[ind] <- NA
values(b) <- some_values
b_tiles <- makeTiles(b, x, glue("temp_folder/tile_b_{1:100}.tif"), overwrite = TRUE)
c <-rast(ncol = 100, nrow = 100)
some_values <- as.integer(runif(10000, min = 1, max = 100))
ind <- which(some_values %in% sample(some_values, 15))
some_values[ind] <- NA
values(c) <- some_values
c_tiles <- makeTiles(c, x, glue("temp_folder/tile_c_{1:100}.tif"), overwrite = TRUE)
d <- rast(ncol = 100, nrow = 100)
some_values <- as.integer(runif(10000, min = 1, max = 100))
ind <- which(some_values %in% sample(some_values, 15))
some_values[ind] <- NA
values(d) <- some_values
d_tiles <- makeTiles(d, x, glue("temp_folder/tile_d_{1:100}.tif"), overwrite = TRUE)
## Outer function so that this can be used in parallel ? But maybe this is a silly way to do it?
outer_function <- function(a_tiles, b_tiles, c_tiles, d_tiles, output_files) {
one_a_tile <- rast(unlist(a_tiles))
one_b_tile <- rast(unlist(b_tiles))
one_c_tile <- rast(unlist(c_tiles))
one_d_tile <- rast(unlist(d_tiles))
output_file <- output_files
# I replace any NAs with 0 as an NA will break my 'if' statement of the inner_function.
# I get Error in if (z["a"] <= z["b"]) { : missing value where TRUE/FALSE needed
one_a_tile[is.na(one_a_tile)] <- 0
one_b_tile[is.na(one_b_tile)] <- 0
one_c_tile[is.na(one_c_tile)] <- 0
one_d_tile[is.na(one_d_tile)] <- 0
z <- sds(one_a_tile, one_b_tile, one_c_tile, one_d_tile)
## Inner function that actually does the work I want doing
inner_function <- function(z) {
names(z) <- c('a', 'b', 'c', 'd')
if (z['a'] <= z['b']) {
y <- rowSums(cbind((z['c'] + z['a'] * 10),
(z['c'] + z['a'] * 20)))
}
if (z['a'] >= z['b']) {
y <- rowSums(cbind((z['c'] + z['a'] * 40),
(z['c'] + z['a'] * 10)))
}
if (z['a'] == z['b']) {
y <- rowSums(cbind((z['c'] + z['a'] * 60),
(z['c'] + z['a'] * 10)))
}
y <- ifelse(y == 0, NA, y)
return(y)
}
app(z,
inner_function,
filename = output_file,
overwrite = TRUE,
wopt = list(datatype = "INT4U"))
return(output_file)
}
results <- mapply(outer_function,
a_tiles = a_tiles,
b_tiles = b_tiles,
c_tiles = c_tiles,
d_tiles = d_tiles,
output_files = output_files <- glue("result_folder/result_tile_{1:length(d_tiles)}.tif"))
names(results) <- NULL
unlink("temp_folder", recursive = TRUE)
unlink("result_folder", recursive = TRUE)

R bootstrapping for the two dataframe individual column wise

Want to do Bootstrapping while comparing two dataframe column wise with the different number of rows.
I have two dataframe in which row represent values from experiments and column with the dataset names (data1, data2, data3, data4)
emp.data1 <- data.frame(
data1 = c(234,0,34,0,46,0,0,0,2.26,0, 5,8,93,56),
data2 = c(1.40,1.21,0.83,1.379,2.60,9.06,0.88,1.16,0.64,8.28, 5,8,93,56),
data3 =c(0,34,43,0,0,56,0,0,0,45,5,8,93,56),
data4 =c(45,0,545,34,0,35,0,35,0,534, 5,8,93,56),
stringsAsFactors = FALSE
)
emp.data2 <- data.frame(
data1 = c(45, 0, 0, 45, 45, 53),
data2 = c(23, 0, 45, 12, 90, 78),
data3 = c(72, 45, 756, 78, 763, 98),
data4 = c(1, 3, 65, 78, 9, 45),
stringsAsFactors = FALSE
)
I am trying to do bootstrapping(n=1000). Values are selected at random replacement from emp.data1(14 * 4) without change in the emp.data2(6 * 4). For example from emp.data2 first column (data1) select 6 values colSum and from emp.data1(data1) select 6 random non zero values colSum Divide the values and store in temp repeat the same 1000 times and take a median value et the end. like this i want to do it for each column of the dataframe. sample code I am providing which is working fine but i am not able get the non-zero random values for emp.data1
nboot <- 1e3
boot_temp_emp<- c()
n_data1 <- nrow(emp.data1); n_data2 <- nrow(emp.data2)
for (j in seq_len(nboot)) {
boot <- sample(x = seq_len(n_data1), size = n_data2, replace = TRUE)
value <- colSums(emp.data2)/colSums(emp.data1[boot,])
boot_temp_emp <- rbind(boot_temp_emp, value)
}
boot_data<- apply(boot_temp_emp, 2, median)
From the above script i am able get the output but each column emp.data1[boot,] data has zero values and taken sum. I want indivisual ramdomly selected non-zero values column sum so I tried below script not able remove zero values. Not able get desired output please some one help me to correct my script
nboot <- 1e3
boot_temp_emp<- c()
for (i in colnames(emp.data2)){
for (j in seq_len(nboot)){
data1=emp.data1[i]
data2=emp.data2[i]
n_data1 <- nrow(data1); n_data2 <- nrow(data2)
boot <- sample(x = seq_len(n_data1), size = n_data2, replace = TRUE)
value <- colSums(data2[i])/colSums(data1[boot, ,drop = FALSE])
boot_temp_emp <- rbind(boot_temp_emp, value)
}
}
boot_data<- apply(boot_temp_emp, 2, median)
Thank you
Here is a solution.
Write a function to make the code clearer. This function takes the following arguments.
x the input data.frame emp.data1;
s2 the columns sums of emp.data2;
n = 6 the number of vector elements to sample from emp.data1's columns with a default value of 6.
The create a results matrix, pre-compute the column sums of emp.data2 and call the function in a loop.
boot_fun <- function(x, s2, n = 6){
# the loop makes sure ther is no divide by zero
nrx <- nrow(x)
repeat{
i <- sample(nrx, n, replace = TRUE)
s1 <- colSums(x[i, ])
if(all(s1 != 0)) break
}
s2/s1
}
set.seed(2022)
nboot <- 1e3
sums2 <- colSums(emp.data2)
results <- matrix(nrow = nboot, ncol = ncol(emp.data1))
for(i in seq_len(nboot)){
results[i, ] <- boot_fun(emp.data1, sums2)
}
ratios_medians <- apply(results, 2, median)
old_par <- par(mfrow = c(2, 2))
for(j in 1:4) {
main <- paste0("data", j)
hist(results[, j], main = main, xlab = "ratios", freq = FALSE)
abline(v = ratios_medians[j], col = "blue", lty = "dashed")
}
par(old_par)
Created on 2022-02-24 by the reprex package (v2.0.1)
Edit
Following the comments here is a revised version of the bootstrap function. It makes sure there are no zeros in the sampled vectors, before computing their sums.
boot_fun2 <- function(x, s2, n = 6){
nrx <- nrow(x)
ncx <- ncol(x)
s1 <- numeric(ncx)
for(j in seq.int(ncx)) {
repeat{
i <- sample(nrx, n, replace = TRUE)
if(all(x[i, j] != 0)) {
s1[j] <- sum(x[i, j])
break
}
}
}
s2/s1
}
set.seed(2022)
nboot <- 1e3
sums2 <- colSums(emp.data2)
results2 <- matrix(nrow = nboot, ncol = ncol(emp.data1))
for(i in seq_len(nboot)){
results2[i, ] <- boot_fun2(emp.data1, sums2)
}
ratios_medians2 <- apply(results2, 2, median)
old_par <- par(mfrow = c(2, 2))
for(j in 1:4) {
main <- paste0("data", j)
hist(results2[, j], main = main, xlab = "ratios", freq = FALSE)
abline(v = ratios_medians2[j], col = "blue", lty = "dashed")
}
par(old_par)
Created on 2022-02-27 by the reprex package (v2.0.1)

Creating a t-test loop over a dataframe using an index

So, let's say I have a 1000-row, 6-column dataframe, the columns are a1, a2, b1, b2, c1, c2. I want to run some t-tests using a's, b's, and c's and get an output df with 3 columns for the t-values of a-b-c and another three for the significance information for those values, making it a total of 6 columns. The problem I have is with rows, I want to loop over chunks of 20, rendering the output a (1000/20=)50-row, 6-column df.
I have already tried creating an index column for my inital df which repeats a 1 for the first 20 row, a 2 for the next 20 row and so on.
convert_n <- function(df) {
df <- df %T>% {.$n_for_t_tests = rep(c(1:(nrow(df)/20)), each = 20)}
}
df <- convert_n(df)
However, I can't seem to find a way to properly utilize the items in this column as indices for a "for" or any kind of loop.
Below you can see the relevant code for that creates a 1-row, 6-column df; I need to modify the [0:20] parts, create a loop that does this for 20 groups and binds them.
t_test_a <- t.test(df$a1[0:20], dfff$a2[0:20], paired = T, conf.level
= 0.95)
t_test_b <- t.test(df$b1[0:20], dfff$b2[0:20], paired = T, conf.level
= 0.95)
t_test_c <- t.test(df$c1[0:20], dfff$c2[0:20], paired = T, conf.level
= 0.95)
t_tests_df <- data.frame(t_a = t_test_a$statistic[["t"]],
t_b = t_test_b$statistic[["t"]],
t_c = t_test_c$statistic[["t"]])
t_tests_df <- t_tests_df %T>% {.$dif_significance_a = ifelse(.$t_a >
2, "YES", "NO")} %T>%
{.$dif_significance_b = ifelse(.$t_b >
2, "YES", "NO")} %T>%
{.$dif_significance_c = ifelse(.$t_c >
2, "YES", "NO")} %>%
dplyr::select(t_a, dif_significance_a,
t_b, dif_significance_b,
t_c, dif_significance_c)
Thank you in advance for your help.
You can use split() and sapply():
set.seed(42)
df <- data.frame(a1 = sample(1000, 1000), a2 = sample(1000, 1000),
b1 = sample(1000, 1000), b2 = sample(1000, 1000),
c1 = sample(1000, 1000), c2 = sample(1000, 1000))
group <- gl(50, 20)
D <- split(df, group)
myt <- function(Di)
with(Di, c(at=t.test(a1, a2)$statistic, ap=t.test(a1, a2)$p.value,
bt=t.test(b1, b2)$statistic, bp=t.test(b1, b2)$p.value,
ct=t.test(c1, c2)$statistic, cp=t.test(c1, c2)$p.value))
sapply(D, FUN=myt) ### or
t(sapply(D, FUN=myt))
This is not the most pretty but i did a for loop like this:
df <- data.frame(a1 = sample(1000, 1000),
a2 = sample(1000, 1000),
b1 = sample(1000, 1000),
b2 = sample(1000, 1000),
c1 = sample(1000, 1000),
c2 = sample(1000, 1000))
df_ttest <- data.frame(p_a = c(1:50),
t_a = c(1:50),
p_b = c(1:50),
t_b = c(1:50),
p_c = c(1:50),
t_c = c(1:50))
index <- 0:50*20
for(i in seq_along(index)) {
df_ttest$p_a[i] = t.test(df$a1[index[i] : index[i+1]])$p.value
df_ttest$p_b[i] = t.test(df$b1[index[i] : index[i+1]])$p.value
df_ttest$p_c[i] = t.test(df$c1[index[i] : index[i+1]])$p.value
df_ttest$t_a[i] = t.test(df$a1[index[i] : index[i+1]])$statistic
df_ttest$t_b[i] = t.test(df$b1[index[i] : index[i+1]])$statistic
df_ttest$t_c[i] = t.test(df$c1[index[i] : index[i+1]])$statistic
}
This gives a 50x6 dataframe with seperate columns of p and t values for every 20 row chunk of a, b and c.
You could even go further and make a nested for loop to cycle through each row in df_ttest to make this abit prettier.

specify every combination of the features then calculate, extract, and store values

I have a classification variable and a few dozen ordinal features. I'd like to find the smallest subset of features that, when summed, produce the most accurate classification. I'm trying to specify every combination of the features, calculate a sum score for each combination, and then identify the best cutoff point to maximize sensitivity and specificity. Here's what I've tried:
library(gtools)
library(OptimalCutpoints)
set.seed(2)
# create fake data for 1 classification variable and just 5 features
df <- data.frame(class=sample(0:1, 50, replace=T),
v01=sample(0:3, 50, replace=T),
v02=sample(0:3, 50, replace=T),
v03=sample(0:3, 50, replace=T),
v04=sample(0:3, 50, replace=T),
v05=sample(0:3, 50, replace=T))
# combinations
vars <- list()
out <- list()
for (i in 2:(length(df)-1)) {
p <- combinations(n = length(df)-1, r = i, v = names(df[2:(length(df))]))
for (r in 1:nrow(p)) {
keep <- c("class", p[r,])
df_ <- df[, keep]
df_$T <- rowSums(df_[,2:length(keep)])
oc <- summary(optimal.cutpoints(X = "T",
status = "class",
tag.healthy = 0,
methods = "SpEqualSe",
data = df_,
pop.prev = NULL,
categorical.cov = NULL,
control = control.cutpoints(),
ci.fit = TRUE,
conf.level = 0.95,
trace = FALSE))
name <- paste(i, r, sep=".")
vars[[name]] <- append(vars, p[r,])
out[[name]] <- append(out, oc) # when I inspect out R stalls
}
}
I don't think I'm going about this the right way.
This might (a) drive the anti-looping establishment crazy and (b) be super slow when the number of variables increases and the number of combinations goes through the roof, but I think it "works".
library(gtools)
library(OptimalCutpoints)
# create fake data
df <- data.frame(class=sample(0:1, 50, replace=T),
v01=sample(0:3, 50, replace=T),
v02=sample(0:3, 50, replace=T),
v03=sample(0:3, 50, replace=T),
v04=sample(0:3, 50, replace=T),
v05=sample(0:3, 50, replace=T))
The basic idea is to loop through each combination of variable combinations, from sets of 2 through 5 variables. For each combination of variables, I calculate a scale score then determine the optimal.cutpoints. I extract the details of the optimal.cutpoints object and store in a dataframe that grows with each pass.
# combinations
dfoc <- as.data.frame(NULL)
ri <- 1
for (i in 2:(length(df)-1)) {
p <- combinations(n = length(df)-1, r = i, v = names(df[2:(length(df))]))
for (r in 1:nrow(p)) {
keep <- c("class", p[r,])
v <- keep[-1]
df_ <- df[, keep]
df_$T <- rowSums(df_[,2:length(keep)])
oc <- summary(optimal.cutpoints(X = "T",
status = "class",
tag.healthy = 0,
methods = "SpEqualSe",
data = df_,
control = control.cutpoints(),
ci.fit = TRUE,
conf.level = 0.95,
trace = FALSE))
dfoc[ri,1] <- i # number vars in set
dfoc[ri,2] <- r # permutation number
dfoc[ri,3] <- paste(v, collapse=",") # var names in set
dfoc[ri,4] <- oc$p.table$Global$SpEqualSe[[1]][1] # cutoff
dfoc[ri,5] <- oc$p.table$Global$SpEqualSe[[1]][2] # sen
dfoc[ri,6] <- oc$p.table$Global$SpEqualSe[[1]][3] # spe
dfoc[ri,7] <- oc$p.table$Global$SpEqualSe[[1]][4] # ppv
dfoc[ri,8] <- oc$p.table$Global$SpEqualSe[[1]][5] # npv
dfoc[ri,9] <- oc$p.table$Global$SpEqualSe[[1]][2,2] # sen l95
dfoc[ri,10] <- oc$p.table$Global$SpEqualSe[[1]][2,3] # sen u95
dfoc[ri,11] <- oc$p.table$Global$SpEqualSe[[1]][3,2] # spe l95
dfoc[ri,12] <- oc$p.table$Global$SpEqualSe[[1]][3,3] # spe u95
dfoc[ri,13] <- oc$p.table$Global$SpEqualSe[[1]][4,2] # ppv l95
dfoc[ri,14] <- oc$p.table$Global$SpEqualSe[[1]][4,3] # ppv u95
dfoc[ri,15] <- oc$p.table$Global$SpEqualSe[[1]][5,2] # npv l95
dfoc[ri,16] <- oc$p.table$Global$SpEqualSe[[1]][5,3] # npv u95
dfoc[ri,17] <- oc$p.table$Global$AUC_CI # auc
ri <- ri+1
remove(df_)
remove(keep)
remove(v)
remove(oc)
}
}

Resources