sample rows from dataframe using for loop in R - r

There is a df given with nrow=600 and ncol=18
Now I need to sample 10000 of each of this columns with replacement.
According to the specifications first I need to create an empty matrix:
df1 <- as.data.frame(matrix(NA,nrow = 10000,ncol=18))
now I want to use for loop to do all the samples(for each column) at once:
for (i in 1:18){
df1[1:10000, i) <- sample(df[,i], 10 000, replace=true)
when I run this code, my df1 is still empty.
Can anyone help?
Many thanks in advance

There are syntax issues in your code. Try the following :
df1 <- as.data.frame(matrix(NA,nrow = 10000,ncol=18))
for (i in 1:18) {
df1[, i] <- sample(df[, i], 10000, replace = TRUE)
}
Without an explicit for loop you can also use sapply/lapply :
#With `sapply`
df1 <- as.data.frame(sapply(df, sample, 1000, replace = TRUE))
#Using `lapply`
df1 <- do.call(cbind.data.frame, lapply(df, sample, 1000, replace = TRUE))
It works for the data shared in comments.
df <- data.frame(V1, V2, V3)
df1 <- as.data.frame(matrix(NA,nrow = 10000,ncol=3))
for (i in 1:3) {
df1[, i] <- sample(df[, i], 10000, replace = TRUE)
}
dim(df1)
#[1] 10000 3
head(df1)
# V1 V2 V3
#1 0.02527926 0.039423826 0.097738594
#2 0.03391239 0.039423826 0.036153091
#3 0.03919354 -0.004922473 0.097738594
#4 -0.06703827 0.039423826 0.097738594
#5 0.02168909 0.048176052 0.036153091
#6 0.02527926 0.074435079 -0.009444024

Related

How to write a loop for the following code in R?

for my script, just the finish loop is missing. Would be great if someone could help. Find attached the example dataset.
library(dplyr)
set.seed(94756)
mat1 <- matrix(sample(seq(-1,100, 0.11),70, replace = TRUE),ncol = 5)
mat1 <- as_tibble(mat1)
mat2 <- matrix(sample(seq(-1,100, 0.11),70, replace = TRUE),ncol = 5)
mat2 <- as_tibble(mat2)
mat2[3,1] <- NA
mat2[6,1] <- NA
mat3 <- matrix(sample(seq(-1,100, 0.11), 70,replace = TRUE),ncol = 5)
mat3 <- as_tibble(mat3)
mat3[4,1] <- NA
data <- list(mat1, mat2, mat3)
library(purrr)
data1 <- map(data, ~add_column(., V1_logical = between(.$V1, 20, 80), .after = 'V1'))
r_pre <- lapply(data1, "[", 2)
data2 <- lapply(data1, function(x) {x$V1_logical[x$V1_logical== TRUE] <- 1; x})
data3 <- lapply(data2, function(x) {x$V1_logical[x$V1_logical== FALSE] <- 0; x})
data4 <- map(data3, ~add_column(., ind = rleid(.$V1_logical), .after = "V1_logical"))
rfun <- function(x) with(rle(x$V1_logical), tibble(lengths, values))
rfun1 <- purrr::map_dfr(data4, rfun)
And then the follwing I would like to write within a loop:
marker <- as.numeric(min(which(rfun1$values == 1 & rfun1$lengths >= 3)))
rfun1 <- add_column(rfun1, marker = rfun1$values == 1 & rfun1$lengths >= 3, .after = "values")
data_drop <- rfun1[c(1:marker),]
data_drop_c <- as.numeric(sum(data_drop$lengths))
Then create the final dataframe by substract the datadrop, somehow like that for every dataframe within a list final_df <- data4[-c(1:data_drop_c), 4] #for all dataframes within list Because with rfun1 I put all dataframes together to one, I would like to have the loop to tell me the place of the marker, cut the data before away and count 4 rows after marker. Then the next dataframe in the list (starting with id of list two with counting until markerplace)
So therefore it would be helpful probably to add an ID for each dataframe for rfun1 as well, somehow like this (but it's for data4 here, wish the same for rfun1) ...
data5 <- bind_rows(data4, .id = "i") %>% group_by(i) %>% count(ind)
Here in data5 in "ind" I don't know if the number is standing for logical TRUE or FALSE. I am only searching for TRUE >=3. So if I could add an id col for rfun1 for every dataframe within a list and run the asked loop, it should work out.
Thanks in advance!

Extract a value from a dataframe iteratively (R)

I have a function to select a value from a dataframe. I want to select that value, save it, remove it from the dataset, and select a value using the same function from the remaining values in the dataframe. What is the best way to do this?
Here is a simple example:
V1 <- c(5,6,7,8,9,10)
df <- data.frame(V1)
V2 <- as.data.frame(matrix(nrow=3,ncol=1))
maximum <- function(x){
max(x)
}
V2[i,]<- maximum(df)
df <- anti_join(df,V2,by='V1')
How can I set this up such that I reapply the maximum function to the remaining values in df and save these values in in V2?
I'm using a different and more complex set of functions and if/else statements than max - this is just an example. I do have to reapply the function to the remaining values, because I will be using the function on a new dataframe if df is empty.
Is this what you're looking for?
V1 <- data.frame(origin = c(5,6,7,8,9,10))
V2 <- as.data.frame(matrix(nrow=3,ncol=1))
df1 <- V1
df2 <- V2
recursive_function <- function(df1,df2,depth = 3,count = 1){
if (count == depth){
# Find index
indx <- which.max(df1[,1])
curVal <- df1[indx,1]
df2[count,1] <- curVal
df1 <- df1[-indx, ,drop = FALSE]
return(list(df1,
df2))
} else {
# Find index
indx <- which.max(df1[,1])
# Find Value
curVal <- df1[indx,1]
# Add value to new data frame
df2[count,1] <- curVal
# Subtract value from old dataframe
df1 <- df1[-indx, ,drop = FALSE]
recursive_function(df1,df2,depth,count + 1)
}
}
recursive_function(df1,df2)
Here is another solution that I stumbled across:
V1 <- c(5,6,7,8,9,10)
df <- data.frame(V1)
minFun <- function(df, maxRun){
V2 <- as.data.frame(matrix(nrow=maxRun,ncol=1))
for(i in 1:maxRun){
V2[i,]<- min(df)
df <- dplyr::anti_join(df,V2,by='V1')
}
return(V2)
}
test <- minFun(df = df, maxRun = 3)
test

multiply all columns in data frame in R

I need to multiply all columns in a data frame with each other. As an example, I need to achieve the following:
mydata$C1_2<-mydata$sic1*mydata$sic2
but for all my columns with values going from 1 to 733 (sic1, sic2, sic3,..., sic733).
I've tried the following but it doesn't work:
for(i in 1:733){
for(j in 1:733){
mydata$C[i]_[j]<-mydata$sic[i]*mydata$sic[j]
}
}
Could you help me? Thanks for your help.
Despite the question if you really want what you think you want, I feel like this could help:
df <- data.frame(
a = 1:4
, b = 1:4
, c = 4:1
)
multiplyColumns <- function(name1, name2, df){
df[, name1] * df[, name2]
}
combinations <- expand.grid(names(df), names(df), stringsAsFactors = FALSE)
names4result <- paste(combinations[,1], combinations[,2], sep = "_")
result <- as.data.frame(mapply(multiplyColumns, combinations[,1], combinations[,2], MoreArgs = list(df = df)))
names(result) <- names4result
result

apply a function with two dataframes as input in r

I want to get the total number of NA that missmatch between two dataframes.
I have found the way to get this for two vectors as follows:
compareNA <- function(v1,v2) {
same <- (v1 == v2) | (is.na(v1) & is.na(v2))
same[is.na(same)] <- FALSE
n <- 0
for (i in 1:length(same))
if (same[i] == "FALSE"){
n <- n+1
}
return(n)
}
Lets say I have vector aand bwhen comparing them I got as a result 2
a <- c(1,2,NA, 4,5,6,NA,8)
b <- c(NA,2,NA, 4,NA,6,NA,8)
h <- compareNA(a,b)
h
[1] 2
My question is: how to apply this function for dataframes instead of vectors?
Having as an example this datafames:
a2 <- c(1,2,NA,NA,NA,6,NA,8)
b2 <- c(1,NA,NA,4,NA,6,NA,NA)
df1 <- data.frame(a,b)
df2 <- data.frame(a2,b2)
what i expect as a result is 5, since this are the total number of NAs that appear in df2 that are not in df1. Any suggestion how to make this work?
Here's a second thought.
xy1 <- data.frame(a = c(NA, 2, 3), b = rnorm(3))
xy2 <- data.frame(a = c(NA, 2, 4), b = rnorm(3))
com <- intersect(colnames(xy1), colnames(xy2))
sum(xy1[, com] == xy2[, com], na.rm = TRUE)
If you don't want to worry about column names (but you should), you can make sure the columns align perfectly. In that case, intersect step is redundant.
sum(xy1 == xy2, na.rm = TRUE)
A third way (assuming dimensions of df1 & df2 are same):
sum(sapply(1:ncol(df1), function(x) compareNA(df1[,x], df2[,x])))
# 5
It would be easier to force both dataframes to have the same column names and compare column by column when those have the same name. You can then simply use a loop over columns and increment a running total by applying your function.
compareNA.df <- function(df1, df2) {
total <- 0
common_columns <- intersect(colnames(df1), colnames(df2))
for (col in common_columns) {
total <- total + compareNA(df1[[col]], df2[[col]])
}
return(total)
}
colnames(df2) <- c("a", "b")
compareNA.df(df1, df2)

Find k highest values in each column and compute mean in R

I am trying to calculate the average of each column in a dataframe using the top k values. I have a solution, however, it is slow and hamfisted. Here is what I came up with:
predictMat <- matrix(0,nrow = length(colnames(DT)),ncol = 1)
k <- 100
itemSummary <- for(i in colnames(DT)) {
u <- data.frame(DT[,i , drop = F])
sortU1 <- data.frame(u[order(u[,i], decreasing = T),, drop = F])
u1Neighbors <- data.matrix(sortU1[1:k,, drop = F])
predictMat[i] <- mean(u1Neighbors, na.rm = T)
}
You can do this in one line using the apply function:
# Sample data frame
set.seed(144)
DT <- matrix(rnorm(1000), nrow=100)
k <- 10
# Compute average of 10 largest values in each column
apply(DT, 2, function(x) mean(tail(sort(x), k)))
# [1] 1.721765 1.658917 1.630231 1.558280 1.606363 1.526322 1.810814 1.678135
# [9] 1.541305 1.621984
could do this with back-to-back apply functions
set.seed(100)
x <- as.data.frame ( matrix(runif(5000,0,10), nrow=1000,ncol=5) )
x1<- apply(x,2,sort,decreasing=T)
apply(x1[1:100,],2,mean)
V1 V2 V3 V4 V5
9.548000 9.572912 9.422325 9.547370 9.462894
edit: looks like I was a few seconds behind in my answer!

Resources