for my script, just the finish loop is missing. Would be great if someone could help. Find attached the example dataset.
library(dplyr)
set.seed(94756)
mat1 <- matrix(sample(seq(-1,100, 0.11),70, replace = TRUE),ncol = 5)
mat1 <- as_tibble(mat1)
mat2 <- matrix(sample(seq(-1,100, 0.11),70, replace = TRUE),ncol = 5)
mat2 <- as_tibble(mat2)
mat2[3,1] <- NA
mat2[6,1] <- NA
mat3 <- matrix(sample(seq(-1,100, 0.11), 70,replace = TRUE),ncol = 5)
mat3 <- as_tibble(mat3)
mat3[4,1] <- NA
data <- list(mat1, mat2, mat3)
library(purrr)
data1 <- map(data, ~add_column(., V1_logical = between(.$V1, 20, 80), .after = 'V1'))
r_pre <- lapply(data1, "[", 2)
data2 <- lapply(data1, function(x) {x$V1_logical[x$V1_logical== TRUE] <- 1; x})
data3 <- lapply(data2, function(x) {x$V1_logical[x$V1_logical== FALSE] <- 0; x})
data4 <- map(data3, ~add_column(., ind = rleid(.$V1_logical), .after = "V1_logical"))
rfun <- function(x) with(rle(x$V1_logical), tibble(lengths, values))
rfun1 <- purrr::map_dfr(data4, rfun)
And then the follwing I would like to write within a loop:
marker <- as.numeric(min(which(rfun1$values == 1 & rfun1$lengths >= 3)))
rfun1 <- add_column(rfun1, marker = rfun1$values == 1 & rfun1$lengths >= 3, .after = "values")
data_drop <- rfun1[c(1:marker),]
data_drop_c <- as.numeric(sum(data_drop$lengths))
Then create the final dataframe by substract the datadrop, somehow like that for every dataframe within a list final_df <- data4[-c(1:data_drop_c), 4] #for all dataframes within list Because with rfun1 I put all dataframes together to one, I would like to have the loop to tell me the place of the marker, cut the data before away and count 4 rows after marker. Then the next dataframe in the list (starting with id of list two with counting until markerplace)
So therefore it would be helpful probably to add an ID for each dataframe for rfun1 as well, somehow like this (but it's for data4 here, wish the same for rfun1) ...
data5 <- bind_rows(data4, .id = "i") %>% group_by(i) %>% count(ind)
Here in data5 in "ind" I don't know if the number is standing for logical TRUE or FALSE. I am only searching for TRUE >=3. So if I could add an id col for rfun1 for every dataframe within a list and run the asked loop, it should work out.
Thanks in advance!
Related
I would like to add a column in each dataframe within a list as logical after V1. Those columns should contain the information (TRUE/FALSE), if the value of V2 is between the range of 20 and 40.
mat1 <- matrix(sample(seq(-1,100, 0.11),50, replace = TRUE),ncol = 5)
mat1 <- as.tibble(mat1)
mat2 <- matrix(sample(seq(-1,100, 0.11),50, replace = TRUE),ncol = 5)
mat2 <- as.tibble(mat2)
mat3 <- matrix(sample(seq(-1,100, 0.11), 50,replace = TRUE),ncol = 5)
mat3 <- as.tibble(mat3)
data <- list(mat1, mat2, mat3)
Maybe this is a possibility for you. I am going to consider the data you provided, but changed from tibble to data.frame. So, you can do:
mat1 <- matrix(sample(seq(-1,100, 0.11),50, replace = TRUE),ncol = 5)
mat1 <- data.frame(mat1)
mat2 <- matrix(sample(seq(-1,100, 0.11),50, replace = TRUE),ncol = 5)
mat2 <- data.frame(mat2)
mat3 <- matrix(sample(seq(-1,100, 0.11), 50,replace = TRUE),ncol = 5)
mat3 <- data.frame(mat3)
data <- list(mat1=mat1,mat2=mat2, mat3=mat3)
To add a column with the logical operation you want, you can use ifelse statement. Lets create a function with this statement and the logical:
if.logical<-function(data){
ifelse (data>20 & data<40,TRUE,FALSE)
}
Now, since we have a list, we can run this function with for loop, which will create a new column with logical output on each dataframe. In your case i believe you wanted for the X2 column:
for (i in 1:length(data)){
data[[paste0("mat",i)]]["LogiX2"]<-if.logical(data[[paste0("mat",i)]]$X2)
}
I have a large data frame.
As you can see, a pattern exists code below:
data_1<-data_1
data_2<-data_2 %>% filter(rowSums(data_2[,1:1])==0)
data_3<-data_3 %>% filter(rowSums(data_3[,1:2])==0)
data_4<-data_4 %>% filter(rowSums(data_4[,1:3])==0)
data_5<-data_5 %>% filter(rowSums(data_5[,1:4])==0)
data_6<-data_6 %>% filter(rowSums(data_6[,1:5])==0)
data_7<-data_7 %>% filter(rowSums(data_7[,1:6])==0)
data_8<-data_8 %>% filter(rowSums(data_8[,1:7])==0)
data_9<-data_9 %>% filter(rowSums(data_9[,1:8])==0)
data_10<-data_10 %>% filter(rowSums(data_10[,1:9])==0)
data_11<-data_11 %>% filter(rowSums(data_11[,1:10])==0)
data_12<-data_12 %>% filter(rowSums(data_12[,1:11])==0)
data_13<-data_13 %>% filter(rowSums(data_13[,1:12])==0)
data_14<-data_14 %>% filter(rowSums(data_14[,1:13])==0)
data_15<-data_15 %>% filter(rowSums(data_15[,1:14])==0)
data_16<-data_16 %>% filter(rowSums(data_16[,1:15])==0)
data_17<-data_17 %>% filter(rowSums(data_17[,1:16])==0)
data_18<-data_18 %>% filter(rowSums(data_18[,1:17])==0)
data_19<-data_19 %>% filter(rowSums(data_19[,1:18])==0)
data_20<-data_20 %>% filter(rowSums(data_20[,1:19])==0)
data_21<-data_21 %>% filter(rowSums(data_21[,1:20])==0)
I tried to make loop like this
for(i in 1:21){
data_i <- data_i %>% filter(rowSums(data_i[,1:i-1])==0)
but, data_i is far away from my intention.
how do I solve this problem?
1) for We use the test data in the Note at the end based on the built in anscombe data frame that comes with R. It is best to keep related data frames in a list so we first create such a list L and then iterate over it producing a new list L2 so that we don't overwrite the original list. Keeping the input and output separate makes it easier to debug.
We could alternately write seq_along(L)[-1] as seq(2, length(L)) and we could alternately write seq_len(i-1) as seq(1, i-1). Note that if DF is a data frame then DF[, 1] is the first column as a column vector but DF[, 1, drop = FALSE] is a one column data frame.
No packages are used.
L <- mget(ls(pattern = "^data_\\d+$"))
L2 <- L
for(i in seq_along(L)[-1]) {
Li <- L[[i]]
Sum <- rowSums(Li[, seq_len(i-1), drop = FALSE])
L2[[i]] <- Li[Sum == 0, ]
}
2) lapply Alternately we could use lapply:
L <- mget(ls(pattern = "^data_\\d+$"))
L2 <- L
L2[-1] <- lapply(seq_along(L)[-1], function(i) {
Li <- L[[i]]
Sum <- rowSums(Li[, seq_len(i-1), drop = FALSE])
Li[Sum == 0, ]
})
3) Map or use Map
L3 <- L
f3 <- function(d, i) {
Sum <- rowSums(d[, seq_len(i-1), drop = FALSE])
d[Sum == 0, ]
}
L3[-1] <- Map(f3, L[-1], seq_along(L)[-1])
or special case the first element like this. Note that it will take the component names from the first argument to Map after the function so it is important that f4 be defined so that that argument is L.
f4 <- function(d, i) {
if (i == 1) d
else {
Sum <- rowSums(d[, seq_len(i-1), drop = FALSE])
d[Sum == 0, ]
}
}
L4 <- Map(f4, L, seq_along(L))
Note
# create test data
data_1 <- anscombe
data_1[1, 1] <- 0
data_2 <- 10 * anscombe
data_2[2, 1:2] <- 0
data_3 <- 100 * anscombe
data_3[3, 1:3] <- 0
# Example data
dat <- matrix(runif(2*300), ncol = 2, nrow = 20)
group <- rep_len(LETTERS[1:3], 20)
df <- cbind.data.frame(dat, Group = group)
# Greate subset groups
n <- levels(as.factor(group))
mylist <- combn(n, 2, simplify = FALSE)
I would like to subset my data according to pairwise combinations of the group attribute, and then save the result in mylist.
How can I do it?
Thanks a lot.
We can use subset with %in% after looping over the mylist
mylist2 <- lapply(mylist, function(x) subset(df, Group %in% x))
It can be also be done within combn by making use of FUN argument
combn(n, 2, FUN = function(x) subset(df, Group %in% x), simplify = FALSE)
There is a df given with nrow=600 and ncol=18
Now I need to sample 10000 of each of this columns with replacement.
According to the specifications first I need to create an empty matrix:
df1 <- as.data.frame(matrix(NA,nrow = 10000,ncol=18))
now I want to use for loop to do all the samples(for each column) at once:
for (i in 1:18){
df1[1:10000, i) <- sample(df[,i], 10 000, replace=true)
when I run this code, my df1 is still empty.
Can anyone help?
Many thanks in advance
There are syntax issues in your code. Try the following :
df1 <- as.data.frame(matrix(NA,nrow = 10000,ncol=18))
for (i in 1:18) {
df1[, i] <- sample(df[, i], 10000, replace = TRUE)
}
Without an explicit for loop you can also use sapply/lapply :
#With `sapply`
df1 <- as.data.frame(sapply(df, sample, 1000, replace = TRUE))
#Using `lapply`
df1 <- do.call(cbind.data.frame, lapply(df, sample, 1000, replace = TRUE))
It works for the data shared in comments.
df <- data.frame(V1, V2, V3)
df1 <- as.data.frame(matrix(NA,nrow = 10000,ncol=3))
for (i in 1:3) {
df1[, i] <- sample(df[, i], 10000, replace = TRUE)
}
dim(df1)
#[1] 10000 3
head(df1)
# V1 V2 V3
#1 0.02527926 0.039423826 0.097738594
#2 0.03391239 0.039423826 0.036153091
#3 0.03919354 -0.004922473 0.097738594
#4 -0.06703827 0.039423826 0.097738594
#5 0.02168909 0.048176052 0.036153091
#6 0.02527926 0.074435079 -0.009444024
The data I have contain three variables. There are three unique IDs and each has multiple records.
ID <- c(rep(1,2), rep(2,1), rep(3,2))
y0 <- c(rep(5,2), rep(3,1), rep(1,2))
z0 <- c(rep(1,2), rep(13,1), rep(4,2))
dat1 <- data.frame(ID, y0,z0)
What I am trying to is repeat the whole data N times (N needs to be a parameter), and I need to add a new column with the repetition number.
So if N = 2, the new data look like:
rep <- c(rep(1,2), rep(2,2), rep(1,1), rep(2,1), rep(1,2), rep(2,2))
ID <- c(rep(1,4), rep(2,2), rep(3,4))
y0 <- c(rep(5,4), rep(3,2), rep(1,4))
z0 <- c(rep(1,4), rep(13,2), rep(4,4))
dat2 <- data.frame(rep, ID, y0,z0)
We replicate the sequence of rows and order it later to get the expected output
res <- cbind(rep = rep(seq_len(2), each = nrow(dat1)), dat1[rep(seq_len(nrow(dat1)), 2),])
resN <- res[order(res$ID),]
row.names(resN) <- NULL
all.equal(dat2, resN, check.attributes = FALSE)
#[1] TRUE
Or another option is to replicate into a list and then with Map create the 'rep' column (it is not recommended to have function names as column names, object names etc.) and rbind the list elements
res1 <- do.call(rbind, Map(cbind, rep = seq_len(2), replicate(2, dat1, simplify = FALSE)))
res2 <- res1[order(res1$ID),]
row.names(res2) <- NULL
all.equal(dat2, res2, check.attributes = FALSE)
#[1] TRUE