extrapolate data within the foreach loop - r

Take the following example, I create a data.frame df1.
For each iteration, we mix up the order of df1 and rename it df2
We then apply conditions to df2, that are:
when df2[1,1] == 1, then we want to subset df2 so that we remove the case of df2$B==125, and if df2[1,1] != 1, then make no action
as a second step when df2[1,1] == 3, then we want to subset df2 so that we remove the case of df2$B==108, and if df2[1,1] != 1, then make no action
But I dont know how to code this step. Can someone fill in the gaps below.
When we run the code, the output should sucessfully return values between 9 and 10.
require(doParallel)
set.seed(123)
A <- 1:10
B <- c(106,144,131,107,125,108,105,119,112,127)
df1 <- data.frame(A,B)
m <- 100
Sample = foreach(i=c(1:m)) %do%{
#shuffle order of data
df2 <- df1[sample(1:nrow(df1)), ]
df2
# when df2[1,1] == 1 then remove df2$B==125, otherwise leave df2 as is
# or
# when df2[1,1] == 3 then remove df2$B==108, otherwise leave df2 as is
length(df2$A)
}

Try this code:
require(doParallel)
set.seed(123)
A <- 1:10
B <- c(106,144,131,107,125,108,105,119,112,127)
df1 <- data.frame(A,B)
m <- 100
Sample = foreach(i=c(1:m)) %do%{
#shuffle order of data
df2 <- df1[sample(1:nrow(df1)), ]
df2
if (df2[1,1]==1) df2 <- df2[-which(df2$B==125),]
if (df2[1,1]==3) df2 <- df2[-which(df2$B==108),]
print(nrow(df2))
}

Related

extracting observations from matrix where columns and rows match a "key"

Given a matrix m how can I create a TRUE/ FALSE or 1 / 0 matrix where the columns and rows match some "key" in a data frame?
My goal is to assign a 1 or 0 to the location in the matrix where the columns match the cols and the rows match the rows in the colsrows_df. Then essentially just extract the observations where this is true or paste them into the colsrows_df next to the correct columns.
The below forloop just creates diagonally 1's and 0's
m <- matrix(runif(30), nrow = 20, ncol = 20)
dimnames(m) <- list(c(paste0("ID", 1:5, "_2000"), paste0("ID", 1:5, "_2001"), paste0("ID", 1:5, "_2002"), paste0("ID", 1:5, "_2003")),
c(paste0("ID", 1:5, "_2000"), paste0("ID", 1:5, "_2001"), paste0("ID", 1:5, "_2002"), paste0("ID", 1:5, "_2003")))
cols <- colnames(m)
rows <- rownames(m)
library(tidyr)
library(dplyr)
colsrows <- cbind(cols, rows)
# Here I just separate the rows/cols and then add an extra year and paste them back together
colsrows_df <- colsrows %>%
data.frame %>%
separate(cols, c("id_col", "year_col"), "_", remove = FALSE) %>%
separate(rows, c("id_row", "year_row"), "_", remove = FALSE) %>%
mutate(year_row_plus_1 = as.numeric(year_row) + 1,
rows = paste0(id_row,"_", year_row_plus_1)) %>%
select(cols, rows)
colsrows_df
for(i in 1:nrow(colsrows)){
m[i, ] <- colnames(m) == colsrows_df$cols
m[, i] <- rownames(m) == colsrows_df$rows
}
m
EDIT:
This seems to "solve" the problem however I am not sure how robust it is.
ids <- colsrows_df[colsrows_df$cols %in% colnames(m) &
colsrows_df$rows %in% rownames(m), ]
res <- melt(m[as.matrix(colsrows_df[colsrows_df$cols %in% colnames(m) &
colsrows_df$rows %in% rownames(m), ][2:1])])
cbind(ids, res)
I think can you first filter colsrows_df with rownames and colnames which are actually present in m then change the order of columns, convert to matrix , use it to subset m and change those values to 1.
m[as.matrix(colsrows_df[colsrows_df$cols %in% colnames(m) &
colsrows_df$rows %in% rownames(m), ][2:1])] <- 1
Then convert remaining ones to 0
m[m != 1] <- 0

reference x's column in R's apply function

I have a df like this:
a <- c(4,5,3,5,1)
b <- c(8,9,7,3,5)
c <- c(6,7,5,4,3)
df <- data.frame(rbind(a,b,c))
I want a new df, df2, containing the difference between the values in each cell in rows a and b and the value in row c in their respective columns.
df2 would look like this:
a <- c(-2,-2,-2,1,-2)
b <- c(2,2,2,-1,2)
df2 <- data.frame(rbind(a,b))
Here is where I'm getting stuck:
df2 <- data.frame(apply(df,c(1,2),function(x) x - df[nrow(df),the col index of x]))
How do I reference the column index of x? Is there something like JavaScript's this?
We can do this easily by replicating the 3rd row to make the lengths equal before subtracting with the first two rows
out <- df[c("a", "b"),] - df["c",][col(df[c("a", "b"),])]
identical(df2, out)
#[1] TRUE
Or explicitly using rep
df[c("a", "b"),] - rep(unlist(df["c",]), each = 2)

join two columns in a dataframe so they do not contain same values

Sooo
I’ve got two lists
list1 <- rep(c("john","steve","lisa","sara","anna"), c(50,0,15,25,10))
list2 <- rep(c("john","steve","lisa","sara","anna"), c(15,25,0,10,50))
I need to put them into a dataframe.
df <- as.data.frame(matrix(1, nrow = 100, ncol = 2))
df$v1 <- list1
Now the problem.
I need to put list2 into df$v2
with out any row in df containing the same values.
It does not matter what values are in each row.
I use this for testing it, if each rows contains the same value:
all(apply(ballots, 1, function(x) length(unique(x)) == 2) == TRUE)
to clarify:
I need each value in the columns, which row doesn't matter.
I need a way to randomize or change the order of the second column (or the first) in such a way that the same value is never in column one or two
The output:
V1 V2
John Steve
John Lisa
Sara John
John Lisa
Steve Anna
Currently, when I join the columns in the dataframe, there are many rows in both column one and two containing the same value.
Alright... finally found the answer after many trials and errors.
If anyone has a cleaner method to do this I would love to see one.
The following code takes list A and puts it in column A
takes list B, randomizes and puts in column C, Column B is NA
If A and C is not the same, switch column B and C.
If it fails to finish all the rows, it starts over, randomizing column C
library(taRifx)
failed.counter <- 0
while (failed.counter <= 1) {
list1 <- rep(c("A","B","C"), c(3,1,2))
list2 <- sample(rep(c("A","B","C"), c(2,3,1)))
df <- as.data.frame(matrix(NA, nrow = length(list1), ncol = 3))
df[,1] <- list1
df[,3] <- list2
iteration.counter <- 0
while (anyNA(df$V2) == TRUE && failed.counter == 0 ) {
iteration.counter <- iteration.counter + 1
df.sub <- df[is.na(df[,2]) & df[,1] != df[,3] & !is.na(df[,3]),]
df.sub <- df.sub[,c("V1", "V3", "V2")]
colnames(df.sub) <- c("V1", "V2", "V3")
r.names <- rownames(df.sub)
df[r.names,] <- df.sub
df[,3] <- shift(df[,3], 1, Wrap=TRUE)
if(iteration.counter >= nrow(df)+1) {failed.counter <- 1}
}
if(anyNA(df$V2) == FALSE) {failed.counter <- 2}
}

Filter rows of dataframes stored in a list and create new list

I have a list with 64 dataframes.
Dataframe 1 and Dataframe 5 have to have the same row names.
The same with 2 and 6, 3 and 7, and so on.
I'm being able to run a for loop and create a new list, but something is not working: I end up having an incorrect number of rows.
Here a simplified example to reproduce it:
# Create dataframes and store in list
dfA <- data.frame(v1=c(1:6), v2=c("x1","x2","x3","x4","x5","x6"))
dfB <- data.frame(v1=c(1:6), v2=c("x1","x2","x3","x4","x5","x6"))
dfC <- data.frame(v1=c(1:5), v2=c("x1","x2","x3","x4","x5"))
dfD <- data.frame(v1=c(1:4), v2=c("x1","x2","x3","x4"))
example_dataframes = list(dfA, dfB, dfC, dfD)
# These vectors give the order of the process
vectorA = c(1,2)
vectorB = c(3,4)
# Create new list and start for loop
filtered_dataframes = list()
for (i in vectorA) {
for (j in vectorB) {
df1 = example_dataframes[[i]]
df2 = example_dataframes[[j]]
test = intersect(df1$v2, df2$v2)
filtered_dataframes[[i]] <- df1[which(df1$v2 %in% test),]
filtered_dataframes[[j]] <- df2[which(df2$v2 %in% test),]
}
}
For this example, I expect to obtain:
sapply(filtered_dataframes, nrow)
> 5 4 5 4
This modified version should work to get the results you need.
dfA <- data.frame(v1=c(1:6), v2=c("x1","x2","x3","x4","x5","x6"))
dfB <- data.frame(v1=c(1:6), v2=c("x1","x2","x3","x4","x5","x6"))
dfC <- data.frame(v1=c(1:5), v2=c("x1","x2","x3","x4","x5"))
dfD <- data.frame(v1=c(1:4), v2=c("x1","x2","x3","x4"))
example_dataframes = list(dfA, dfB, dfC, dfD)
# Put the comparison vectors into a list. Exampl: To compare dataframes 1 and 3, put in c(1,3)
vector.list <- list(c(1,3),c(2,4))
# Create new list and start for loop
filtered_dataframes = list()
# Loop through the list of vectors
for (i in vector.list) {
# Get the first dataframe from the current vector being processed
df1 = example_dataframes[[i[1]]]
# Get the second dataframe from the current vector being processed
df2 = example_dataframes[[i[2]]]
# Get the intersection of the two dataframes
test = intersect(df1$v2, df2$v2)
# Add the first filtered dataframe to the list of filtered dataframes
filtered_dataframes[[i[1]]] <- df1[which(df1$v2 %in% test),]
# Add the second filtered dataframe to the list of filtered dataframes
filtered_dataframes[[i[2]]] <- df2[which(df2$v2 %in% test),]
}

Matching data from unequal length data frames in r

This seems like it should be really simple. Ive 2 data frames of unequal length in R. one is simply a random subset of the larger data set. Therefore, they have the same exact data and a UniqueID that is exactly the same. What I would like to do is put an indicator say a 0 or 1 in the larger data set that says this row is in the smaller data set.
I can use which(long$UniqID %in% short$UniqID) but I can't seem to figure out how to match this indicator back to the long data set
Made same sample data.
long<-data.frame(UniqID=sample(letters[1:20],20))
short<-data.frame(UniqID=sample(letters[1:20],10))
You can use %in% without which() to get values TRUE and FALSE and then with as.numeric() convert them to 0 and 1.
long$sh<-as.numeric(long$UniqID %in% short$UniqID)
I'll use #AnandaMahto's data to illustrate another way using duplicated which also works if you've a unique ID or not.
Case 1: Has unique id column
set.seed(1)
df1 <- data.frame(ID = 1:10, A = rnorm(10), B = rnorm(10))
df2 <- df1[sample(10, 4), ]
transform(df1, indicator = 1 * duplicated(rbind(df2, df1)[, "ID",
drop=FALSE])[-seq_len(nrow(df2))])
Case 2: Has no unique id column
set.seed(1)
df1 <- data.frame(A = rnorm(10), B = rnorm(10))
df2 <- df1[sample(10, 4), ]
transform(df1, indicator = 1 * duplicated(rbind(df2, df1))[-seq_len(nrow(df2))])
The answers so far are good. However, a question was raised, "what if there wasn't a "UniqID" column?
At that point, perhaps merge can be of assistance:
Here's an example using merge and %in% where an ID is available:
set.seed(1)
df1 <- data.frame(ID = 1:10, A = rnorm(10), B = rnorm(10))
df2 <- df1[sample(10, 4), ]
temp <- merge(df1, df2, by = "ID")$ID
df1$matches <- as.integer(df1$ID %in% temp)
And, a similar example where an ID isn't available.
set.seed(1)
df1_NoID <- data.frame(A = rnorm(10), B = rnorm(10))
df2_NoID <- df1_NoID[sample(10, 4), ]
temp <- merge(df1_NoID, df2_NoID, by = "row.names")$Row.names
df1_NoID$matches <- as.integer(rownames(df1_NoID) %in% temp)
You can directly use the logical vector as a new column:
long$Indicator <- 1*(long$UniqID %in% short$UniqID)
See if this can get you started:
long <- data.frame(UniqID=sample(1:100)) #creating a long data frame
short <- data.frame(UniqID=long[sample(1:100, 30), ]) #creating a short one with the same ids.
long$indicator <- long$UniqID %in% short$UniqID #creating an indicator column in long.
> head(long)
UniqID indicator
1 87 TRUE
2 15 TRUE
3 100 TRUE
4 40 FALSE
5 89 FALSE
6 21 FALSE

Resources