Loop function in r to compare values of different data frames - r

Introduction
Hi to everyone,
for a little project, I try to get a function to compare values of a Data Frame 1 with values from a Data Frame 2. Thereafter, data frames 3 and 4 are supposed to get printed with the information of the comparison.
Data Frame 1:
ID
x1i
x2i
x3i
a
1
2
4
b
1
4
1
Data Frame 2:
Data_Frame_2 <- c(1:4)
Read x1a and compare with Data Frame 2. The value 1 is in Data Frame 2. Print value 1 and the name of the variable (x1a) in Data Frame 3 and cross out the value 1 from Data Frame 2.
Read x1b and compare with Data Frame 2. The value 1 is (not anymore) in Data Frame 2. Read x2b. The value 4 is in Data Frame 2. Print value 4 and the name of the variable (x2b) in Data Frame 3 and cross out the value 4 from Data Frame 2.
The Data Frame 3 is supposed to be something like this:
Data Frame 3:
ID
Value
Variable
a
1
x1i
b
4
x2i
Data Frame 4 (the remaining numbers of Data Frame 2):
Remaining numbers
2
3
Example in R to solve this theoretical problem
Until now, I worked out this code which does the job:
b <- as.data.frame(c(1:4)) # data frame 2
colnames(b, do.NULL = FALSE)
colnames(b) <- c("b")
View(b)
a <- as.data.frame(cbind(c("a","b"), c(3,3), c(2,1), c(1,2))) # data frame 1
colnames(a, do.NULL = FALSE)
colnames(a) <- c("ID","x1i","x2i","x3i")
View(a)
`%notin%` <- Negate(`%in%`) #got this one from <https://www.marsja.se/how-to-use-in-in-r/>
Read_Info <- function(a,b)
{
if (a[1,2] %in% b[1:4,1]) {c_1<-c(a[1,1:2],names(a)[2]); b1<-subset(b,b %notin% a[1,2])}
if (a[2,2] %in% b1[1:3,1]) {c_2<-c(a[2,1:2],names(a)[2]); b2<-subset(b,b %notin% c(a[1,2],a[2,2]))}
else if (a[2,3] %in% b1[1:3,1]) {c_2<-c(a[2,1],a[2,3],names(a)[3]); b2<-subset(b,b %notin% c(a[1,2],a[2,3]))}
if (a[3,2] %in% b2[1:2,1]) {c_3<-c(a[3,1],a[3,2],names(a)[2]); b3<-subset(b,b %notin% c(a[1,2],a[2,3],a[3,2]))}
else if (a[3,2] %notin% b2[1:2,1]) {c_3<-c(NA,NA,NA); b3<-b2}
c<-rbind(c_1,c_2,c_3)
colnames(c, do.NULL = FALSE)
colnames(c) <- c("ID","Value","Variable")
bx<-b3
colnames(bx, do.NULL = FALSE)
colnames(bx) <- c("Remaining numbers")
print(c)
print(bx)
}
Read_Info(a,b)
# In this example, c is data frame 3 and bx is data frame 4
Actual Task at hand - If, else if Loop Function in R
I do face the following obstacle: the actual data which I have is a little bit larger than the above example. Nevertheless, it follows the same structure:
b <- as.data.frame(c(1:20)) # this would be Data Frame 2 in the theoretical considerations
colnames(l, do.NULL = FALSE)
colnames(l) <- c("b")
View(l)
# This would be data frame 1 in the theoretical considerations
# Note: between "ID" and "x1i", there are now two additional variables which were not in the example above
# Although these two variables are part of the data, they are not of interest right know
a2 <- cbind(c("a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t"),c(0),c(1))
a1 <- data.frame(replicate(16,sample(1:20,rep=T)))
a <- cbind(a2, a1)
colnames(a, do.NULL = FALSE)
colnames(a) <- c("ID","variable1","variable2","x1i","x2i","x3i","x4i","x5i","x6i","x7i","x8i","x9i","x10i","x11i","x12i","x13i","x14i")
View(a)
I try to create an “if”, “else if” loop function utilizing "for" which is supposed to do this reading task by itself. Until now, I wrote down the following code which does not work yet.
`%notin%` <- Negate(`%in%`) # got this one from <https://www.marsja.se/how-to-use-in-in-r/>
Read_Info_Loop <- function(a,b)
{for (i in 1:20)
{ if (a[i,4] %in% b[1:(21-i),1]) {x[i]<-c(a[i,1],a[i,4],names(a)[4]); b[i]<-subset(b,b %notin% a[i,4])}
if (a[i,5] %in% b[i-1][1:(21-i),1]) {x[i]<-c(a[i,1],a[i,5],names(a)[5]); b[i]<-subset(b,b %notin% c(a[1,4],a[i,5]))
} else if (a[i,6] %in% b[i-1][1:(21-i),1]) {x[i]<-c(a[i,1],a[i,6],names(a)[6]); b[i]<-subset(b,b %notin% c(a[1,4],a[i,6]))
} else if (a[i,7] %in% b[i-1][1:(21-i),1]) {x[i]<-c(a[i,1],a[i,7],names(a)[7]); b[i]<-subset(b,b %notin% c(a[1,4],a[i,7]))
} else if (a[i,8] %in% b[i-1][1:(21-i),1]) {x[i]<-c(a[i,1],a[i,8],names(a)[8]); b[i]<-subset(b,b %notin% c(a[1,4],a[i,8]))
} else if (a[i,9] %in% b[i-1][1:(21-i),1]) {x[i]<-c(a[i,1],a[i,9],names(a)[9]); b[i]<-subset(b,b %notin% c(a[1,4],a[i,9]))
} else if (a[i,10] %in% b[i-1][1:(21-i),1]) {x[i]<-c(a[i,1],a[i,10],names(a)[10]); b[i]<-subset(b,b %notin% c(a[1,4],a[i,10]))
} else if (a[i,11] %in% b[i-1][1:(21-i),1]) {x[i]<-c(a[i,1],a[i,11],names(a)[11]); b[i]<-subset(b,b %notin% c(a[1,4],a[i,11]))
} else if (a[i,12] %in% b[i-1][1:(21-i),1]) {x[i]<-c(a[i,1],a[i,12],names(a)[12]); b[i]<-subset(b,b %notin% c(a[1,4],a[i,12]))
} else if (a[i,13] %in% b[i-1][1:(21-i),1]) {x[i]<-c(a[i,1],a[i,13],names(a)[13]); b[i]<-subset(b,b %notin% c(a[1,4],a[i,13]))
} else if (a[i,14] %in% b[i-1][1:(21-i),1]) {x[i]<-c(a[i,1],a[i,14],names(a)[14]); b[i]<-subset(b,b %notin% c(a[1,4],a[i,14]))
} else if (a[i,15] %in% b[i-1][1:(21-i),1]) {x[i]<-c(a[i,1],a[i,15],names(a)[15]); b[i]<-subset(b,b %notin% c(a[1,4],a[i,15]))
} else if (a[i,16] %in% b[i-1][1:(21-i),1]) {x[i]<-c(a[i,1],a[i,16],names(a)[16]); b[i]<-subset(b,b %notin% c(a[1,4],a[i,16]))
} else if (a[i,17] %in% b[i-1][1:(21-i),1]) {x[i]<-c(a[i,1],a[i,17],names(a)[17]); b[i]<-subset(b,b %notin% c(a[1,4],a[i,17]))
} else if (a[i,17] %notin% b[1:(21-i),1]) {x[i]<-c(NA,NA,NA); b[i]<-c(b[i-1])}
y<-rbind(x[i[1:20]])
colnames(y, do.NULL = FALSE)
colnames(y) <- c("ID","Value","Variable")
u<-rbind(b[i=20])
colnames(u, do.NULL = FALSE)
colnames(u) <- c("Remaining numbers")
print(y)
print(u)
}
}
# y is supposed to be data frame 3 and u is supposed to be data frame 4
# in the above theoretical considerations
Errors
I now get the following errors:
Error in `[<-.data.frame`(`*tmp*`, i, value = c("a", "1", "x3i")) :
replacement has 3 rows, data has 4
Error in Read_Info_Loop(test, l) : object 'x' not found
...nevertheless, the first error, I got yesterday. Today, after restarting R, the second error occurred which seems to address internal structural problems of the function code. Additionally, I am pretty sure, that there might be further errors which are right now "hidden" behind the other errors and which will occur as soon as the two above mentioned errors are dealt with.
However, I do not want you to just solve any problems. I rather would like to ask, if you have ideas how I can solve these two specific errors, and maybe a hint to just get the function a little bit closer to work properly. So, for me the focus is clearly on learning a thing or two in general.
A few disclaimers: I have little experience in programming, so the code or my descriptions are probably rather messy. Therefore, if you have any questions for clarification, please feel free to ask. I try to respond as quickly as possible. English is not my first language, so please excuse me for any language mistakes.
I am looking forward to learning and hear your ideas about the code itself, ideas regarding the theoretical considerations or the approach to the loop function.
Kind Regards
Paul
Edits / Progression
Edit: I just realized, that the code can already be simplified with another "for". Nevertheless, I read that one should rather avoid nested "for" loops (for...for...)
`%notin%` <- Negate(`%in%`) #got this one from <https://www.marsja.se/how-to-use-in-in-r/>
Read_Info_Loop2 <- function(a,b)
{for (i in 1:20) for (k in 5:17) {
{ if (a[i,4] %in% b[1:(21-i),1]) {x[i]<-c(a[i,1],a[i,4],names(a)[4]); b[i]<-subset(b,b %notin% a[i,4])
} else if (a[i,k] %in% b[i-1][1:(21-i),1]) {x[i]<-c(a[i,1],a[i,k],names(a)[k]); b[i]<-subset(b,b %notin% c(a[1,4],a[i,k]))
} else if (a[i,k] %notin% b[1:(21-i),1]) {x[i]<-c(NA,NA,NA); b[i]<-c(b[i-1])}
}
y<-rbind(x[i[1:20]])
colnames(y, do.NULL = FALSE)
colnames(y) <- c("ID","Value","Variable")
u<-rbind(b[i=20])
colnames(u, do.NULL = FALSE)
colnames(u) <- c("Remaining numbers")
print(y)
print(u)
}
}
The same error was shown:
Error in Read_Info_Loop2(test, l) : object 'x' not found
I try to use this resource, going forward: https://cran.r-project.org/doc/manuals/r-release/R-intro.html#Repetitive-execution
I am going to give further updates.

This is a tricky one. I was able to find a solution for the underlying problem but unfortunately I wasn't able to fix OP's code as it was requested.
However, here is my solution:
library(data.table)
long <- melt(setDT(a), "ID", patterns("^x"))
df3 <- long[, {
if (any(.SD$value %in% b)) {
result <- first(.SD[value %in% b])
b <- setdiff(b, result$value)
} else {
result <- data.table(variable = NA_integer_, value = NA_integer_)
}
result
}, by = ID]
df3
ID variable value
1: a x1i 1
2: b x2i 4
# remaining values
df4 <- data.table(Remaining.numbers = setdiff(b, df3$value))
df4
Remaining.numbers
1: 2
2: 3
Explanation
In a first step, the dataset a is reshaped into long format
long
ID variable value
1: a x1i 1
2: b x1i 1
3: a x2i 2
4: b x2i 4
5: a x3i 4
6: b x3i 1
Now, variable contains the column names as data items which simplifies subsequent steps. Note that melt() has maintained the original order of rows and columns which is important for picking the correct values later on.
Now, we kind of loop through long by unique values of ID. This is achieved by grouping. As a speciality of data.table, we can use an arbitrary expression (enclosed in curly brackets) for aggregation.
For each ID, we check if there is at least one value still available in the vector of remaining values. If so, the first appearance is taken as resulting row. The corresponding value is removed from b which is then used in the next "iteration", i.e., the next group level.
Please note that b inside the expression (in curly brackets) is a local variable. The modified value of b is not available outside of the environment of the expression.
While testing with arbitrary datasets I have noticed that there might be situations where all numbers which belong to an ID already have been removed from remaining. To indicate this, a dummy result with NA values is returned.
So, for each ID group one row is returned which are then combined into one data.table object and assigned to df3.
df4 contains the Remaining.numbers and is created from building the set difference between b and the vector of picked values df3$value.
Note that I have tried to rewrite the code as a loop for demonstration purposes but I have given up because I found that the bookkeeping overhead wasn't worth it.
Data
For the first use case in OP's question:
a <- fread("ID x1i x2i x3i
a 1 2 4
b 1 4 1")
b <- 1:4
Other use cases with varying numbers of rows, columns, and lengths of b can be created using the code below. Please note that set.seed() is important because the created dataset a and the results df3 and df4 depend on it. For example, with set.seed(123) we can reproduce the situation where the list of remaining numbers for the last ID is exhausted.
# number of rows and columns to create
n_rows <- 18
n_cols <- 16
# create vector b
b <- 1:20
# create data.frame a
a2 <- data.frame(ID = letters[seq(n_rows)], variable1 = 0, variable2 = 1)
set.seed(123) # to ensure reproducible results
a1 <- as.data.frame(replicate(n_cols, sample(b, n_rows, replace = TRUE)))
colnames(a1) <- sprintf("x%ii", seq(n_cols))
a <- cbind(a2, a1)

Uwe’s Solution
Thank you very much, Uwe, for your solution and comprehensive explanation! It did not even occur to me, to combine the values into one list and to let the function run over that list. So, your solution opened a new perspective on the data. I am going to try out your solution in detail to learn as much as possible and report back here as soon as possible!
Solution regarding the original code
I was able to get to a solution for the original code which took quite some time.
test2 <- cbind(c("a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t"),c(0),c(1),c(1,1,1,sample(1:15),1,1),c(2,3,3,sample(1:15),2,3))
test1 <- data.frame(replicate(12,sample(1:20,rep=T)))
data.frame1 <- cbind(test2, test1)
colnames(data.frame1) <- c("ID","variable1","variable2","x1i","x2i","x3i","x4i","x5i","x6i","x7i","x8i","x9i","x10i","x11i","x12i","x13i","x14i")
data.frame2 <- as.data.frame(c(1:20))
x <- as.data.frame(matrix(NA,nrow = 3,ncol = 20))
rownames(x) <- c("ID","value","variable")
colnames(x) <- c()
View(x)
`%notin%` <- Negate(`%in%`) #got this one from <https://www.marsja.se/how-to-use-in-in-r/>
Read_Info_Loop2 <- function(a,b) {for (k in 1:20) {for (i in 4:17)
{if (a[k,i] %in% b[,1]) {x[k]<-c(a[k,1],(a[k,i]),names(a[i])); b<-subset(b,b %notin% a[k,i]);break}}
}
c<-rbind(x)
bx<-b
colnames(bx) <- c("numbers remaining")
print(c)
print(bx)
}
Read_Info_Loop2(data.frame1, data.frame2)
The only downfall with this solution is the output. It is rather in a weird form. But I don’t mind really. So now we already have two solutions which use different approaches. Very exciting. Regarding the output (see picture below of the output of some of the actual data) of data.frames 3 and 4: The last 7 columns are NAs because this data.frame1_original has just 13 rows (k=13). So for the last 7 iterations (k=14 to k=20), there is no output.
Here is the output of the random data.frame1 as described above. Here, the solution looks rather weird, since for "r" and "t" all entries are already deleted from data.frame2 which returns NAs for these rows. The two numbers, which remain are 18 and 20.

Related

Conditionally add named elements to a list

I have a function to perform actions on a variable list of dataframes depending on user selections. The function mostly performs generic actions but there are a few actions that are dataframe specific.
My code runs fine if all dataframes are selected but I am unable to get it to work if not all dataframes are selected.
The following provides a minimal reproducible example:
# User switches.
df1Switch <- TRUE
df2Switch <- TRUE
df3Switch <- TRUE
# DF creation.
set.seed(1)
df <- data.frame(X=sample(1:10), Y=sample(11:20))
if (df1Switch) df1 <- df
if (df2Switch) df2 <- df
if (df3Switch) df3 <- df
# Function to do something.
fn_something <- function(file_list, file_names) {
df <- file_list
# Do lots of generic things.
df$Z <- df$X + df$Y
# Do a few specific things.
if (file_names == "Name1") df$X <- df$X + 1
else if (file_names == "Name2") df$X <- df$Z - 1
else if (file_names == "Name3") df$Y <- df$X + df$Y
return(df)
}
# Call function to do something.
file_list <- list(Name1=df1, Name2=df2, Name3=df3)
file_names <- names(file_list)
all_df <- do.call(rbind,mapply(fn_something, file_list, file_names,
SIMPLIFY=FALSE))
In this case the code runs fine as the user has selected to create all three dataframes. I use a named list so that the specific actions can be performed against the correct dataframes.
The output looks something like this (the actual numbers aren't important):
X Y Z
Name1.1 4 13 16
Name1.2 5 12 16
Name1.3 6 16 21
: : : :
Name2.1 15 13 16
: : : :
The problem arises if the user selects not to create some dataframes, e.g.:
# User switches.
df1Switch <- TRUE
df2Switch <- FALSE
df3Switch <- TRUE
Not surprisingly, in this case an object not found error results:
> # Call function to do something.
> file_list <- list(Name1=df1, Name2=df2, Name3=df3)
Error: object 'df2' not found
What I would like to do is conditionally specify the contents of file_list along the lines of this pseudo code:
file_list <- list(if (df1Switch) {Name1=df1}, if (df2Switch) {Name2=df2}, if (df3Switch) {Name3=df3})
I have come across list.foldLeft
Conditionally merge list elements but I don't know if this is suitable.
(I'll re-hash my comment:)
In general, I would encourage you to consider use of a list-of-dataframes instead of individual frames. My rationale for this:
assuming that each frame is structured (nearly) identically; and
assuming that what you do to one frame you will (or at least can) do to all frames; then
it is easier to list_of_frames <- lapply(list_of_frames, some_func) than it is to do something like:
for (nm in c("df1", "df2", "df3")) {
d <- get(nm)
d <- some_func(d)
assign(nm, d)
}
especially when dealing with non-global environments (i.e., doing this within a function).
To be clear, "easier" is subjective: though it does win code-golf, I find it much easier to read and understand that "I am running some_func on each element of list_of_frames and saving the result". (You can even save it to a new list-of-frames, thereby keeping the original frames untouched.)
You may also do things conditionally, as in
needs_work <- sapply(list_of_frames, some_checker_func) # returns logical
# or
needs_work <- c("df1", "df2") # names of elements of list_of_frames
list_of_frames[needs_work] <- lapply(list_of_frames[needs_work], some_func)
Having said that ... the direct answer to your one liner:
c(if (df1Switch) list(Name1=df1), if (df2Switch) list(Name2=df2), if (df3Switch) list(Name3=df3))
This capitalizes on the fact that unstated else results in a NULL, and the NULL-compressing (dropping) characteristic of c(). You can see it in action with:
c(if (T) list(a=1), if (T) list(b=2), if (T) list(d=4))
# $a
# [1] 1
# $b
# [1] 2
# $d
# [1] 4
c(if (T) list(a=1), if (FALSE) list(b=2), if (T) list(d=4))
# $a
# [1] 1
# $d
# [1] 4

Looping through rows in an R data frame?

I'm working with multiple big data frames in R and I'm trying to write functions that can modify each of them (given a set of common parameters). One function is giving me trouble (shown below).
RawData <- function(x)
{
for(i in 1:nrow(x))
{
if(grep(".DERIVED", x[i,]) >= 1)
{
x <- x[-i,]
}
}
for(i in 1:ncol(x))
{
if(is.numeric(x[,i]) != TRUE)
{
x <- x[,-i]
}
}
return(x)
}
The objective of this function is twofold: first, to remove any rows that contain a ".DERIVED" string in any one of their cells (using grep), and second, to remove any columns that are non-numeric (using is.numeric). I get an error on the following condition:
if(grep(".DERIVED", x[i,]) >= 1)
The error states the "argument is of zero length", which I believe is usually associated with NULL values in a vector. However, I've used is.null on the entire data frame that is giving me errors, and it confirmed that there are no null values in the DF. I'm sure I'm missing something relatively simple here. Any advice would be greatly appreciated.
If you can use non-base-R functions, this should address your issue. df is the data.frame in question here. It will also be faster than looping over rows (generally not advised if avoidable).
library(dplyr)
library(stringr)
df %>%
filter_all(!str_detect(., '\\.DERIVED')) %>%
select_if(is.numeric)
You can make it a function just as you would anything else:
mattsFunction <- function(dat){
dat %>%
filter_all(!str_detect(., '\\.DERIVED')) %>%
select_if(is.numeric)
}
you should probably give it a better name though
The error is from the line
if(grep(".DERIVED", x[i,]) >= 1)
When grep doesn't find the term ".DERIVED", it returns something of zero length, your inequality doesn't return TRUE or FALSE, but rather returns logical(0). The error is telling you that the if statement cannot evaluate whether logical(0) >= 1
A simple example:
if(grep(".DERIVED", "1234.DERIVEDabcdefg") >= 1) {print("it works")} # Works nicely, since the inequality can be evaluated
if(grep(".DERIVED", "1234abcdefg") > 1) {print("no dice")}
You can replace that line with if(length(grep(".DERIVED", x[i,])) != 0)
There's something else you haven't noticed yet, which is that you're removing rows/columns in a loop. Say you remove the 5th column, the next loop iteration (when i = 6) will be handling what was the 7th row! (this will end in an error along the lines of Error in[.data.frame(x, , i) : undefined columns selected)
I prefer using dplyr, but if you need to use base R functions there are ways to to this without if statements.
Notice that you should consider using the regex version of "\\.DERIVED" and not ".DERIVED" which would mean "any character followed by DERIVED".
I don't have example data or output, so here's my best go...
# Made up data
test <- data.frame(a = c("data","data.DERIVED","data","data","data.DERIVED"),
b = (c(1,2,3,4,5)),
c = c("A","B","C","D","E"),
d = c(2,5,6,8,9),
stringsAsFactors = FALSE)
# Note: The following code assumes that the column class is numeric because the
# example code provided assumed that the column class was numeric. This will not
# detects if the column is full of a string of character values of only numbers.
# Using the base subset command
test2 <- subset(test,
subset = !grepl("\\.DERIVED",test$a),
select = sapply(test,is.numeric))
# > test2
# b d
# 1 1 2
# 3 3 6
# 4 4 8
# Trying to use []. Note: If only 1 column is numeric this will return a vector
# instead of a data.frame
test2 <- test[!grepl("\\.DERIVED",test$a),]
test2 <- test2[,sapply(test,is.numeric)]
# > test2
# b d
# 1 1 2
# 3 3 6
# 4 4 8

Creating function to read data set and columns and displyaing nrow

I am struggling a bit with a probably fairly simple task. I wanted to create a function that has arguments of dataframe(df), column names of dataframe(T and R), value of the selected column of dataframe(a and b). I know that the function reads the dataframe. but , I don't know how the columns are selected. I'm getting an error.
fun <- function(df,T,a,R,b)
{
col <- ds[c("x","y")]
omit <- na.omit(col)
data1 <- omit[omit$x == 'a',]
data2 <- omit[omit$x == 'b',]
nrow(data2)/nrow(data1)
}
fun(jugs,Place,UK,Price,10)
I'm new to r language. So, please help me.
There are several errors you're making.
col <- ds[c("x","y")]
What are x and y? Presumably they're arguments that you're passing, but you specify T and R in your function, not x and y.
data1 <- omit[omit$x == 'a',]
data2 <- omit[omit$x == 'b',]
Again, presumably, you want a and b to be arguments you passed to the function, but you specified 'a' and 'b' which are specific, not general arguments. Also, I assume that second "omit$x" should be "omit$y" (or vice versa). And actually, since you just made this into a new data frame with two columns, you can just use the column index.
nrow(data2)/nrow(data1)
You should print this line, or return it. Either one should suffice.
fun(jugs,Place,UK,Price,10)
Finally, you should use quotes on Place, UK, and Price, at least the way I've done it.
fun <- function(df, col1, val1, col2, val2){
new_cols <- df[,c(col1, col2)]
omit <- na.omit(new_cols)
data1 <- omit[omit[,1] == val1,]
data2 <- omit[omit[,2] == val2,]
print(nrow(data2)/nrow(data1))
}
fun(jugs, "Place", "UK", "Price", 10)
And if I understand what you're trying to do, it may be easier to avoid creating multiple dataframes that you don't need and just use counts instead.
fun <- function(df, col1, val1, col2, val2){
new_cols <- df[,c(col1, col2)]
omit <- na.omit(new_cols)
n1 <- sum(omit[,1] == val1)
n2 <- sum(omit[,2] == val2)
print(n2/n1)
}
fun(jugs, "Place", "UK", "Price", 10)
I would write this function as follows:
fun <- function(df,T,a,R,b) {
data <- na.omit(df[c(T,R)]);
sum(data[[R]]==b)/sum(data[[T]]==a);
};
As you can see, you can combine the first two lines into one, because in your code col was not reused anywhere. Secondly, since you only care about the number of rows of the two subsets of the intermediate data.frame, you don't actually need to construct those two data.frames; instead, you can just compute the logical vectors that result from the two comparisons, and then call sum() on those logical vectors, which naturally treats FALSE as 0 and TRUE as 1.
Demo:
fun <- function(df,T,a,R,b) { data <- na.omit(df[c(T,R)]); sum(data[[R]]==b)/sum(data[[T]]==a); };
df <- data.frame(place=c(rep(c('p1','p2'),each=4),NA,NA), price=c(10,10,20,NA,20,20,20,NA,20,20), stringsAsFactors=F );
df;
## place price
## 1 p1 10
## 2 p1 10
## 3 p1 20
## 4 p1 NA
## 5 p2 20
## 6 p2 20
## 7 p2 20
## 8 p2 NA
## 9 <NA> 20
## 10 <NA> 20
fun(df,'place','p1','price',20);
## [1] 1.333333

Build Efficient R Filter

I have this dataframe called data. In the data frame I have a few columns, for simplicity I will explain the columns with a weather analogy, it is like "weather_st_louis", "weather_boston", "weather_ny"... I want to build a column "weather" and it should be done like this, "if weather in st louis exists, use this column, else if weather in boston exists, use this column, else if weather in ny exists, use this column, else NONE". I'm going to be using this logic many times, with many columns, so need a way to make this more efficient. What is the R way to do this.
Also, side question, is what I'm trying to build here called a "filter"?
if(exists("data['w_stlouis']")) {
data['w'] <- data['w_stlouis']
} else if(exists("data['w_boston]")){
data['w'] <- data['w_boston']
} else if(exists("data['w_ny']")){
data['w'] <- data['w_ny']
} else {data['w'] <- NA}
Try something like that :
example <- matrix(NA,ncol=5,nrow=5)
colnames(example) <- c("weather_1","weather_2","weather_3","weather_4","weather_5")
example[5,3] <- 1
example[3,2] <- 1
example[1,2] <- 1
example[4,4] <- 1
example[5,2] <- 1
w <- apply(example,1,function(x){
o <- which(!is.na(x))[1]
if (is.na(o)) r <- "NONE"
else r <- colnames(example)[o]
r
})
w
When you have repeating tasks to do, try to use apply/tapply/sapply functions
Here is another posibility. I'm not sure if this is what you need, but perhaps it gives you another way to handle it.
df <- data.frame(matrix(rnorm(100, 100, 20),ncol=5,nrow=5))
colnames(df) <- c("weather_1","weather_2","weather_3","weather_4","weather_5")
library(reshape2)
df <- melt(df)
df[1:10,2] <- NA
str(df)
weather_levels <- levels(df$variable)
df$case <- ifelse(is.na(df$value), 0, 1)
These two output the same result
subset(df, df$case == 1)
na.omit(df)

Is it possible to swap columns around in a data frame using R?

I have three variables in a data frame and would like to swap the 4 columns around from
"dam" "piglet" "fdate" "ssire"
to
"piglet" "ssire" "dam" "tdate"
Is there any way I can do the swapping using R?
Any help would be very much appreciated.
Baz
dfrm <- dfrm[c("piglet", "ssire", "dam", "tdate")]
OR:
dfrm <- dfrm[ , c("piglet", "ssire", "dam", "tdate")]
d <- data.frame(a=1:3, b=11:13, c=21:23)
d
# a b c
#1 1 11 21
#2 2 12 22
#3 3 13 23
d2 <- d[,c("b", "c", "a")]
d2
# b c a
#1 11 21 1
#2 12 22 2
#3 13 23 3
or you can do same thing using index:
d3 <- d[,c(2, 3, 1)]
d3
# b c a
#1 11 21 1
#2 12 22 2
#3 13 23 3
To summarise the other posts, there are three ways of changing the column order, and two ways of specifying the indexing in each method.
Given a sample data frame
dfr <- data.frame(
dam = 1:5,
piglet = runif(5),
fdate = letters[1:5],
ssire = rnorm(5)
)
Kohske's answer: You can use standard matrix-like indexing using column numbers
dfr[, c(2, 4, 1, 3)]
or using column names
dfr[, c("piglet", "ssire", "dam", "fdate")]
DWin & Gavin's answer: Data frames allow you to omit the row argument when specifying the index.
dfr[c(2, 4, 1, 3)]
dfr[c("piglet", "ssire", "dam", "fdate")]
PaulHurleyuk's answer: You can also use subset.
subset(dfr, select = c(2, 4, 1, 3))
subset(dfr, select = c(c("piglet", "ssire", "dam", "fdate")))
You can use subset's 'select' argument;
#Assume df contains "dam" "piglet" "fdate" "ssire"
newdf<-subset(df, select=c("piglet", "ssire", "dam", "tdate"))
I noticed that this is almost an 8-year old question. But for people who are starting to learn R and might stumble upon this question, like I did, you can now use a much flexible select() function from dplyr package to accomplish the swapping operation as follows.
# Install and load the dplyr package
install.packages("dplyr")
library("dplyr")
# Override the existing data frame with the desired column order
df <- select(df, piglet, ssire, dam, tdate)
This approach has following advantages:
You will have to type less as the select() does not require variable names to be enclosed within quotes.
In case your data frame has more than 4 variables, you can utilize select helper functions such as starts_with(), ends_with(), etc. to select multiple columns without having to name each column and rearrange them with much ease.
Relevance Note: In response to some users (myself included) that would like to swap columns without having to specify every column, I wrote this answer up.
TL;DR: A one-liner for numerical indices is provided herein and a function for swapping exactly 2 nominal and numerical indices at the end, neither using imports, that will correctly swap any two columns in a data frame of any size is provided. A function that allows the reassignment of an arbitrary number of columns that may cause unavoidable superfluous swaps if not used carefully is also made available (read more & get functions in Summary section)
Preliminary Solution
Suppose you have some huge (or not) data frame, DF, and you only know the indices of the two columns you want to swap, say 1 < n < m < length(DF). (Also important is that your columns are not adjacent, i.e. |n-m| > 1 which is very likely to be the case in our "huge" data frame but not necessarily for smaller ones; work-arounds for all degenerate cases are provided at the end).
Because it is huge, there are a ton of columns and you don't want to have to specify every other column by hand, or it isn't huge and you're just lazy someone with fine taste in coding, either way, this one-liner will do the trick:
DF <- DF[ c( 1:(n-1), m, (n+1):(m-1), n, (m+1):length(DF) ) ]
Each piece works like this:
1:(n-1) # This keeps every column before column `n` in place
m # This places column `m` where column `n` was
(n+1):(m-1) # This keeps every column between the two in place
n # This places column `n` where column `m` was
(m+1):length(DF) # This keeps every column after column `m` in place
Generalizing for Degenerates
Because of how the : operator works, i.e. allowing "backwards-ranges" like this,
> 10:0
[1] 10 9 8 7 6 5 4 3 2 1 0
we have to be careful about our choices and placements of n and m, hence our previous restrictions. For instance, n < m doesn't lose us any generality (one of the columns has to be before the other one if they are different), however, it means we do need to be careful about which goes where in our line of code. We can make it so that we don't have to check this condition with the following modification:
DF <- DF[ c( 1:(min(n,m)-1), max(n,m), (min(n,m)+1):(max(n,m)-1), min(n,m), (max(n,m)+1):length(DF) ) ]
We have replaced every instance of n and m with min(n,m) and max(n,m) respectively, meaning that the correct ordering for our code will be preserved even in the case that m > n.
In the cases where min(n,m) == 1, max(n,m) == length(DF), both of those at the same time, and |n-m| == 1, we we will make some unreadable less aesthetic modifications involving if\else to forget about having to check if these are the case. Versions for where you know that one of these are the case, (i.e. you are always swapping some interior column with the first column, swapping some interior column with the last column, swapping the first and last columns, or swapping two adjacent columns), you can actually express these actions more succinctly because they usually just require omitting parts from our restricted case:
# Swapping not the last column with the first column
# We just got rid of 1:(min(n,m)-1) because it would be invalid and not what we meant
# since min(n,m) == 1
# Now we just stick the other column right at the front
DF <- DF[ c( max(n,m), (min(n,m)+1):(max(n,m)-1), min(n,m), (max(n,m)+1):length(DF) ) ]
# Also equivalent since we know min(n,m) == 1, for the leftover index i
DF <- DF[ c( i, 2:(i-1), 1, (i+1):length(DF) ) ]
# Swapping not the first column with the last column
# Similarly, we just got rid of (max(n,m)+1):length(DF) because it would also be invalid
# and not what we meant since max(n,m) == length(DF)
# Now we just stick the other column right at the end
DF <- DF[ c( 1:(min(n,m)-1), max(n,m), (min(n,m)+1):(max(n,m)-1), min(n,m) ) ]
# Also equivalent since we know max(n,m) == length(DF), for the leftover index, say i
DF <- DF[ c( 1:(i-1), length(DF), (i+1):(length(DF)-1), i ) ]
# Swapping the first column with the last column
DF <- DF[ c( max(n,m), (min(n,m)+1):(max(n,m)-1), min(n,m) ) ]
# Also equivalent (for if you don't actually know the length beforehand, as assumed
# elsewhere)
DF <- DF[ c( length(DF), 2:(length(DF)-1), 1 ) ]
# Swapping two interior adjacent columns
# Here we drop the explicit swap on either side of our middle column segment
# This is actually enough because then the middle segment becomes a backwards range
# because we know that `min(n,m) + 1 = max(n,m)`
# The range is just an ordering of the two adjacent indices from largest to smallest
DF <- DF[ c( 1:(min(n,m)-1), (min(n,m)+1):(max(n,m)-1), (max(n,m)+1):length(DF) )]
"But!", I hear you saying, "What if more than one of these cases occur simultaneously, like did in the third version in the block above!?". Right, coding up versions for each case is an enormous waste of time if one wants to be able to "swap columns" in the most general sense.
Swapping any Two Columns
It will be easiest to generalize our code to cover all of the cases at the same time, because they all employ essentially the same strategy. We will use if\else to keep our code a one-liner:
DF <- DF[ if (n==m) 1:length(DF) else c( (if (min(n,m)==1) c() else 1:(min(n,m)-1) ), (if (min(n,m)+1 == max(n,m)) (min(n,m)+1):(max(n,m)-1) else c( max(n,m), (min(n,m)+1):(max(n,m)-1), min(n,m))), (if (max(n,m)==length(DF)) c() else (max(n,m)+1):length(DF) ) ) ]
That's totally unreadable and probably pretty unfriendly to anyone who might try to understand or recreate your code (including yourself), so better to box it up in a function.
# A function that swaps the `n` column and `m` column in the data frame DF
swap <- function(DF, n, m)
{
return (DF[ if (n==m) 1:length(DF) else c( (if (min(n,m)==1) c() else 1:(min(n,m)-1) ), (if (min(n,m)+1 == max(n,m)) (min(n,m)+1):(max(n,m)-1) else c( max(n,m), (min(n,m)+1):(max(n,m)-1), min(n,m))), (if (max(n,m)==length(DF)) c() else (max(n,m)+1):length(DF) ) ) ])
}
A more robust version that can also swap on column names and has semi-explanatory comments:
# Returns data frame object with columns `n` and `m` swapped
# `n` and `m` can be column names, numerical indices, or a heterogeneous pair of both
swap <- function(DF, n, m)
{
# Of course, first, we want to make sure that n != m,
# because if they do, we don't need to do anything
if (n==m) return(DF)
# Next, if either n or m is a column name, we want to get its index
# We assume that if they aren't column names, they are indices (integers)
n <- if (class(n)=="character" & is.na(suppressWarnings(as.integer(n)))) which(colnames(DF)==n) else as.integer(n)
m <- if (class(m)=="character" & is.na(supressWarnings(as.integer(m)))) which(colnames(DF)==m) else as.integer(m)
# Make sure each index is actually valid
if (!(1<=n & n<=length(DF))) stop( "`n` represents invalid index!" )
if (!(1<=m & m<=length(DF))) stop( "`m` represents invalid index!" )
# Also, for readability, lets go ahead and set which column is earlier, and which is later
earlier <- min(n,m)
later <- max(n,m)
# This constructs the first third of the indices
# These are the columns that, if any, come before the earlier column you are swapping
firstThird <- if ( earlier==1 ) c() else 1:(earlier-1)
# This constructs the last third of the the indices
# These are the columns, if any, that come after the later column you are swapping
lastThird <- if ( later==length(DF) ) c() else (later+1):length(DF)
# This checks if the columns to be swapped are adjacent and then constructs the
# secondThird accordingly
if ( earlier+1 == later )
{
# Here; the second third is a list of the two columns ordered from later to earlier
secondThird <- (earlier+1):(later-1)
}
else
{
# Here; the second third is a list of
# the later column you want to swap
# the columns in between
# and then the earlier column you want to swap
secondThird <- c( later, (earlier+1):(later-1), earlier)
}
# Now we assemble our indices and return our permutation of DF
return (DF[ c( firstThird, secondThird, lastThird ) ])
}
And, for ease of repatriation with less of the spatial cost, a comment-less version that checks index validity and can handle column names, i.e. does everything in pretty close to the smallest space it can (yes, you could vectorize, using ifelse(...), the two checks that get performed, but then you'd have to unpack the vector back into n,m or change how the final line is written):
swap <- function(DF, n, m)
{
n <- if (class(n)=="character" & is.na(suppressWarnings(as.integer(n)))) which(colnames(DF)==n) else as.integer(n)
m <- if (class(m)=="character" & is.na(suppressWarnings(as.integer(m)))) which(colnames(DF)==m) else as.integer(m)
if (!(1<=n & n<=length(DF))) stop( "`n` represents invalid index!" )
if (!(1<=m & m<=length(DF))) stop( "`m` represents invalid index!" )
return (DF[ if (n==m) 1:length(DF) else c( (if (min(n,m)==1) c() else 1:(min(n,m)-1) ), (if (min(n,m)+1 == max(n,m)) (min(n,m)+1):(max(n,m)-1) else c( max(n,m), (min(n,m)+1):(max(n,m)-1), min(n,m))), (if (max(n,m)==length(DF)) c() else (max(n,m)+1):length(DF) ) ) ])
}
Permutations (or How to Do Specifically What the Question Asked and More!)
With our swap function in tow, we can try to actually do what the original question asked. The easiest way to do this, is to build a function that utilizes the really cool power that comes with a choice of heterogeneous arguments. Create a mapping:
mapping <- data.frame( "piglet" = 1, "ssire" = 2, "dam" = 3, "tdate" = 4)
In the case of the original question, these are all of the columns in our original data frame, but we will build a function where this doesn't have to be the case:
# A function that takes two data frames, one with actual data: DF, and the other with a
# rearrangement of the columns: R
# R must be structured so that colnames(R) is a subset of colnames(DF)
# Alternatively, R can be structured so that 1 <= as.integer(colnames(R)) <= length(DF)
# Further, 1 <= R$column <= length(DF), and length(R$column) == 1
# These structural requirements on R are not checked
# This is for brevity and because most likely R has been created specifically for use with
# this function
rearrange <- function(DF, R)
{
for (col in colnames(R))
{
DF <- swap(DF, col, R[col])
}
return (DF)
}
Wait, that's it? Yup. This will swap every column name to the appropriate placement. The power for such simplicity comes from swap taking heterogeneous arguments meaning we can specify the moving column name that we want to put somewhere, and so long as we only ever try to put one column in each position (which we should), once we put that column where it belongs, it won't move again. This means that even though it seems like later swaps could undo previous placements, the heterogeneous arguments make certain that won't happen, and so additionally, the order of the columns in our mapping also doesn't matter. This is a really nice quality because it means that we aren't kicking this whole "organizing the data" issue down the road too much. You only have to be able to determine which placement you want to send each column you want to move to.
Ok, ok, there is a catch. If you don't reassign the entire data frame when you do this, then you have superfluous swaps that occur, meaning that if you re-arrange over a subset of columns that isn't "closed", i.e. not every column name has an index that is represented in the rearrangement, then other columns that you didn't explicitly say to move may get moved to other places they don't exactly belong. This can be handled by creating your mapping very carefully, or simply using numerical indices mapping to other numerical indices. In the latter case, this doesn't solve the issue, but it makes more explicit what swaps are taking place and in what order so planning the rearrangement is more explicit and thus less likely to lead to problematic superfluous swaps.
Summary
You can use the swap function that we built to successfully swap exactly two columns or the rearrange function with a "rearrangement" data frame specifying where to send each column name you want to move. In the case of the rearrange function, if any of the placements chosen for each column name are not already occupied by one of the specified columns (i.e. not in colnames(R)), then superfluous swaps can and are very likely to occur (The only instance they won't is when every superfluous swap has a partner superfluous swap that undoes it before the end. This is, as stated, very unlikely to happen by accident, but the mapping can be structured to accomplish this outcome in practice).
swap <- function(DF, n, m)
{
n <- if (class(n)=="character" & is.na(suppressWarnings(as.integer(n)))) which(colnames(DF)==n) else as.integer(n)
m <- if (class(m)=="character" & is.na(suppressWarnings(as.integer(m)))) which(colnames(DF)==m) else as.integer(m)
if (!(1<=n & n<=length(DF))) stop( "`n` represents invalid index!" )
if (!(1<=m & m<=length(DF))) stop( "`m` represents invalid index!" )
return (DF[ if (n==m) 1:length(DF) else c( (if (min(n,m)==1) c() else 1:(min(n,m)-1) ), (if (min(n,m)+1 == max(n,m)) (min(n,m)+1):(max(n,m)-1) else c( max(n,m), (min(n,m)+1):(max(n,m)-1), min(n,m))), (if (max(n,m)==length(DF)) c() else (max(n,m)+1):length(DF) ) ) ])
}
rearrange <- function(DF, R)
{
for (col in colnames(R))
{
DF <- swap(DF, col, R[col])
}
return (DF)
}
I quickly wrote a function that takes a vector v and column indexes a and b which you want to swap.
swappy = function(v,a,b){ # where v is a dataframe, a and b are the columns indexes to swap
name = deparse(substitute(v))
helpy = v[,a]
v[,a] = v[,b]
v[,b] = helpy
name1 = colnames(v)[a]
name2 = colnames(v)[b]
colnames(v)[a] = name2
colnames(v)[b] = name1
assign(name,value = v , envir =.GlobalEnv)
}
I was using the function by Khôra Willis, which is helpful. But I encountered an error. I tried to make corrections. Here is R code that finally works. The arguments n and m could either be column names or column numbers in data frame DF.
require(tidyverse)
swap <- function(DF, n, m)
{
if (class(n)=="character") n <- which(colnames(DF)==n)
if (class(m)=="character") m <- which(colnames(DF)==m)
p <- NCOL(DF)
if (!(1<=n & n<=p)) stop("`n` represents invalid index!")
if (!(1<=m & m<=p)) stop("`m` represents invalid index!")
index <- 1:p
index[n] <- m; index[m] <- n
DF0 <- DF %>% select(all_of(index))
return(DF0)
}

Resources