Remove columns with certain column name patterns in multiple dataframes in R - r

I have >100 dataframes loaded into R. I want to remove all the columns from all data frames containing a certain pattern, in the example case below "abc".
df1 <- data.frame(`abc_1` = rep(3, 5), `b` = seq(1, 5, 1), `c` = letters[1:5])
df2 <- data.frame(`d` = rep(5, 5), `e_abc` = seq(2, 6, 1), `f` = letters[6:10])
df3 <- data.frame(`g` = rep(5, 5), `h` = seq(2, 6, 1), `i_a_abc` = letters[6:10])
I would thus like to remove the column abc_1 in df1, e_abc in df2 and i_a_abc in df3. How could this be done?

Do all of your dataframes start with or contain a shared string (e.g., df)? If yes, then it might be easier to put all your dataframes in a list by using that shared string and then apply the function to remove the abc columns in every dataframe in that list.
You can then read your dataframes back into your environment with list2env(), but it probably is in your interest to keep everything in a list for convenience.
library(dplyr)
df1 <- data.frame(`abc_1` = rep(3, 5), `b` = seq(1, 5, 1), `c` = letters[1:5])
df2 <- data.frame(`d` = rep(5, 5), `e_abc` = seq(2, 6, 1), `f` = letters[6:10])
df3 <- data.frame(`g` = rep(5, 5), `h` = seq(2, 6, 1), `i_a_abc` = letters[6:10])
dfpattern <- grep("df", names(.GlobalEnv), value = TRUE)
dflist <- do.call("list", mget(dfpattern))
dflist <- lapply(dflist, function(x){ x <- x %>% select(!contains("abc")) })
list2env(dflist, envir = .GlobalEnv)

Related

How to efficiently find the overlap between two data tables of sequence coordinates inn R?

I have two large data tables with the coordinates of different sequences. For example:
library(data.table)
dt1 <- data.table(cat = c(rep("A", 2), rep("B", 2)),
start = c(1, 4, 2, 15),
end = c(6, 9, 5, 20))
dt2 <- data.table(cat = c(rep("A", 2), rep("B", 2)),
start = c(2, 1, 10, 17),
end = c(7, 3, 12, 20))
I need to create a data table of the coordinates for the overlapping sequences (ie the integers that occur in the sequences given in both data tables, for each category). I can currently do this using a for loop. For example:
seq2 <- Vectorize(seq.default, vectorize.args = c("from", "to"))
out_list <- list()
for(i in 1:length(unique(dt1$cat))){
sub1 <- dt1[cat == unique(dt1$cat)[i]]
sub2 <- dt2[cat == unique(dt1$cat)[i]]
vec1 <- unique(unlist(c(seq2(from = sub1$start, to = sub1$end))))
vec2 <- unique(unlist(c(seq2(from = sub2$start, to = sub2$end))))
vec <- Reduce(intersect, list(vec1, vec2))
vec_dt <- data.table(V1 = vec)
output <- vec_dt[order(V1),
.(start = min(V1),
end = max(V1)),
by = .(grp = rleid(c(0, cumsum(diff(V1) > 1))))
]
output$grp <- NULL
output$cat <- unique(dt1$cat)[i]
out_list[[i]] <- output
print(i)
}
output_dt <- do.call("rbind", out_list)
However, the data sets I need to apply this to are very large (both in the number of rows and the size of the vectors). Is anyone able to suggest a way to improve performance?
Thanks
You could (a) convert your start/end variables to a sequence, (b) do an inner join, (c) convert back to start/end.
library(data.table)
dt1 <- data.table(cat = c(rep("A", 2), rep("B", 2)),
start = c(1, 4, 2, 15),
end = c(6, 9, 5, 20))
dt2 <- data.table(cat = c(rep("A", 2), rep("B", 2)),
start = c(2, 1, 10, 17),
end = c(7, 3, 12, 20))
# convert to sequence
dt1 = dt1[, .(sequence = start:end), by=.(cat, 1:nrow(dt1))][
, nrow := NULL]
dt2 = dt2[, .(sequence = start:end), by=.(cat, 1:nrow(dt2))][
, nrow := NULL]
# inner join + unique
overlap = merge(dt1, dt2)
overlap = unique(overlap)
# convert to start/end
overlap = overlap[, .(start=min(sequence), end=max(sequence)), by=.(cat)]
# result
overlap
#> cat start end
#> 1: A 1 7
#> 2: B 17 20

Operations on data frames in the loop

I have three data frame and I would like to perform some operation on them in the loop (transpose and assign names to columns). The problem with my code is that the data frames are not updated and the result is a completely new data frame.
df1 = data.frame(A = c(1, 2), B = c(1, 2))
df2 = data.frame(A = c(1, 2), B = c(1, 2))
df3 = data.frame(A = c(1, 2), B = c(1, 2))
names = c("df1", "df2", "df3")
for(df in names) {
df = get(names)
df = t(df)
colnames(df) = df[1, ]
df = df[-1, ]
}
My recommendation is place all of the dataframes in a list and then work with the list instead of the individual dataframes.
df1 = data.frame(A = c(1, 2), B = c(1, 2))
df2 = data.frame(A = c(1, 2), B = c(1, 2))
df3 = data.frame(A = c(1, 2), B = c(1, 2))
names = list(df1, df2, df3)
names<-lapply(names, function(df){
df = t(df)
colnames(df) = df[1, ]
df = df[-1, ]
})
Of course the list "names" have then updated dataframes and the original dataframes are untouched.
EDIT
In order to address your comment of reducing data redundancy. I tweaked your code and used the assign() function to update the data frames in the global environment.
df1 = data.frame(A = c(1, 2), B = c(1, 2))
df2 = data.frame(A = c(1, 2), B = c(1, 2))
df3 = data.frame(A = c(1, 2), B = c(1, 2))
names = c("df1", "df2", "df3")
for(name in names) {
df = get(name)
df = t(df)
colnames(df) = df[1, ]
df = df[-1, ]
assign(name, df)
}

Collapsing every two columns into a final set of two columns

I have the below example_df, which has 4 "sets" of columns, each set has two columns in it. I essentially want a quick way to take every set of two columns and move the data into a resulting two columns (shown below in result_df, that is what I want to end up with). Any ideas on how to automate this?
set.seed(20)
example_df <- data.frame("test1" = c(rnorm(6), rep(NA, 18)),
"test2" = c(rnorm(6), rep(NA, 18)),
"test3" = c(rep(NA, 6), rnorm(6), rep(NA, 12)), "test4" = c(rep(NA, 6), rnorm(6), rep(NA, 12)),
"test5" = c(rep(NA, 12), rnorm(6), rep(NA, 6)), "test6" = c(rep(NA, 12), rnorm(6), rep(NA, 6)),
"test7" = c(rep(NA, 18), rnorm(6)), "test8" = c(rep(NA, 18), rnorm(6)))
result_df <- data.frame("total1" = c(example_df[c(1:6),1], example_df[c(7:12),3], example_df[c(13:18),5], example_df[c(19:24),7]),
"total2" = c(example_df[c(1:6),2], example_df[c(7:12),4], example_df[c(13:18),6], example_df[c(19:24),8]))
odd_cols <- as.logical(1:ncol(example_df) %% 2)
result_df <- data.frame(total1 = as.vector(apply(example_df[, odd_cols], 2, na.omit)),
total2 = as.vector(apply(example_df[,!odd_cols], 2, na.omit)))
Here are two options to create the expected output.
1) We create a 2 column data.frame by subsetting the alternate columns of 'example_df' (using logical index), unlist and remove the NAs
total1 <- na.omit(unlist(example_df[c(TRUE, FALSE)]))
total2 <- na.omit(unlist(example_df[c(FALSE, TRUE)]))
d1 <- data.frame(total1, total2)
row.names(d1) <- NULL
#checking with the OP's output
all.equal(d1, result_df, check.attributes=FALSE)
#[1] TRUE
Or in a single step
na.omit(do.call(rbind, Map(cbind, example_df[c(TRUE, FALSE)], example_df[c(FALSE, TRUE)])))
2) Loop through the sequence of columns in a list, subset the 'example_df', rbind the list elements with rbindlist and remove the NAs
library(data.table)
rbindlist(lapply(seq(1, ncol(example_df), by =2), function(i)
example_df[i:(i+1)]))[complete.cases(test1, test2)]

pulling up matching rows from a matrix using dplyr

Suppose I have the following:
myDF <- cbind.data.frame("Id" = rep(1:5, each = 4), values = c(rnorm(4,0,1), rnorm(4, 10, 1), rnorm(4, 20,1 ), rnorm(4, 30,1), rnorm(4, 40,1)))
idVector <- sample(1:5, size = 5, replace = TRUE)
If my `idVector = 4,4,3,2,1', I want to pull all the rows with Id 4, then Id 4 again, then 3 then 2 then 1.
I can do it using the following:
do.call("rbind", lapply(idVector, function(x, currentDF){
currentDF[currentDF$Id == x,]}
, myDF))
Is there a neater way to do it using dplyr or plyr?
With dplyr
library(dplyr)
left_join(data.frame(Id=idVector), myDF)

Apply subset function to a list of dataframes

I have a list of SpatialPolygonDataFrame that I can assimilate to dataframe like this:
df.1 <- data.frame(A = c(1:10), B = c(1, 2, 2, 2, 5:10))
df.2 <- data.frame(A = c(1:10), B = c(1, 2, 2, 2, 2, 2, 7:10))
df.3 <- data.frame(A = c(1:10), B = c(1, 2, 2, 4:10))
list.df <- list(df.1, df.2, df.3)
I would like to get a list of a subset of each dataframe based on condition (list.df.sub is the result I am looking for):
df.1.sub <- subset(df.1, df.1$B != 2)
df.2.sub <- subset(df.2, df.2$B != 2)
df.3.sub <- subset(df.3, df.3$B != 2)
list.df.sub <- list(df.1.sub, df.2.sub, df.3.sub)
I would like to apply directly my subset on list.df. I know that I have to use lapply function but don't know how?
This should work:
lapply(list.df, function(x)x[x$B!=2,])
or with subset:
lapply(list.df, subset, B!=2)
If you only want to subset one column, you can also use the "[[" function
example_list <- list(iris, iris, iris)
lapply(example_list, "[[", "Species")

Resources