Collapsing every two columns into a final set of two columns - r

I have the below example_df, which has 4 "sets" of columns, each set has two columns in it. I essentially want a quick way to take every set of two columns and move the data into a resulting two columns (shown below in result_df, that is what I want to end up with). Any ideas on how to automate this?
set.seed(20)
example_df <- data.frame("test1" = c(rnorm(6), rep(NA, 18)),
"test2" = c(rnorm(6), rep(NA, 18)),
"test3" = c(rep(NA, 6), rnorm(6), rep(NA, 12)), "test4" = c(rep(NA, 6), rnorm(6), rep(NA, 12)),
"test5" = c(rep(NA, 12), rnorm(6), rep(NA, 6)), "test6" = c(rep(NA, 12), rnorm(6), rep(NA, 6)),
"test7" = c(rep(NA, 18), rnorm(6)), "test8" = c(rep(NA, 18), rnorm(6)))
result_df <- data.frame("total1" = c(example_df[c(1:6),1], example_df[c(7:12),3], example_df[c(13:18),5], example_df[c(19:24),7]),
"total2" = c(example_df[c(1:6),2], example_df[c(7:12),4], example_df[c(13:18),6], example_df[c(19:24),8]))

odd_cols <- as.logical(1:ncol(example_df) %% 2)
result_df <- data.frame(total1 = as.vector(apply(example_df[, odd_cols], 2, na.omit)),
total2 = as.vector(apply(example_df[,!odd_cols], 2, na.omit)))

Here are two options to create the expected output.
1) We create a 2 column data.frame by subsetting the alternate columns of 'example_df' (using logical index), unlist and remove the NAs
total1 <- na.omit(unlist(example_df[c(TRUE, FALSE)]))
total2 <- na.omit(unlist(example_df[c(FALSE, TRUE)]))
d1 <- data.frame(total1, total2)
row.names(d1) <- NULL
#checking with the OP's output
all.equal(d1, result_df, check.attributes=FALSE)
#[1] TRUE
Or in a single step
na.omit(do.call(rbind, Map(cbind, example_df[c(TRUE, FALSE)], example_df[c(FALSE, TRUE)])))
2) Loop through the sequence of columns in a list, subset the 'example_df', rbind the list elements with rbindlist and remove the NAs
library(data.table)
rbindlist(lapply(seq(1, ncol(example_df), by =2), function(i)
example_df[i:(i+1)]))[complete.cases(test1, test2)]

Related

Remove columns with certain column name patterns in multiple dataframes in R

I have >100 dataframes loaded into R. I want to remove all the columns from all data frames containing a certain pattern, in the example case below "abc".
df1 <- data.frame(`abc_1` = rep(3, 5), `b` = seq(1, 5, 1), `c` = letters[1:5])
df2 <- data.frame(`d` = rep(5, 5), `e_abc` = seq(2, 6, 1), `f` = letters[6:10])
df3 <- data.frame(`g` = rep(5, 5), `h` = seq(2, 6, 1), `i_a_abc` = letters[6:10])
I would thus like to remove the column abc_1 in df1, e_abc in df2 and i_a_abc in df3. How could this be done?
Do all of your dataframes start with or contain a shared string (e.g., df)? If yes, then it might be easier to put all your dataframes in a list by using that shared string and then apply the function to remove the abc columns in every dataframe in that list.
You can then read your dataframes back into your environment with list2env(), but it probably is in your interest to keep everything in a list for convenience.
library(dplyr)
df1 <- data.frame(`abc_1` = rep(3, 5), `b` = seq(1, 5, 1), `c` = letters[1:5])
df2 <- data.frame(`d` = rep(5, 5), `e_abc` = seq(2, 6, 1), `f` = letters[6:10])
df3 <- data.frame(`g` = rep(5, 5), `h` = seq(2, 6, 1), `i_a_abc` = letters[6:10])
dfpattern <- grep("df", names(.GlobalEnv), value = TRUE)
dflist <- do.call("list", mget(dfpattern))
dflist <- lapply(dflist, function(x){ x <- x %>% select(!contains("abc")) })
list2env(dflist, envir = .GlobalEnv)

Populate R matrix from vector based on combined col- and row-name

I am trying to populate a matrix with values from a vector, where the name of each value is a combination of the matrix' col- and row names.
set.seed(1001)
mat <- matrix(nrow = 2, ncol = 3, dimnames = list(c("one", "two"), c("house", "tree", "flower")))
vec <- sample.int(10, 5)
names(vec) <- c("one_house", "one_flower", "two_tree", "two_house", "one_tree")
The output I'm looking for is this:
matrix(c(4, 7, 6, 2, 3, NA), nrow = 2, ncol = 3, dimnames = list(c("one", "two"), c("house", "tree", "flower")))
Any help with this would be greatly appreciated.
We can split the names of the vector by _ and do an assignment based on row/column name attributes in the mat
mat[do.call(rbind, strsplit(names(vec), "_"))] <- vec
-output
mat
# house tree flower
#one 7 9 3
#two 8 10 NA
Or, if we don't have a dataset to replace, we can construct a data.frame and use xtabs to reshape the data
xtabs(vec ~ V1 + V2, as.data.frame(do.call(rbind, strsplit(names(vec), "_"))))

How to efficiently find the overlap between two data tables of sequence coordinates inn R?

I have two large data tables with the coordinates of different sequences. For example:
library(data.table)
dt1 <- data.table(cat = c(rep("A", 2), rep("B", 2)),
start = c(1, 4, 2, 15),
end = c(6, 9, 5, 20))
dt2 <- data.table(cat = c(rep("A", 2), rep("B", 2)),
start = c(2, 1, 10, 17),
end = c(7, 3, 12, 20))
I need to create a data table of the coordinates for the overlapping sequences (ie the integers that occur in the sequences given in both data tables, for each category). I can currently do this using a for loop. For example:
seq2 <- Vectorize(seq.default, vectorize.args = c("from", "to"))
out_list <- list()
for(i in 1:length(unique(dt1$cat))){
sub1 <- dt1[cat == unique(dt1$cat)[i]]
sub2 <- dt2[cat == unique(dt1$cat)[i]]
vec1 <- unique(unlist(c(seq2(from = sub1$start, to = sub1$end))))
vec2 <- unique(unlist(c(seq2(from = sub2$start, to = sub2$end))))
vec <- Reduce(intersect, list(vec1, vec2))
vec_dt <- data.table(V1 = vec)
output <- vec_dt[order(V1),
.(start = min(V1),
end = max(V1)),
by = .(grp = rleid(c(0, cumsum(diff(V1) > 1))))
]
output$grp <- NULL
output$cat <- unique(dt1$cat)[i]
out_list[[i]] <- output
print(i)
}
output_dt <- do.call("rbind", out_list)
However, the data sets I need to apply this to are very large (both in the number of rows and the size of the vectors). Is anyone able to suggest a way to improve performance?
Thanks
You could (a) convert your start/end variables to a sequence, (b) do an inner join, (c) convert back to start/end.
library(data.table)
dt1 <- data.table(cat = c(rep("A", 2), rep("B", 2)),
start = c(1, 4, 2, 15),
end = c(6, 9, 5, 20))
dt2 <- data.table(cat = c(rep("A", 2), rep("B", 2)),
start = c(2, 1, 10, 17),
end = c(7, 3, 12, 20))
# convert to sequence
dt1 = dt1[, .(sequence = start:end), by=.(cat, 1:nrow(dt1))][
, nrow := NULL]
dt2 = dt2[, .(sequence = start:end), by=.(cat, 1:nrow(dt2))][
, nrow := NULL]
# inner join + unique
overlap = merge(dt1, dt2)
overlap = unique(overlap)
# convert to start/end
overlap = overlap[, .(start=min(sequence), end=max(sequence)), by=.(cat)]
# result
overlap
#> cat start end
#> 1: A 1 7
#> 2: B 17 20

Uniquefy duplicate column names in R [duplicate]

This question already has an answer here:
How to make a unique set of names from a vector of strings?
(1 answer)
Closed 5 years ago.
So I have loaded an Excel file which contains duplicate column names. I would like to add a suffix each time a column name is repeated. So:
problem_df <- data.frame(A = rep(1, 5), B = rep(2, 5), A = rep(3, 5), B = rep(4, 5), A = rep(5, 5))
solution_df <- data.frame(A = rep(1, 5), B = rep(2, 5), A_1 = rep(3, 5), B_1 = rep(4, 5), A_2 = rep(5, 5))
Or the column name suffixes can be '_2' and '_3'.
We can do with make.unique which also have the sep argument
make.unique(c("A", "B", "A", "B", "A"), sep="_")
#[1] "A" "B" "A_1" "B_1" "A_2"
In our 'problem_df', the data.frame call is using the check.names = TRUE, which call the make.names that calls the make.unique and by default the sep is ..
On checking the data.frame, it is in the code block that starts from line 124
if (check.names) {
if (fix.empty.names)
vnames <- make.names(vnames, unique = TRUE) ###
else {
nz <- nzchar(vnames)
vnames[nz] <- make.names(vnames[nz], unique = TRUE) ###
}
}
names(value) <- vnames
One option is to use check.names = FALSE and then assign the column names with make.unique and sep="_"
problem_df <- data.frame(A = rep(1, 5), B = rep(2, 5), A = rep(3, 5),
B = rep(4, 5), A = rep(5, 5), check.names = FALSE)
names(problem_df) <- make.unique(names(problem_df), sep="_")
Or using sub assuming that the dataset object is created with the .\\d+ as column names for duplicate names
sub("\\.", "_", names(problem_df))
#[1] "A" "B" "A_1" "B_1" "A_2"

pulling up matching rows from a matrix using dplyr

Suppose I have the following:
myDF <- cbind.data.frame("Id" = rep(1:5, each = 4), values = c(rnorm(4,0,1), rnorm(4, 10, 1), rnorm(4, 20,1 ), rnorm(4, 30,1), rnorm(4, 40,1)))
idVector <- sample(1:5, size = 5, replace = TRUE)
If my `idVector = 4,4,3,2,1', I want to pull all the rows with Id 4, then Id 4 again, then 3 then 2 then 1.
I can do it using the following:
do.call("rbind", lapply(idVector, function(x, currentDF){
currentDF[currentDF$Id == x,]}
, myDF))
Is there a neater way to do it using dplyr or plyr?
With dplyr
library(dplyr)
left_join(data.frame(Id=idVector), myDF)

Resources