Rename colnames according to data.frame - r

I have several data.frames df1, df2, df3, ... .
df1 <- data.frame(var1 = c("a", "b", "c"),
var2 = c("a", "b", "c"),
var3 = c("a", "b", "c"),
var4 = c("a", "b", "c"))
df2 <- data.frame(var1 = c("a", "b", "c"),
var2 = c("a", "b", "c"),
var3 = c("a", "b", "c"),
var4 = c("a", "b", "c"))
df3 <- data.frame(var1 = c("a", "b", "c"),
var2 = c("a", "b", "c"),
var3 = c("a", "b", "c"),
var4 = c("a", "b", "c"))
df4 <- data.frame(var1 = c("a", "b", "c"),
var2 = c("a", "b", "c"),
var3 = c("a", "b", "c"),
var4 = c("a", "b", "c"))
And I have a data.frame rename_vars which tells mich which variables in which data.frame should be renamed.
rename_vars <- data.frame(df = c("df1", "df1", "df3"),
var = c("var1", "var3", "var1"),
rename_to = c("var1x", "var3y", "var1z"))
df var rename_to
1 df1 var1 var1x
2 df1 var3 var3y
3 df2 var1 var1z
For example, in df1 the variable var1 should be renamed into var1x and var3 should be called var3x. In data.frame df2 variable var1 should be called var1z and so forth.
But how can I automate this process of renaming variables in different data.frames according to rename_vars?
Thanks for help!

We split the 'rename_vars' data based on the 'df' column then loop over the list with map2 on the values from the names of the list (mget) and the 'lst1', and change the column names with rename_at. It is better to keep it in a list, but if we need to change the global individual objects, use list2env after naming the list ('out') with the object names
library(dplyr)
library(purrr)
lst1 <- split(rename_vars[-1], rename_vars$df)
out <- map2(mget(names(lst1)), lst1, ~ {
nm1 <- .y[[1]]
nm2 <- .y[[2]]
.x %>%
rename_at(vars(nm1), ~ nm2)})
list2env(out, .GlobalEnv)
-output
df1
# var1x var2 var3y var4
#1 a a a a
#2 b b b b
#3 c c c c
df3
# var1z var2 var3 var4
#1 a a a a
#2 b b b b
#3 c c c c
Or another option is !!! with rename
library(tibble)
lst1 <- split(as.list(deframe(rename_vars[3:2])), rename_vars$df)
list2env(map2(mget(names(lst1)), lst1, ~ .x %>%
rename(!!! .y)), .GlobalEnv)
Or using base R with a for loop and assign
for(i in seq_len(nrow(rename_vars))) {
tmp1 <- get(rename_vars$df[i])
i1 <- match(rename_vars$var[i], names(tmp1))
names(tmp1)[i1] <- rename_vars$rename_to[i]
assign(rename_vars$df[i], tmp1)
}

setnames from data.table
library('data.table')
for (dt in unique(rename_vars$df) ) {
df_rows <- (rename_vars$df == dt) # get row indices matching data frame name
old <- rename_vars$var[df_rows] # old names
new <- rename_vars$rename_to[df_rows] # new names
setDT(get(dt)) # convert to data table by reference
setnames(get(dt), old, new) # set names by reference
}
Output:
names(df1)
# [1] "var1x" "var2" "var3y" "var4"
names(df2)
# [1] "var1" "var2" "var3" "var4"
names(df3)
# [1] "var1z" "var2" "var3" "var4"

Related

How to merge two data frames and fill with different options based on coincidences

I have two data frame y and z
y <- data.frame(ID = c("A", "A", "A", "B", "B"), gene = c("a", "b", "c", "a", "c"))
z <- data.frame(A = c(2,6,3), B = c(8,4,9), C=c(1,6,2))
rownames(z) <- c("a", "b", "c")
So for y I have a table with patients ID and gene for each patients and in Z I have the same patients IDs in the first row and a list of genes with a specific value (which is not important here). The genes in y are in z, but in z there are genes that are not included in y.
What I want to do is to merge this frames and have something like this:
a b c
A 1 1 1
B 1 0 1
So for each patient, if the genes in z are also in y, fill with 1 and if not, fill with 0
I don't really know how to handle this, any ideas?
Thank you
I've made RE from your question (add this to your question next time):
y <- data.frame(ID = c("id_A", "id_A", "id_A", "id_B", "id_B"), gene = c("a", "b", "c", "a", "c"))
z <- data.frame(id_A = c(2,6,3), id_B = c(8,4,9), id_C=c(1,6,2))
rownames(z) <- c("a", "b", "c")
The idea here is to pivot_longer your tables so you can join than easily.
To do so, you first need to make your rownames into a field:
z <- tibble::rownames_to_column(z, "gene")
Then, you pivot longer you z table:
library(tidyr)
z_long <- pivot_longer(z, starts_with("id_"), names_to = "ID")
and join it with your y table:
library(dplyr)
table_join <- left_join(y, z_long)
Finally, you just have to calculate the frequencies:
table(table_join$ID, table_join$gene)
a b c
id_A 1 1 1
id_B 1 0 1

Exchange data.table columns with most prevalent value of columns

I have data
test = data.table(
a = c(1,1,3,4,5,6),
b = c("a", "be", "a", "c", "d", "c"),
c = rep(1, 6)
)
I wish to take the unique values of column a, store it in another data.table, and afterwards fill in the remaining columns with the most prevalent values of those remaining columns, such that my resulting data.table would be:
test2 = data.table(a = c(1,3,4,5,6), b = "a", c = 1)
Column be has equal amounts of "a" and "c", but it doesn't matter which is chosen in those cases.
Attempt so far:
test2 = unique(test, by = "a")
test2[, c("b", "c") := lapply(.SD, FUN = function(x){test2[, .N, by = x][order(-N)][1,1]}), .SDcols = c("b", "c")]
EDIT: I would preferrably like a generic solution that is compatible with a function where I specify the column to be "uniqued", and the rest of the columns are with the single most prevalent value. Hence my use of lapply and .SD =)
EDIT2: as #MichaelChirico points out, how do we keep the class. With the following data.table some of the solutions does not work, although solution of #chinsoon12 does work:
test = data.table(a = c(1,1,3,4,5,6),
b = c("a", "be", "a", "c", "d", "c"),
c = rep(1, 6),
d = as.Date("2019-01-01"))
Another option:
dtmode <- function(x) x[which.max(rowid(x))]
test[, .(A=unique(A), B=dtmode(B), C=dtmode(C))]
data:
test = data.table(
A = c(1,1,3,4,5,6),
B = c("a", "be", "a", "c", "d", "c"),
C = rep(1, 6)
)
Not a clean way to do this but it works.
test = data.frame(a = c(1,1,3,4,5,6), b = c("a", "be", "a", "c", "d", "c"), c = rep(1, 6))
a = unique(test$a)
b = tail(names(sort(table(test$b))), 1)
c = tail(names(sort(table(test$c))), 1)
test2 = cbind(a,b,c)
Output is like this:
> test2
a b c
[1,] "1" "c" "1"
[2,] "3" "c" "1"
[3,] "4" "c" "1"
[4,] "5" "c" "1"
[5,] "6" "c" "1"
>
#EmreKiratli is very close to what I would do:
test[ , c(
list(a = unique(a)),
lapply(.SD, function(x) as(tail(names(sort(table(x))), 1L), class(x)))
), .SDcols = !'a']
The as(., class(x)) part is because names in R are always character, so we have to convert back to the original class of x.
You might like this better in magrittr form since it's many nested functions:
library(magrittr)
test[ , c(
list(a = unique(a)),
lapply(.SD, function(x) {
table(x) %>% sort %>% names %>% tail(1L) %>% as(class(x))
})
), .SDcols = !'a']
I was able to make an OK solution, but if somebody can do it more elegantly, for example not going through the step of storting a list in refLevel below, please let me know! I'm very interested in learning data.table properly!
#solution:
test = data.table(a = c(1,1,3,4,5,6), b = c("a", "be", "a", "c", "d", "c"), c = rep(1, 6))
test2 = unique(test, by="a")
funPrev = function(x){unlist(as.data.table(x)[, .N, by=x][order(-N)][1,1], use.names = F)}
refLevel = lapply(test[, c("b", "c")], funPrev)
test2[, c("b", "c") := refLevel]
...and using a function (if anybody see any un-necessary step, please let me know):
genData = function(dt, var_unique, vars_prev){
data = copy(dt)
data = unique(data, by = var_unique)
funPrev = function(x){unlist(as.data.table(x)[, .N, by=x][order(-N)][1,1], use.names = F)}
refLevel = lapply(dt[, .SD, .SDcols = vars_prev], funPrev)
data[, (vars_prev) := refLevel]
return(data)
}
test2 = genData(test, "a", c("b", "c"))
Here's another variant which one might find less sophisticated, yet more readable. It's essentially chinsoon12's rowid approach generalized for any number of columns. Also the classes are kept.
test = data.table(a = c(1,1,3,4,5,6),
b = c("a", "be", "a", "c", "d", "c"),
c = rep(1, 6),
d = as.Date("2019-01-01"))
test2 = unique(test, by = "a")
for (col in setdiff(names(test2), "a")) test2[[col]] = test2[[col]][which.max(rowid(test2[[col]]))]

Replace values in vector where not %in% vector

Short question:
I can substitute certain variable values like this:
values <- c("a", "b", "a", "b", "c", "a", "b")
df <- data.frame(values)
What's the easiest way to replace all the values of df$values by "x" (where the value is neither "a" or "b")?
Output should be:
c("a", "b", "a", "b", "x", "a", "b")
Your example is a bit unclear and not reproducible.
However, based on guessing what you actually want, I could suggest trying this option using the data.table package:
df[values %in% c("a", "b"), values := "x"]
or the dplyr package:
df %>% mutate(values = ifelse(values %in% c("a","b"), x, values))
What about:
df[!df[, 1] %in% c("a", "b"), ] <- "x"
values
1 a
2 b
3 a
4 b
5 x
6 a
7 b

R convert tidy hierarchical data frame to hierarchical list

Converting this
g1 g2 desc val
A a 1 v1
A a 2 v2
A b 3 v3
To:
desc val
A
a
1 v1
2 v2
b
3 v3
I've converted a hierarchical data frame with two grouping levels into a structured list using a for loop. This displayed descriptions with an associated variable in a list interspersed with the group levels in order.
The purpose is to present the hierarchical data as a list so that it can be printed with formatting to distinguish the different grouping levels, using openxlsx.
Is there a more efficient base R, tidyverse or other approach to achieve this?
For loop code
tib <- tibble(g1 = c("A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "C"),
g2 = c("a", "a", "b", "b", "b", "c", "d", "d", "b", "b", "e", "e"),
desc = 1:12,
val = paste0("v", 1:12))
# Number of rows in final table
n_rows <- length(unique(tib$g1)) + length(unique(paste0(tib$g1, tib$g2))) + nrow(tib)
# create empty output tibble
output <-
as_tibble(matrix(nrow = n_rows, ncol = 2)) %>%
rename(desc = V1, val = V2) %>%
mutate(desc = NA_character_,
val = NA_real_)
# loop counters
level_1 <- 0
level_2 <- 0
output_row <- 1
for(i in seq_len(nrow(tib))){
# level 1 headings
if(tib$g1[[i]] != level_1) {
output$desc[[output_row]] <- tib$g1[[i]]
output_row <- output_row + 1
}
# level 2 headings
if(paste0(tib$g1[[i]], tib$g2[[i]]) != paste0(level_1, level_2)) {
output$desc[[output_row]] <- tib$g2[[i]]
output_row <- output_row + 1
}
level_1 <- tib$g1[[i]]
level_2 <- tib$g2[[i]]
# Description and data
output$desc[[output_row]] <- tib$desc[[i]]
output$val[[output_row]] <- tib$val[[i]]
output_row <- output_row + 1
}
Using a few packages from the tidyverse, we could do:
library(tidyverse)
# or explicitly load what you need
library(purrr)
library(dplyr)
library(tidyr)
library(stringr)
transpose(df) %>%
unlist() %>%
stack() %>%
distinct(values, ind) %>%
mutate(detect_var = str_detect(values, "^v"),
ind = lead(case_when(detect_var == TRUE ~ values)),
values = case_when(detect_var == TRUE ~ NA_character_,
TRUE ~ values)) %>%
drop_na(values) %>%
select(values, ind) %>%
replace_na(list(ind = ""))
Returns:
values ind
1 A
2 a
3 1 v1
5 2 v2
7 b
8 3 v3
Using tib data set, my solution seems to be a little slower than Plamen's:
Unit: milliseconds
expr min lq mean median uq max neval
old 17.658398 18.492957 21.292965 19.396304 21.770249 133.215223 100
new_simple 6.742158 7.013732 7.638155 7.190095 7.759104 12.640237 100
new_fast 4.064907 4.266243 4.837131 4.507865 4.871533 9.442904 100
tidyverse 4.980664 5.326694 6.004602 5.552611 6.215129 9.923524 100
I believe you can simplify and slightly optimize your code like this :
library(dplyr)
library(tidyr)
library(microbenchmark)
microbenchmark(
old = {
tib <- tibble(g1 = c("A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "C"),
g2 = c("a", "a", "b", "b", "b", "c", "d", "d", "b", "b", "e", "e"),
desc = 1:12,
val = paste0("v", 1:12))
# Number of rows in final table
n_rows <- length(unique(tib$g1)) + length(unique(paste0(tib$g1, tib$g2))) + nrow(tib)
# create empty output tibble
output <-
as_tibble(matrix(nrow = n_rows, ncol = 2)) %>%
rename(desc = V1, val = V2) %>%
mutate(desc = NA_character_,
val = NA_real_)
# loop counters
level_1 <- 0
level_2 <- 0
output_row <- 1
for(i in seq_len(nrow(tib))){
# level 1 headings
if(tib$g1[[i]] != level_1) {
output$desc[[output_row]] <- tib$g1[[i]]
output_row <- output_row + 1
}
# level 2 headings
if(paste0(tib$g1[[i]], tib$g2[[i]]) != paste0(level_1, level_2)) {
output$desc[[output_row]] <- tib$g2[[i]]
output_row <- output_row + 1
}
level_1 <- tib$g1[[i]]
level_2 <- tib$g2[[i]]
# Description and data
output$desc[[output_row]] <- tib$desc[[i]]
output$val[[output_row]] <- tib$val[[i]]
output_row <- output_row + 1
}
}
,
new_simple = {
tib <- tibble(g1 = c("A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "C"),
g2 = c("a", "a", "b", "b", "b", "c", "d", "d", "b", "b", "e", "e"),
desc = 1:12,
val = paste0("v", 1:12)) %>%
unite('g1g2', g1, g2, remove = F)
tib_list <- split(tib, tib$g1g2)
convert_group <- function(sub_df){
tibble(
desc = c(sub_df$g1[1], sub_df$g2[2], sub_df$desc)
, val = c(NA, NA, sub_df$val)
)
}
res_df <- bind_rows(lapply(tib_list, convert_group))
}
,
new_fast = {
tib <- tibble(g1 = c("A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "C"),
g2 = c("a", "a", "b", "b", "b", "c", "d", "d", "b", "b", "e", "e"),
desc = 1:12,
val = paste0("v", 1:12)) %>%
unite('g1g2', g1, g2, remove = F)
tib_list <- split(tib, tib$g1g2)
convert_desc <- function(sub_df){
c(sub_df$g1[1], sub_df$g2[2], sub_df$desc)
}
convert_val <- function(sub_df){ c(NA, NA, sub_df$val) }
res_df <- tibble(
desc = sapply(tib_list, convert_desc)
, val = sapply(tib_list, convert_val)
)
}
)
This gives me the following output:
Unit: milliseconds
expr min lq mean median uq max neval
old 41.06535 43.52606 49.42744 47.29305 52.74399 76.98021 100
new_simple 57.08038 60.65657 68.11021 63.38157 71.62398 112.24893 100
new_fast 24.16624 26.30785 31.07178 28.38764 31.91647 148.06442 100

How to combine two data frames by equal elements

I have two data frames containing the names of genetic elements. I want another data frame with the elements in common in both data frames.
Example:
data.a data.b
Column Column
1 a c
2 b e
3 c l
4 d a
I want this result:
data.c
Column
1 a
2 c
This is just an example. The data frame data.b has more elements than data.a.
The %in% operator lets you find which elements are in both.
data.c = data.frame(Column = data.a$Column[data.a$Column %in% data.b$Column])
data.c
Column
1 a
2 c
a <- data.frame(a = c("a","b","c","d"))
a
b <- data.frame(b = c("c","d","e","f"))
b
c <- data.frame(c = a[a$a %in% b$b,])
c
The merge function allows you control the type of join you want.
df1 <- data.frame(a = c("a", "b", "c", "d"))
df2 <- data.frame(a = c("c", "e", "l", "a"))
merge(x=df1, y=df2, by.x="a", by.y="a", all = FALSE)
library(dplyr)
data.a <- data_frame(a = c("a", "b", "c", "d"))
data.b <- data_frame(a = c("c", "e", "l", "a"))
data.c <- data.a %>% inner_join(data.b)

Resources