Coerce specific column to "double" within a dataframe list - r

Let's say I have a list of dataframes
myList <- list(df1 = data.frame(A = as.character(sample(10)), B =
rep(1:2, 10)), df2 = data.frame(A = as.character(sample(10)), B = rep(1:2, 10)) )
I want to coerce column A in each dataframe to double.
I'm trying:
myList = sapply(myList,simplify = FALSE, function(x){
x$A <- as.double(x$A) })
But this returns the coerced values, not even column with column names.
I also tried with dplyr and mutate_if, but with no success

We can use lapply with transform in base R
myList2 <- lapply(myList, transform, A = as.double(A))
Or use map with mutate from tidyverse
library(dplyr)
library(purrr)
myList2 <- map(myList, ~ .x %>%
mutate(A = as.double(A)))
The issue in the OP's code is that it is not returning the data i.e. 'x'.
myList2 <- sapply(myList, simplify = FALSE,
function(x){
x$A <- as.double(x$A)
x
})

Related

Altering the column names of dataframes in a list with lapply

I want to add a prefix to the colnames of dataframes in a list, based on the name of the dataframe:
list1 <- vector("list", 6)
list1 <- lapply(list1, function(x) data.frame(replicate(10,sample(0:1,10,rep=TRUE))))
list1 <- lapply(list1, function(x) {colnames(x)<- letters[1:10];x})
names(list1) <- LETTERS[1:6]
At this point, i want to change the colnames with something like:
colnames(list1[[1]]) <- paste(names(list1[1]), colnames(list1[[1]]), sep=".")
colnames(list1[[1]])
[1] "A.a" "A.b" "A.c" "A.d" "A.e" "A.f" "A.g" "A.h" "A.i" "A.j"
How can i do this sequentally on each of the six dataframes? I tried
lapply(list1, function(x) {colnames(list1[x]) <- paste(names(list1[x]), colnames(list1[[x]]), sep=".");x })
but
Error in list1[x] : invalid subscript type 'list'
I guess its about cycling through the names of list1? Or how is it done?
Thanks for your help.
Using base R, you can use Map.
Map(function(x, y) setNames(x, paste(names(x), y, sep = ".")), list1, names(list1))
Here is another way using purrr. The i functions (e.g., imap, imodify) will pass the name of the list item in as parameter .y along with the list contents in .x.
library(purrr)
imodify(list1, ~ set_names(.x, paste(names(.x), .y, sep = ".")))
another way, if you have your dataframes already prepared and want to prefix columns with each dataframe's name:
## example dataframes
df1 <- data.frame(col1 = runif(1))
df2 <- data.frame(col1 = runif(1))
library(purrr)
dataframes_to_change = c('df1','df2')
walk(dataframes_to_change, ~{
df <- get(.x)
names(df) <- paste(.x, names(df), sep = '_')
assign(.x, value = df, pos = 1)
})
Modifying attributes might be faster.
Map(`attr<-`, list1, 'names', Map(paste, names(list1), lapply(list1, attr, 'names'), sep='.'))

Change columns in several dfs depending on name of df in a loop (R)

I want to change the column names in several dfs in a loop, where the new column names depend on the name of the df:
library(dplyr)
# A simple example of my data:
dataframe_AA <- data.frame(
var1 = sample(1:3),
var2 = sample(1:3),
var3 = sample(1:3))
dataframe_AB <- data.frame(
var1 = sample(1:3),
var2 = sample(1:3),
var3 = sample(1:3))
dfs <- list(dataframe_AA, dataframe_AB)
combinations <- c("AA", "AB")
oldnames = c("var1", "var2", "var3")
for (df in seq_along(dfs)) {
for (combi in combinations) {
newnames = paste(oldnames, substr = substring(combi,1,2), sep = "_")
df <- df %>% rename_at(vars(oldnames), ~newnames)
}
}
#Expected outcome:
names(dataframe_AA)
[1] "var1_AA" "var2_AA" "var3_AA"
names(dataframe_AB)
[1] "var1_AB" "var2_AB" "var3_AB"
newnames is successfully created inside the loop, but then I receive the error message: Error in UseMethod("tbl_vars") :
no applicable method for 'tbl_vars' applied to an object of class "c('integer', 'numeric')"
Doing everything outside a loop works, however:
df = dataframe_AA
combi = "AA"
newnames = paste(oldnames, substr = substring(combi,1,2), sep = "_")
df <- df %>% rename_at(vars(oldnames), ~newnames)
print(names(df))
[1] "var1_AA" "var2_AA" "var3_AA"
I don't understand what I'm doing wrong here and how I can make it work in a loop.
We can use mget to get dataframes in a list and then paste new variables based on name of the dataframe.
library(dplyr)
library(purrr)
temp <- imap(mget(ls(pattern = "dataframe_.*")), function(x, y)
x %>% rename_at(vars(oldnames), ~paste0(., sub(".*_", "_", y))))
temp is list of dataframes with changed names, to get the individual dataframes in global environment we can use list2env.
list2env(temp, .GlobalEnv)
names(dataframe_AA)
#[1] "var1_AA" "var2_AA" "var3_AA"
names(dataframe_AB)
#[1] "var1_AB" "var2_AB" "var3_AB"

Replacing values of a dataframe column with a vector

I want to simplify my current method of replacing column values of a dataframe with a vector. I've provided a reproducible answer below with my solution using base R. A simplified version contains just one data frame and with multiple dataframes, i'm forced to use a for loop due to my bad solution.
How can I simplify my approach?
# Simplified version
Df <- data.frame(a = c(1,2,3),
b = c(4,5,6),
c = c(7,8,9))
l <- list(c(11,22,33),
c(44,55,66))
letters <- c("a","b")
Df[letters] <- l
# Multiple data frames
Df1 <- list(data.frame(a = c(1,2,3),
b = c(4,5,6),
c = c(7,8,9)),
data.frame(a = c(101,102,103),
b = c(104,105,106),
c = c(107,108,109)))
l <- list( list(c(11,22,33), c(44,55,66)),
list(c(111,222,333), c(444,555,666)) )
letters <- c("a","b")
for(i in 1:length(Df1)){
Df1[[i]][letters] <- l[[i]]
}
Here is an option with map2
library(purrr)
library(dplyr)
map2(Df1, l, ~ {.x[letters] <- .y; .x})
Or with inset from magrittr
library(magrittr)
map2(Df1, l, ~ inset(.x, letters, value = .y))
or in a chain
map2(Df1, l, ~ .x %>%
select(-one_of(letters)) %>%
bind_cols(.y %>%
set_names(letters)) %>%
select(names(.x)))
Or in base R
Map(function(x, y) {x[letters] <- y;x}, Df1, l)

Renaming Several Columns in Data Frames Stored in a List Simultaneously

I have the following list, which contains several dataframes that all have the same column names:
my_list <- list(df1 = data.frame(A = c(1:3), B = c(4:6), C = c(7:9)),
df2 = data.frame(A = c(1:4), B = c(5:8), C = c(9:12)),
df3 = data.frame(A = c(1:5), B = c(6:10), C = c(11:15)))
Is there an efficient way to rename all of the column As in each data frame in the list simultaneously using base R functions?
I was thinking that something like
names(lapply(my_list, `[[`, "A")) <- "new_name"
may work, but I think I'm off track - the lapply function returns an object that might not work for what I'm trying to do.
Thanks!
A few more base options:
# rename first column name
lapply(my_list, function(x) setNames(x, replace(names(x), 1, "new_name_for_A")))
# rename column named "A"
lapply(my_list, function(x) setNames(x, replace(names(x), names(x) == "A", "new_name_for_A")))
# lowly for loop
for (i in seq_along(my_list)) {
names(my_list[[i]])[names(my_list[[i]]) == "A"] = "new_name_for_A"
}
We can use map to loop over the list and rename the column named 'A' to 'new_name" with rename_at
library(purrr)
library(dplyr)
map(my_list, ~ .x %>%
rename_at(vars("A"), ~ "new_name"))
Or with base R by making use of anonymous function call
lapply(my_list, function(x) {names(x)[names(x) == "A"] <- "new_name"; x})
How about
new.names = c('New', 'B', 'C')
lapply(my_list, `names<-`, new.names)
For the added example in your edit, you would simply change this to
new.names = sub('B', 'New', names(my_list[[1]]))

Joining data frames without returning all matching combinations

I have a list of data.frames (in this example only 2):
set.seed(1)
df1 <- data.frame(id = sample(LETTERS,50,replace=T), val = rnorm(50), val1 = rnorm(50), stringsAsFactors = F)
df2 <- data.frame(id = sample(LETTERS,30,replace=T), val = rnorm(30), val2 = rnorm(30), stringsAsFactors = F)
df.list <- list(df1,df2)
I want to join them into a single data.frame only by a subset of the shared column names, in this case by id.
If I use:
library(dplyr)
df <- df.list %>% purrr::reduce(dplyr::inner_join,by="id")
The shared column names, which I'm not joining by, get mutated with the x and y suffices:
id val.x val1 val.y val2
1 G -0.05612874 0.2914462 2.087167 0.7876396
2 G -0.05612874 0.2914462 -0.255027 1.4411577
3 J -0.15579551 -0.4432919 -1.286301 1.0273924
In reality, for the shared column names for which I'm not joining by, it's good enough to select them from a single data.frame in the list - which ever they exist in WRT to the joined id.
I don't know these shared column names in advance but that's not difficult find out:
E.g.:
df.list.colnames <- unlist(lapply(df.list,function(l) colnames(l %>% dplyr::select(-id))))
df.list.colnames <- table(df.list.colnames)
repeating.colnames <- names(df.list.colnames)[which(df.list.colnames > 1)]
Which will then allow me to separate them from the data.frames in the list:
repeating.colnames.df <- do.call(rbind,lapply(df.list,function(r) r %>% dplyr::select_(.dots = c("id",repeating.colnames)))) %>%
unique()
I can then join the list of data.frames excluding these columns:
And then join them as above:
for(r in 1:length(df.list)) df.list[[r]] <- df.list[[r]] %>% dplyr::select_(.dots = paste0("-",repeating.colnames))
df <- df.list %>% purrr::reduce(dplyr::inner_join,by="id")
And now I'm left with adding the repeating.colnames.df to that. I don't know of any join in dplyr that wont return all combinations between df and repeating.colnames.df, so it seems that all I can do is apply over each df$id, pick the first match in repeating.colnames.df and join the result with df.
Is there anything less cumbersome for this situation?
If I followed correctly, I think you can handle this by writing a custom function to pass into reduce that identifies the common column names (excluding your joining columns) and excludes those columns from the "second" table in the merge. As reduce works through the list, the function will "accumulate" the unique columns, defaulting to the columns in the "left-most" table.
Something like this:
library(dplyr)
library(purrr)
set.seed(1)
df1 <- data.frame(id = sample(LETTERS,50,replace=T), val = rnorm(50), val1 = rnorm(50), stringsAsFactors = F)
df2 <- data.frame(id = sample(LETTERS,30,replace=T), val = rnorm(30), val2 = rnorm(30), stringsAsFactors = F)
df.list <- list(df1,df2)
fun <- function(df1, df2, by_col = "id"){
df1_names <- names(df1)
df2_names <- names(df2)
dup_cols <- intersect(df1_names[!df1_names %in% by_col], df2_names[!df2_names %in% by_col])
out <- dplyr::inner_join(df1, df2[, !(df2_names %in% dup_cols)], by = by_col)
return(out)
}
df_chase <- df.list %>% reduce(fun,by_col="id")
Created on 2019-01-15 by the reprex package (v0.2.1)
If I compare df_chase to your final solution, I yield the same answer:
> all.equal(df_chase, df_orig)
[1] TRUE
You can just get rid of the duplicate columns from one of the data frames if you say you don't really care about them and simply use base::merge:
set.seed(1)
df1 <- data.frame(id = sample(LETTERS,50,replace=T), val = rnorm(50), val1 = rnorm(50), stringsAsFactors = F)
df2 <- data.frame(id = sample(LETTERS,30,replace=T), val = rnorm(30), val2 = rnorm(30), stringsAsFactors = F)
duplicates = names(df1) == names(df2) & names(df1) !="id"
df2 = df2[,!duplicates]
df12 = base::merge.data.frame(df1, df2, by = "id")
head(df12)

Resources