I need to be able to rename columns by name in a list of dataframes that can all expected to have the same names.
For example:
[[1]]
col1 col2
1 1 2
2 2 3
[[2]]
col1 col2
1 1 2
2 2 3
Should become:
[[1]]
ID col2
1 1 2
2 2 3
[[2]]
ID col2
1 1 2
2 2 3
data:
col1 <- c(1,2)
col2 <- c(2,3)
myList <- list(data.frame(col1,col2),data.frame(col1,col2))
my attempt:
lapply(myList, function(x){
names(myList[[x]])[names(myList[[x]]) =="col1"] <- "ID"
})
Where did I go wrong? I need base R.
You can use {dplyr} and {purrr} from the {tidyverse}
> library(purrr)
> library(dplyr)
>
> df1 <- data.frame(col1 = 1:2, col2 = 3:4)
> df2 <- data.frame(col1 = 3:2, col2 = 6:7)
>
> list(df1, df2) %>% map(~ rename(., ID = col1))
[[1]]
ID col2
1 1 3
2 2 4
[[2]]
ID col2
1 3 6
2 2 7
It also works with strings: list(df1, df2) %>% map(~ rename(., "ID" = "col1"))
Related
For example DF1 is:
Id1 Id2
1 10
2 10
3 7
4 7
5 10
And want DF2:
Id1 Id2
1 2
1 5
2 5
3 4
The data frame DF2 is a pairwise set of values from Id1 column in DF1 that shared a common value in Id2 of DF1.
My attempt:
temp <- do.call("cbind", split(DF1, rep(c(1,2), length.out = nrow(DF1))))
(DF2 <- temp %>% select("1.Id1", "2.Id2"))
But this does not generate a pairwise data frame:
Id1 Id2
1 2
3 4
Here is another tidyverse method using full_join.
library(dplyr)
library(purrr)
dat2 <- dat %>%
full_join(dat, by = "Id2") %>%
filter(Id1.x != Id1.y) %>%
mutate(Id_sort = map2_chr(Id1.x, Id1.y, ~paste(sort(c(.x, .y)), collapse = ", "))) %>%
distinct(Id_sort, .keep_all = TRUE) %>%
select(Id1 = Id1.x, Id2 = Id1.y)
dat2
# Id1 Id2
# 1 1 2
# 2 1 5
# 3 2 5
# 4 3 4
Data
dat <- read.table(text = "Id1 Id2
1 10
2 10
3 7
4 7
5 10",
header = TRUE)
You can split Id1 based on values in Id2 and create all possible combinations with combn and bind the results.
do.call(rbind, lapply(split(df$Id1, df$Id2), function(x) t(combn(x, 2))))
# [,1] [,2]
#[1,] 3 4
#[2,] 1 2
#[3,] 1 5
#[4,] 2 5
We can also use by which is shorter :
do.call(rbind, by(df$Id1, df$Id2, function(x) t(combn(x, 2))))
We can use tidyverse methods where we group by 'Id2', get the combn of 'Id1', unnest to wide format and rename the columns
library(dplyr)
library(tidyr)
library(stringr)
DF1 %>%
# // group by Id2
group_by(Id2) %>%
# // get the combinations in summarise
summarise(out = combn(Id1, 2, simplify = FALSE)) %>%
ungroup %>%
# // unnest to wide format
unnest_wider(c(out)) %>%
select(-Id2) %>%
rename_all(~ str_c("V", seq_along(.)))
# A tibble: 4 x 2
# V1 V2
# <int> <int>
#1 3 4
#2 1 2
#3 1 5
#4 2 5
data
DF1 <- structure(list(Id1 = 1:5, Id2 = c(10L, 10L, 7L, 7L, 10L)),
class = "data.frame", row.names = c(NA,
-5L))
It could be conceptualised as a network/graph issue too:
df1 <- data.frame(Id1 = 1:5, Id2 = c(10L, 10L, 7L, 7L, 10L))
library(igraph)
g <- graph.data.frame(df1)
g <- connect(g, 2)
g <- induced_subgraph(g, V(g) %in% df1$Id1)
as_edgelist(g)
# [,1] [,2]
#[1,] "1" "2"
#[2,] "1" "5"
#[3,] "2" "5"
#[4,] "3" "4"
I have a file named 'schema'. Based on the file, I need to rename other data frames. For example, 'Var1' of TableA needs to be renamed to 'Col1'. Similarly, VarA of TableA needs to be renamed to ColA. In short, all variables listed in 'FROM' colume of schema needs to be renamed to column 'To'.
Schema <- read.table(header = TRUE, text =
'Tables From To
A Var1 Col1
A Var2 Col2
A Var3 Col3
B VarA ColA
B VarB ColB
B VarC ColC
')
A <- data.frame(Var1 = 1:3,
Var2 = 2:4,
Var3 = 3:5)
B <- data.frame(VarA = 1:3,
VarB = 2:4,
VarC = 3:5)
We could use match:
lapply(list(A = A, B = B), function(i){
setNames(i, Schema$To[ match(names(i), Schema$From) ])
})
# $A
# Col1 Col2 Col3
# 1 1 2 3
# 2 2 3 4
# 3 3 4 5
#
# $B
# ColA ColB ColC
# 1 1 2 3
# 2 2 3 4
# 3 3 4 5
Or:
Anew <- setNames(A, Schema$To[ match(names(A), Schema$From) ])
Bnew <- setNames(B, Schema$To[ match(names(B), Schema$From) ])
Or list2env:
list2env(lapply(list(A = A, B = B), function(i){
setNames(i, Schema$To[ match(names(i), Schema$From) ])
}), envir = globalenv())
Edit: When there is no match Schema then use keep column name as is:
list2env(lapply(list(A = A, B = B), function(i){
# check if there is a match, if not keep name unchaged
x <- as.character(Schema$To[ match(names(i), Schema$From) ])
ix <- which(is.na(x))
x[ ix ] <- names(i)[ ix ]
# retunr with updated names
setNames(i, x)
}), envir = globalenv())
The following code can extract retrieve the name of tables (A and B) from Schema and to the name replacement task:
r <- Map(function(v) function(v) {
r <- get(v)
names(r)[names(r) %in% Schema$From] <- as.character(Schema$To[Schema$From %in% names(r)])
assign(v,r)},
as.character(unique(Schema$Tables)))
which gives
> r
$A
Col1 Col2 Col3
1 1 2 3
2 2 3 4
3 3 4 5
$B
ColA ColB ColC
1 1 2 3
2 2 3 4
3 3 4 5
If you don't want result as list, you can do something like
list2env(Map(function(v) {
r <- get(v)
names(r)[names(r) %in% Schema$From] <- as.character(Schema$To[Schema$From %in% names(r)])
assign(v,r)},
as.character(unique(Schema$Tables))),envir = .GlobalEnv)
or
for (v in as.character(unique(Schema$Tables))) {
r <- get(v)
names(r)[names(r) %in% Schema$From] <- as.character(Schema$To[Schema$From %in% names(r)])
assign(v,r)
}
then you will keep your object A and B
> A
Col1 Col2 Col3
1 1 2 3
2 2 3 4
3 3 4 5
> B
ColA ColB ColC
1 1 2 3
2 2 3 4
3 3 4 5
lut <- setNames(as.character(Schema$To), Schema$From)
setNames(A, lut[names(A)])
Col1 Col2 Col3
1 1 2 3
2 2 3 4
3 3 4 5
setNames(B, lut[names(B)])
ColA ColB ColC
1 1 2 3
2 2 3 4
3 3 4 5
I have a user-defined function to replace text patterns. This seems to work on a data frame but not a tibble.
fixcontents <- function(mydf, mypattern1, mypattern2, mycol) {
mydf[ ,mycol] <- sub(mypattern1, mypattern2, mydf[ ,mycol])
return(mydf)
}
mydf1 <- data.frame(col1 = c(1,2), col2 = c("aaa", "bbb"))
mytbl1 <- tibble(col1 = c(1,2), col2 = c("aaa", "bbb"))
fixcontents(mydf1, "(b{3})", "\\1X", 2) # works
mydf1
col1 col2
1 1 aaa
2 2 bbbX
fixcontents(mytbl1, "(b{3})", "\\1_", 2) # does not work (??)
mytbl1
# A tibble: 2 x 2
col1 col2
<dbl> <chr>
1 1 aaa
2 2 bbb
Why this behavior, and how do you manipulate data in a tibble?
Subsetting using [ works differently on tibbles. Subsetting dataframe returns a vector whereas tibble returns tibble back.
mydf1[, 2]
#[1] aaa bbb
#Levels: aaa bbb
mytbl1[, 2]
# A tibble: 2 x 1
# col2
# <chr>
#1 aaa
#2 bbb
Try using [[ to subset
fixcontents <- function(mydf, mypattern1, mypattern2, mycol) {
mydf[[mycol]] <- sub(mypattern1, mypattern2, mydf[[mycol]])
return(mydf)
}
fixcontents(mydf1, "(b{3})", "\\1X", 2)
# col1 col2
#1 1 aaa
#2 2 bbbX
fixcontents(mytbl1, "(b{3})", "\\1_", 2)
# A tibble: 2 x 2
# col1 col2
# <dbl> <chr>
#1 1 aaa
#2 2 bbb_
I have below data frame
col1 <- c("A","B", "A")
col2 <- c("C","D","D")
col3 <- c("E","E","E")
col4 <- c("F","F","H")
x <- data.frame(col1,col2,col3,col4)
Output of above frame is:
1
I want to replace characters to numbers, as below:
2
Here's a one-liner in base R that works with any number of columns and any names - nothing is hard-coded, so it works with any x:
> setNames(data.frame(matrix(as.numeric(unlist(x)),ncol=ncol(x))),names(x))
col1 col2 col3 col4
1 1 3 5 6
2 2 4 5 6
3 1 4 5 7
x <- x %>%
unlist %>%
as.numeric %>%
matrix(ncol=4) %>%
data.frame
names(x) <- paste0("col", 1:4)
x
col1 col2 col3 col4
1 1 3 5 6
2 2 4 5 6
3 1 4 5 7
Here is a solution with base R:
x[] <- match(as.matrix(x), unique(c(as.matrix(x))))
# > x
# col1 col2 col3 col4
# 1 1 3 5 6
# 2 2 4 5 6
# 3 1 4 5 7
Here is a shorter solution:
x[] <- as.integer(unlist(x))
data:
x <- data.frame(col1=c("A","B", "A"), col2=c("C","D","D"), col3=c("E","E","E"), col4=c("F","F","H")
We can use lapply from base R
x[] <- lapply(x, match, LETTERS)
x
# col1 col2 col3 col4
#1 1 3 5 6
#2 2 4 5 6
#3 1 4 5 8
I have three dataframes df1,df2,df3. I would like to identify the value(s) in col1 of df2 not present in col1 of df1 and/or col1 of df3.
df1 <- data.frame(col1=c('A','C','E'),col2=c(4,8,2))
df1
df2 <- data.frame(col1=c('A','B','C','E','G','I'),col2=c(4,8,2,6,1,9))
df2
df3 <- data.frame(col1=LETTERS[3:26],col2=sample(3:26))
df3
# Expected output
#2 B 8
What I have done?
table(df2$col1 %in% df1$col1)
# FALSE TRUE
# 3 3
df2[df2$col1 %in% df1$col1,]
# col1 col2
#1 A 4
#3 C 2
#4 E 6
df2[!df2$col1 %in% df1$col1,]
# col1 col2
#2 B 8
#5 G 1
#6 I 9
table(df2$col1 %in% df3$col1)
#FALSE TRUE
# 2 4
df2[df2$col1 %in% df3$col1,]
# col1 col2
#3 C 2
#4 E 6
#5 G 1
#6 I 9
df2[!df2$col1 %in% df3$col1,]
# col1 col2
#1 A 4
#2 B 8
In a wrong approach,
df2[!df2$col1[!df2$col1 %in% df1$col1] %in% df3$col1,]
# col1 col2
#1 A 4
#4 E 6
How to avoid the repetition of the indices?
Is there any better approach than the below?
df2[!df2$col1 %in% df1$col1,][!df2$col1[!df2$col1 %in% df1$col1] %in% df3$col1,]
# col1 col2
#2 B 8
While the correct approach,
df2[!(df2$col1 %in% df1$col1 | df2$col1 %in% df3$col1),]
# col1 col2
#2 B 8
We can use anti_join
library(dplyr)
bind_rows(df1, df3) %>%
anti_join(df2, ., by = "col1")
# col1 col2
#1 B 8