How can I create a data frame which contains the column names of all Environment objects (df)
Ex. Having this 3 df as all the objects in the global environment.
chocolate <- data.frame(a = 1, b = 2, c = 3)
banana <- data.frame(a = 2, d = 4, c = 3)
pear <- data.frame(d = 1, e = 4)
Desired output
output <- data.frame(id = c("chocolate","banana", "pear"),
v2 = c("a", "a", NA),
v3 = c("b", NA, NA),
v4 = c("c", "c", NA),
v5 = c(NA, "d", "d"),
v6 = c(NA, NA, "e"))
output
We can try
library(data.table)
lst <- mget(paste0("df", 1:3))
setnames(rbindlist(lapply(setNames(lst, seq_along(lst)), function(x) {
x[] <- names(x)
x}), fill = TRUE, idcol = 'id'), 2:6, paste0("V", 1:5))[]
# id V1 V2 V3 V4 V5
#1: 1 a b c NA NA
#2: 2 a NA c d NA
#3: 3 NA NA NA d e
Related
I have a data frame that looks like this.
name = c("p1","p2","p3","p4")
place = c("f","g","h","i")
v1 = c("x", "NA", "NA", "NA")
v2 = c("NA", "y", "y", "NA")
v3 = c("NA", "NA", "z", "NA")
region = c("n","w","s","e")
grade = c("f1","f2","f3","f4")
df = data.frame(name, place, v1, v2, v3, region, grade)
name place v1 v2 v3 region grade
1 p1 f x NA NA n f1
2 p2 g NA y NA w f2
3 p3 h NA y z s f3
4 p4 i NA NA NA e f4
I would like to add a new character vector v4 that contains the character from any of columns v1 v2 v3.
name place v1 v2 v3 v4 region grade
1 p1 f x NA NA x n f1
2 p2 g NA y NA y w f2
3 p3 h NA y z yz s f3
4 p4 i NA NA NA NA e f4
many thanks
We can use paste after converting the columns to character
df$V4 <- gsub("NA", "", do.call(paste, c(df[3:5], sep="")))
df$V4[df$V4==""] <- NA
df$V4
#[1] "x" "y" "yz" NA
data
df <- structure(list(name = c("p1", "p2", "p3", "p4"), place = c("f",
"g", "h", "i"), v1 = c("x", NA, NA, NA), v2 = c(NA, "y", "y",
NA), v3 = c(NA, NA, "z", NA), region = c("n", "w", "s", "e"),
grade = c("f1", "f2", "f3", "f4")), .Names = c("name", "place",
"v1", "v2", "v3", "region", "grade"), class = "data.frame",
row.names = c("1", "2", "3", "4"))
The dplyr alternative:
install.packages("dplyr")
library(dplyr)
df <- df %>%
mutate( v4 = gsub( "NA", "", paste0(v1,v2,v3) ) ) %>%
mutate( v4 = ifelse( v4 == "", NA, v4 ) )
This should work if NA/"NA" is NA (not-a-value) or "NA" (character). And if you don't care whether v4 contains "" or NA, you can leave off the last line (and delete the last pipe).
Let's say we have two data frames:
df1 <- data.frame(A = letters[1:3], B = letters[4:6], C = letters[7:9], stringsAsFactors = FALSE)
A B C
1 a d g
2 b e h
3 c f i
df2 <- data.frame(V1 = 1:3, V2 = 4:6, V3 = 7:9)
V1 V2 V3
1 1 4 7
2 2 5 8
3 3 6 9
I need to build a function that takes as input a single value or a vector containing elements from one of the data frames and returns the elements from the other data frame according to their positional indexes.
The function should work like this:
> matchdf(values = c("a", "e", "i"), dfin = df1, dfout = df2)
[1] 1 5 9
> matchdf(values = c(1, 5, 9), dfin = df2, dfout = df1)
[1] "a" "e" "i"
> matchdf(values = c(1, 1, 1), dfin = df2, dfout = df1)
[1] "a" "a" "a"
This is what I have tried so far:
requiere(dplyr)
toVec <- function(df) df %>% as.matrix %>% as.vector
matchdf <- function(values, dfin, dfout) toVec(dfout)[toVec(dfin) %in% values]
# But sometimes the output values aren't in correct order:
> matchdf(c("c", "i", "h"), dt1, dt2)
[1] 3 8 9
# should output 3 9 8
> matchdf(values = c("a", "a", "a"), dfin = dt1, dfout = dt2)
[1] 1
# Should output 1 1 1
Feel free to use data.table or/and dplyr if it eases the task. I would prefer a solution without for loops.
Assumptions:
elements from df1 are different from df2
dim(df1) = dim(df2)
matchdf <- function(values, dfin, dfout){
unlist(sapply(values,
function(val) dfout[dfin == val],
USE.NAMES = F)
)
}
matchdf(c("c", "i", "h"), df1, df2)
#should output 3 9 8
[1] 3 9 8
matchdf(values = c("a", "a", "a"), dfin = df1, dfout = df2)
#should output 1 1 1
[1] 1 1 1
matchdf(values = c("X", "Y", "a"), dfin = df1, dfout = df2)
#should output vector, not list
[1] 1
I have a data.frame:
mydata = data.frame(v1 = c("A", "A", "A", "B", "B", "C", "D"),
v2 = c("XY", "XY", "ZZ", "BB", "ZZ", NA, "ZZ"),
v3 = 5)
And I would like to encode each of the characters in the data frame to integers corresponding to each of the levels. I also want to "ignore" NA values. The expected output would be equal to:
output = data.frame(v1 = c(1, 1, 1, 2, 2, 3, 4),
v2 = c(1, 1, 2, 3, 2, NA, 2),
v3 = 5)
My hope is to write a function that accepts a data.frame object AND a list specifying the columns on which I want to perform the operation, something like:
my_function = function(df, vars){
...
}
EDIT: in the example above, "vars" would be = c("v1", "v2")
Any suggestions for how to approach this? I'm open to using packages such as dplyr to help.
Thanks,
D
We can convert to factor and then coerce to numeric
mydata[1:2] <- lapply(mydata[1:2], function(x)
as.numeric(factor(x, levels=unique(x))))
This can be converted to a function
myfunction <- function(df, vars) {
df[vars] <- lapply(df[vars], function(x)
as.numeric(factor(x, levels=unique(x))))
df
}
myfunction(mydata, c('v1', 'v2'))
# v1 v2 v3
#1 1 1 5
#2 1 1 5
#3 1 2 5
#4 2 3 5
#5 2 2 5
#6 3 NA 5
#7 4 2 5
If we need it to be further generalized, we may need to check the column classes i.e. whether it is a numeric column and if not, then change to factor with levels specified and coerce to numeric.
mydata[] <- lapply(mydata, function(x)
if(!is.numeric(x)) as.numeric(factor(x, levels=unique(x)))
else x)
I want to left_join multiple data frames:
dfs <- list(
df1 = data.frame(a = 1:3, b = c("a", "b", "c")),
df2 = data.frame(c = 4:6, b = c("a", "c", "d")),
df3 = data.frame(d = 7:9, b = c("b", "c", "e"))
)
Reduce(left_join, dfs)
# a b c d
# 1 1 a 4 NA
# 2 2 b NA 7
# 3 3 c 5 8
This works because they all have the same b column, but Reduce doesn't let me specify additional arguments that I can pass to left_join. Is there a work around for something like this?
dfs <- list(
df1 = data.frame(a = 1:3, b = c("a", "b", "c")),
df2 = data.frame(c = 4:6, d = c("a", "c", "d")),
df3 = data.frame(d = 7:9, b = c("b", "c", "e"))
)
Update
This kind of works: Reduce(function(...) left_join(..., by = c("b" = "d")), dfs) but when by is more than one element it gives this error: Error: cannot join on columns 'b' x 'd': index out of bounds
It's been too late i know....today I got introduced to the unanswered questions section. Sorry to bother.
Using left_join()
dfs <- list(
df1 = data.frame(b = c("a", "b", "c"), a = 1:3),
df2 = data.frame(d = c("a", "c", "d"), c = 4:6),
df3 = data.frame(b = c("b", "c", "e"), d = 7:9)
)
func <- function(...){
df1 = list(...)[[1]]
df2 = list(...)[[2]]
col1 = colnames(df1)[1]
col2 = colnames(df2)[1]
xxx = left_join(..., by = setNames(col2,col1))
return(xxx)
}
Reduce( func, dfs)
# b a c d
#1 a 1 4 NA
#2 b 2 NA 7
#3 c 3 5 8
Using merge() :
func <- function(...){
df1 = list(...)[[1]]
df2 = list(...)[[2]]
col1 = colnames(df1)[1]
col2 = colnames(df2)[1]
xxx=merge(..., by.x = col1, by.y = col2, , all.x = T)
return(xxx)
}
Reduce( func, dfs)
# b a c d
#1 a 1 4 NA
#2 b 2 NA 7
#3 c 3 5 8
Would this work for you?
jnd.tbl <- df1 %>%
left_join(df2, by='b') %>%
left_join(df3, by='d')
Yet another solution:
library(purrr)
library(dplyr)
dfs = list(
df1 = data.frame(a = 1:3, b = c("a", "b", "c")),
df2 = data.frame(c = 4:6, b = c("a", "c", "d")),
df3 = data.frame(d = 7:9, b = c("b", "c", "e"))
)
purrr::reduce(dfs, dplyr::left_join, by = 'b')
I have the following data.frames:
a <- data.frame(id = 1:3, v1 = c('a', NA, NA), v2 = c(NA, 'b', 'c'))
b <- data.frame(id = 1:3, v1 = c(NA, 'B', 'C'), v2 = c("A", NA, NA))
> a
id v1 v2
1 1 a <NA>
2 2 <NA> b
3 3 <NA> c
> b
id v1 v2
1 1 <NA> A
2 2 B <NA>
3 3 C <NA>
note: There are no ids for which v1 or v2 are defined in both tables; there is only a single unique non-NA value in each column for each id value
I would like to merge these data frames on matching values of "id':
ab <- merge(a, b, by = "id")
but I would also like to combine the two columns v1 and v2, so that the data.frame ab will look like this:
ab <- data.frame(id = 1:3, v1 = c("a", "B", "C"), v2 = c("A", "b", "c"))
> ab
id v1 v2
1 1 a A
2 2 B b
3 3 C c
instead, I get this:
> merge(a, b, by = "id")
id v1.x v2.x v1.y v2.y
1 1 a <NA> <NA> A
2 2 <NA> b B <NA>
3 3 <NA> c C <NA>
it would be helpful to have examples using both data.frame and data.table, so here are the data.table versions of above:
A <- data.table(a, key = 'id')
B <- data.table(b, key = 'id')
A[B]
The type of merge you specify probably won't be possible using merge (with data frames), although saying that usually invites being proved wrong.
You also omit some details: will there always be a single unique non-NA value in each column for each id value? If so, this will work:
ab <- rbind(a,b)
> colFun <- function(x){x[which(!is.na(x))]}
> ddply(ab,.(id),function(x){colwise(colFun)(x)})
id v1 v2
1 1 a A
2 2 B b
3 3 C c
A similar strategy should work with data.tables as well:
abDT <- data.table(ab,key = "id")
> abDT[,list(colFun(v1),colFun(v2)),by = id]
id V1 V2
[1,] 1 a A
[2,] 2 B b
[3,] 3 C c
If your data is as simple as it is above joran's answer is likely the simplest way. Here's may approach in base:
a <- data.frame(id = 1:3, v1 = c('a', NA, NA), v2 = c(NA, 'b', 'c'))
b <- data.frame(id = 1:3, v1 = c(NA, 'B', 'C'), v2 = c("A", NA, NA))
decider <- function(x, y) factor(ifelse(is.na(x), as.character(y), as.character(x)))
data.frame(mapply(a, b, FUN = decider))
If your data has different id's (some overlap and some do not, then here's a different approach:
a <- data.frame(id = c(1,2,4,5), v1 = c('a', NA, "q", NA), v2 = c(NA, 'b', 'c', "e"))
b <- data.frame(id = 1:4, v1 = c(NA, "A", "C", 'B'), v2 = c("A", NA, "D", NA))
decider <- function(x, y) factor(ifelse(is.na(x), as.character(y), as.character(x)))
DF <- data.frame(mapply(a, b, FUN = decider))
DF2 <- rbind(b[!b$id %in% DF$id , ], DF)
DF2 <- DF2[order(DF2$id), ]
rownames(DF2) <- 1:nrow(DF2)