Column names into data frame - r

How can I create a data frame which contains the column names of all Environment objects (df)
Ex. Having this 3 df as all the objects in the global environment.
chocolate <- data.frame(a = 1, b = 2, c = 3)
banana <- data.frame(a = 2, d = 4, c = 3)
pear <- data.frame(d = 1, e = 4)
Desired output
output <- data.frame(id = c("chocolate","banana", "pear"),
v2 = c("a", "a", NA),
v3 = c("b", NA, NA),
v4 = c("c", "c", NA),
v5 = c(NA, "d", "d"),
v6 = c(NA, NA, "e"))
output

We can try
library(data.table)
lst <- mget(paste0("df", 1:3))
setnames(rbindlist(lapply(setNames(lst, seq_along(lst)), function(x) {
x[] <- names(x)
x}), fill = TRUE, idcol = 'id'), 2:6, paste0("V", 1:5))[]
# id V1 V2 V3 V4 V5
#1: 1 a b c NA NA
#2: 2 a NA c d NA
#3: 3 NA NA NA d e

Related

Collapse columns into a new variable

I have a data frame that looks like this.
name = c("p1","p2","p3","p4")
place = c("f","g","h","i")
v1 = c("x", "NA", "NA", "NA")
v2 = c("NA", "y", "y", "NA")
v3 = c("NA", "NA", "z", "NA")
region = c("n","w","s","e")
grade = c("f1","f2","f3","f4")
df = data.frame(name, place, v1, v2, v3, region, grade)
name place v1 v2 v3 region grade
1 p1 f x NA NA n f1
2 p2 g NA y NA w f2
3 p3 h NA y z s f3
4 p4 i NA NA NA e f4
I would like to add a new character vector v4 that contains the character from any of columns v1 v2 v3.
name place v1 v2 v3 v4 region grade
1 p1 f x NA NA x n f1
2 p2 g NA y NA y w f2
3 p3 h NA y z yz s f3
4 p4 i NA NA NA NA e f4
many thanks
We can use paste after converting the columns to character
df$V4 <- gsub("NA", "", do.call(paste, c(df[3:5], sep="")))
df$V4[df$V4==""] <- NA
df$V4
#[1] "x" "y" "yz" NA
data
df <- structure(list(name = c("p1", "p2", "p3", "p4"), place = c("f",
"g", "h", "i"), v1 = c("x", NA, NA, NA), v2 = c(NA, "y", "y",
NA), v3 = c(NA, NA, "z", NA), region = c("n", "w", "s", "e"),
grade = c("f1", "f2", "f3", "f4")), .Names = c("name", "place",
"v1", "v2", "v3", "region", "grade"), class = "data.frame",
row.names = c("1", "2", "3", "4"))
The dplyr alternative:
install.packages("dplyr")
library(dplyr)
df <- df %>%
mutate( v4 = gsub( "NA", "", paste0(v1,v2,v3) ) ) %>%
mutate( v4 = ifelse( v4 == "", NA, v4 ) )
This should work if NA/"NA" is NA (not-a-value) or "NA" (character). And if you don't care whether v4 contains "" or NA, you can leave off the last line (and delete the last pipe).

Get elements by position from one data frame to another

Let's say we have two data frames:
df1 <- data.frame(A = letters[1:3], B = letters[4:6], C = letters[7:9], stringsAsFactors = FALSE)
A B C
1 a d g
2 b e h
3 c f i
df2 <- data.frame(V1 = 1:3, V2 = 4:6, V3 = 7:9)
V1 V2 V3
1 1 4 7
2 2 5 8
3 3 6 9
I need to build a function that takes as input a single value or a vector containing elements from one of the data frames and returns the elements from the other data frame according to their positional indexes.
The function should work like this:
> matchdf(values = c("a", "e", "i"), dfin = df1, dfout = df2)
[1] 1 5 9
> matchdf(values = c(1, 5, 9), dfin = df2, dfout = df1)
[1] "a" "e" "i"
> matchdf(values = c(1, 1, 1), dfin = df2, dfout = df1)
[1] "a" "a" "a"
This is what I have tried so far:
requiere(dplyr)
toVec <- function(df) df %>% as.matrix %>% as.vector
matchdf <- function(values, dfin, dfout) toVec(dfout)[toVec(dfin) %in% values]
# But sometimes the output values aren't in correct order:
> matchdf(c("c", "i", "h"), dt1, dt2)
[1] 3 8 9
# should output 3 9 8
> matchdf(values = c("a", "a", "a"), dfin = dt1, dfout = dt2)
[1] 1
# Should output 1 1 1
Feel free to use data.table or/and dplyr if it eases the task. I would prefer a solution without for loops.
Assumptions:
elements from df1 are different from df2
dim(df1) = dim(df2)
matchdf <- function(values, dfin, dfout){
unlist(sapply(values,
function(val) dfout[dfin == val],
USE.NAMES = F)
)
}
matchdf(c("c", "i", "h"), df1, df2)
#should output 3 9 8
[1] 3 9 8
matchdf(values = c("a", "a", "a"), dfin = df1, dfout = df2)
#should output 1 1 1
[1] 1 1 1
matchdf(values = c("X", "Y", "a"), dfin = df1, dfout = df2)
#should output vector, not list
[1] 1

Advice on writing generic function to encode variables in R

I have a data.frame:
mydata = data.frame(v1 = c("A", "A", "A", "B", "B", "C", "D"),
v2 = c("XY", "XY", "ZZ", "BB", "ZZ", NA, "ZZ"),
v3 = 5)
And I would like to encode each of the characters in the data frame to integers corresponding to each of the levels. I also want to "ignore" NA values. The expected output would be equal to:
output = data.frame(v1 = c(1, 1, 1, 2, 2, 3, 4),
v2 = c(1, 1, 2, 3, 2, NA, 2),
v3 = 5)
My hope is to write a function that accepts a data.frame object AND a list specifying the columns on which I want to perform the operation, something like:
my_function = function(df, vars){
...
}
EDIT: in the example above, "vars" would be = c("v1", "v2")
Any suggestions for how to approach this? I'm open to using packages such as dplyr to help.
Thanks,
D
We can convert to factor and then coerce to numeric
mydata[1:2] <- lapply(mydata[1:2], function(x)
as.numeric(factor(x, levels=unique(x))))
This can be converted to a function
myfunction <- function(df, vars) {
df[vars] <- lapply(df[vars], function(x)
as.numeric(factor(x, levels=unique(x))))
df
}
myfunction(mydata, c('v1', 'v2'))
# v1 v2 v3
#1 1 1 5
#2 1 1 5
#3 1 2 5
#4 2 3 5
#5 2 2 5
#6 3 NA 5
#7 4 2 5
If we need it to be further generalized, we may need to check the column classes i.e. whether it is a numeric column and if not, then change to factor with levels specified and coerce to numeric.
mydata[] <- lapply(mydata, function(x)
if(!is.numeric(x)) as.numeric(factor(x, levels=unique(x)))
else x)

How to join multiple data frames using dplyr?

I want to left_join multiple data frames:
dfs <- list(
df1 = data.frame(a = 1:3, b = c("a", "b", "c")),
df2 = data.frame(c = 4:6, b = c("a", "c", "d")),
df3 = data.frame(d = 7:9, b = c("b", "c", "e"))
)
Reduce(left_join, dfs)
# a b c d
# 1 1 a 4 NA
# 2 2 b NA 7
# 3 3 c 5 8
This works because they all have the same b column, but Reduce doesn't let me specify additional arguments that I can pass to left_join. Is there a work around for something like this?
dfs <- list(
df1 = data.frame(a = 1:3, b = c("a", "b", "c")),
df2 = data.frame(c = 4:6, d = c("a", "c", "d")),
df3 = data.frame(d = 7:9, b = c("b", "c", "e"))
)
Update
This kind of works: Reduce(function(...) left_join(..., by = c("b" = "d")), dfs) but when by is more than one element it gives this error: Error: cannot join on columns 'b' x 'd': index out of bounds
It's been too late i know....today I got introduced to the unanswered questions section. Sorry to bother.
Using left_join()
dfs <- list(
df1 = data.frame(b = c("a", "b", "c"), a = 1:3),
df2 = data.frame(d = c("a", "c", "d"), c = 4:6),
df3 = data.frame(b = c("b", "c", "e"), d = 7:9)
)
func <- function(...){
df1 = list(...)[[1]]
df2 = list(...)[[2]]
col1 = colnames(df1)[1]
col2 = colnames(df2)[1]
xxx = left_join(..., by = setNames(col2,col1))
return(xxx)
}
Reduce( func, dfs)
# b a c d
#1 a 1 4 NA
#2 b 2 NA 7
#3 c 3 5 8
Using merge() :
func <- function(...){
df1 = list(...)[[1]]
df2 = list(...)[[2]]
col1 = colnames(df1)[1]
col2 = colnames(df2)[1]
xxx=merge(..., by.x = col1, by.y = col2, , all.x = T)
return(xxx)
}
Reduce( func, dfs)
# b a c d
#1 a 1 4 NA
#2 b 2 NA 7
#3 c 3 5 8
Would this work for you?
jnd.tbl <- df1 %>%
left_join(df2, by='b') %>%
left_join(df3, by='d')
Yet another solution:
library(purrr)
library(dplyr)
dfs = list(
df1 = data.frame(a = 1:3, b = c("a", "b", "c")),
df2 = data.frame(c = 4:6, b = c("a", "c", "d")),
df3 = data.frame(d = 7:9, b = c("b", "c", "e"))
)
purrr::reduce(dfs, dplyr::left_join, by = 'b')

join matching columns in a data.frame or data.table

I have the following data.frames:
a <- data.frame(id = 1:3, v1 = c('a', NA, NA), v2 = c(NA, 'b', 'c'))
b <- data.frame(id = 1:3, v1 = c(NA, 'B', 'C'), v2 = c("A", NA, NA))
> a
id v1 v2
1 1 a <NA>
2 2 <NA> b
3 3 <NA> c
> b
id v1 v2
1 1 <NA> A
2 2 B <NA>
3 3 C <NA>
note: There are no ids for which v1 or v2 are defined in both tables; there is only a single unique non-NA value in each column for each id value
I would like to merge these data frames on matching values of "id':
ab <- merge(a, b, by = "id")
but I would also like to combine the two columns v1 and v2, so that the data.frame ab will look like this:
ab <- data.frame(id = 1:3, v1 = c("a", "B", "C"), v2 = c("A", "b", "c"))
> ab
id v1 v2
1 1 a A
2 2 B b
3 3 C c
instead, I get this:
> merge(a, b, by = "id")
id v1.x v2.x v1.y v2.y
1 1 a <NA> <NA> A
2 2 <NA> b B <NA>
3 3 <NA> c C <NA>
it would be helpful to have examples using both data.frame and data.table, so here are the data.table versions of above:
A <- data.table(a, key = 'id')
B <- data.table(b, key = 'id')
A[B]
The type of merge you specify probably won't be possible using merge (with data frames), although saying that usually invites being proved wrong.
You also omit some details: will there always be a single unique non-NA value in each column for each id value? If so, this will work:
ab <- rbind(a,b)
> colFun <- function(x){x[which(!is.na(x))]}
> ddply(ab,.(id),function(x){colwise(colFun)(x)})
id v1 v2
1 1 a A
2 2 B b
3 3 C c
A similar strategy should work with data.tables as well:
abDT <- data.table(ab,key = "id")
> abDT[,list(colFun(v1),colFun(v2)),by = id]
id V1 V2
[1,] 1 a A
[2,] 2 B b
[3,] 3 C c
If your data is as simple as it is above joran's answer is likely the simplest way. Here's may approach in base:
a <- data.frame(id = 1:3, v1 = c('a', NA, NA), v2 = c(NA, 'b', 'c'))
b <- data.frame(id = 1:3, v1 = c(NA, 'B', 'C'), v2 = c("A", NA, NA))
decider <- function(x, y) factor(ifelse(is.na(x), as.character(y), as.character(x)))
data.frame(mapply(a, b, FUN = decider))
If your data has different id's (some overlap and some do not, then here's a different approach:
a <- data.frame(id = c(1,2,4,5), v1 = c('a', NA, "q", NA), v2 = c(NA, 'b', 'c', "e"))
b <- data.frame(id = 1:4, v1 = c(NA, "A", "C", 'B'), v2 = c("A", NA, "D", NA))
decider <- function(x, y) factor(ifelse(is.na(x), as.character(y), as.character(x)))
DF <- data.frame(mapply(a, b, FUN = decider))
DF2 <- rbind(b[!b$id %in% DF$id , ], DF)
DF2 <- DF2[order(DF2$id), ]
rownames(DF2) <- 1:nrow(DF2)

Resources