How do I reorganize this data.frame in R - r

I have the following data.frame.
u = c("aa", "bb", "cc", "dd")
v = c(1, 6, 9, 10)
w = c(2, 7, "", 11)
x = c(3, 8, "", 12)
y = c(4, "", "", 13)
z = c(5, "", "", "")
df = data.frame(cbind(u, v, w, x, y, z))
df
u v w x y z
1 aa 1 2 3 4 5
2 bb 6 7 8
3 cc 9
4 dd 10 11 12 13
I want the final product to be reorganized as such
1 aa
2 aa
3 aa
4 aa
5 aa
6 bb
7 bb
8 bb
9 cc
10 dd
11 dd
12 dd
13 dd
14 dd
I have the following script worked up but I'm missing something. I would appreciate guidance on what I'm missing.
dat <- df[,-1]
dat <- dat[,!apply (is.na(dat), 2, all)]
dat[is.na(dat)]="|"
dat <- apply(dat, 1, paste, collapse="|")
dat <- gsub("\\|\\|","", dat)
dat <- trimws(gsub("\\|$","",dat))
all.dat <- unlist(strsplit(dat,"\\|"))
dat.tmp <- data.frame(matrix(ncol = 2, nrow = length(all.dat)))
col1 <- df[,1]
for(i in 1:length(dat)){
tmp <- dat[i]
tmp <- unlist(strsplit(tmp, "\\|"))
for(j in 1:length(tmp)){
dat.tmp[i,1] <- tmp[j]
dat.tmp[i,2] <- as.character(col1[i])
}
print(i)
}
dat.tmp

You can use the reshape() function in the stats package.
df <- sapply(df, as.character) #PRE-PROCESS DATA
df[df == ""] <- NA #PRE-PROCESS DATA
df.new <- reshape(df, idvar = "u", direction = "long", varying = list(2:dim(df)[2]),
v.names = "vars")
df.new <- df.new[!is.na(df$vars), ]
rownames(df.new) <- seq(1, df.new[1])
You can also use the melt() function in reshape2
#USING PREPROCESSED DF.NEW
df.new <- melt(df, id.vars = "u", na.rm = T)

This is a fairly strange data structure, since every variable is a factor variable. A second method is to explicitly construct the two vectors of the desired data.frame using t and as.integer and rep for the second variable.
# transpose numeric values and convert to integer vector. repeat categorical
dat <- data.frame(val=as.integer(t(df[-1])), cat=rep(df[,1], each=ncol(df)-1L))
Now, drop the NA values
dat <- dat[!is.na(dat$val),]
dat
val cat
1 1 aa
2 2 aa
3 3 aa
4 4 aa
5 5 aa
6 6 bb
7 7 bb
8 8 bb
11 9 cc
16 10 dd
17 11 dd
18 12 dd
19 13 dd

ind <- apply(df, 1, function(x) sum(!is.na(as.numeric(x[-1]))))
as.data.frame(rep(df$u, ind))
1 aa
2 aa
3 aa
4 aa
5 aa
6 bb
7 bb
8 bb
9 cc
10 dd
11 dd
12 dd
13 dd

Here is a dplyr/tidyr solution
library(dplyr)
library(tidyr)
df[] <- lapply(df, gsub, pattern = "^$|^ $", replacement = NA)
df <- gather(df, id, value, v:z, na.rm = TRUE) %>%
arrange(u) %>%
select(u)

Related

R with dplyr rename, avoid error if column doesn't exist AND create new column with NAs

We are looking to rename columns in a dataframe in R, however the columns may be missing and this throws an error:
my_df <- data.frame(a = c(1,2,3), b = c(4,5,6))
my_df %>% dplyr::rename(aa = a, bb = b, cc = c)
Error: Can't rename columns that don't exist.
x Column `c` doesn't exist.
our desired output is this, which creates a new column with NA values if the original column does not exist:
> my_df
aa bb c
1 1 4 NA
2 2 5 NA
3 3 6 NA
A possible solution:
library(tidyverse)
my_df <- data.frame(a = c(1,2,3), b = c(4,5,6))
cols <- c(a = NA_real_, b = NA_real_, c = NA_real_)
my_df %>% add_column(!!!cols[!names(cols) %in% names(.)]) %>%
rename(aa = a, bb = b, cc = c)
#> aa bb cc
#> 1 1 4 NA
#> 2 2 5 NA
#> 3 3 6 NA
You can use a named vector with any_of() to rename that won't error on missing variables. I'm uncertain of a dplyr way to then create the missing vars but it's easy enough in base R.
library(dplyr)
cols <- c(aa = "a", bb = "b", cc = "c")
my_df %>%
rename(any_of(cols)) %>%
`[<-`(., , setdiff(names(cols), names(.)), NA)
aa bb cc
1 1 4 NA
2 2 5 NA
3 3 6 NA
Here is a solution using the data.table function setnames. I've added a second "missing" column "d" to demonstrate generality.
library(tidyverse)
library(data.table)
my_df <- data.frame(a = c(1,2,3), b = c(4,5,6))
curr <- names(my_df)
cols <- data.frame(new=c("aa","bb","cc","dd"), old = c("a", "b", "c","d")) %>%
mutate(exist = old %in% curr)
foo <- filter(cols, exist)
bar <- filter(cols, !exist)
setnames(my_df, new = foo$new)
my_df[, bar$old] <- NA
my_df
#> my_df
# aa bb c d
#1 1 4 NA NA
#2 2 5 NA NA
#3 3 6 NA NA

data frame from a list of vectors

I have 4 vectors (d1,d2,d3,d4) of different lengths from which I create a data frame like this
df <- data.frame(
x = c(
seq_along(d1),
seq_along(d2),
seq_along(d3),
seq_along(d4)
),
y = c(
d1,
d2,
d3,
d4
),
id = c(
rep("d1", times = length(d1)),
rep("d2", times = length(d2)),
rep("d3", times = length(d3)),
rep("d4", times = length(d4))
))
Adding a new vector means adding it in 3 different places, this is what I'd like to avoid.
Ideally I would like to pass d1,d2,d3,d4 into a function that then returns the data frame.
The first steps seems to be to wrap the vectors into a list and name them.
l <- list(d1,d2,d3,d4)
names(l) <- c("d1","d2","d3","d4")
But I am struggling with the 2nd part that probably should be something along the lines of this (pseudo code)
df <- data.frame(
x = flatten(map(l, function(a) seq_along(a))),
y = flatten(l),
id = flatten(map(l, function(a) rep(a.name,times=length(a))))
)
What's the correct way to construct the data frame from the list?
Or is there a better way of doing this?
UPDATE: For demonstrative purposes d1..d4 could be imagined to be
d1 <- pnorm(seq(-2, 2, 0.05))-3
d2 <- pnorm(seq(-3, 3, 0.10))
d3 <- pnorm(seq(-1, 2, 0.05))-4
d4 <- pnorm(seq(-4, 3, 0.15))
You can define a function that takes any number of vectors:
build_df <- function(...)
{
vec_list <- list(...)
df <- data.frame(x = do.call("c", sapply(vec_list, seq_along)),
y = do.call("c", vec_list),
name = do.call("c", sapply(seq_along(vec_list),
function(i) rep(names(vec_list)[i],
length(vec_list[[i]]))))
)
rownames(df) <- seq(nrow(df))
df
}
build_df(d1 = 1:3, d2 = 6:9, bananas = 4:6)
#> x y name
#> 1 1 1 d1
#> 2 2 2 d1
#> 3 3 3 d1
#> 4 1 6 d2
#> 5 2 7 d2
#> 6 3 8 d2
#> 7 4 9 d2
#> 8 1 4 bananas
#> 9 2 5 bananas
#> 10 3 6 bananas
Created on 2020-08-03 by the reprex package (v0.3.0)
Your y can be assembled easily with unlist. I needed a for loop to generate x and id. How about this function?
d1 <- pnorm(seq(-2, 2, 0.05))-3
d2 <- pnorm(seq(-3, 3, 0.10))
d3 <- pnorm(seq(-1, 2, 0.05))-4
d4 <- pnorm(seq(-4, 3, 0.15))
my_list <- list(d1 = d1, d2 = d2, d3 = d3, d4 = d4)
build_df <- function(list) {
names <- names(list)
x <- integer()
id <- character()
for(i in 1:length(list)) {
x <- c(x, seq_along(list[[i]]))
id <- c(id, rep(names[i], length(list[[i]])))
}
y <- unname(unlist(list))
df <- data.frame(x = x, y = y, id = id)
return(df)
}
df <- build_df(my_list)
head(df)
x y id
1 1 -2.977250 d1
2 2 -2.974412 d1
3 3 -2.971283 d1
4 4 -2.967843 d1
5 5 -2.964070 d1
6 6 -2.959941 d1
We could use mget
library(dplyr)
library(tibble)
library(tidyr)
library(data.table)
mget(paste0("d", 1:4)) %>%
enframe(name = 'id', value = 'y') %>%
unnest(c(y)) %>%
mutate(x = rowid(id))

Extracting and cbinding similarly named variables in a data.frame in R

I have a cbind of 2 data.frames called DATA. Using BASE R, I was wondering how I could extract and then, cbind similarly named variables in DATA and store them as a list?
For the example below, I want all variable AAs, and separately all variable BBs in DATA be separately cbinded and stored as a list?
Note: names could be anything, and the number of variables could be any number. A function(al) solution is highly appreciated.
Note: suppose we have NO ACCESS to r, the only input is DATA.
r <- list(
data.frame(Name = rep("Jacob", 6),
X = c(2,2,1,1,NA, NA),
Y = c(1,1,1,2,1,NA),
Z = rep(3, 6),
out = rep(1, 6)),
data.frame(Name = rep("Jon", 6),
X = c(1,NA,3,1,NA,NA),
Y = c(1,1,1,2,NA,NA),
Z = rep(2, 6),
out = rep(1, 6)),
data.frame(Name = rep("Jon", 6),
X = c(1,NA,3,1,NA,NA),
Y = c(1,1,1,2,2,NA),
Z = rep(2, 6),
out = rep(2, 6)),
data.frame(Name = rep("Jim", 6),
X = c(1,NA,3,1,NA,NA),
Y = c(1,1,1,2,2,NA),
Z = rep(2, 6),
out = rep(1, 6)))
DATA <- do.call(cbind, r) ## DATA: cbind of two data.frames
Here is an option with split. Wouldn't recommend to have same duplicate column names in the dataset. But, if it is really needed, after thee split, change the column names by removing the . following by one or more numbers at the end of it with sub
nm1 <- Reduce(intersect, lapply(r, colnames)) # get the common names
lst1 <- split.default(DATA[names(DATA) %in% nm1], names(DATA)[names(DATA) %in% nm1])
lapply(lst1, function(x) setNames(x, sub("\\.\\d+$", "", names(x))))
Or if we need to use only 'DATA' and not 'r' for finding the intersecting column names. It is difficult but we can get a frequency of the occurence of column names and select that have 2 as frequency
tbl <- table(names(DATA))
nm1 <- names(which(tbl==max(tbl)))
Use that in the split.default as before
lst1 <- split.default(DATA[names(DATA) %in% nm1], names(DATA)[names(DATA) %in% nm1])
lapply(lst1, function(x) setNames(x, sub("\\.\\d+$", "", names(x))))
Using OP's new example
r <- list( data.frame( AA = c(2,2,1,1,3,2), BB = c(1,1,1,2,2,NA), CC = 1:6), data.frame( AA = c(1,NA,3,1,3,2), BB = c(1,1,1,2,2,2)), data.frame( AA = c(1,NA,3,1,3,2), BB = c(1,1,1,2,2,2), DD = 0:5) )
DATA <- do.call(cbind, r)
tbl <- table(names(DATA))
nm1 <- names(which(tbl==max(tbl)))
lst1 <- split.default(DATA[names(DATA) %in% nm1], names(DATA)[names(DATA) %in% nm1])
lapply(lst1, function(x) setNames(x, sub("\\.\\d+$", "", names(x))))
#$AA
# AA AA AA
#1 2 1 1
#2 2 NA NA
#3 1 3 3
#4 1 1 1
#5 3 3 3
#6 2 2 2
#$BB
# BB BB BB
#1 1 1 1
#2 1 1 1
#3 1 1 1
#4 2 2 2
#5 2 2 2
#6 NA 2 2

Loop by variable names

I want to create a for loop by variable names.
Each time, I calculte the max between each two variables, and define a new one in data df. New variables look like this:var1_1, var1_2... Here is my code:
df=data.frame(matrix(c(1:6), nrow = 2))
colnames(df) = c("x", "y", "z")
for(i in length(names(df))-1){
df = df %>% mutate(paste0("var", i, "_", i+1) = max(names(df)[i], names(df)[i+1]))
}
But there gives error.
Expected output:
>df
x y z var1_2 var1_3 var2_3
1 3 5 3 5 5
2 4 6 4 6 6
One way via base R,
m1 <- sapply(combn(names(df),2, simplify = FALSE), function(i) do.call(pmax, df[i]))
nms <- combn(ncol(m1), 2, function(i) paste0('Var', i[1], '_', i[2]))
cbind(df, setNames(data.frame(m1), nms))
# x y z Var1_2 Var1_3 Var2_3
#1 1 3 5 3 5 5
#2 2 4 6 4 6 6
If you really want to use a Loop you can try:
ind<-combn(3,2)
for(i in 1:dim(df)[2]){
i <- ind[,i]
name <- paste0("var", i[1], "_", i[2])
val <- names(df)[i[ifelse(sum(df[,i[1]]) > sum(df[,i[2]]),1,2)]]
df <- mutate_(df, .dots= setNames(list(val),name))
}

Combine two or more columns in a dataframe into a new column with a new name

For example if I have this:
n = c(2, 3, 5)
s = c("aa", "bb", "cc")
b = c(TRUE, FALSE, TRUE)
df = data.frame(n, s, b)
n s b
1 2 aa TRUE
2 3 bb FALSE
3 5 cc TRUE
Then how do I combine the two columns n and s into a new column named x such that it looks like this:
n s b x
1 2 aa TRUE 2 aa
2 3 bb FALSE 3 bb
3 5 cc TRUE 5 cc
Use paste.
df$x <- paste(df$n,df$s)
df
# n s b x
# 1 2 aa TRUE 2 aa
# 2 3 bb FALSE 3 bb
# 3 5 cc TRUE 5 cc
For inserting a separator:
df$x <- paste(df$n, "-", df$s)
As already mentioned in comments by Uwe and UseR, a general solution in the tidyverse format would be to use the command unite:
library(tidyverse)
n = c(2, 3, 5)
s = c("aa", "bb", "cc")
b = c(TRUE, FALSE, TRUE)
df = data.frame(n, s, b) %>%
unite(x, c(n, s), sep = " ", remove = FALSE)
Using dplyr::mutate:
library(dplyr)
df <- mutate(df, x = paste(n, s))
df
> df
n s b x
1 2 aa TRUE 2 aa
2 3 bb FALSE 3 bb
3 5 cc TRUE 5 cc
Some examples with NAs and their removal using apply
n = c(2, NA, NA)
s = c("aa", "bb", NA)
b = c(TRUE, FALSE, NA)
c = c(2, 3, 5)
d = c("aa", NA, "cc")
e = c(TRUE, NA, TRUE)
df = data.frame(n, s, b, c, d, e)
paste_noNA <- function(x,sep=", ") {
gsub(", " ,sep, toString(x[!is.na(x) & x!="" & x!="NA"] ) ) }
sep=" "
df$x <- apply( df[ , c(1:6) ] , 1 , paste_noNA , sep=sep)
df
We can use paste0:
df$combField <- paste0(df$x, df$y)
If you do not want any padding space introduced in the concatenated field. This is more useful if you are planning to use the combined field as a unique id that represents combinations of two fields.
Instead of
paste (default spaces),
paste0 (force the inclusion of missing NA as character) or
unite (constrained to 2 columns and 1 separator),
I'd suggest an alternative as flexible as paste0 but more careful with NA: stringr::str_c
library(tidyverse)
# check the missing value!!
df <- tibble(
n = c(2, 2, 8),
s = c("aa", "aa", NA_character_),
b = c(TRUE, FALSE, TRUE)
)
df %>%
mutate(
paste = paste(n,"-",s,".",b),
paste0 = paste0(n,"-",s,".",b),
str_c = str_c(n,"-",s,".",b)
) %>%
# convert missing value to ""
mutate(
s_2=str_replace_na(s,replacement = "")
) %>%
mutate(
str_c_2 = str_c(n,"-",s_2,".",b)
)
#> # A tibble: 3 x 8
#> n s b paste paste0 str_c s_2 str_c_2
#> <dbl> <chr> <lgl> <chr> <chr> <chr> <chr> <chr>
#> 1 2 aa TRUE 2 - aa . TRUE 2-aa.TRUE 2-aa.TRUE "aa" 2-aa.TRUE
#> 2 2 aa FALSE 2 - aa . FALSE 2-aa.FALSE 2-aa.FALSE "aa" 2-aa.FALSE
#> 3 8 <NA> TRUE 8 - NA . TRUE 8-NA.TRUE <NA> "" 8-.TRUE
Created on 2020-04-10 by the reprex package (v0.3.0)
extra note from str_c documentation
Like most other R functions, missing values are "infectious": whenever a missing value is combined with another string the result will always be missing. Use str_replace_na() to convert NA to "NA"
There are other great answers, but in the case where you don't know the column names or the number of columns you want to concatenate beforehand, the following is useful.
df = data.frame(x = letters[1:5], y = letters[6:10], z = letters[11:15])
colNames = colnames(df) # could be any number of column names here
df$newColumn = apply(df[, colNames, drop = F], MARGIN = 1, FUN = function(i) paste(i, collapse = ""))
I'd like to also propose a method for concatenating a large/unknown number of columns. The solution proposed by Ben Ernest can be pretty slow on large datasets.
Below is my proposed solution:
# setup data.frame - Making it large for the time benchmarking
n = rep(c(2, 3, 5), 1000000)
s = rep(c("aa", "bb", "cc"), 1000000)
b = rep(c(TRUE, FALSE, TRUE), 1000000)
df = data.frame(n, s, b)
# The proposed solution:
colNames = c("n", "s") # could be any number of column names here
df$x <- do.call(paste0, c(df[,colNames], sep=" "))
# running system.time on this yields:
# user system elapsed
# 1.861 0.005 1.865
# compare with alternative method:
df$x <- apply(df[, colNames, drop = F], MARGIN = 1,
FUN = function(i) paste(i, collapse = ""))
# running system.time on this yields:
# user system elapsed
# 16.127 0.147 16.304

Resources