separate() in tidyr with NA - r

I have a question related to separate() in the tidyr package. When there is no NA in a data frame, separate() works. I have been using this function a lot. But, today I had a case in which there were NAs in a data frame. separate() returned an error message. I could be very silly. But, I wonder if tidyr may not be designed for this kind of data cleaning. Or is there any way separate() can work with NAs? Thank you very much for taking your time.
Here is an updated sample based on the comments. Say I want to separate characters in y and create new columns. If I remove the row with NA, separate() will work. But, I do not want to delete the row, what could I do?
x <- c("a-1","b-2","c-3")
y <- c("d-4","e-5", NA)
z <- c("f-6", "g-7", "h-8")
foo <- data.frame(x,y,z, stringsAsFactors = F)
ana <- foo %>%
separate(y, c("part1", "part2"))
# > foo
# x y z
# 1 a-1 d-4 f-6
# 2 b-2 e-5 g-7
# 3 c-3 <NA> h-8
# > ana <- foo %>%
# + separate(y, c("part1", "part2"))
# Error: Values not split into 2 pieces at 3

One way would be:
res <- foo %>%
mutate(y=ifelse(is.na(y), paste0(NA,"-", NA), y)) %>%
separate(y, c('part1', 'part2'))
res[res=='NA'] <- NA
res
# x part1 part2 z
#1 a-1 d 4 f-6
#2 b-2 e 5 g-7
#3 c-3 <NA> <NA> h-8

You can use extra option in separate.
Here's an example from hadley's github issue page
> df <- data.frame(x = c("a", "a b", "a b c", NA))
> df
x
1 a
2 a b
3 a b c
4 <NA>
> df %>% separate(x, c("a", "b"), extra = "merge")
a b
1 a <NA>
2 a b
3 a b c
4 <NA> <NA>
> df %>% separate(x, c("a", "b"), extra = "drop")
a b
1 a <NA>
2 a b
3 a b
4 <NA> <NA>

Related

R with dplyr rename, avoid error if column doesn't exist AND create new column with NAs

We are looking to rename columns in a dataframe in R, however the columns may be missing and this throws an error:
my_df <- data.frame(a = c(1,2,3), b = c(4,5,6))
my_df %>% dplyr::rename(aa = a, bb = b, cc = c)
Error: Can't rename columns that don't exist.
x Column `c` doesn't exist.
our desired output is this, which creates a new column with NA values if the original column does not exist:
> my_df
aa bb c
1 1 4 NA
2 2 5 NA
3 3 6 NA
A possible solution:
library(tidyverse)
my_df <- data.frame(a = c(1,2,3), b = c(4,5,6))
cols <- c(a = NA_real_, b = NA_real_, c = NA_real_)
my_df %>% add_column(!!!cols[!names(cols) %in% names(.)]) %>%
rename(aa = a, bb = b, cc = c)
#> aa bb cc
#> 1 1 4 NA
#> 2 2 5 NA
#> 3 3 6 NA
You can use a named vector with any_of() to rename that won't error on missing variables. I'm uncertain of a dplyr way to then create the missing vars but it's easy enough in base R.
library(dplyr)
cols <- c(aa = "a", bb = "b", cc = "c")
my_df %>%
rename(any_of(cols)) %>%
`[<-`(., , setdiff(names(cols), names(.)), NA)
aa bb cc
1 1 4 NA
2 2 5 NA
3 3 6 NA
Here is a solution using the data.table function setnames. I've added a second "missing" column "d" to demonstrate generality.
library(tidyverse)
library(data.table)
my_df <- data.frame(a = c(1,2,3), b = c(4,5,6))
curr <- names(my_df)
cols <- data.frame(new=c("aa","bb","cc","dd"), old = c("a", "b", "c","d")) %>%
mutate(exist = old %in% curr)
foo <- filter(cols, exist)
bar <- filter(cols, !exist)
setnames(my_df, new = foo$new)
my_df[, bar$old] <- NA
my_df
#> my_df
# aa bb c d
#1 1 4 NA NA
#2 2 5 NA NA
#3 3 6 NA NA

Error when duplicating a row conditionally - R

I have a data frame with columns A, B, C as follows:
A <- c("NX300", "BT400", "GD200")
B <- c("M0102", "N0703", "M0405")
C <- c(NA, "M0104", "N0404")
df <- data.frame (A,B,C)
Instead, I would like to duplicate a row whenever a value in C is not NA and replace the value of B with NA for the duplicated row. This is the desired output:
A1 <- c("NX300", "BT400", "BT400", "GD200", "GD200")
B1 <- c("M0102", "N0703", NA, "M0405", NA)
C1 <- c(NA, NA, "M0104", NA, "N0404")
df1 <- data.frame(A1,B1,C1)
To achieve this, I tried duplicating the row, without replacing B with NA just yet, but I get the following error code:
rbind(df, df[,is.na(C)==FALSE])
Error: object "C" not found
Can anyone help please?
Define a function newrows which accepts a row x and returns it or the duplicated rows and then apply it to each row. No packages are used.
newrows <- function(x) {
if (is.na(x$C)) x
else rbind(replace(x, "C", NA), replace(x, "B", NA))
}
do.call("rbind", by(df, 1:nrow(df), newrows))
giving:
A B C
1 NX300 M0102 <NA>
2.2 BT400 N0703 <NA>
2.21 BT400 <NA> M0104
3.3 GD200 M0405 <NA>
3.31 GD200 <NA> N0404
An option would be
library(dplyr)
df %>%
mutate(i1 = 1 + !is.na(C)) %>%
uncount(i1) %>%
mutate(B = replace(B, duplicated(B), NA)) %>%
group_by(A) %>%
mutate(C = replace(C, duplicated(C, fromLast = TRUE), NA))
If sorting does not matter, and continuing your first steps you can try:
x <- rbind(df, cbind(df[!is.na(df$C),1:2], C=NA))
x$B[!is.na(x$C)] <- NA
x
# A B C
#1 NX300 M0102 <NA>
#2 BT400 <NA> M0104
#3 GD200 <NA> N0404
#21 BT400 N0703 <NA>
#31 GD200 M0405 <NA>

Group data by factor level, then transform to data frame with colname being levels?

There is my problem that I can't solve it:
Data:
df <- data.frame(f1=c("a", "a", "b", "b", "c", "c", "c"),
v1=c(10, 11, 4, 5, 0, 1, 2))
data.frame:f1 is factor
f1 v1
a 10
a 11
b 4
b 5
c 0
c 1
c 2
# What I want is:(for example, fetch data with the number of element of some level == 2, then to data.frame)
a b
10 4
11 5
Thanks in advance!
I might be missing something simple here , but the below approach using dplyr works.
library(dplyr)
nlevels = 2
df1 <- df %>%
add_count(f1) %>%
filter(n == nlevels) %>%
select(-n) %>%
mutate(rn = row_number()) %>%
spread(f1, v1) %>%
select(-rn)
This gives
# a b
# <int> <int>
#1 10 NA
#2 11 NA
#3 NA 4
#4 NA 5
Now, if you want to remove NA's we can do
do.call("cbind.data.frame", lapply(df1, function(x) x[!is.na(x)]))
# a b
#1 10 4
#2 11 5
As we have filtered the dataframe which has only nlevels observations, we would have same number of rows for each column in the final dataframe.
split might be useful here to split df$v1 into parts corresponding to df$f1. Since you are always extracting equal length chunks, it can then simply be combined back to a data.frame:
spl <- split(df$v1, df$f1)
data.frame(spl[lengths(spl)==2])
# a b
#1 10 4
#2 11 5
Or do it all in one call by combining this with Filter:
data.frame(Filter(function(x) length(x)==2, split(df$v1, df$f1)))
# a b
#1 10 4
#2 11 5
Here is a solution using unstack :
unstack(
droplevels(df[ave(df$v1, df$f1, FUN = function(x) length(x) == 2)==1,]),
v1 ~ f1)
# a b
# 1 10 4
# 2 11 5
A variant, similar to #thelatemail's solution :
data.frame(Filter(function(x) length(x) == 2, unstack(df,v1 ~ f1)))
My tidyverse solution would be:
library(tidyverse)
df %>%
group_by(f1) %>%
filter(n() == 2) %>%
mutate(i = row_number()) %>%
spread(f1, v1) %>%
select(-i)
# # A tibble: 2 x 2
# a b
# * <dbl> <dbl>
# 1 10 4
# 2 11 5
or mixing approaches :
as_tibble(keep(unstack(df,v1 ~ f1), ~length(.x) == 2))
Using all base functions (but you should use tidyverse)
# Add count of instances
x$len <- ave(x$v1, x$f1, FUN = length)
# Filter, drop the count
x <- x[x$len==2, c('f1','v1')]
# Hacky pivot
result <- data.frame(
lapply(unique(x$f1), FUN = function(y) x$v1[x$f1==y])
)
colnames(result) <- unique(x$f1)
> result
a b
1 10 4
2 11 5
I'd like code this, may it helps for you
library(reshape2)
library(dplyr)
aa = data.frame(v1=c('a','a','b','b','c','c','c'),f1=c(10,11,4,5,0,1,2))
cc = aa %>% group_by(v1) %>% summarise(id = length((v1)))
dd= merge(aa,cc) #get the level
ee = dd[dd$aa==2,] #select number of level equal to 2
ee$id = rep(c(1,2),nrow(ee)/2) # reset index like (1,2,1,2)
dcast(ee, id~v1,value.var = 'f1')
all done!

Applying tidyr to separate only specific rows by specifying which rows to exclude

I would like to separate a column by a condition that excludes certain rows. This is a minor variation on this question: Applying tidyr separate only to specific rows But instead of specifying which rows to separate, I'd like to specify which rows to exclude from separating.
For example, lets say we want to split all rows of the 'text' column, except for the ones that have here_do in them:
#creating DF for the example
df <- data.frame(var_a = letters[1:5],
var_b = c(sample(1:100, 5)),
text = c("foo_bla",
"here_do",
"oh_yes",
"ba_a",
"lan_d"))
I guess there would be some way of using extract as we see in the related question, but I can't seem to figure out how to modify the "(here)_(do)" part to make it work:
library(tidyr)
extract(df, text, into = c("first", "sec"), "(here)_(do)", remove = FALSE)
If you don't mind using "data.table" instead, you can try:
library(data.table)
setDT(df)[!text %in% "here_do", c("first", "second") := tstrsplit(text, "_")][]
# var_a var_b text first second
# 1: a 40 foo_bla foo bla
# 2: b 4 here_do NA NA
# 3: c 12 oh_yes oh yes
# 4: d 35 ba_a ba a
# 5: e 11 lan_d lan d
One way is to separate everything then "unseparate" the rows you wanted to exlude.
library('tidyverse')
df <- data.frame(var_a = letters[1:5],
var_b = c(sample(1:100, 5)),
text = c("foo_bla",
"here_do",
"oh_yes",
"ba_a",
"lan_d"),
stringsAsFactors = F)
df %>%
separate(text, c('first_val', 'second_val'), remove = F) %>%
mutate(
first_val = ifelse(text == 'here_do', text, first_val),
second_val = ifelse(text == 'here_do', NA, first_val))
#> var_a var_b text first_val second_val
#> 1 a 45 foo_bla foo foo
#> 2 b 43 here_do here_do <NA>
#> 3 c 81 oh_yes oh oh
#> 4 d 33 ba_a ba ba
#> 5 e 15 lan_d lan lan
We can filter out the row that you do not want to separate, separate the rest of the rows, and then join the result back to the original data frame.
library(dplyr)
library(tidyr)
df2 <- df %>%
filter(!(text %in% "here_do")) %>%
separate(text, into = c("First", "Second"), remove = FALSE) %>%
right_join(df, by = c("var_a", "var_b", "text"))
df2
# var_a var_b text First Second
# 1 a 19 foo_bla foo bla
# 2 b 90 here_do <NA> <NA>
# 3 c 21 oh_yes oh yes
# 4 d 6 ba_a ba a
# 5 e 15 lan_d lan d
DATA
set.seed(244)
df <- data.frame(var_a = letters[1:5],
var_b = c(sample(1:100, 5)),
text = c("foo_bla",
"here_do",
"oh_yes",
"ba_a",
"lan_d"))

How to split my columns using a unique and tidyR

I'm working on a data.table with a column like this:
A <- c("a;b;c","a;a;b","d;a;b","f;f;f")
df <- data.frame(A)
I would like to separate this column into 3 columns like this:
seg1 seg2 seg3
1 a b c
2 a b <NA>
3 d a b
4 f <NA> <NA>
The thing here is that when i split each row by ";" i need to keep unique of the row.
Here's a tidyverse approach. We split the character in A, keep only the unique values, paste the result back together and separate into three columns:
library(tidyverse)
df %>%
mutate(A = map(strsplit(as.character(A), ";"),
.f = ~ paste(unique(.x), collapse = ";"))) %>%
separate(A, into = c("seg1", "seg2", "seg3"))
Which gives:
# seg1 seg2 seg3
#1 a b c
#2 a b <NA>
#3 d a b
#4 f <NA> <NA>
library(stringr)
A <- c("a;b;c","a;a;b","d;a;b","f;f;f")
df <- data.frame(A)
df <- str_split_fixed(df$A, ";", 3)
df <- apply(X = df,
FUN = function(x){
return(x[!duplicated(x)][1:ncol(df)])
},
MARGIN = 1)
df <- t(df)
df <- as.data.frame(df)
names(df) <- c("seg1", "seg2", "seg3")
df
# seg1 seg2 seg3
# 1 a b c
# 2 a b <NA>
# 3 d a b
# 4 f <NA> <NA>

Resources