Separate rows by matching two columns in similar pattern - r

i have data like
df1 <- data.frame(A = c("P,Q","X,Y"), B = c("P1,Q1",""), C = c("P2,Q2","X2,Y2"))
i am looking for output like
output <- data.frame(A = c("P","Q","X","Y"), B = c("P1","Q1","",""), C = c("P2","Q2","X2","Y2"))
i tried using separate_rows like mentioned below but it is not matching the strings seperated by comma.
separate_rows(df1, A, sep=",") %>%
separate_rows(B) %>%
separate_rows(C)

I like splitstackshape package for such operations,
library(splitstackshape)
cSplit(df1, splitCols = names(df1), sep = ',', direction = 'long')
# A B C
#1: P P1 P2
#2: Q Q1 Q2

you simply have to do :
library(tidyr)
separate_rows(df1, A, B, C, convert = TRUE)
Output :
A B C
1 P P1 P2
2 Q Q1 Q2
Edit if you have NA and empty strings :
data:
df1 <- data.frame(A = c("P,Q","X,Y"), B = c("P1,Q1",""), C =
c("P2,Q2","X2,Y2"))
Code:
df1 <- data.frame(lapply(df1, as.character), stringsAsFactors=FALSE)
df1[df1 == ""] <- "0,0"
df1 <- separate_rows(df1, A, B, C, convert = TRUE)
df1[df1 == "0"] <- ""
Output :
A B C
1 P P1 P2
2 Q Q1 Q2
3 X X2
4 Y Y2

An option using base R with strsplit
data.frame(lapply(df1, function(x) strsplit(as.character(x), ",")[[1]]))
# A B C
#1 P P1 P2
#2 Q Q1 Q2
Or with scan
data.frame(lapply(df1, function(x)
scan(text = as.character(x), what = "", sep=",", quiet = TRUE)))

As suggested by Gainz's answer, separate_rows(df1, A, B, C, convert = T) works really well.
However, if you do have blank cells in the dataframe then it does become harder to use, since it will give you an error about all the columns not having the same number of rows.
I suggest using a column that you know will have no blank values. Let's assume it is column A.
I would first then convert the dataframe to a tibble, and all factor columns to character columns. Then I would replace the blank cells with a string with the correct number of commas. Then separate_rows() should be able to work correctly.
Then the code will look as follows:
df1_tibble <- df1 %>%
as_tibble() %>%
mutate_if(is.factor, as.character)
df1_clean <- df1_tibble %>%
mutate(count = str_count(A, ",") + 1) %>%
mutate(temp_str = map_chr(count, ~ rep("", .x) %>% paste0(collapse = ","))) %>%
mutate_at(vars(B, C), funs(ifelse(str_length(.) == 0, temp_str, .))) %>%
select(A, B, C)
df1_clean
#> # A tibble: 2 x 3
#> A B C
#> <chr> <chr> <chr>
#> 1 P,Q P1,Q1 P2,Q2
#> 2 X,Y , X2,Y2
df1_clean %>% separate_rows(A, B, C)
#> # A tibble: 4 x 3
#> A B C
#> <chr> <chr> <chr>
#> 1 P P1 P2
#> 2 Q Q1 Q2
#> 3 X "" X2
#> 4 Y "" Y2

Related

Collapsing Columns in R using tidyverse with mutate, replace, and unite. Writing a function to reuse?

Data:
ID
B
C
1
NA
x
2
x
NA
3
x
x
Results:
ID
Unified
1
C
2
B
3
B_C
I'm trying to combine colums B and C, using mutate and unify, but how would I scale up this function so that I can reuse this for multiple columns (think 100+), instead of having to write out the variables each time? Or is there a function that's already built in to do this?
My current solution is this:
library(tidyverse)
Data %>%
mutate(B = replace(B, B == 'x', 'B'), C = replace(C, C == 'x', 'C')) %>%
unite("Unified", B:C, na.rm = TRUE, remove= TRUE)
We may use across to loop over the column, replace the value that corresponds to 'x' with column name (cur_column())
library(dplyr)
library(tidyr)
Data %>%
mutate(across(B:C, ~ replace(., .== 'x', cur_column()))) %>%
unite(Unified, B:C, na.rm = TRUE, remove = TRUE)
-output
ID Unified
1 1 C
2 2 B
3 3 B_C
data
Data <- structure(list(ID = 1:3, B = c(NA, "x", "x"), C = c("x", NA,
"x")), class = "data.frame", row.names = c(NA, -3L))
Here are couple of options.
Using dplyr -
library(dplyr)
cols <- names(Data)[-1]
Data %>%
rowwise() %>%
mutate(Unified = paste0(cols[!is.na(c_across(B:C))], collapse = '_')) %>%
ungroup -> Data
Data
# ID B C Unified
# <int> <chr> <chr> <chr>
#1 1 NA x C
#2 2 x NA B
#3 3 x x B_C
Base R
Data$Unified <- apply(Data[cols], 1, function(x)
paste0(cols[!is.na(x)], collapse = '_'))

Extract character list values from data.frame rows and reshape data

I have a variable x with character lists in each row:
dat <- data.frame(id = c(rep('a',2),rep('b',2),'c'),
x = c('f,o','f,o,o','b,a,a,r','b,a,r','b,a'),
stringsAsFactors = F)
I would like to reshape the data so that each row is a unique (id, x) pair such as:
dat2 <- data.frame(id = c(rep('a',2),rep('b',3),rep('c',2)),
x = c('f','o','a','b','r','a','b'))
> dat2
id x
1 a f
2 a o
3 b a
4 b b
5 b r
6 c a
7 c b
I've attempted to do this by splitting the character lists and keeping only the unique list values in each row:
dat$x <- sapply(strsplit(dat$x, ','), sort)
dat$x <- sapply(dat$x, unique)
dat <- unique(dat)
> dat
id x
1 a f, o
3 b a, b, r
5 c a, b
However, I'm not sure how to proceed with converting the row lists into individual row entries.
How would I accomplish this? Or is there a more efficient way of converting a list of strings to reshape the data as described?
You can use tidytext::unnest_tokens:
library(tidytext)
library(dplyr)
dat %>%
unnest_tokens(x1, x) %>%
distinct()
id x1
1 a f
2 a o
3 b b
4 b a
5 b r
6 c b
7 c a
A base R method with two lines is
#get list of X potential vars
x <- strsplit(dat$x, ",")
# construct full data.frame, then use unique to return desired rows
unique(data.frame(id=rep(dat$id, lengths(x)), x=unlist(x)))
This returns
id x
1 a f
2 a o
6 b b
7 b a
9 b r
13 c b
14 c a
If you don't want to write out the variable names yourself, you can use setNames.
setNames(unique(data.frame(rep(dat$id, lengths(x)), unlist(x))), names(dat))
We could use separate_rows
library(tidyverse)
dat %>%
separate_rows(x) %>%
distinct()
# id x
#1 a f
#2 a o
#3 b b
#4 b a
#5 b r
#6 c b
#7 c a
A solution can be achieved using splitstackshape::cSplit to split x column into mulltiple columns. Then gather and filter will help to achieve desired output.
library(tidyverse)
library(splitstackshape)
dat %>% cSplit("x", sep=",") %>%
mutate_if(is.factor, as.character) %>%
gather(key, value, -id) %>%
filter(!is.na(value)) %>%
select(-key) %>% unique()
# id value
# 1 a f
# 3 b b
# 5 c b
# 6 a o
# 8 b a
# 10 c a
# 13 b r
Base solution:
temp <- do.call(rbind, apply( dat, 1,
function(z){ data.frame(
id=z[1],
x = scan(text=z['x'], what="",sep=","),
stringsAsFactors=FALSE)} ) )
Read 2 items
Read 3 items
Read 4 items
Read 3 items
Read 2 items
Warning messages:
1: In data.frame(id = z[1], x = scan(text = z["x"], what = "", sep = ",")) :
row names were found from a short variable and have been discarded
2: In data.frame(id = z[1], x = scan(text = z["x"], what = "", sep = ",")) :
row names were found from a short variable and have been discarded
3: In data.frame(id = z[1], x = scan(text = z["x"], what = "", sep = ",")) :
row names were found from a short variable and have been discarded
4: In data.frame(id = z[1], x = scan(text = z["x"], what = "", sep = ",")) :
row names were found from a short variable and have been discarded
5: In data.frame(id = z[1], x = scan(text = z["x"], what = "", sep = ",")) :
row names were found from a short variable and have been discarded
temp[!duplicated(temp),]
#------
id x
1 a f
2 a o
6 b b
7 b a
9 b r
13 c b
14 c a
To get rid of all the messages and warnings:
temp <- do.call(rbind, apply( dat, 1,
function(z){ suppressWarnings(data.frame(id=z[1],
x = scan(text=z['x'], what="",sep=",", quiet=TRUE), stringsAsFactors=FALSE)
)} ) )
temp[!duplicated(temp),]

Remove exact rows and frequency of rows of a data.frame that are in another data.frame in r

Consider the following two data.frames:
a1 <- data.frame(A = c(1:5, 2, 4, 2), B = letters[c(1:5, 2, 4, 2)])
a2 <- data.frame(A = c(1:3,2), B = letters[c(1:3,2)])
I would like to remove the exact rows of a1 that are in a2 so that the result should be:
A B
4 d
5 e
4 d
2 b
Note that one row with 2 b in a1 is retained in the final result. Currently, I use a looping statement, which becomes extremely slow as I have many variables and thousands of rows in my data.frames. Is there any built-in function to get this result?
The idea is, add a counter for duplicates to each file, so you can get a unique match for each occurrence of a row. Data table is nice because it is easy to count the duplicates (with .N), and it also gives the necessary function (fsetdiff) for set operations.
library(data.table)
a1 <- data.table(A = c(1:5, 2, 4, 2), B = letters[c(1:5, 2, 4, 2)])
a2 <- data.table(A = c(1:3,2), B = letters[c(1:3,2)])
# add counter for duplicates
a1[, i := 1:.N, .(A,B)]
a2[, i := 1:.N, .(A,B)]
# setdiff gets the exception
# "all = T" allows duplicate rows to be returned
fsetdiff(a1, a2, all = T)
# A B i
# 1: 4 d 1
# 2: 5 e 1
# 3: 4 d 2
# 4: 2 b 3
You could use dplyr to do this. I set stringsAsFactors = FALSE to get rid of warnings about factor mismatches.
library(dplyr)
a1 <- data.frame(A = c(1:5, 2, 4, 2), B = letters[c(1:5, 2, 4, 2)], stringsAsFactors = FALSE)
a2 <- data.frame(A = c(1:3,2), B = letters[c(1:3,2)], stringsAsFactors = FALSE)
## Make temp variables to join on then delete later.
# Create a row number
a1_tmp <-
a1 %>%
group_by(A, B) %>%
mutate(tmp_id = row_number()) %>%
ungroup()
# Create a count
a2_tmp <-
a2 %>%
group_by(A, B) %>%
summarise(count = n()) %>%
ungroup()
## Keep all that have no entry int a2 or the id > the count (i.e. used up a2 entries).
left_join(a1_tmp, a2_tmp, by = c('A', 'B')) %>%
ungroup() %>% filter(is.na(count) | tmp_id > count) %>%
select(-tmp_id, -count)
## # A tibble: 4 x 2
## A B
## <dbl> <chr>
## 1 4 d
## 2 5 e
## 3 4 d
## 4 2 b
EDIT
Here is a similar solution that is a little shorter. This does the following: (1) add a column for row number to join both data.frame items (2) a temporary column in a2 (2nd data.frame) that will show up as null in the join to a1 (i.e. indicates it's unique to a1).
library(dplyr)
left_join(a1 %>% group_by(A,B) %>% mutate(rn = row_number()) %>% ungroup(),
a2 %>% group_by(A,B) %>% mutate(rn = row_number(), tmpcol = 0) %>% ungroup(),
by = c('A', 'B', 'rn')) %>%
filter(is.na(tmpcol)) %>%
select(-tmpcol, -rn)
## # A tibble: 4 x 2
## A B
## <dbl> <chr>
## 1 4 d
## 2 5 e
## 3 4 d
## 4 2 b
I think this solution is a little simpler (perhaps very little) than the first.
I guess this is similar to DWal's solution but in base R
a1_temp = Reduce(paste, a1)
a1_temp = paste(a1_temp, ave(seq_along(a1_temp), a1_temp, FUN = seq_along))
a2_temp = Reduce(paste, a2)
a2_temp = paste(a2_temp, ave(seq_along(a2_temp), a2_temp, FUN = seq_along))
a1[!a1_temp %in% a2_temp,]
# A B
#4 4 d
#5 5 e
#7 4 d
#8 2 b
Here's another solution with dplyr:
library(dplyr)
a1 %>%
arrange(A) %>%
group_by(A) %>%
filter(!(paste0(1:n(), A, B) %in% with(arrange(a2, A), paste0(1:n(), A, B))))
Result:
# A tibble: 4 x 2
# Groups: A [3]
A B
<dbl> <fctr>
1 2 b
2 4 d
3 4 d
4 5 e
This way of filtering avoids creating extra unwanted columns that you have to later remove in the final output. This method also sorts the output. Not sure if it's what you want.

R: Extract columns from list of data.frames in a tibble

I am wondering how to manipulate a list containing data.frames stored in a tibble.
Specifically, I would like to extract two columns from a data.frame that are stored in a tibble list column.
I would like to go from this tibble c
random_data<-list(a=letters[1:10],b=LETTERS[1:10])
x<-as.data.frame(random_data, stringsAsFactors=FALSE)
y<-list()
y[[1]]<-x[1,,drop=FALSE]
y[[3]]<-x[2,,drop=FALSE]
c<-tibble(z=c(1,2,3),my_data=y)
to this tibble d
d<-tibble(z=c(1,2,3),a=c('a',NA,'b'),b=c('A',NA,'B'))
thanks
Iain
c2 is the final output.
library(tidyverse)
c2 <- c %>%
filter(!map_lgl(my_data, is.null)) %>%
unnest() %>%
right_join(c, by = "z") %>%
select(-my_data)
You could create a function f to change out the NULL values, then apply it to the my_data column and finish with unnest.
library(dplyr); library(tidyr)
unnest(mutate(c, my_data = lapply(my_data, f)))
# # A tibble: 3 x 3
# z a b
# <dbl> <chr> <chr>
# 1 1 a A
# 2 2 <NA> <NA>
# 3 3 b B
Where f is a helper function to change out the NULL values, and is defined as
f <- function(x) {
if(is.null(x)) data.frame(a = NA, b = NA) else x
}
I think this does the trick with d the requested tibble:
library(dplyr)
new.y <- lapply(y, function(x) if(is.null(x)) data.frame(a = NA, b = NA) else x)
d <- cbind(z = c(1, 2, 3), bind_rows(new.y)) %>% tbl_df()
# # A tibble: 3 x 3
# z a b
# <dbl> <fctr> <fctr>
# 1 1 a A
# 2 2 NA NA
# 3 3 b B
Do you know your column names ahead of time?
extract_column <- function( d, column_name ) {
if( is.null(d) ) {
NA_character_
} else {
as.character(d[[column_name]])
}
}
cc %>%
dplyr::mutate(
a = purrr::map_chr(.$my_data, extract_column, column_name="a"),
b = purrr::map_chr(.$my_data, extract_column, column_name="b")
) %>%
dplyr::select(-my_data)
(I renamed your c tibble to cc so it can't collide with c().)

Concatenating all rows within a group using dplyr

Suppose I have a dataframe like this:
hand_id card_id card_name card_class
A 1 p alpha
A 2 q beta
A 3 r theta
B 2 q beta
B 3 r theta
B 4 s gamma
C 1 p alpha
C 2 q beta
I would like to concatenate the card_id, card_name, and card_class into one single row per hand level A, B, C. So the result would look something like this:
hand_id combo_1 combo_2 combo_3
A 1-2-3 p-q-r alpha-beta-theta
B 2-3-4 q-r-s beta-theta-gamma
....
I attempted to do this using group_by and mutate, but I can't seem to get it to work
data <- read_csv('data.csv')
byHand <- group_by(data, hand_id) %>%
mutate(combo_1 = paste(card_id),
combo_2 = paste(card_name),
combo_3 = paste(card_class))
Thank you for your help.
You were kind of close!
library(tidyr)
library(dplyr)
data <- read_csv('data.csv')
byHand <- group_by(data, hand_id) %>%
summarise(combo_1 = paste(card_id, collapse = "-"),
combo_2 = paste(card_name, collapse = "-"),
combo_3 = paste(card_class, collapse = "-"))
or using summarise_each:
byHand <- group_by(data, hand_id) %>%
summarise_each(funs(paste(., collapse = "-")))
Here is another option using data.table
library(data.table)
setDT(data)[, lapply(.SD, paste, collapse="-") , by = hand_id]
# hand_id card_id card_name card_class
#1: A 1-2-3 p-q-r alpha-beta-theta
#2: B 2-3-4 q-r-s beta-theta-gamma
#3: C 1-2 p-q alpha-beta
Not very familiar with dplyr... so here's my attempt without dplyr
df <- read_csv('data.csv')
res <- lapply(split(df, df$hand_id),function(x){
sL <- apply(x[,-1], 2, function(y) paste(y, collapse = "-"))
d <- data.frame(x$hand_id[1], rbind(sL))
names(d) <- c("hand_id", "combo_1", "combo_2", "combo_3")
return(d)
})
res <- do.call("rbind",res)
rownames(res) <- NULL
Here's the output:
## hand_id combo_1 combo_2 combo_3
## 1 A 1-2-3 p-q-r alpha-beta-theta
## 2 B 2-3-4 q-r-s beta-theta-gamma
## 3 C 1-2 p-q alpha-beta
If you have NAs in your data, you can use na.omit() inline with str_c(). unique() will also work if you only want the distinct.
data:
hand_id card_id card_name card_class
<chr> <dbl> <chr> <chr>
1 A 1 p alpha
2 A 2 q beta
3 A 3 r theta
4 A NA NA NA
5 B 2 q beta
6 B 3 r theta
7 B 4 s gamma
8 C 1 p alpha
9 C 2 q beta
code:
data %>%
group_by(hand_id) %>%
summarize(card_id = str_c(na.omit(card_id), collapse = "-"),
card_name = str_c(na.omit(card_name), collapse = "-"),
card_class = str_c(na.omit(card_class), collapse = "-"))
output:
hand_id card_id card_name card_class
* <chr> <chr> <chr> <chr>
1 A 1-2-3 p-q-r alpha-beta-the…
2 B 2-3-4 q-r-s beta-theta-gam…
3 C 1-2 p-q alpha-beta

Resources