Tabulating list of values in third variable in R - r

I have following data:
ddf2 = structure(list(col1 = c(3, 3, 2, 1, 1, 1, 3, 2, 1, 1, 3, 1, 1,
2, 1, 1, 1, 2, 3, 1, 1, 3, 2, 3, 3), col2 = c("c", "c", "b",
"b", "b", "a", "b", "c", "b", "b", "c", "c", "b", "b", "a", "c",
"c", "b", "a", "b", "b", "c", "a", "c", "a"), col3 = c("C", "E",
"E", "B", "D", "E", "C", "C", "E", "E", "C", "A", "D", "D", "C",
"E", "A", "A", "A", "D", "A", "A", "B", "A", "E")), .Names = c("col1",
"col2", "col3"), row.names = c(NA, 25L), class = "data.frame")
head(ddf2)
col1 col2 col3
1 3 c C
2 3 c E
3 2 b E
4 1 b B
5 1 b D
6 1 a E
For every combination of col1 and col2, there may be many values of col3:
with(ddf2, ddf2[col1==1 & col2=='b',])
col1 col2 col3
4 1 b B
5 1 b D
9 1 b E
10 1 b E
13 1 b D
20 1 b D
21 1 b A
with(ddf2, table(col1, col2))
col2
col1 a b c
1 2 7 3
2 1 3 1
3 2 1 5
I want to create a table/matrix of col1 and col2 as above but each cell should have a list of unique col3 entries for that set of col1 and col2. I expect following output:
col2
col1 a b c
1 E,C A,B,D,E A,E
2 B A,D,E C
3 A,E C A,C,E
I tried following but it does not work:
with(ddf2, tapply(col3, list(col1,col2), c))
a b c
1 Character,2 Character,7 Character,3
2 "B" Character,3 "C"
3 Character,2 "C" Character,5
How can this be done? Thanks for your help.

One option:
d <- with(ddf2, aggregate(col3 ~ col2 + col1, FUN = function(x) paste0(unique(x))))
library(reshape2)
dcast(d, col1 ~ col2, value.var = "col3")
# col1 a b c
#1 1 E, C B, D, E, A A, E
#2 2 B E, D, A C
#3 3 A, E C C, E, A
Most likely it's possible to do both steps in one, but I'll generously leave it to someone else to figure this out ;)
Or
library(dplyr)
library(tidyr)
ddf2 %>%
group_by(col1, col2) %>%
summarise(col3 = paste(unique(col3), collapse = ", ")) %>%
spread(col2, col3)
#Source: local data frame [3 x 4]
#
# col1 a b c
#1 1 E, C B, D, E, A A, E
#2 2 B E, D, A C
#3 3 A, E C C, E, A
Edit after comment:
Just tested with tapply and this seems to work (the problem was apparently in calling c()):
with(ddf2, tapply(col3, list(col1,col2), FUN = function(x) paste(unique(x), collapse = ", ")))
# a b c
#1 "E, C" "B, D, E, A" "A, E"
#2 "B" "E, D, A" "C"
#3 "A, E" "C" "C, E, A"

Related

How to count a swap characters between two columns in R

I have a data frame that looks like this
df <- data.frame(col1 = c("A", "A", "A", "A", "A", "B", "B", "B", "B", "B",
"C", "C", "C", "C", "C"),
col2 = c("A", "B", "C", "D", "E", "A", "B", "C", "D", "E",
"A", "B", "C", "D", "E"))
what I want is to have like this
df <- data.frame(col1 = c("A", "A", "A", "A", "A", "B", "B", "B", "B", "B",
"C", "C", "C", "C", "C"),
col2 = c("A", "B", "C", "D", "E", "A", "B", "C", "D", "E",
"A", "B", "C", "D", "E"),
col3 = c("1","0","0","0","0","1","1","0","0","0","1","1","1","0","0"))
In col3, it counts the duplicated characters as 1 and unique as 0. row 6 is considered a duplicate because the swap characters ("B", "A") were counted already in row2 as unique ("A", "B"). I can easily do this in excel using the if and countif function. Thanks in advance!
We can use pmin and pmax to sort the values from left to right by rows and apply duplicated to check the duplicates
transform(
df,
col3 = +(duplicated(paste(pmin(col1, col2), pmax(col1, col2))) | col1 == col2)
)
which gives
col1 col2 col3
1 A A 1
2 A B 0
3 A C 0
4 A D 0
5 A E 0
6 B A 1
7 B B 1
8 B C 0
9 B D 0
10 B E 0
11 C A 1
12 C B 1
13 C C 1
14 C D 0
15 C E 0
Does this work:
df %>% mutate(col4 = str_c(col1, col2)) %>%
mutate(col5 = lapply(col4, function(x) paste(sort(unlist(strsplit(x, ''))), collapse = ''))) %>%
mutate(col3 = +(duplicated(col5) | (col1 == col2))) %>%
select(col1, col2, col3)
col1 col2 col3
1 A A 1
2 A B 0
3 A C 0
4 A D 0
5 A E 0
6 B A 1
7 B B 1
8 B C 0
9 B D 0
10 B E 0
11 C A 1
12 C B 1
13 C C 1
14 C D 0
15 C E 0
Here is one option where we look for any duplicates or where col1 and col2 are the same. The + returns a binary for the logical.
df$col3 <- +(duplicated(t(apply(df, 1, sort))) | df$col1 == df$col2)
Output
col1 col2 col3
1 A A 1
2 A B 0
3 A C 0
4 A D 0
5 A E 0
6 B A 1
7 B B 1
8 B C 0
9 B D 0
10 B E 0
11 C A 1
12 C B 1
13 C C 1
14 C D 0
15 C E 0
try this
column <- grepl("^[.0-9]+$", dat[,1])
column
dat2 <- data.frame(Sex = dat[cbind(seq_len(nrow(dat)),1+column)], Length =
dat[cbind(seq_len(nrow(dat)),2-column)])
dat2$Length <- as.numeric(dat2$Length)
dat2

How to concatenate multiple columns in one and remove duplicates?

I have a dataframe like this one:
A <- c("a", "a", "a", "a")
B <- c("b", "b", "b", "b")
C <- c("c", "a", "c", "c")
D <- c("d", "b", "a", "d")
E <- c("a", "a", "b", "e")
F <- c("b", "b", "c", "f")
G <- c("c", "a", "a", "g")
df <- data.frame(A, B, C, D, E, F, G)
I need to merge all values from the columns A to G, remove duplicates, and store a resulting list in a new column. So, the final result should look like this:
Try this one
> df$new <- apply(df,1,unique)
> df
A B C D E F G new
1 a b c d a b c a, b, c, d
2 a b a b a b a a, b
3 a b c a b c a a, b, c
4 a b c d e f g a, b, c, d, e, f, g
A possible solution:
library(tidyverse)
A <- c("a", "a", "a", "a")
B <- c("b", "b", "b", "b")
C <- c("c", "a", "c", "c")
D <- c("d", "b", "a", "d")
E <- c("a", "a", "b", "e")
F <- c("b", "b", "c", "f")
G <- c("c", "a", "a", "g")
df <- data.frame(A, B, C, D, E, F, G)
df %>%
rowwise %>%
mutate(new = c_across(everything()) %>% unique %>% str_c(collapse = ",")) %>%
ungroup
#> # A tibble: 4 × 8
#> A B C D E F G new
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 a b c d a b c a,b,c,d
#> 2 a b a b a b a a,b
#> 3 a b c a b c a a,b,c
#> 4 a b c d e f g a,b,c,d,e,f,g
this is sort of a silly way of doing it, but does this address your issue?
list(unique(t(df)[,1]),
unique(t(df)[,2]),
unique(t(df)[,3]),
unique(t(df)[,4]))

Match columns with multiple entries in a row and mutate result

I have a data frame:
col_1 <- c("A", "A", "B", "B", "C", "C")
col_2 <- c("A", "B", "C", "D", "E", "F")
col_3 <- c("A", "B", "C", "C", "B", "A")
df <- data.frame(col_1, col_2, col_3)
I want to mutate a new column that contains TRUE or FALSE depending on whether any row has more than two identical entries.
e.g.:
t_f <- c(TRUE, TRUE, TRUE, FALSE, FALSE, FALSE)
Even better, if I could have a column that contains the repeated values, e.g.:
name <- c("A", "B", "C", NA, NA, NA)
For you first requirement
df$t_f <- apply(df, 1, function(x) any(duplicated(x)))
And your second
df$name <- apply(df, 1, function(x) ifelse(any(duplicated(x)), x[which(duplicated(x))], NA))
For your second requirement:
col_1 <- c("A", "A", "B", "B", "C", "C")
col_2 <- c("A", "B", "C", "D", "E", "F")
col_3 <- c("A", "B", "C", "C", "B", "A")
df <- data.frame(col_1, col_2, col_3)
df$name <- apply(df, 1,
function(row)ifelse(max(table(row))>=2,
names(table(row))[which.max(table(row))], NA))
df
#> col_1 col_2 col_3 name
#> 1 A A A A
#> 2 A B B B
#> 3 B C C C
#> 4 B D C <NA>
#> 5 C E B <NA>
#> 6 C F A <NA>
in base R you can try
ifelse(colSums(table(row(df), as.matrix(df)) >= 2) == 1, colnames(table(row(df), as.matrix(df))), NA)
A B C D E F
"A" "B" "C" NA NA NA
In tidyverse you can do
library(tidyverse)
df %>%
mutate_if(is.factor, as.character) %>%
rowwise() %>%
mutate(dup=anyDuplicated(c(col_1, col_2, col_3))!=0) %>%
mutate(which.dup=c(col_1, col_2, col_3)[which(duplicated(c(col_1, col_2, col_3)))[1]])
Source: local data frame [6 x 5]
Groups: <by row>
# A tibble: 6 x 5
col_1 col_2 col_3 dup which.dup
<chr> <chr> <chr> <lgl> <chr>
1 A A A TRUE A
2 A B B TRUE B
3 B C C TRUE C
4 B D C FALSE NA
5 C E B FALSE NA
6 C F A FALSE NA

Remove/collapse consecutive duplicate values in sequence

I have the following dataframe:
a a a b c c d e a a b b b e e d d
The required result should be
a b c d e a b e d
It means no two consecutive rows should have same value. How it can be done without using loop.
As my data set is quite huge, looping is taking lot of time to execute.
The dataframe structure is like the following
a 1
a 2
a 3
b 2
c 4
c 1
d 3
e 9
a 4
a 8
b 10
b 199
e 2
e 5
d 4
d 10
Result:
a 1
b 2
c 4
d 3
e 9
a 4
b 10
e 2
d 4
Its should delete the entire row.
One easy way is to use rle:
Here's your sample data:
x <- scan(what = character(), text = "a a a b c c d e a a b b b e e d d")
# Read 17 items
rle returns a list with two values: the run length ("lengths"), and the value that is repeated for that run ("values").
rle(x)$values
# [1] "a" "b" "c" "d" "e" "a" "b" "e" "d"
Update: For a data.frame
If you are working with a data.frame, try something like the following:
## Sample data
mydf <- data.frame(
V1 = c("a", "a", "a", "b", "c", "c", "d", "e",
"a", "a", "b", "b", "e", "e", "d", "d"),
V2 = c(1, 2, 3, 2, 4, 1, 3, 9,
4, 8, 10, 199, 2, 5, 4, 10)
)
## Use rle, as before
X <- rle(mydf$V1)
## Identify the rows you want to keep
Y <- cumsum(c(1, X$lengths[-length(X$lengths)]))
Y
# [1] 1 4 5 7 8 9 11 13 15
mydf[Y, ]
# V1 V2
# 1 a 1
# 4 b 2
# 5 c 4
# 7 d 3
# 8 e 9
# 9 a 4
# 11 b 10
# 13 e 2
# 15 d 4
Update 2
The "data.table" package has a function rleid that lets you do this quite easily. Using mydf from above, try:
library(data.table)
as.data.table(mydf)[, .SD[1], by = rleid(V1)]
# rleid V2
# 1: 1 1
# 2: 2 2
# 3: 3 4
# 4: 4 3
# 5: 5 9
# 6: 6 4
# 7: 7 10
# 8: 8 2
# 9: 9 4
library(dplyr)
x <- c("a", "a", "a", "b", "c", "c", "d", "e", "a", "a", "b", "b", "b", "e", "e", "d", "d")
x[x!=lag(x, default=1)]
#[1] "a" "b" "c" "d" "e" "a" "b" "e" "d"
EDIT: For data.frame
mydf <- data.frame(
V1 = c("a", "a", "a", "b", "c", "c", "d", "e",
"a", "a", "b", "b", "e", "e", "d", "d"),
V2 = c(1, 2, 3, 2, 4, 1, 3, 9,
4, 8, 10, 199, 2, 5, 4, 10),
stringsAsFactors=FALSE)
dplyr solution is one liner:
mydf %>% filter(V1!= lag(V1, default="1"))
# V1 V2
#1 a 1
#2 b 2
#3 c 4
#4 d 3
#5 e 9
#6 a 4
#7 b 10
#8 e 2
#9 d 4
post scriptum
lead(x,1) suggested by #Carl Witthoft iterates in reverse order.
leadit<-function(x) x!=lead(x, default="what")
rows <- leadit(mydf[ ,1])
mydf[rows, ]
# V1 V2
#3 a 3
#4 b 2
#6 c 1
#7 d 3
#8 e 9
#10 a 8
#12 b 199
#14 e 5
#16 d 10
With base R, I like funny algorithmics:
x <- c("a", "a", "a", "b", "c", "c", "d", "e", "a", "a", "b", "b", "b", "e", "e", "d", "d")
x[x!=c(x[-1], FALSE)]
#[1] "a" "b" "c" "d" "e" "a" "b" "e" "d"
Much as I like,... errr, love rle , here's a shootoff:
EDIT: Can't figure out exactly what's up with dplyr so I used dplyr::lead . I'm on OSX, R3.1.2, and latest dplyr from CRAN.
xlet<-sample(letters,1e5,rep=T)
rleit<-function(x) rle(x)$values
lagit<-function(x) x[x!=lead(x, default=1)]
tailit<-function(x) x[x!=c(tail(x,-1), tail(x,1))]
microbenchmark(rleit(xlet),lagit(xlet),tailit(xlet),times=20)
Unit: milliseconds
expr min lq median uq max neval
rleit(xlet) 27.43996 30.02569 30.20385 30.92817 37.10657 20
lagit(xlet) 12.44794 15.00687 15.14051 15.80254 46.66940 20
tailit(xlet) 12.48968 14.66588 14.78383 15.32276 55.59840 20
Tidyverse solution:
x <- scan(what = character(), text = "a a a b c c d e a a b b b e e d d")
x <- tibble(x)
x |>
mutate(id = consecutive_id(x)) |>
distinct(x, id)
In addition, if there is another column y associated with the consecutive values column, this solution allows some flexibility:
x <- scan(what = character(), text = "a a a b c c d e a a b b b e e d d")
x <- tibble(x, y = runif(length(x)))
x |>
group_by(id = consecutive_id(x)) |>
slice_min(y)
We can choose between the different slice functions, like slice_max, slice_min, slice_head, and slice_tail.
This Stack Overflow thread appeared in the second edition of R4DS, in the Numbers chapter of the book.

R: unique combination (avoid a-b and b-a and identical such as a-a, b-b)

I have the following variable columns -
var1 <- c("a", "b", "a", "a", "c", "a", "b", "b", "c", "b", "c", "c", "d")
var2 <- c("a", "a", "b", "c", "a", "d", "b", "c", "b", "d", "c", "d", "d")
mydf <- data.frame(var1, var2)
I want to find unique variable combination, such that
(a) var1 a- var2 b and var1 b- var2 a are not considered unique.
(b) no identical combination are present -
for example var1 a and var2 a, var1 b and var2 b
I used the following codes, is not providing what I am expecting:
unique(mydf)
var1 var2
1 a a
2 b a
3 a b
4 a c
5 c a
6 a d
7 b b
8 b c
9 c b
10 b d
11 c c
12 c d
13 d d
My expected output is:
var1 var2
1 a b
2 a c
3 a d
4 b c
5 b d
6 c d
thanks;
This should do it:
mydf = mydf[mydf[,1] != mydf[,2], ]
mydf = mydf[!duplicated(data.frame(t(apply(mydf, 1, sort)))), ]
> mydf
var1 var2
2 b a
4 a c
6 a d
8 b c
10 b d
12 c d
More of an exercise to teach myself some sets package behavior:
require(sets)
mydf <- data.frame(var1, var2, stringsAsFactors=FALSE) # unneeded factors are a plague on R/S
dlis <- list();
for (i in seq(nrow(mydf)) ) {
if( length(set(mydf[i,1], mydf[i,2]) )==2 ) {
dlis <- c( dlis, list(set(mydf[i,1], mydf[i,2]))
) } }
unique(dlis)
[[1]]
{"a", "b"}
[[2]]
{"a", "c"}
[[3]]
{"a", "d"}
[[4]]
{"b", "c"}
[[5]]
{"b", "d"}
[[6]]
{"c", "d"}

Resources