Match columns with multiple entries in a row and mutate result - r

I have a data frame:
col_1 <- c("A", "A", "B", "B", "C", "C")
col_2 <- c("A", "B", "C", "D", "E", "F")
col_3 <- c("A", "B", "C", "C", "B", "A")
df <- data.frame(col_1, col_2, col_3)
I want to mutate a new column that contains TRUE or FALSE depending on whether any row has more than two identical entries.
e.g.:
t_f <- c(TRUE, TRUE, TRUE, FALSE, FALSE, FALSE)
Even better, if I could have a column that contains the repeated values, e.g.:
name <- c("A", "B", "C", NA, NA, NA)

For you first requirement
df$t_f <- apply(df, 1, function(x) any(duplicated(x)))
And your second
df$name <- apply(df, 1, function(x) ifelse(any(duplicated(x)), x[which(duplicated(x))], NA))

For your second requirement:
col_1 <- c("A", "A", "B", "B", "C", "C")
col_2 <- c("A", "B", "C", "D", "E", "F")
col_3 <- c("A", "B", "C", "C", "B", "A")
df <- data.frame(col_1, col_2, col_3)
df$name <- apply(df, 1,
function(row)ifelse(max(table(row))>=2,
names(table(row))[which.max(table(row))], NA))
df
#> col_1 col_2 col_3 name
#> 1 A A A A
#> 2 A B B B
#> 3 B C C C
#> 4 B D C <NA>
#> 5 C E B <NA>
#> 6 C F A <NA>

in base R you can try
ifelse(colSums(table(row(df), as.matrix(df)) >= 2) == 1, colnames(table(row(df), as.matrix(df))), NA)
A B C D E F
"A" "B" "C" NA NA NA
In tidyverse you can do
library(tidyverse)
df %>%
mutate_if(is.factor, as.character) %>%
rowwise() %>%
mutate(dup=anyDuplicated(c(col_1, col_2, col_3))!=0) %>%
mutate(which.dup=c(col_1, col_2, col_3)[which(duplicated(c(col_1, col_2, col_3)))[1]])
Source: local data frame [6 x 5]
Groups: <by row>
# A tibble: 6 x 5
col_1 col_2 col_3 dup which.dup
<chr> <chr> <chr> <lgl> <chr>
1 A A A TRUE A
2 A B B TRUE B
3 B C C TRUE C
4 B D C FALSE NA
5 C E B FALSE NA
6 C F A FALSE NA

Related

How to concatenate multiple columns in one and remove duplicates?

I have a dataframe like this one:
A <- c("a", "a", "a", "a")
B <- c("b", "b", "b", "b")
C <- c("c", "a", "c", "c")
D <- c("d", "b", "a", "d")
E <- c("a", "a", "b", "e")
F <- c("b", "b", "c", "f")
G <- c("c", "a", "a", "g")
df <- data.frame(A, B, C, D, E, F, G)
I need to merge all values from the columns A to G, remove duplicates, and store a resulting list in a new column. So, the final result should look like this:
Try this one
> df$new <- apply(df,1,unique)
> df
A B C D E F G new
1 a b c d a b c a, b, c, d
2 a b a b a b a a, b
3 a b c a b c a a, b, c
4 a b c d e f g a, b, c, d, e, f, g
A possible solution:
library(tidyverse)
A <- c("a", "a", "a", "a")
B <- c("b", "b", "b", "b")
C <- c("c", "a", "c", "c")
D <- c("d", "b", "a", "d")
E <- c("a", "a", "b", "e")
F <- c("b", "b", "c", "f")
G <- c("c", "a", "a", "g")
df <- data.frame(A, B, C, D, E, F, G)
df %>%
rowwise %>%
mutate(new = c_across(everything()) %>% unique %>% str_c(collapse = ",")) %>%
ungroup
#> # A tibble: 4 × 8
#> A B C D E F G new
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 a b c d a b c a,b,c,d
#> 2 a b a b a b a a,b
#> 3 a b c a b c a a,b,c
#> 4 a b c d e f g a,b,c,d,e,f,g
this is sort of a silly way of doing it, but does this address your issue?
list(unique(t(df)[,1]),
unique(t(df)[,2]),
unique(t(df)[,3]),
unique(t(df)[,4]))

how to add a column to identify specific combination of values in R?

I have a database with several columns ( >20) and 2 of these columns have the subject names. I would like to add another column with inside a number that identifies the combination of the two subjects.
Here is an example with only the 2 columns of names (I don't include the others for convenience):
ID1 ID2
A B
A C
A B
B C
A B
B A
C B
And here is what i would like to create:
ID1 ID2 CODE
A B 1
A C 2
A B 1
B C 3
A B 1
B A 1
C B 3
I am kind of new in R and I think it can be done with stringr but I am not sure how
Thanks for the help!
Simo
df$CODE <- as.integer(
factor(
apply(df, 1, function(x) paste0(sort(x), collapse = ""))
)
)
# ID1 ID2 CODE
# 1 A B 1
# 2 A C 2
# 3 A B 1
# 4 B C 3
# 5 A B 1
# 6 B A 1
# 7 C B 3
Data
df <- data.frame(
ID1 = c("A", "A", "A", "B", "A", "B", "C"),
ID2 = c("B", "C", "B", "C", "B", "A", "B")
)
Try this:
library(dplyr)
#Code
new <- df %>% rowwise() %>%
mutate(Var = paste0(sort(c(ID1, ID2)), collapse = '')) %>%
group_by(Var) %>%
mutate(CODE=cur_group_id()) %>%
ungroup() %>%
select(-Var)
Output:
# A tibble: 7 x 3
ID1 ID2 CODE
<chr> <chr> <int>
1 A B 1
2 A C 2
3 A B 1
4 B C 3
5 A B 1
6 B A 1
7 C B 3
Some data used:
#Data
df <- structure(list(ID1 = c("A", "A", "A", "B", "A", "B", "C"), ID2 = c("B",
"C", "B", "C", "B", "A", "B")), class = "data.frame", row.names = c(NA,
-7L))

Count number of element for each row in a matrix [duplicate]

This question already has answers here:
Count number of values in row using dplyr
(5 answers)
Counting number of instances of a condition per row R [duplicate]
(1 answer)
Closed 2 years ago.
Hello I have a matrix such as :
COL1 COL2 COL3
A "A" "B" NA
B "B" "B" "C"
C NA NA NA
D "B" "B" "B"
E NA NA "C"
F "A" "A" "C"
and I would liek for each row (A,B,C,D etc) get the number of letters being A or B
exemple :
Nb
A 2
B 2
C 0
D 3
E 0
F 2
does someone have an idea ?
another way is to use sapply:
df$n <- sapply(1:nrow(df), function(i) sum((df[i,] %in% c('A', 'B'))))
# COL1 COL2 COL3 n
# A A B <NA> 2
# B B B C 2
# C <NA> <NA> <NA> 0
# D B B B 3
# E <NA> <NA> C 0
# F A A C 2
You can achieve the same output by using purrr::map_dbl as well. Just replace sapply with map_dbl.
You can try a base R solution with apply():
#Base R
df$Var <- apply(df,1,function(x) length(which(!is.na(x) & x %in% c('A','B'))))
Output:
COL1 COL2 COL3 Var
A A B <NA> 2
B B B C 2
C <NA> <NA> <NA> 0
D B B B 3
E <NA> <NA> C 0
F A A C 2
Some data used:
#Data
df <- structure(list(COL1 = c("A", "B", NA, "B", NA, "A"), COL2 = c("B",
"B", NA, "B", NA, "A"), COL3 = c(NA, "C", NA, "B", "C", "C")), row.names = c("A",
"B", "C", "D", "E", "F"), class = "data.frame")
Or if you feel curious about tidyverse:
library(tidyverse)
#Code
df %>% mutate(id=1:n()) %>%
left_join(df %>% mutate(id=1:n()) %>%
pivot_longer(cols = -id) %>%
filter(value %in% c('A','B')) %>%
group_by(id) %>%
summarise(Var=n())) %>% ungroup() %>%
replace(is.na(.),0) %>% select(-id)
Output:
COL1 COL2 COL3 Var
1 A B 0 2
2 B B C 2
3 0 0 0 0
4 B B B 3
5 0 0 C 0
6 A A C 2
library(dplyr)
df <- structure(list(COL1 = c("A", "B", NA, "B", NA, "A"), COL2 = c("B",
"B", NA, "B", NA, "A"), COL3 = c(NA, "C", NA, "B", "C", "C")), row.names = c("A",
"B", "C", "D", "E", "F"), class = "data.frame")
df %>%
rowwise() %>%
mutate(sumVar = across(c(COL1:COL3),~ifelse(. %in% c("A", "B"),1,0)) %>% sum)
# A tibble: 6 x 4
# Rowwise:
COL1 COL2 COL3 sumVar
<chr> <chr> <chr> <dbl>
1 A B NA 2
2 B B C 2
3 NA NA NA 0
4 B B B 3
5 NA NA C 0
6 A A C 2

Tabulating list of values in third variable in R

I have following data:
ddf2 = structure(list(col1 = c(3, 3, 2, 1, 1, 1, 3, 2, 1, 1, 3, 1, 1,
2, 1, 1, 1, 2, 3, 1, 1, 3, 2, 3, 3), col2 = c("c", "c", "b",
"b", "b", "a", "b", "c", "b", "b", "c", "c", "b", "b", "a", "c",
"c", "b", "a", "b", "b", "c", "a", "c", "a"), col3 = c("C", "E",
"E", "B", "D", "E", "C", "C", "E", "E", "C", "A", "D", "D", "C",
"E", "A", "A", "A", "D", "A", "A", "B", "A", "E")), .Names = c("col1",
"col2", "col3"), row.names = c(NA, 25L), class = "data.frame")
head(ddf2)
col1 col2 col3
1 3 c C
2 3 c E
3 2 b E
4 1 b B
5 1 b D
6 1 a E
For every combination of col1 and col2, there may be many values of col3:
with(ddf2, ddf2[col1==1 & col2=='b',])
col1 col2 col3
4 1 b B
5 1 b D
9 1 b E
10 1 b E
13 1 b D
20 1 b D
21 1 b A
with(ddf2, table(col1, col2))
col2
col1 a b c
1 2 7 3
2 1 3 1
3 2 1 5
I want to create a table/matrix of col1 and col2 as above but each cell should have a list of unique col3 entries for that set of col1 and col2. I expect following output:
col2
col1 a b c
1 E,C A,B,D,E A,E
2 B A,D,E C
3 A,E C A,C,E
I tried following but it does not work:
with(ddf2, tapply(col3, list(col1,col2), c))
a b c
1 Character,2 Character,7 Character,3
2 "B" Character,3 "C"
3 Character,2 "C" Character,5
How can this be done? Thanks for your help.
One option:
d <- with(ddf2, aggregate(col3 ~ col2 + col1, FUN = function(x) paste0(unique(x))))
library(reshape2)
dcast(d, col1 ~ col2, value.var = "col3")
# col1 a b c
#1 1 E, C B, D, E, A A, E
#2 2 B E, D, A C
#3 3 A, E C C, E, A
Most likely it's possible to do both steps in one, but I'll generously leave it to someone else to figure this out ;)
Or
library(dplyr)
library(tidyr)
ddf2 %>%
group_by(col1, col2) %>%
summarise(col3 = paste(unique(col3), collapse = ", ")) %>%
spread(col2, col3)
#Source: local data frame [3 x 4]
#
# col1 a b c
#1 1 E, C B, D, E, A A, E
#2 2 B E, D, A C
#3 3 A, E C C, E, A
Edit after comment:
Just tested with tapply and this seems to work (the problem was apparently in calling c()):
with(ddf2, tapply(col3, list(col1,col2), FUN = function(x) paste(unique(x), collapse = ", ")))
# a b c
#1 "E, C" "B, D, E, A" "A, E"
#2 "B" "E, D, A" "C"
#3 "A, E" "C" "C, E, A"

R: unique combination (avoid a-b and b-a and identical such as a-a, b-b)

I have the following variable columns -
var1 <- c("a", "b", "a", "a", "c", "a", "b", "b", "c", "b", "c", "c", "d")
var2 <- c("a", "a", "b", "c", "a", "d", "b", "c", "b", "d", "c", "d", "d")
mydf <- data.frame(var1, var2)
I want to find unique variable combination, such that
(a) var1 a- var2 b and var1 b- var2 a are not considered unique.
(b) no identical combination are present -
for example var1 a and var2 a, var1 b and var2 b
I used the following codes, is not providing what I am expecting:
unique(mydf)
var1 var2
1 a a
2 b a
3 a b
4 a c
5 c a
6 a d
7 b b
8 b c
9 c b
10 b d
11 c c
12 c d
13 d d
My expected output is:
var1 var2
1 a b
2 a c
3 a d
4 b c
5 b d
6 c d
thanks;
This should do it:
mydf = mydf[mydf[,1] != mydf[,2], ]
mydf = mydf[!duplicated(data.frame(t(apply(mydf, 1, sort)))), ]
> mydf
var1 var2
2 b a
4 a c
6 a d
8 b c
10 b d
12 c d
More of an exercise to teach myself some sets package behavior:
require(sets)
mydf <- data.frame(var1, var2, stringsAsFactors=FALSE) # unneeded factors are a plague on R/S
dlis <- list();
for (i in seq(nrow(mydf)) ) {
if( length(set(mydf[i,1], mydf[i,2]) )==2 ) {
dlis <- c( dlis, list(set(mydf[i,1], mydf[i,2]))
) } }
unique(dlis)
[[1]]
{"a", "b"}
[[2]]
{"a", "c"}
[[3]]
{"a", "d"}
[[4]]
{"b", "c"}
[[5]]
{"b", "d"}
[[6]]
{"c", "d"}

Resources