Find Count of Elements from One List in Another List - r

So, if I have two lists, one being a "master list" without repeats, and the other being a subset with possible repeats, I would like to be able to check how many of each element are in the secondary subset list.
So if I have these lists:
a <- (a, b, c, d, e, f, g)
b <- (a, d, c, d, a, f, f, g, c, c)
I'd like to determine how many times each element from list a appear in list b and the frequency of each. My ideal output would be an r table that looks like:
c <- a b c d e f g
2 0 3 1 0 2 1
I've been trying to think through it with %in% and table()

You can use table and match - but first make the vectors factors so levels not present are included in the output:
a <- factor(c("a", "b", "c", "d", "e", "f", "g"))
b <- factor(c("a", "d", "c", "d", "a", "f", "f", "g", "c", "c"))
table(a[match(b, a)])
a b c d e f g
2 0 3 2 0 2 1

If for some reason you want a tidyverse solution. This method preserves the original data type in the lists.
library(tidyverse)
a <- c("a", "b", "c", "d", "e", "f", "g")
b <- c("a", "d", "c", "d", "a", "f", "f", "g", "c", "c")
tibble(letters = a, count = unlist(map(a, function(x) sum(b %in% x))))
# A tibble: 7 x 2
letters count
<chr> <int>
1 a 2
2 b 0
3 c 3
4 d 2
5 e 0
6 f 2
7 g 1

Related

Create ID variable per chain of values

I have a dataset that looks like this:
data <- data.frame(Name1 = c("A", "B", "D", "E", "H"),
Name2 = c("B", "C", "E", "G", "I"))
I would like to add an ID column to help me trace groups of names, i.e. who references who? So with the example data, the groups would be:
Name1 Name2 GroupID
A B 1
B C 1
D E 2
E G 2
H I 3
Please note that my original data is not ordered as this example is. Thanks in advance for any help!
You can use the igraph package to make a network from your data set and determine clusters:
data <- data.frame(Name1 = c("A", "B", "D", "E", "H"),
Name2 = c("B", "C", "E", "G", "I"))
library(igraph)
graph <- graph_from_data_frame(data, directed = FALSE)
clusters <- components(graph)
#data$GroupId <- sapply(data$Name1, function(x) clusters$membership[which(names(clusters$membership) == x)])
# Simpler version
data$GroupId <- clusters$membership[data$Name1]
That gives:
> data
Name1 Name2 GroupId
1 A B 1
2 B C 1
3 D E 2
4 E G 2
5 H I 3

How to concatenate multiple columns in one and remove duplicates?

I have a dataframe like this one:
A <- c("a", "a", "a", "a")
B <- c("b", "b", "b", "b")
C <- c("c", "a", "c", "c")
D <- c("d", "b", "a", "d")
E <- c("a", "a", "b", "e")
F <- c("b", "b", "c", "f")
G <- c("c", "a", "a", "g")
df <- data.frame(A, B, C, D, E, F, G)
I need to merge all values from the columns A to G, remove duplicates, and store a resulting list in a new column. So, the final result should look like this:
Try this one
> df$new <- apply(df,1,unique)
> df
A B C D E F G new
1 a b c d a b c a, b, c, d
2 a b a b a b a a, b
3 a b c a b c a a, b, c
4 a b c d e f g a, b, c, d, e, f, g
A possible solution:
library(tidyverse)
A <- c("a", "a", "a", "a")
B <- c("b", "b", "b", "b")
C <- c("c", "a", "c", "c")
D <- c("d", "b", "a", "d")
E <- c("a", "a", "b", "e")
F <- c("b", "b", "c", "f")
G <- c("c", "a", "a", "g")
df <- data.frame(A, B, C, D, E, F, G)
df %>%
rowwise %>%
mutate(new = c_across(everything()) %>% unique %>% str_c(collapse = ",")) %>%
ungroup
#> # A tibble: 4 × 8
#> A B C D E F G new
#> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
#> 1 a b c d a b c a,b,c,d
#> 2 a b a b a b a a,b
#> 3 a b c a b c a a,b,c
#> 4 a b c d e f g a,b,c,d,e,f,g
this is sort of a silly way of doing it, but does this address your issue?
list(unique(t(df)[,1]),
unique(t(df)[,2]),
unique(t(df)[,3]),
unique(t(df)[,4]))

Remove df rows using information about unrepeated levels between two vectors

df <- data.frame(X = c("a", "b", "c", "a", "b", "c", "a", "b", "c", "d" , "a", "b", "c", "d", "e"),
Y = c("w", "w", "w", "K", "K", "K", "L", "L", "L", "L", "Z", "Z", "Z", "Z", "Z"))
Note that the first vector has 5 levels and the second has 4 levels. My goal is to select df lines that have all levels of vector 1 in common as vector 2. That is, I want to select lines that have levels "a", "b" and "c" since " d "appears only twice" and "appears only in vector 1.
I tried to make a list with the common levels and leave only the lines with the common levels by subset. However, it doesn't work because this level list doesn't generate the address of the lines I want to remove. Ex:
common <- c ("a", "b", "c")
df2 <- df [c(common),]
In my real df, there are 64 levels in common, so it doesn't happen "to do by hand". Can someone help me?
I think this is what you want. Essentially splitting X by Y, then looking for all intersecting values that are in every set.
df[df$X %in% Reduce(intersect, split(df$X, df$Y)),]
# X Y
#1 a w
#2 b w
#3 c w
#4 a K
#5 b K
#6 c K
#7 a L
#8 b L
#9 c L
#11 a Z
#12 b Z
#13 c Z
Another way could be to group_by X and select groups which has all distinct values in Y.
library(dplyr)
df %>%
group_by(X) %>%
filter(n_distinct(Y) == n_distinct(.$Y))
# X Y
# <fct> <fct>
# 1 a w
# 2 b w
# 3 c w
# 4 a K
# 5 b K
# 6 c K
# 7 a L
# 8 b L
# 9 c L
#10 a Z
#11 b Z
#12 c Z
In base R, that would be using ave
subset(df, as.logical(ave(as.character(Y), X,
FUN = function(x) length(unique(x)) == length(unique(Y)))))
Using data.table
library(data.table)
setDT(df)[, .SD[uniqueN(Y) == uniqueN(df$Y)], by = X]

R-Software: Counting occurrence combination in a column based on second column

I have a simple problem (seemingly) but have not yet able to find an appropriately quick/time & resource efficient solution. This is a problem in R-Software.
My data is of format:
INPUT
col1 col2
A q
C w
B e
A r
A t
A y
C q
B w
C e
C r
B t
C y
DESIRED OUTPUT
unit1 unit2 same_col2_freq
A B 1
A C 3
B A 1
B C 2
C A 3
C B 2
That is in input A has occurred in col1 with q, r, t, y occurring in col2. Now, q, r, t, y occurs for B with t so the A-B combination has count 1.
B has occurred in col1 with e, w, t occurring in col2. Now, e, w, t occurs for C with w, t so the B-C combination has count 2.
.... and so on for all combinations in col1.
I have done it using a for loop but it is slow. I am picking unique elements from col1 and then, all the data is iterated for each element of col1. Then I am combining the results using rbind. This is slow and resource costly.
I am looking for an efficient method. Maybe a library, function etc. exists that I am unaware of. I tried using co-occurrence matrix but the number of elements in col1 is of order of ~10,000 and it does not solve my purpose.
Any help is greatly appreciated.
Thanks!
Use merge to join the dataframe with itself and then use aggregate to count within groups. demo:
d = data.frame(col1=c("A", "C", "B", "A", "A", "A", "C", "B", "C", "C", "B", "C"), col2=c("q", "w", "e", "r", "t", "y", "q", "w", "e", "r", "t", "y"))
dm = merge(d, d, by="col2")
dm = dm[dm[,'col1.x']!=dm[,'col1.y'],]
aggregate(col2 ~ col1.x + col1.y, data=dm, length)
# col1.x col1.y col2
# 1 B A 1
# 2 C A 3
# 3 A B 1
# 4 C B 2
# 5 A C 3
# 6 B C 2
Here is a similar approach (as showed by #cogitovita), but using data.table. Convert the "data.frame" to "data.table" using setDT, then Cross Join (CJ) the unique elements of "col1", grouped by "col2". Subset the rows of the output columns that are not equal (V1!=V2), get the count (.N), grouped by the new columns (.(V1, V2)) and finally order the columns (order(V1,V2))
library(data.table)
setDT(df)[,CJ(unique(col1), unique(col1)), col2][V1!=V2,
.N, .(V1,V2)][order(V1,V2)]
# V1 V2 N
#1: A B 1
#2: A C 3
#3: B A 1
#4: B C 2
#5: C A 3
#6: C B 2
data
df <- structure(list(col1 = c("A", "C", "B", "A", "A", "A", "C", "B",
"C", "C", "B", "C"), col2 = c("q", "w", "e", "r", "t", "y", "q",
"w", "e", "r", "t", "y")), .Names = c("col1", "col2"), class =
"data.frame", row.names = c(NA, -12L))

loop for working with individual values in r

Here is my small dataset.
Indvidual <- c("A", "B", "C", "D", "E", "F", "G", "H", "I", "J")
Parent1 <- c(NA, NA, "A", "A", "C", "C", "C", "E", "A", NA)
Parent2 <- c(NA, NA, "B", "C", "D", "D", "D", NA, "D", NA)
mydf <- data.frame (Indvidual, Parent1, Parent2)
Indvidual Parent1 Parent2
1 A <NA> <NA>
2 B <NA> <NA>
3 C A B
4 D A C
5 E C D
6 F C D
7 G C D
8 H E <NA>
9 I A D
10 J <NA> <NA>
Just consider people who has two or one known parents. I need to compare and derieve score by calculating scores that their parents have.
The rules is that either one of parent (names in parent1 or parent2 column) is known (not NA), will get 1 one additional score plus score their parents have. If there are two parents known, the highest scorer will be taken into consideration.
Here is an example:
Individual "A", has both parents unknown so will get score 0
Indiviudal "C", has both parents known (i.e. A, B)
will get 0 score (maximum of their parents)
plus 1 (as it has either one of parents known)
Thus expected output from above dataframe (with explanation) is:
Indvidual Parent1 Parent2 Scores Explanation
1 A <NA> <NA> 0 0 (Max of parent Scores NA) + 0 (neither parent knwon)
2 B <NA> <NA> 0 0 (Max of parent Scores NA) + 0 (neither parent knwon)
3 C A B 1 0 (Max of parent Scores) + 1 (either parent knwon)
4 D A C 2 1 (Max of parent scores) + 1 (either parent knwon)
5 E C D 3 2 (Max of parent scores) + 1 (either parent knwon)
6 F C D 3 2 (Max of parent scores) + 1 (either parent knwon)
7 G C D 3 2 (Max of parent scores) + 1 (either parent knwon)
8 H E <NA> 4 3 (Max of parent scores) + 1 (either parent knwon)
9 I A D 3 2 (Max of parent scores) + 1 (either parent knwon)
10 J <NA> <NA> 0 0 (Max of parent scores NA) + 0 (neither parent knwon)
Explanation: As loop goes on, it takes into account on the Scores already calculated.
Max of parent scores
Edits: based on chase's question
For example:
Individual C has two parents A and B, each of which has Scores calculated as 0 and 0
(in row 1 and 2 and column Scores), means that max (c(0,0)) will be 0
Individual E has parents C and D, whose scores in Scores column is (in row 3 and 4),
1 and 2, respectively. So maximum of max(c(1,2)) will be 2.
Example using plyr and a recursive argument
library(plyr)
Indvidual <- c("A", "B", "C", "D", "E", "F", "G", "H", "I", "J")
Parent1 <- c(NA, NA, "A", "A", "C", "C", "C", "E", "A", NA)
Parent2 <- c(NA, NA, "B", "C", "D", "D", "D", NA, "D", NA)
mydf <- data.frame (Indvidual, Parent1, Parent2)
scor.fun<-function(x,mydf){
Explanation<-0
P1<-as.character(x$Parent1)
P2<-as.character(x$Parent2)
score<-as.numeric(!(is.na(P1)||is.na(P1)))
if(!(is.na(P1)||is.na(P2))){
Explanation<-max(scor.fun(subset(mydf,Indvidual==P1),mydf)[1],scor.fun(subset(mydf,Indvidual==P2),mydf)[1])
score<-score+Explanation
}else{
Explanation<-ifelse(is.na(P1),0,scor.fun(subset(mydf,Indvidual==P1),mydf)[1])
Explanation<-max(Explanation,ifelse(is.na(P2),0,scor.fun(subset(mydf,Indvidual==P2),mydf)[1]))
score<-score+Explanation
}
c(score,Explanation)
}
adply(mydf,1,scor.fun,mydf)
Probably not the best idea with the recursion on a big dataframe.
Individual <- c("A", "B", "C", "D", "E", "F", "G", "H", "I", "J")
Parent1 <- c(NA, NA, "A", "A", "C", "C", "C", "E", "A", NA)
Parent2 <- c(NA, NA, "B", "C", "D", "D", "D", NA, "D", NA)
mydf <- data.frame (Individual, Parent1, Parent2, stringsAsFactors = FALSE)
mydf$Scores <- NA
mydf$Scores[rowSums(is.na(mydf[, c("Parent1", "Parent2")])) == 2] <- 0
while(any(is.na(mydf$Scores))){
KnownScores <- mydf[!is.na(mydf$Scores), c(1, 4)]
ToCalculate <- mydf[
mydf$Parent1 %in% c(KnownScores$Individual, NA) &
mydf$Parent2 %in% c(KnownScores$Individual, NA) &
is.na(mydf$Scores),
-4]
ToCalculate$Score <- apply(
merge(
merge(
ToCalculate,
KnownScores,
by.x = "Parent1",
by.y = "Individual",
all.x = TRUE
),
KnownScores,
by.x = "Parent2",
by.y = "Individual",
all.x = TRUE
)[, 4:5],
1,
max,
na.rm = TRUE) + 1
mydf <- merge(mydf, ToCalculate[, c(1, 4)], all.x = TRUE)
mydf$Scores[!is.na(mydf$Score)] <- mydf$Score[!is.na(mydf$Score)]
mydf$Score <- NULL
}

Resources