Overlap in row values from previous rows - r

I have a dataframe like this:
set.seed(123)
a <- c("A", "B", "C", "D", "E", "F", "G", "H", "I")
df <- data.frame(
V1 = sample(a,4, replace=TRUE),
V2 = sample(a,4, replace=TRUE),
V3 = sample(a,4, replace=TRUE),
V4 = sample(a,4, replace=TRUE)
)
which looks like
V1 V2 V3 V4
1 C I E G
2 H A E F
3 D E I A
4 H I E I
I'd like to count the number of unique values in a row in comparison to the previous rows, so the result would look like:
V1 V2 V3 V4 V5
1 C I E G 4
2 H A E F 3
3 D E I A 2
4 H I E I 1
V5 equals 4 for row 1 since it's the 1st row and all are unique
V5 equals 3 for row 2 since H, A, and F were not in row 1
V5 equals 2 for row 3 since 1) D and I were not in row 2. and 2) D and A were not in row 1.
V5 equals 1 for row 4 since 1) H was not in row 1, 2) I was not in row 2, and 3) H was not in row 4.
if row 4 were H I E A, then V5 for row 4 would have been still been 1 since it only has 1 value not in row 3, even though it would have 2 values not in row 2 and 2 values not in row 1.

Here is a multi-step method in base R.
# Create a list of the elements by row, using mike H's method
myList <- strsplit(Reduce(paste0, df), "")
# previous method, could create new object first t(df) if large df
# myList <- split(t(df), col(t(df)))
# get pairwise combinations of rows
combos <- t(combn(nrow(df):1, 2))[choose(nrow(df), 2):1,]
# get desired values, sapply runs through pairs of rows, tapply calculates min with row
df$cnts <- c(length(unique(myList[[1]])), # value for first row
tapply(sapply(1:nrow(combos), # sapply through pairs, taking set diffs
function(x) length(setdiff(myList[[combos[x,1]]],
myList[[combos[x,2]]]))),
combos[,1], min)) # split set diff lengths by row, get min length
This returns
df
V1 V2 V3 V4 cnts
1 C I E G 4
2 H A E F 3
3 D E I A 2
4 H I E I 1

For such tasks, storing the rows/sets of data like "df" in a tabulation format can be helpful to solve problems:
tab = table(as.matrix(df), row(df)) > 0
#> tab
#
# 1 2 3 4
# A FALSE TRUE TRUE FALSE
# C TRUE FALSE FALSE FALSE
# D FALSE FALSE TRUE FALSE
# E TRUE TRUE TRUE TRUE
# F FALSE TRUE FALSE FALSE
# G TRUE FALSE FALSE FALSE
# H FALSE TRUE FALSE TRUE
# I TRUE FALSE TRUE TRUE
crossprod can be used to retrieve (in a very efficient manner) the number of items that belong to a row but not to any other:
ct = crossprod(tab, !tab)
#> ct
#
# 1 2 3 4
# 1 0 3 2 2
# 2 3 0 2 2
# 3 2 2 0 2
# 4 1 1 1 0
Above we can see that, e.g., row 4 contains 1 element that row 1 does not contain, while row 1 contains 2 elements that are not in row 4, etc.
Since here we only care about the previous rows of each row and, specifically, the minimum of each set of one-to-all comparisons, an idea to get the result is:
ct[upper.tri(ct, TRUE)] = Inf ## to ignore 'upper.tri' values in 'max.col'
j_min = max.col(-ct, "first") ## row-index of the minimum difference per row
c(sum(tab[, 1]),
ct[cbind(2:nrow(df), j_min[-1])])
#[1] 4 3 2 1

Here's an approach that uses Reduce and mapply:
df$cols_paste <- strsplit(Reduce(paste0, df), split = "")
df$V5 <- lapply(1:length(df$cols_paste), function(x){
if(x==1) compare = NA
else compare = df$cols_paste[seq(1:(x-1))]
min(mapply(function(x, y) length(setdiff(x,y)), df$cols_paste[x], compare))
})
df[,setdiff(names(df), "cols_paste")]
V1 V2 V3 V4 V5
1 C I E G 4
2 H A E F 3
3 D E I A 2
4 H I E I 1

Related

Calculate probability and consecutive events in a dataframe

My dataset has 575 rows and 368 columns and it looks like this:
NUTS3_2016 URAU_CODE FUA_CODE X2018.01.01.x X2018.01.02.x X2018.01.03.x ...
1 AT130 AT001C1 AT001L3 0.46369280 0.3582241 0.2777274 ...
2 AT211 AT006C1 AT006L2 -0.04453125 -0.3092773 -0.3284180 ...
3 AT312 AT003C1 AT003L3 1.02993164 0.9640137 0.6413086 ...
4 AT323 AT004C1 AT004L3 1.21105239 1.4335363 1.2400620 ...
... ... .... ... ... ... .... ...
I want to calculate the probability that x>2.5 for each row.
I also want to calculate for how many consecutive days x remains >2.5 for each row.
What are your suggestions?
Many thanks
Attempt:
A <- c("a", "b", "c", "d", "e")
B <- c(1:5)
C <- c(1:5)
x <- data.frame(A,B,C)
x$prob <- rowMeans(x[-(1)]>2)
x
# A B C prob
# 1 a 1 1 0
# 2 b 2 2 0
# 3 c 3 3 1
# 4 d 4 4 1
# 5 e 5 5 1
We can use rle for finding the length of the maximum streak.
## Some sample data:
set.seed(47)
data = matrix(rnorm(24, mean = 2.5), nrow = 3)
data = cbind(ID = c("A", "B", "C"), as.data.frame(data))
data
# ID V1 V2 V3 V4 V5 V6 V7 V8
# 1 A 4.494696 2.218235 1.514518 1.034250 2.9938202 3.170779 1.7966118 2.749148
# 2 B 3.211143 2.608776 2.515131 1.577544 0.6717708 2.418922 2.4594218 2.159584
# 3 C 2.685405 1.414263 2.247954 2.539602 2.5914729 3.764241 0.9338379 2.917191
data$max_streak = apply(data[-1], 1, function(x) with(rle(x > 2.5), max(lengths[values])))
# ID V1 V2 V3 V4 V5 V6 V7 V8 max_streak
# 1 A 4.494696 2.218235 1.514518 1.034250 2.9938202 3.170779 1.7966118 2.749148 2
# 2 B 3.211143 2.608776 2.515131 1.577544 0.6717708 2.418922 2.4594218 2.159584 3
# 3 C 2.685405 1.414263 2.247954 2.539602 2.5914729 3.764241 0.9338379 2.917191 3

Identify if a value is repeated or not by groups in R

I have a data with many columns and rows. I would like to identify if a value is repeated (the same) for a group or not with a creation of a new logical variable.
So my data looks like this:
v0 <- c(1,2,3,4,5,6,7,8,9)
v1 <- c("a", "b", "a", "c","e", "c", "b", "b", "e")
v2 <- c("R", NA, "R", "R", "G","C", "R", "R", "G")
dftest <- data.frame(v0, v1, v2)
v0 v1 v2
1 1 a R
2 2 b <NA>
3 3 a R
4 4 c R
5 5 e G
6 6 c C
7 7 b R
8 8 b R
9 9 e G
I need to compare the values of v1 and v2 in a way whether the same v1 values take a same v2 values or not. So my output df would look like this:
v3 <- c(T, F, T, F, T, F, F, F, T )
dfresult <- data.frame(v0, v1, v2, v3)
v0 v1 v2 v3
1 1 a R TRUE
2 2 b <NA> FALSE
3 3 a R TRUE
4 4 c R FALSE
5 5 e G TRUE
6 6 c C FALSE
7 7 b R FALSE
8 8 b R FALSE
9 9 e G TRUE
Any saggestion would be appreciated. Thanks.
Here is a base R solution, where ave is used
dfresult <- within(dftest, v3 <- as.logical(ave(as.vector(v2),v1,FUN = function(x) length(unique(x))==1)))
such that
> dfresult
v0 v1 v2 v3
1 1 a R TRUE
2 2 b <NA> FALSE
3 3 a R TRUE
4 4 c R FALSE
5 5 e G TRUE
6 6 c C FALSE
7 7 b R FALSE
8 8 b R FALSE
9 9 e G TRUE
One way would be the following. As far as I see your question, it seems that you want to return TRUE when there is only one unique value in v2 in a group. Otherwise, you want to return FALSE.
library(dplyr)
group_by(dftest, v1) %>%
mutate(v3 = n_distinct(v2) == 1)
# v0 v1 v2 v3
# <dbl> <fct> <fct> <lgl>
#1 1 a R TRUE
#2 2 b NA FALSE
#3 3 a R TRUE
#4 4 c R FALSE
#5 5 e G TRUE
#6 6 c C FALSE
#7 7 b R FALSE
#8 8 b R FALSE
#9 9 e G TRUE
If you use the data.table package, you can do the following.
setDT(dftest)[, v3 := uniqueN(v2) == 1, by = v1][]
An option using the data.table package:
library(data.table)
setDT(dftest)[, v3 := uniqueN(v2)==1L, v1]
output:
v0 v1 v2 v3
1: 1 a R TRUE
2: 2 b <NA> FALSE
3: 3 a R TRUE
4: 4 c R FALSE
5: 5 e G TRUE
6: 6 c C FALSE
7: 7 b R FALSE
8: 8 b R FALSE
9: 9 e G TRUE

Adding two vectors by names

I have two named vectors
v1 <- 1:4
v2 <- 3:5
names(v1) <- c("a", "b", "c", "d")
names(v2) <- c("c", "e", "d")
I want to add them up by the names, i.e. the expected result is
> v3
a b c d e
1 2 6 9 4
Is there a way to programmatically do this in R? Note the names may not necessarily be in a sorted order, like in v2 above.
Just combine the vectors (using c, for example) and use tapply:
v3 <- c(v1, v2)
tapply(v3, names(v3), sum)
# a b c d e
# 1 2 6 9 4
Or, for fun (since you're just doing sum), continuing with "v3":
xtabs(v3 ~ names(v3))
# names(v3)
# a b c d e
# 1 2 6 9 4
I suppose with "data.table" you could also do something like:
library(data.table)
as.data.table(Reduce(c, mget(ls(pattern = "v\\d"))),
keep.rownames = TRUE)[, list(V2 = sum(V2)), by = V1]
# V1 V2
# 1: a 1
# 2: b 2
# 3: c 6
# 4: d 9
# 5: e 4
(I shared the latter not so much for "data.table" but to show an automated way of capturing the vectors of interest.)

In R, how can I access the first element of each level of a factor?

I have a data frame like this:
n = c(2, 2, 3, 3, 4, 4)
n <- as.factor(n)
s = c("a", "b", "c", "d", "e", "f")
df = data.frame(n, s)
df
n s
1 2 a
2 2 b
3 3 c
4 3 d
5 4 e
6 4 f
and I want to access the first element of each level of my factor (and have in this example a vector containing a, c, e).
It is possible to reach the first element of one level, with
df$s[df$n == 2][1]
but it does not work for all levels:
df$s[df$n == levels(n)]
[1] a f
How would you do that?
And to go further, I’d like to modify my data frame to see which is the first element for each level at every occurrence. In my example, a new column should be:
n s rep firstelement
1 2 a a a
2 2 b c a
3 3 c e c
4 3 d a c
5 4 e c e
6 4 f e e
Edit. The first part of my answer addresses the original question, i.e. before "And to go further" (which was added by OP in an edit).
Another possibility, using duplicated. From ?duplicated: "duplicated() determines which elements of a vector or data frame are duplicates of elements with smaller subscripts."
Here we use !, the logical negation (NOT), to select not duplicated elements of 'n', i.e. first elements of each level of 'n'.
df[!duplicated(df$n), ]
# n s
# 1 2 a
# 3 3 c
# 5 4 e
Update Didn't see your "And to go further" edit until now. My first suggestion would definitely be to use ave, as already proposed by #thelatemail and #sparrow. But just to dig around in the R toolbox and show you an alternative, here's a dplyr way:
Group the data by n, use the mutate function to create a new variable 'first', with the value 'first element of s' (s[1]),
library(dplyr)
df %.%
group_by(n) %.%
mutate(
first = s[1])
# n s first
# 1 2 a a
# 2 2 b a
# 3 3 c c
# 4 3 d c
# 5 4 e e
# 6 4 f e
Or go all in with dplyr convenience functions and use first instead of [1]:
df %.%
group_by(n) %.%
mutate(
first = first(s))
A dplyr solution for your original question would be to use summarise:
df %.%
group_by(n) %.%
summarise(
first = first(s))
# n first
# 1 2 a
# 2 3 c
# 3 4 e
Here is an approach using match:
df$s[match(levels(n), df$n)]
EDIT: Maybe this looks a bit confusing ...
To get a column which lists the first elements you could use match twice (but with x and table arguments swapped):
df$firstelement <- df$s[match(levels(n), df$n)[match(df$n, levels(n))]]
df$firstelement
# [1] a a c c e e
# Levels: a b c d e f
Lets look at this in detail:
## this returns the first matching elements
match(levels(n), df$n)
# [1] 1 3 5
## when we swap the x and table argument in match we get the level index
## for each df$n (the duplicated indices are important)
match(df$n, levels(n))
# [1] 1 1 2 2 3 3
## results in
c(1, 3, 5)[c(1, 1, 2, 2, 3, 3)]
# [1] 1 1 3 3 5 5
df$s[c(1, 1, 3, 3, 5, 5)]
# [1] a a c c e e
# Levels: a b c d e f
the function ave is useful in these cases:
df$firstelement = ave(df$s, df$n, FUN = function(x) x[1])
df
n s firstelement
1 2 a a
2 2 b a
3 3 c c
4 3 d c
5 4 e e
6 4 f e
In this case I prefer plyr package, it gives further freedom to manipulate the data.
library(plyr)
ddply(df,.(n),function(subdf){return(subdf[1,])})
n s
1 2 a
2 3 c
3 4 e
You could also use data.table
library(data.table)
dt = as.data.table(df)
dt[, list(firstelement = s[1]), by=n]
which would get you:
n firstelement
1: 2 a
2: 3 c
3: 4 e
The by=n bit groups everything by each value of n so s[1] is getting the first element of each of those groups.
To get this as an extra column you could do:
dt[, newcol := s[1], by=n]
dt
# n s newcol
#1: 2 a a
#2: 2 b a
#3: 3 c c
#4: 3 d c
#5: 4 e e
#6: 4 f e
So this just takes the value of s from the first row of each group and assigns it to a new column.
df$s[sapply(levels(n), function(particular.level) { which(df$n == particular.level)[1]})]
I believe your problem is that you are comparing two vectors df$n is a vector and levels(n) is a vector. vector == vector only happens to work for you since df$n is a multiple length of levels(n)
Surprised not to see this classic in the answer stream yet.
> do.call(rbind, lapply(split(df, df$n), function(x) x[1,]))
## n s
## 2 2 a
## 3 3 c
## 4 4 e

Count unique elements in data frame row and return one with maximum occurrence [duplicate]

This question already has answers here:
How to find mode across variables/vectors within a data row in R
(3 answers)
Closed 9 years ago.
Is it possible to count unique elements in data frame row and return one with maximum occurrence and as result form the vector.
example:
a a a b b b b -> b
c v f w w r t -> w
s s d f b b b -> b
You can use apply to use table function on every row of dataframe.
df <- read.table(textConnection("a a a b b b b\nc v f w w r t\ns s d f b b b"), header = F)
df$result <- apply(df, 1, function(x) names(table(x))[which.max(table(x))])
df
## V1 V2 V3 V4 V5 V6 V7 result
## 1 a a a b b b b b
## 2 c v f w w r t w
## 3 s s d f b b b b
Yes with table
x=c("a", "a", "a", "b" ,"b" ,"b" ,"b")
table(x)
x
a b
3 4
EDIT with data.table
DT = data.table(x=sample(letters[1:5],10,T),y=sample(letters[1:5],10,T))
#DT
# x y
# 1: d a
# 2: c d
# 3: d c
# 4: c a
# 5: a e
# 6: d c
# 7: c b
# 8: a b
# 9: b c
#10: c d
f = function(x) names(table(x))[which.max(table(x))]
DT[,lapply(.SD,f)]
# x y
#1: c c
Note that if you want to keep ALL max's, you need to ask for them explicitly.
You can save them as a list inside the data.frame. If there is only one per row, then the list will be simplified to a common vector
df$result <- apply(df, 1, function(x) {T <- table(x); list(T[which(T==max(T))])})
With Ties for max:
df2 <- df[, 1:6]
df2$result <- apply(df2, 1, function(x) {T <- table(x); list(T[which(T==max(T))])})
> df2
V1 V2 V3 V4 V5 V6 result
1 a a a b b b 3, 3
2 c v f w w r 2
3 s s d f b b 2, 2
With No Ties for max:
df$result <- apply(df, 1, function(x) {T <- table(x); list(T[which(T==max(T))])})
> df
V1 V2 V3 V4 V5 V6 V7 result
1 a a a b b b b 4
2 c v f w w r t 2
3 s s d f b b b 3

Resources