Subset and compare on two columns - r

I have a table made up of three columns. A person identifier, a column of event type (A or B) and a column of dates when the event happened.
This is shown here:
Person Event EventDate
1 A 22/03/15
1 A 22/05/15
1 B 12/12/15
1 B 12/12/15
2 B 01/13/12
2 B 02/03/12
2 B 03/08/14
2 A 05/09/14
3 B 02/02/02
3 A 03/05/14
3 B 03/06/14
3 A 17/11/15
3 A 17/02/16
3 A 18/05/16
3 A 23/06/16
I want to subset the data. The subset should capture all eventA within a Person that occur after the first eventB The output would be:
Person Event EventDate
2 A 05/09/14
3 A 03/05/14
3 A 17/11/15
3 A 17/02/16
3 A 18/05/16
3 A 23/06/16
I think the problem I have is knowing how to compare rows for a Person based on two column comparison (Event and EventDate).
This is the dput of the original data above
structure(list(Person..Event...EventDate = c("1 A 22/03/15",
"1 A 22/05/15", "1 B 12/12/15", "1 B 12/12/15", "2 B 01/13/12",
"2 B 02/03/12", "2 B 03/08/14", "2 A 05/09/14", "3 B 02/02/02",
"3 A 03/05/14", "3 B 03/06/14", "3 A 17/11/15", "3 A 17/02/16",
"3 A 18/05/16", "3 A 23/06/16")), .Names = "Person..Event...EventDate", class = "data.frame", row.names = c(NA,
-15L))

We can use data.table. Convert the 'data.frame' to 'data.table' (setDT(df1)). Grouped by 'Person', we order the 'Person', 'EventDate' (after converting to Date class), then if the cumulative sum of 'B' 'Event' is 1 for the first elements, then get the row index of 'A' 'Event' and use that to subset the original dataset.
library(data.table)
setDT(df1)[df1[order(Person, as.Date(EventDate, '%d/%m/%y')),
if(cumsum(Event == "B")[1]==1) .I[Event == "A"], by = Person]$V1]
# Person Event EventDate
#1: 2 A 05/09/14
#2: 3 A 03/05/14
#3: 3 A 17/11/15
#4: 3 A 17/02/16
#5: 3 A 18/05/16
#6: 3 A 23/06/16
Or we can use dplyr
library(dplyr)
df1 %>%
arrange(Person, as.Date(EventDate, '%d/%m/%y')) %>%
group_by(Person) %>%
filter(first(Event == "B") & Event == "A")
# Person Event EventDate
# <int> <chr> <chr>
#1 2 A 05/09/14
#2 3 A 03/05/14
#3 3 A 17/11/15
#4 3 A 17/02/16
#5 3 A 18/05/16
#6 3 A 23/06/16
data
df1 <- structure(list(Person = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L,
3L, 3L, 3L, 3L, 3L, 3L), Event = c("A", "A", "B", "B", "B", "B",
"B", "A", "B", "A", "B", "A", "A", "A", "A"), EventDate = c("22/03/15",
"22/05/15", "12/12/15", "12/12/15", "01/13/12", "02/03/12", "03/08/14",
"05/09/14", "02/02/02", "03/05/14", "03/06/14", "17/11/15", "17/02/16",
"18/05/16", "23/06/16")), .Names = c("Person", "Event", "EventDate"
), class = "data.frame", row.names = c(NA, -15L))

This can be done using sqldf. I'm assuming the data is sorted by date.
> library(sqldf)
>
v1 <- structure(list(Person = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L,
3L, 3L, 3L, 3L, 3L), Event = c("A", "A", "B", "B", "B", "B", "B", "A", "B", "A",
"B", "A", "A", "A", "A"), EventDate = c("22/03/15", "22/05/15", "12/12/15",
"12/12/15", "01/10/12", "02/03/12", "03/08/14", "05/09/14", "02/02/02",
"03/05/14", "03/06/14", "17/11/15", "17/02/16", "18/05/16", "23/06/16")), .Names
= c("Person", "Event", "EventDate"), class = "data.frame", row.names = c(NA,
-15L))
> v1$EventDate <- as.Date(v1$EventDate , '%d/%m/%y')
> v2 <- v1[v1$Event == 'B' , ]
> v2 <- v2[ !duplicated(v2$Person) , ]
> v3 <- v1[v1$Event == 'A' , ]
> sqldf("select a.* from v3 a , v2 b where a.EventDate > b.EventDate And a.Person = b.Person")
Person Event EventDate
1 2 A 2014-09-05
2 3 A 2014-05-03
3 3 A 2015-11-17
4 3 A 2016-02-17
5 3 A 2016-05-18
6 3 A 2016-06-23
> v1
Person Event EventDate
1 1 A 2015-03-22
2 1 A 2015-05-22
3 1 B 2015-12-12
4 1 B 2015-12-12
5 2 B 2012-10-01
6 2 B 2012-03-02
7 2 B 2014-08-03
8 2 A 2014-09-05
9 3 B 2002-02-02
10 3 A 2014-05-03
11 3 B 2014-06-03
12 3 A 2015-11-17
13 3 A 2016-02-17
14 3 A 2016-05-18
15 3 A 2016-06-23

Related

How To Remove NA Values Dependent On A Condition in R?

Let's say I have this data frame. How would I go about removing only the NA values associated with name a without physically removing them manually?
a 1 4
a 7 3
a NA 4
a 6 3
a NA 4
a NA 3
a 2 4
a NA 3
a 1 4
b NA 2
c 3 NA
I've tried using the function !is.na, but that removes all the NA values in the column ID1 for all the names. How would I specifically target the ones that are associated with name a?
You could subset your data frame as follows:
df_new <- df[!(df$name == "a" & is.na(df$ID1)), ]
This can also be written as:
df_new <- df[df$name != "a" | !is.na(df$ID1), ]
With dplyr:
library(dplyr)
df %>%
filter(!(name == "a" & is.na(ID1)))
Or with subset:
subset(df, !(name == "a" & is.na(ID1)))
Output
name ID1 ID2
1 a 1 4
2 a 7 3
3 a 6 3
4 a 2 4
5 a 1 4
6 b NA 2
7 c 3 NA
Data
df <- structure(list(name = c("a", "a", "a", "a", "a", "a", "a", "a",
"a", "b", "c"), ID1 = c(1L, 7L, NA, 6L, NA, NA, 2L, NA, 1L, NA,
3L), ID2 = c(4L, 3L, 4L, 3L, 4L, 3L, 4L, 3L, 4L, 2L, NA)), class = "data.frame", row.names = c(NA,
-11L))

R increment by 1 for every change in value column and restart the counter

I would like to find a way to do very similar to this question.
Increment by 1 for every change in column
But i want to restart the counter when var1 = c
using
df$var2 <- with(rle(as.character(df$var1)), rep(seq_along(values), lengths))*
results in column var 2
var1 var2 Should be
a 1 1
a 1 1
1 2 2
0 3 3
b 4 4
b 4 4
b 4 4
c 5 1
1 6 2
1 6 2
In data.table you can use rleid to get a run-length-id for var1 within each group.
library(data.table)
setDT(df)
df[, var2 := rleid(var1), by = cumsum(var1 == "c")]
df
# var1 var2
# 1: a 1
# 2: a 1
# 3: 1 2
# 4: 0 3
# 5: b 4
# 6: b 4
# 7: b 4
# 8: c 1
# 9: 1 2
#10: 1 2
and using dplyr
library(dplyr)
df %>%
group_by(group = cumsum(var1 == "c")) %>%
mutate(var2 = cumsum(var1 != lag(var1, default = first(var1))) + 1)
data
df <- structure(list(var1 = structure(c(3L, 3L, 2L, 1L, 4L, 4L, 4L,
5L, 2L, 2L), .Label = c("0", "1", "a", "b", "c"), class = "factor")),
class = "data.frame", row.names = c(NA, -10L))
We can use the OP's code with rle in base R with ave
df$var2 <- with(df, as.integer(ave(as.character(var1), cumsum(var1 == 'c'),
FUN = function(x) with(rle(x), rep(seq_along(values), lengths)))))
df$var2
#[1] 1 1 2 3 4 4 4 1 2 2
data
df <- structure(list(var1 = structure(c(3L, 3L, 2L, 1L, 4L, 4L, 4L,
5L, 2L, 2L), .Label = c("0", "1", "a", "b", "c"), class = "factor")),
class = "data.frame", row.names = c(NA,
-10L))

How to group by a Variable A to get a minimum value from Variable B based on a condition on Variable C

I have a data of 3 variables A, B, C. I need to group by on A and need a minimum value of B when C is non-zero.
> data
A B C
1 a 3 0
2 a 6 1
3 a 9 2
4 a 12 2
5 b 3 0
6 b 6 0
7 b 9 0
8 b 12 4
Expected Output:
> output
1 2
1 a 6
2 b 12
I tried doing this which was running for more than 2 hours:
rbind(by(data, data$A, function(x) min(x$B[x$C>0])))
We group by 'A', get the min of 'B' where 'C' is not 0
library(dplyr)
df1 %>%
group_by(A) %>%
summarise(B = min(B[C > 0]))
# A tibble: 2 x 2
# A B
# <chr> <int>
#1 a 6
#2 b 12
Or a faster option would be to filter first, then do the group_by
df1 %>%
filter(C > 0) %>%
group_by(A) %>%
summarise(B = min(B))
Or with data.table
library(data.table)
setDT(df1)[,.(B = min(B[C > 0])) , A]
data
df1 <- structure(list(A = c("a", "a", "a", "a", "b", "b", "b", "b"),
B = c(3L, 6L, 9L, 12L, 3L, 6L, 9L, 12L), C = c(0L, 1L, 2L,
2L, 0L, 0L, 0L, 4L)), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8"))

How to collapse session path data into from-to paths for visualizing network data?

What are some ways to transform session path data such as this:
df
# Session Link1 Link2 Link3 Link4 Link5
# 1 1 A B
# 2 2 C
# 3 3 D A B
# 4 4 C F G H J
# 5 5 A B C
Into a data set that looks like this:
desired
# Session From To
# 1 1 A B
# 2 2 C <NA>
# 3 3 D A
# 4 3 A B
# 5 4 C F
# 6 4 F G
# 7 4 G H
# 8 4 H J
# 9 5 A B
# 10 5 B C
Data for reproducibility:
df <- structure(list(Session = 1:5, Link1 = structure(c(1L, 2L, 3L, 2L, 1L), .Label = c("A", "C", "D"), class = "factor"), Link2 = structure(c(3L, 1L, 2L, 4L, 3L), .Label = c("", "A", "B", "F"), class = "factor"), Link3 = structure(c(1L, 1L, 2L, 4L, 3L), .Label = c("", "B", "C", "G"), class = "factor"), Link4 = structure(c(1L, 1L, 1L, 2L, 1L), .Label = c("", "H"), class = "factor"), Link5 = structure(c(1L, 1L, 1L, 2L, 1L), .Label = c("", "J"), class = "factor")), .Names = c("Session", "Link1", "Link2", "Link3", "Link4", "Link5"), class = "data.frame", row.names = c(NA, -5L))
desired <- structure(list(Session = c(1L, 2L, 3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L), From = structure(c(1L, 3L, 4L, 1L, 3L, 5L, 6L, 7L, 1L, 2L), .Label = c("A", "B", "C", "D", "F", "G", "H"), class = "factor"), To = structure(c(2L, NA, 1L, 2L, 4L, 5L, 6L, 7L, 2L, 3L), .Label = c("A", "B", "C", "F", "G", "H", "J"), class = "factor")), .Names = c("Session", "From", "To"), class = "data.frame", row.names = c(NA, -10L))
We could use data.table. Convert the 'data.frame' to 'data.table' (setDT(df)). Reshape from 'wide' to 'long' format with melt specifying the id.var as 'Session'. Remove the 'value' elements that are empty [value!='']. Grouped by 'Session', we insert 'NA' values in the 'value' column for those 'Session' that have only a single row (if...else), create a two columns ('From' and 'To') by removing the last and first element of 'V1' grouped by 'Session'.
library(data.table)#v1.9.5+
melt(setDT(df), id.var='Session')[value!=''][,
if(.N==1L) c(value, NA) else value, by = Session][,
list(From=V1[-.N], To=V1[-1L]), by = Session]
# Session From To
#1: 1 A B
#2: 2 C NA
#3: 3 D A
#4: 3 A B
#5: 4 C F
#6: 4 F G
#7: 4 G H
#8: 4 H J
#9: 5 A B
#10: 5 B C
The above could be simplified to a single block after the melt step. For some reason, tmp[-.N] is not working. So I used tmp[1:(.N-1)].
melt(setDT(df), id.var= 'Session')[value!='', {
tmp <- if(.N==1L) c(value, NA) else value
list(From= tmp[1:(.N-1)], To= tmp[-1L]) }, by = Session]
# Session From To
#1: 1 A B
#2: 2 C NA
#3: 3 D A
#4: 3 A B
#5: 4 C F
#6: 4 F G
#7: 4 G H
#8: 4 H J
#9: 5 A B
#10: 5 B C
Inspired by #akrun, this is my personal stab at the problem. Granted, the results are tweaked to include the terminal from-to path for each pair:
library(dplyr)
library(tidyr)
gather(df, "Link_Num", "Value", -Session) %>%
group_by(Session) %>%
mutate(to = Value,
from = lag(to)) %>%
filter(Link_Num != "Link1" &
from != "") %>%
select(Session, from, to, Link_Num) %>%
arrange(Session)
Which yields:
Session from to Link_Num
1 1 A B Link2
2 1 B Link3
3 2 C Link2
4 3 D A Link2
5 3 A B Link3
6 3 B Link4
7 4 C F Link2
8 4 F G Link3
9 4 G H Link4
10 4 H J Link5
11 5 A B Link2
12 5 B C Link3
13 5 C Link4
Another approach with dplyr functions melt and lead:
library(dplyr)
df$spacer <- ""
df %>% melt(id.var = "Session") %>%
arrange(Session) %>%
mutate(To = lead(value)) %>%
filter(To !="" & value !="" | To =="" & variable =="Link1") %>%
mutate(To = ifelse(To == "", NA, To)) %>% select(-variable)
# Session value To
# 1 1 A B
# 2 2 C <NA>
# 3 3 D A
# 4 3 A B
# 5 4 C F
# 6 4 F G
# 7 4 G H
# 8 4 H J
# 9 5 A B
# 10 5 B C

Multiply a table(file1) with individual cells of a column(file2) using R

File 1:Ele A B C DEs 1 2 3 4Ep 2 4 3 4Ek 1 9 3 8File2:A 1 B 2 C 3 D 5
Need is to ensure that each element under Column A (file 1) gets multiplied by the value assigned to A in file 2 (and so on). I know matrix multiplication in R but this is not the case of matrix multiplication I suppose. Help would be greatly appreciated. Thanks
You could try
indx <- df2$Col1
df1[indx]*df2$Col2[col(df1[indx])]
# A B C D
#1 1 4 9 20
#2 2 8 9 20
#3 1 18 9 40
Or you could use sweep
sweep(df1[indx], 2, df2$Col2, '*')
# A B C D
#1 1 4 9 20
#2 2 8 9 20
#3 1 18 9 40
data
df1 <- structure(list(Ele = c("Es", "Ep", "Ek"), A = c(1L, 2L, 1L),
B = c(2L, 4L, 9L), C = c(3L, 3L, 3L), D = c(4L, 4L, 8L)),
.Names = c("Ele", "A", "B", "C", "D"), class = "data.frame",
row.names = c(NA, -3L))
df2 <- structure(list(Col1 = c("A", "B", "C", "D"), Col2 = c(1L, 2L,
3L, 5L)), .Names = c("Col1", "Col2"), class = "data.frame",
row.names = c(NA, -4L))

Resources