print group in R by condition - r

Say, I have data
data=structure(list(x1 = structure(c(1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L), .Label = c("q",
"r", "w"), class = "factor"), x2 = structure(c(2L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), .Label = c("e", "w"), class = "factor"), x3 = structure(c(1L,
1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L,
2L, 2L, 2L), .Label = c("e", "q", "r"), class = "factor"), var = c(34L,
35L, 34L, 34L, 34L, 34L, 35L, 34L, 34L, 34L, 34L, 35L, 34L, 34L,
34L, 34L, 34L, 34L, 34L, 34L), act = c(1L, 1L, 1L, 1L, 1L, 0L,
0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L)), .Names = c("x1",
"x2", "x3", "var", "act"), class = "data.frame", row.names = c(NA,
-20L))
by columns x1, x2, x3 we have three groups
q w e
w e r
r e q
I must print only those groups which for act column have only 1 value.
In this case, only w e r group has by act column both 0 and 1 value, and
another group q w e and r e q has only 1 by act column, so I need to print it.
How to do it?
Expected output
x1 x2 x3
q w e
r e q

library(dplyr)
data %>% distinct(x1,x2,x3, .keep_all = TRUE) %>%
filter(act==1) %>% select(-var,-act)
x1 x2 x3
1 q w e
2 r e q
data %>% distinct(x1,x2,x3, .keep_all = TRUE) %>%
filter(act==1) %>% select(-var,-act) %>%
filter(x1=='r',x2=='e',x3=='q')
x1 x2 x3
1 r e q
#OR
data %>% filter(x1=='r',x2=='e',x3=='q')

If I understand correctly, the OP has requested to extract/print only those groups which contain only the value 1 in the act column.
This can be achieved using the all() function.
data.table
library(data.table)
setDT(data)[, which(all(act == 1L)), by = .(x1, x2, x3)][, -"V1"]
x1 x2 x3
1: q w e
2: r e q
dplyr
library(dplyr)
data %>%
group_by(x1, x2, x3) %>%
filter(all(act == 1L)) %>%
distinct(x1, x2, x3)
# A tibble: 2 x 3
# Groups: x1, x2, x3 [2]
x1 x2 x3
<fct> <fct> <fct>
1 q w e
2 r e q
Data
In addition to the dataset data provided by the OP I have tested my answer with a second dataset which contains an additional group and where the rows in the w e r group are ordered differently.
data2 <- data.table::fread(
" x1 x2 x3 var act
a b c 33 0
a b c 33 0
q w e 34 1
q w e 35 1
w e r 34 1
w e r 35 1
w e r 34 0
w e r 35 0
r e q 34 1
r e q 34 1
", data.table = FALSE)

Related

Get rows from a column per group based on a condition

I have a data.frame as shown below:
Basic requirement is to find average of "n" number of "value" after certain date per group.
For ex:, user provides:
Certain Date = Failure Date
n = 4
Hence, for A, the average would be (60+70+80+100)/4 ; ignoring NAs
and for B, the average would be (80+90+100)/3. Note for B, n=4 does not happen as there are only 3 values after the satisfied condition failuredate = valuedate.
Here is the dput:
structure(list(Name = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("A",
"B"), class = "factor"), FailureDate = structure(c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L), .Label = c("1/5/2020", "1/7/2020"), class = "factor"), ValueDate = structure(c(1L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 2L, 1L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 2L), .Label = c("1/1/2020", "1/10/2020", "1/2/2020",
"1/3/2020", "1/4/2020", "1/5/2020", "1/6/2020", "1/7/2020", "1/8/2020",
"1/9/2020"), class = "factor"), Value = c(10L, 20L, 30L, 40L,
NA, 60L, 70L, 80L, NA, 100L, 10L, 20L, 30L, 40L, 50L, 60L, 70L,
80L, 90L, 100L)), class = "data.frame", row.names = c(NA, -20L
))
We could create an index with cumsum after grouping by 'Name', extract the 'Value' elements and get the mean
library(dplyr)
n <- 4
df1 %>%
type.convert(as.is = TRUE) %>%
group_by(Name) %>%
summarise(Ave = mean(head(na.omit(Value[lag(cumsum(FailureDate == ValueDate),
default = 0) > 0]), n), na.rm = TRUE))
# A tibble: 2 x 2
# Name Ave
# <chr> <dbl>
#1 A 77.5
#2 B 90
You can convert factor dates to the Date object and then compute averages of "n" numbers after FailureDate per group. Note that "n" numbers should exclude NA, so tidyr::drop_na() is used here.
library(dplyr)
df %>%
mutate(across(contains("Date"), as.Date, "%m/%d/%Y")) %>%
tidyr::drop_na(Value) %>%
group_by(Name) %>%
summarise(mean = mean(Value[ValueDate > FailureDate][1:4], na.rm = T))
# # A tibble: 2 x 2
# Name mean
# <fct> <dbl>
# 1 A 77.5
# 2 B 90
You can try this:
library(dplyr)
n <- 4
df %>%
mutate(condition = as.character(FailureDate) == as.character(ValueDate))
group_by(Name) %>%
mutate(condition = cumsum(condition)) %>%
filter(condition == 1) %>%
slice(-1) %>%
filter(!is.na(Value)) %>%
slice(1:n) %>%
summarise(mean_col = mean(Value))
> df
# A tibble: 2 x 2
Name mean_col
<fct> <dbl>
1 A 77.5
2 B 90

Compute proportion of outcome from repeated measures design

I have a table in the following format:
CowId Result IMI
1 S. aureus 1
1 No growth 0
2 No growth 0
2 No growth 0
3 E. coli 1
3 No growth 0
3 E. coli 0
4 Bacillus sp. 1
4 Contaminated 0
From this table, I would like to compute the proportion of CowIds that are negative for an IMI (0 = negative; 1 = positive) at all sampling time points.
In this example, 25% of cows [CowId = 2] tested negative for an IMI at all sampling time points.
To compute this proportion, my initial approach was to group each CowId, then compute the difference between the number of negative IMIs and the total number of IMI tests, where a resulting value of 0 would indicate that the cow was negative for an IMI at all time points.
As of now, my code computes this for each individual CowId. How can I augment this to compute the proportion described above?
fp %>%
filter(Result != "Contaminated") %>%
group_by(CowId) %>%
summarise(negative = (sum(IMI == 0) - length(IMI)))
We can count how many CowId's have tested negative at all points and calculate their ratio.
library(dplyr)
fp %>%
filter(Result != "Contaminated") %>%
group_by(CowId) %>%
summarise(negative = all(IMI == 0)) %>%
summarise(total_percent = mean(negative) * 100)
# total_percent
# <dbl>
#1 25
In base R, we can use aggregate
temp <- aggregate(IMI~CowId, subset(fp, Result != "Contaminated"),
function(x) all(x == 0))
mean(temp$IMI) * 100
data
fp <- structure(list(CowId = c(1L, 1L, 2L, 2L, 3L, 3L, 3L, 4L, 4L),
Result = structure(c(5L, 4L, 4L, 4L, 3L, 4L, 3L, 1L, 2L), .Label =
c("Bacillus_sp.","Contaminated", "E.coli", "No_growth", "S.aureus"),
class = "factor"),IMI = c(1L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L)),
class = "data.frame", row.names = c(NA, -9L))
With data.table
library(data.table)
setDT(fp)[Result != "Contaminated", .(negative = all(IMI == 0)),
.(CowId)][, .(total_percent = mean(negative)* 100 )]
# total_percent
#1: 25
data
fp <- structure(list(CowId = c(1L, 1L, 2L, 2L, 3L, 3L, 3L, 4L, 4L),
Result = structure(c(5L, 4L, 4L, 4L, 3L, 4L, 3L, 1L, 2L), .Label =
c("Bacillus_sp.","Contaminated", "E.coli", "No_growth", "S.aureus"),
class = "factor"),IMI = c(1L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L)),
class = "data.frame", row.names = c(NA, -9L))

Removing factor levels from variable X based on values in Y

I have a larger data frame with many factor levels. I would like to remove those levels for which all corresponding Y values are zero.
An example data set:
df <- structure(list(X = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L), .Label = c("A",
"B", "C", "D", "E"), class = "factor"), Y = c(1L, 2L, 0L, 2L,
0L, 0L, 0L, 0L, 2L, 5L, 1L, 1L, 0L, 0L, 1L, 8L, 0L, 0L, 0L, 0L
)), .Names = c("X", "Y"), class = "data.frame", row.names = c(NA,
-20L))
For this example, I would like to have the rows containing B and E removed.
We can group by 'X' and filter for rows that have any value in 'Y' not equal to 0
library(dplyr)
df %>%
group_by(X) %>%
filter(any(Y != 0))
Or use the all with negate (!)
df %>%
group_by(X) %>%
filter(!all(Y==0))
You can do it in base R
df[df$X%in%df$X[df$Y!=0],]
X Y
1 A 1
2 A 2
3 A 0
4 A 2
9 C 2
10 C 5
11 C 1
12 C 1
13 D 0
14 D 0
15 D 1
16 D 8

Time varying network in r

I have data on every interaction that could and did happen at a university club weekly social hour
A sample of my data is as follows
structure(list(from = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("A",
"B", "C"), class = "factor"), to = structure(c(2L, 3L, 2L, 3L,
2L, 3L, 1L, 3L, 1L, 3L, 1L, 3L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("A",
"B", "C"), class = "factor"), timestalked = c(0L, 1L, 0L, 4L,
1L, 2L, 0L, 1L, 0L, 2L, 1L, 0L, 1L, 2L, 1L, 0L, 0L, 0L), week = structure(c(1L,
1L, 3L, 3L, 2L, 2L, 1L, 1L, 3L, 3L, 2L, 2L, 1L, 1L, 3L, 3L, 2L,
2L), .Label = c("1/1/2010", "1/15/2010", "1/8/2010"), class = "factor")), .Names = c("from",
"to", "timestalked", "week"), class = "data.frame", row.names = c(NA,
-18L))
I am trying to calculate network statistics such as centrality for A,B,C for each individual week, the last two weeks, and year to date. The only way I have gotten this to work is by manually breaking up the file in the time unit I want to analyze but there has to be a less labourious way, I hope.
When timestalked is 0 this should be treated as no edge
The output would produce a .csv with the following:
actor cent_week1 cent_week2 cent_week3 cent_last2weeks cent_yeartodate
A
B
C
with cent_week1 being 1/1/2010 centrality; cent_last2weeks being just considering 1/8/2010 and 1/15/2010; and cent_yeartodate being all of the data being considered at once. This is being applied to a MUCH larger dataset of millions of observations.
Can do this by setting your windows in another table, then doing by group operations on each of the windows:
Data Preparation:
# Load Data
DT <- structure(list(from = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("A",
"B", "C"), class = "factor"), to = structure(c(2L, 3L, 2L, 3L,
2L, 3L, 1L, 3L, 1L, 3L, 1L, 3L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("A",
"B", "C"), class = "factor"), timestalked = c(0L, 1L, 0L, 4L,
1L, 2L, 0L, 1L, 0L, 2L, 1L, 0L, 1L, 2L, 1L, 0L, 0L, 0L), week = structure(c(1L,
1L, 3L, 3L, 2L, 2L, 1L, 1L, 3L, 3L, 2L, 2L, 1L, 1L, 3L, 3L, 2L,
2L), .Label = c("1/1/2010", "1/15/2010", "1/8/2010"), class = "factor")), .Names = c("from",
"to", "timestalked", "week"), class = "data.frame", row.names = c(NA,
-18L))
# Code
library(igraph)
library(data.table)
setDT(DT)
# setup events
DT <- DT[timestalked > 0]
DT[, week := as.Date(week, format = "%m/%d/%Y")]
# setup windows, edit as needed
date_ranges <- data.table(label = c("cent_week_1","cent_week_2","cent_last2weeks","cent_yeartodate"),
week_from = as.Date(c("2010-01-01","2010-01-08","2010-01-08","2010-01-01")),
week_to = as.Date(c("2010-01-01","2010-01-08","2010-01-15","2010-01-15"))
)
# find all events within windows
DT[, JA := 1]
date_ranges[, JA := 1]
graph_base <- merge(DT, date_ranges, by = "JA", allow.cartesian = TRUE)[week >= week_from & week <= week_to]
Here is now the by group code, the second line is a bit gross, open to ideas about how to avoid the double call
graph_base <- graph_base[, .(graphs = list(graph_from_data_frame(.SD))), by = label, .SDcols = c("from", "to", "timestalked")] # create graphs
graph_base <- graph_base[, .(vertex = names(eigen_centrality(graphs[[1]])$vector), ec = eigen_centrality(graphs[[1]])$vector), by = label] # calculate centrality
dcast for final formatting:
dcast(graph_base, vertex ~ label, value.var = "ec")
vertex cent_last2weeks cent_week_1 cent_week_2 cent_yeartodate
1: A 1.0000000 0.7071068 0.8944272 0.9397362
2: B 0.7052723 0.7071068 0.4472136 0.7134685
3: C 0.9008487 1.0000000 1.0000000 1.0000000
Can't comment, so I'm writing an "answer". If you want to perform some mathematical operation on timestalked and get values by the from (didn't find any variable called actor in your example), here's a data.table approach that can be helpful:
dat <- as.data.table(dat) # or add 'data.table' to the class parameter
dat$week <- as.Date(dat$week, format = "%m/%d/%Y")
dat[, .(cent = mean(timestalked)), by = list(from, weeknum = week(week))]
This gives the below output:
dat[, .(cent = mean(timestalked)), by = list(from, weeknum = week(week))]
from weeknum cent
1: A 1 0.5
2: A 2 2.0
3: A 3 1.5
4: B 1 0.5
5: B 2 1.0
6: B 3 0.5
7: C 1 1.5
8: C 2 0.5
9: C 3 0.0
Assign this to new_dat. You can subset by week simply with new_dat[weeknum %in% 2:3] or whatever other variation you want or sum over the year. Additionally, you can also sort/order as desired.
Hope this helps!
How about:
library(dplyr)
centralities <- tmp %>%
group_by(week) %>%
filter(timestalked > 0) %>%
do(
week_graph=igraph::graph_from_edgelist(as.matrix(cbind(.$from, .$to)))
) %>%
do(
ecs = igraph::eigen_centrality(.$week_graph)$vector
) %>%
summarise(ecs_A = ecs[[1]], ecs_B = ecs[[2]], ecs_C = ecs[[3]])
You can use summarise_all if you have a lot of actors. Putting it in long format is left as an exercise.
This analysis follows the general split-apply-combine approach, where the data re split by week, graph functions are applied, and then the results combined together. There are several tools for this, but below uses base R, and data.table.
Base R
First set data-class for your data, so that term last two weeks has meaning.
# Set date class and order
d$week <- as.Date(d$week, format="%m/%d/%Y")
d <- d[order(d$week), ]
d <- d[d$timestalked > 0, ] # remove edges // dont need to do this is using weights
Then split and apply graph functions
# split data and form graph for eack week
g1 <- lapply(split(seq(nrow(d)), d$week), function(i)
graph_from_data_frame(d[i,]))
# you can then run graph functions to extract specific measures
(grps <- sapply(g1, function(x) eigen_centrality(x,
weights = E(x)$timestalked)$vector))
# 2010-01-01 2010-01-08 2010-01-15
# A 0.5547002 0.9284767 1.0000000
# B 0.8320503 0.3713907 0.7071068
# C 1.0000000 1.0000000 0.7071068
# Aside: If you only have one function to run on the graphs,
# you could do this in one step
#
# sapply(split(seq(nrow(d)), d$week), function(i) {
# x = graph_from_data_frame(d[i,])
# eigen_centrality(x, weights = E(x)$timestalked)$vector
# })
You then need to combine in the the analysis on all the data - as you only have to build two further graphs, this is not the time-consuming part.
fun1 <- function(i, name) {
x = graph_from_data_frame(i)
d = data.frame(eigen_centrality(x, weights = E(x)$timestalked)$vector)
setNames(d, name)
}
a = fun1(d, "alldata")
lt = fun1(d[d$week %in% tail(unique(d$week), 2), ], "lasttwo")
# Combine: could use `cbind` in this example, but perhaps `merge` is
# safer if there are different levels between dates
data.frame(grps, lt, a) # or
Reduce(merge, lapply(list(grps, a, lt), function(x) data.frame(x, nms = row.names(x))))
# nms X2010.01.01 X2010.01.08 X2010.01.15 alldata lasttwo
# 1 A 0.5547002 0.9284767 1.0000000 0.909899 1.0
# 2 B 0.8320503 0.3713907 0.7071068 0.607475 0.5
# 3 C 1.0000000 1.0000000 0.7071068 1.000000 1.0
data.table
It is likely that the time-consuming step will be explicitly split-applying the function over the data. data.table should offer some benefit here, especially when the data becomes large, and/or there are more groups.
# function to apply to graph
fun <- function(d) {
x = graph_from_data_frame(d)
e = eigen_centrality(x, weights = E(x)$timestalked)$vector
list(e, names(e))
}
library(data.table)
dcast(
setDT(d)[, fun(.SD), by=week], # apply function - returns data in long format
V2 ~ week, value.var = "V1") # convert to wide format
# V2 2010-01-01 2010-01-08 2010-01-15
# 1: A 0.5547002 0.9284767 1.0000000
# 2: B 0.8320503 0.3713907 0.7071068
# 3: C 1.0000000 1.0000000 0.7071068
Then just run the function over the full data / last two weeks as before.
There are differences between the answers, which is down to how we use the use the weights argument when calculating the centralities, whereas the others don't use the weights.
d=structure(list(from = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("A",
"B", "C"), class = "factor"), to = structure(c(2L, 3L, 2L, 3L,
2L, 3L, 1L, 3L, 1L, 3L, 1L, 3L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("A",
"B", "C"), class = "factor"), timestalked = c(0L, 1L, 0L, 4L,
1L, 2L, 0L, 1L, 0L, 2L, 1L, 0L, 1L, 2L, 1L, 0L, 0L, 0L), week = structure(c(1L,
1L, 3L, 3L, 2L, 2L, 1L, 1L, 3L, 3L, 2L, 2L, 1L, 1L, 3L, 3L, 2L,
2L), .Label = c("1/1/2010", "1/15/2010", "1/8/2010"), class = "factor")), .Names = c("from",
"to", "timestalked", "week"), class = "data.frame", row.names = c(NA,
-18L))

Find sum of one column based on consecutive values of another column in data.table

I have a data.table like the following:
dput(DT)
structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L), Job = structure(c(6L, 7L, 8L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L), .Label = c("f1", "f2", "f3", "f4", "f5", "h1", "h2", "h3"), class = "factor"), Duration = c(2L, 3L, 4L, 4L, 3L, 2L, 1L, 0L, 2L, 3L, 4L, 5L, 4L, 0L), Outsourced = structure(c(1L,2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L), .Label = c("N","Y"), class = "factor")), .Names = c("ID", "Job", "Duration", "Outsourced"), row.names = c(NA, -14L), class = c("data.table", "data.frame"), .internal.selfref = <pointer: 0x103003178>)
which gives
ID Job Duration Outsourced
1: 1 h1 2 N
2: 1 h2 3 Y
3: 1 h3 4 Y
4: 1 f1 4 Y
5: 1 f2 3 N
6: 1 f3 2 N
7: 1 f4 1 N
8: 1 f5 0 N
9: 2 h1 2 N
10: 2 h2 3 Y
11: 2 f1 4 Y
12: 2 f2 5 N
13: 2 f3 4 N
14: 2 f4 0 N
I want to have the sum of Duration, for all jobs that have consecutive "Y" in the Outsourced column. Moreover, if activities belong to different ID, they shouldn't be counted as consecutive. One ID may have more than one set of consecutive "Y" jobs.
So for this example, the correct answer would be something like
ID V1
1: 1 11
2: 2 7
Currently, I use rle to find running lengths of "Y" in the outsourced column, and then I try with ifs to do the rest, but I think this can be done more elegantly...
Thank you
Following #docendo discimus suggestion from above, I managed to get what I wanted by adding a "unique" statement:
DT[, NewCol := sum(Duration), by = list(ID, rleid(Outsourced))][Outsourced == "N", NewCol := NA]
DT[!is.na(NewCol), unique(NewCol), ID]
EDIT: To cover cases that include many sets of Outsourced activities with the same duration, the second statement should be changed to:
DT[!is.na(NewCol), sum(rle(NewCol)$values), ID]

Resources