how can I subtract specific region from each block - r

I have a data with several column
df<- structure(list(X1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), X2 = structure(c(1L, 2L, 3L,
4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L,
18L, 19L, 20L, 21L, 22L, 23L, 24L, 7L, 8L, 1L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L,
19L, 20L, 21L, 22L, 23L, 24L), .Label = c("B02", "B03", "B04",
"B05", "B06", "B07", "C02", "C03", "C04", "C05", "C06", "C07",
"D02", "D03", "D04", "D05", "D06", "D07", "G02", "G03", "G04",
"G05", "G06", "G07"), class = "factor"), X3 = c(0.005648642,
0.005876389, 0.00592532, 0.006244456, 0.005987075, 0.006075874,
0.006198667, 0.006003758, 0.006041885, 0.006186987, 0.006041323,
0.006071594, 0.005902391, 0.005976096, 0.00593805, 0.005866524,
0.0059831, 0.005902586, 0.005914309, 0.005887304, 0.006054509,
0.005931266, 0.005936195, 0.005895191, 0.005840959, 0.005849247,
0.005808851, 0.005833586, 0.005825153, 0.00584873, 0.005983976,
0.00598669, 0.006011548, 0.005997747, 0.005851022, 0.005919044,
0.005854566, 0.0058226, 0.00578052, 0.005784874, 0.005933198,
0.005996407, 0.005898848, 0.00595775, 0.005918857, 0.005882898,
0.005877808, 0.005803604, 0.006235161, 0.005808725), X4 = c(0.024054157,
0.025850824, 0.023122678, 0.042337945, 0.030468744, 0.026481616,
0.017430149, 0.024019931, 0.025572552, 0.024869532, 0.024148692,
0.025228634, 0.030078166, 0.025860944, 0.023530696, 0.029244585,
0.026599876, 0.023578245, 0.024014744, 0.023963795, 0.025466639,
0.02969377, 0.02307532, 0.022739164, 0.02860112, 0.022800416,
0.022287748, 0.022491258, 0.023340693, 0.024576665, 0.024378624,
0.030037462, 0.024904579, 0.025901291, 0.02912765, 0.024926085,
0.024044815, 0.023799791, 0.023147189, 0.021253484, 0.023979501,
0.029659496, 0.029393487, 0.02470603, 0.024562731, 0.023819856,
0.025065129, 0.023228642, 0.023919905, 0.024781896), X5 = c(0.00535592,
0.00555428, 0.00555428, 0.00572213, 0.00573739, 0.00575265, 0.00576791,
0.00572213, 0.00573739, 0.00572213, 0.00575265, 0.00576791, 0.0056611,
0.0056611, 0.00567636, 0.00563058, 0.0056611, 0.00564584, 0.00563058,
0.00561532, 0.00575265, 0.00569162, 0.00567636, 0.00564584, 0.00561532,
0.00560006, 0.00556954, 0.0055848, 0.00555428, 0.00556954, 0.00569162,
0.00573739, 0.00572213, 0.00567636, 0.00561532, 0.00561532, 0.0055848,
0.00553903, 0.00552377, 0.00549325, 0.0056611, 0.00572213, 0.0056611,
0.0056611, 0.00563058, 0.00561532, 0.0055848, 0.00553903, 0.00553903,
0.00550851)), .Names = c("X1", "X2", "X3", "X4", "X5"), class = "data.frame", row.names = c(NA,
-50L))
basically, I am trying to correct each value based on average of a several values
I want to take average of G02, G03, G04 and G05 and then subtract it from each value for that column if the X1 is 1 or 2 or whatever number has
for example lets look at the X3
take average of
0.005914309
0.005887304
0.006054509
0.005931266
The average of this will be 0.005946847. Then I subtract it from the first value with having X1 as 1. It becomes 0.005648642 -0.005946847 =-0.000298205
Then subtract from each of the values in that column where the X1 is 1
The same Take average of the G02, G03, G04, G05 when the X1 is 2 and subtract it from each value of that column when the X1 is 2 etc etc

Here is an option using data.table join
library(data.table)
nm1 <- paste0("X", 3:5)
nm2 <- paste0("G0", 2:5)
dfN <- copy(df)
setDT(dfN)[dfN[X2 %in% nm2, lapply(.SD, function(x) mean(x)),
by = .(X1), .SDcols = X3:X5], (nm1) := Map(`-`, mget(nm1), mget(paste0("i.", nm1))),
on = .(X1)]
head(dfN, 10)
# X1 X2 X3 X4 X5
# 1: 1 B02 -0.000298205 -0.001730580 -0.0003166225
# 2: 1 B03 -0.000070458 0.000066087 -0.0001182625
# 3: 1 B04 -0.000021527 -0.002662059 -0.0001182625
# 4: 1 B05 0.000297609 0.016553208 0.0000495875
# 5: 1 B06 0.000040228 0.004684007 0.0000648475
# 6: 1 B07 0.000129027 0.000696879 0.0000801075
# 7: 1 C02 0.000251820 -0.008354588 0.0000953675
# 8: 1 C03 0.000056911 -0.001764806 0.0000495875
# 9: 1 C04 0.000095038 -0.000212185 0.0000648475
#10: 1 C05 0.000240140 -0.000915205 0.0000495875

A solution using dplyr. df2 is the mean from G02 to G05. df3 is the final output.
library(dplyr)
df2 <- df %>%
filter(X2 %in% paste0("G0", 2:5)) %>%
group_by(X1) %>%
summarise_at(vars(-X2), funs(mean(.))) %>%
gather(Col, Value, -X1)
df3 <- df %>%
group_by(X1) %>%
mutate(ID = 1:n()) %>%
gather(Col, Value, -ID, -X1, -X2) %>%
left_join(df2, by = c("X1", "Col")) %>%
mutate(Value = Value.x - Value.y) %>%
select(ID, X1, X2, Col, Value) %>%
spread(Col, Value) %>%
arrange(X1, ID) %>%
select(-ID) %>%
ungroup()
df3
# A tibble: 50 x 5
X1 X2 X3 X4 X5
<int> <fctr> <dbl> <dbl> <dbl>
1 1 B02 -0.000298205 -0.001730580 -0.0003166225
2 1 B03 -0.000070458 0.000066087 -0.0001182625
3 1 B04 -0.000021527 -0.002662059 -0.0001182625
4 1 B05 0.000297609 0.016553208 0.0000495875
5 1 B06 0.000040228 0.004684007 0.0000648475
6 1 B07 0.000129027 0.000696879 0.0000801075
7 1 C02 0.000251820 -0.008354588 0.0000953675
8 1 C03 0.000056911 -0.001764806 0.0000495875
9 1 C04 0.000095038 -0.000212185 0.0000648475
10 1 C05 0.000240140 -0.000915205 0.0000495875
# ... with 40 more rows

Related

How to calculate Cohen's D across 50 points in R

I have the following DF:
structure(list(AgeGroup = structure(c(1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("Young",
"Old"), class = "factor"), variable = structure(c(1L, 1L, 2L,
2L, 3L, 3L, 4L, 4L, 5L, 5L, 6L, 6L, 7L, 7L, 8L, 8L, 9L, 9L, 10L,
10L, 11L, 11L, 12L, 12L, 13L, 13L, 14L, 14L, 15L, 15L, 16L, 16L,
17L, 17L, 18L, 18L, 19L, 19L, 20L, 20L, 21L, 21L, 22L, 22L, 23L,
23L, 24L, 24L, 25L, 25L, 26L, 26L, 27L, 27L, 28L, 28L, 29L, 29L,
30L, 30L, 31L, 31L, 32L, 32L, 33L, 33L, 34L, 34L, 35L, 35L, 36L,
36L, 37L, 37L, 38L, 38L, 39L, 39L, 40L, 40L, 41L, 41L, 42L, 42L,
43L, 43L, 44L, 44L, 45L, 45L, 46L, 46L, 47L, 47L, 48L, 48L, 49L,
49L, 50L, 50L), .Label = c("Point.1", "Point.2", "Point.3", "Point.4",
"Point.5", "Point.6", "Point.7", "Point.8", "Point.9", "Point.10",
"Point.11", "Point.12", "Point.13", "Point.14", "Point.15", "Point.16",
"Point.17", "Point.18", "Point.19", "Point.20", "Point.21", "Point.22",
"Point.23", "Point.24", "Point.25", "Point.26", "Point.27", "Point.28",
"Point.29", "Point.30", "Point.31", "Point.32", "Point.33", "Point.34",
"Point.35", "Point.36", "Point.37", "Point.38", "Point.39", "Point.40",
"Point.41", "Point.42", "Point.43", "Point.44", "Point.45", "Point.46",
"Point.47", "Point.48", "Point.49", "Point.50"), class = "factor"),
value = c(0.714518666666667, 0.723876630952381, 0.728961368421053,
0.735228897233202, 0.701283807017544, 0.71396457312253, 0.663229964912281,
0.68923661660079, 0.613014666666667, 0.652671079051383, 0.547104,
0.602951166007905, 0.504106245614035, 0.558832648221344,
0.487034052631579, 0.515752438735178, 0.451825245614035,
0.476300007905138, 0.442370175438596, 0.441173656126482,
0.438668315789474, 0.435859173913043, 0.450059526315789,
0.434047494071146, 0.478947649122807, 0.450561841897233,
0.481134438596491, 0.461228027667984, 0.446763543859649,
0.451031316205534, 0.396206754385965, 0.406836889328063,
0.357049368421053, 0.368716249011858, 0.343943631578947,
0.368048932806324, 0.376060403508772, 0.398834193675889,
0.413613877192982, 0.434683889328063, 0.434964894736842,
0.448746023715415, 0.451208631578947, 0.450663276679842,
0.470569192982456, 0.473143399209486, 0.515300736842105,
0.502499193675889, 0.543379719298246, 0.507495533596838,
0.550050701754386, 0.498506288537549, 0.541725807017544,
0.482379664031621, 0.517293315789474, 0.458068636363636,
0.485205245614035, 0.423109671936759, 0.438844403508772,
0.385925747035573, 0.39522349122807, 0.362403612648221, 0.374209192982456,
0.350889750988142, 0.354036315789474, 0.336213118577075,
0.340668122807018, 0.327800648221344, 0.326388666666667,
0.322577146245059, 0.328114842105263, 0.319440624505929,
0.342721666666667, 0.323974818181818, 0.357620473684211,
0.335501339920949, 0.372856842105263, 0.343831292490119,
0.377362315789474, 0.361571442687747, 0.393890736842105,
0.377489727272727, 0.419330684210526, 0.38274228458498, 0.419797666666667,
0.387899881422925, 0.423127684210526, 0.385955055335968,
0.42140750877193, 0.377730351778656, 0.403711631578947, 0.366319122529644,
0.390753140350877, 0.355189754940711, 0.373226596491228,
0.347452173913044, 0.348689877192982, 0.340376324110672,
0.329466947368421, 0.344867375494071)), row.names = c(NA,
-100L), class = c("tbl_df", "tbl", "data.frame"))
which a subset looks like:
A tibble: 100 x 3
AgeGroup variable value
<fct> <fct> <dbl>
1 Young Point.1 0.715
2 Old Point.1 0.724
3 Young Point.2 0.729
4 Old Point.2 0.735
5 Young Point.3 0.701
6 Old Point.3 0.714
7 Young Point.4 0.663
8 Old Point.4 0.689
9 Young Point.5 0.613
10 Old Point.5 0.653
I have an output using:
Cho_D <- DF %>%
rstatix::cohens_d(value ~ variable, var.equal = TRUE)
But this provides me with a lot of unnecessary calculations like Point.1 and Point.3, Point.1 and Point.4, etc.
I would like to calculate Cohen's D for each successive points. So for example:
Point.1:Point.2, Point.2:Point.3, etc. The end goal is to plot D values on the Y-axis and Points 1 through 50 on the X-axis.
In base R you can accomplish the same using embed + apply:
L <- split(DF$value, DF$variable)
mat <- embed(names(L), 2)[,2:1]
res <- apply(mat, 1, function(x) rstatix::cohens_d(stack(L[x]), values~ind))
do.call(rbind, res)
A tibble: 49 x 7
.y. group1 group2 effsize n1 n2 magnitude
<chr> <chr> <chr> <dbl> <int> <int> <ord>
1 values Point.1 Point.2 -2.29 2 2 large
2 values Point.2 Point.3 3.46 2 2 large
3 values Point.3 Point.4 2.17 2 2 large
4 values Point.4 Point.5 1.83 2 2 large
5 values Point.5 Point.6 1.69 2 2 large
6 values Point.6 Point.7 1.11 2 2 large
7 values Point.7 Point.8 0.973 2 2 large
8 values Point.8 Point.9 1.98 2 2 large
9 values Point.9 Point.10 1.82 2 2 large
10 values Point.10 Point.11 2.95 2 2 large
# ... with 39 more rows
if you can use the effsize::cohen.d then this function will be faster than all the options given so far:
my_cohen <- function(data){
L <- split(data$value, data$variable)
mat <- embed(names(L), 2)
res <- apply(mat, 1, function(x)
effsize::cohen.d(L[[x[2]]], L[[x[1]]])$estimate)
data.frame(mat, res)
}
my_cohen(DF)
X1 X2 res
1 Point.2 Point.1 -2.29025540
2 Point.3 Point.2 3.45998958
3 Point.4 Point.3 2.16986489
4 Point.5 Point.4 1.82991671
5 Point.6 Point.5 1.68816593
6 Point.7 Point.6 1.11414226
We could nest the 'value', get the lead of the list column, and apply cohen.d by looping over the two list
library(dplyr)
library(effsize)
library(purrr)
out <- DF %>%
select(-AgeGroup) %>%
nest(data = value) %>%
mutate(across(everything(), lead, .names = "{.col}_lead")) %>%
slice(-n()) %>%
mutate(cohen_d = map2_dbl(data, data_lead,
~ cohen.d(.x$value, .y$value)$estimate))
-output
head(out, 2)
# A tibble: 2 x 5
variable data variable_lead data_lead cohen_d
<fct> <list> <fct> <list> <dbl>
1 Point.1 <tibble [2 × 1]> Point.2 <tibble [2 × 1]> -2.29
2 Point.2 <tibble [2 × 1]> Point.3 <tibble [2 × 1]> 3.46
comparing with OP's filtered output
Cho_D %>%
slice(seq(1, n(), by = 49)) %>%
slice_head(n = 2)
# A tibble: 2 x 7
# .y. group1 group2 effsize n1 n2 magnitude
# <chr> <chr> <chr> <dbl> <int> <int> <ord>
#1 value Point.1 Point.2 -2.29 2 2 large
#2 value Point.2 Point.3 3.46 2 2 large
Benchmarks
With the number of comparisons reduced, the timings are below
system.time({Cho_D <- DF %>%
rstatix::cohens_d(value ~ variable, var.equal = TRUE)
})
# user system elapsed
# 16.316 0.060 16.330
system.time({out <- DF %>%
select(-AgeGroup) %>%
nest(data = value) %>%
mutate(across(everything(), lead, .names = "{.col}_lead")) %>%
slice(-n()) %>%
mutate(cohen_d = map2_dbl(data, data_lead,
~ cohen.d(.x$value, .y$value)$estimate))})
# user system elapsed
# 0.031 0.005 0.037

Get rows from a column per group based on a condition

I have a data.frame as shown below:
Basic requirement is to find average of "n" number of "value" after certain date per group.
For ex:, user provides:
Certain Date = Failure Date
n = 4
Hence, for A, the average would be (60+70+80+100)/4 ; ignoring NAs
and for B, the average would be (80+90+100)/3. Note for B, n=4 does not happen as there are only 3 values after the satisfied condition failuredate = valuedate.
Here is the dput:
structure(list(Name = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("A",
"B"), class = "factor"), FailureDate = structure(c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L), .Label = c("1/5/2020", "1/7/2020"), class = "factor"), ValueDate = structure(c(1L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 2L, 1L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 2L), .Label = c("1/1/2020", "1/10/2020", "1/2/2020",
"1/3/2020", "1/4/2020", "1/5/2020", "1/6/2020", "1/7/2020", "1/8/2020",
"1/9/2020"), class = "factor"), Value = c(10L, 20L, 30L, 40L,
NA, 60L, 70L, 80L, NA, 100L, 10L, 20L, 30L, 40L, 50L, 60L, 70L,
80L, 90L, 100L)), class = "data.frame", row.names = c(NA, -20L
))
We could create an index with cumsum after grouping by 'Name', extract the 'Value' elements and get the mean
library(dplyr)
n <- 4
df1 %>%
type.convert(as.is = TRUE) %>%
group_by(Name) %>%
summarise(Ave = mean(head(na.omit(Value[lag(cumsum(FailureDate == ValueDate),
default = 0) > 0]), n), na.rm = TRUE))
# A tibble: 2 x 2
# Name Ave
# <chr> <dbl>
#1 A 77.5
#2 B 90
You can convert factor dates to the Date object and then compute averages of "n" numbers after FailureDate per group. Note that "n" numbers should exclude NA, so tidyr::drop_na() is used here.
library(dplyr)
df %>%
mutate(across(contains("Date"), as.Date, "%m/%d/%Y")) %>%
tidyr::drop_na(Value) %>%
group_by(Name) %>%
summarise(mean = mean(Value[ValueDate > FailureDate][1:4], na.rm = T))
# # A tibble: 2 x 2
# Name mean
# <fct> <dbl>
# 1 A 77.5
# 2 B 90
You can try this:
library(dplyr)
n <- 4
df %>%
mutate(condition = as.character(FailureDate) == as.character(ValueDate))
group_by(Name) %>%
mutate(condition = cumsum(condition)) %>%
filter(condition == 1) %>%
slice(-1) %>%
filter(!is.na(Value)) %>%
slice(1:n) %>%
summarise(mean_col = mean(Value))
> df
# A tibble: 2 x 2
Name mean_col
<fct> <dbl>
1 A 77.5
2 B 90

Sum consecutive hours when condition is met

I have a dataframe that has a timestamp and a numeric variable, the data is recorded once every hour. Ultimately, I'd life to know the mean number of hours that the variable is at or below a certain value. For example, what is the average number of hours that var1 is at or below 4? There are missing timestamps in the dataframe, so if the time is not consecutive the sum needs to restart.
In the example data frame the columns HoursBelow5 and RunningGroup were generated 'by hand', if I could create these columns programmatically, I could filter to remove the RunningGroups that were associate with var1 values greater than 4 and then use dplyr::slice to get the maximum HoursBelow5 per group. I could then find the mean of these values.
So, in this approach I would need to create the restarting cumulative sum HoursBelow5, which restarts when the condition var1<5 is not met, or when the timestamp is not consecutive hours. I could then use ifelse statements to create the RunningGroup variable. Is this possible? I may be lacking the jargon to find the procedure. Cumsum and lag seemed promising, but I have yet to construct a procedure that does the above.
Or, there may be a smarter way to do this using the timestamp.
edit: result incorporating code from answer below
df1 <- df %>%
group_by(group = data.table::rleid(var1 > 4),
group1 = cumsum(ts - lag(ts, default = first(ts)) > 3600)) %>%
mutate(temp = row_number() * (var1 <= 4)) %>%
ungroup() %>%
filter(var1 <= 4) %>%
select(ts, var1, temp)
df2 <- df1 %>% mutate(temp2 = ifelse(temp==1, 1, 0),
newgroup = cumsum(temp2))
df3 <- df2 %>% group_by(newgroup) %>% slice(which.max(temp))
mean(df3$temp)
# example dataframe with desired output columns to then get actual output
df <- structure(list(ts = structure(c(-2208967200, -2208963600, -2208960000,
-2208956400, -2208952800, -2208949200, -2208945600, -2208942000,
-2208938400, -2208934800, -2208931200, -2208927600, -2208924000,
-2208913200, -2208909600, -2208906000, -2208902400, -2208898800,
-2208895200, -2208891600, -2208888000, -2208884400, -2208880800,
-2208877200, -2208852000, -2208848400, -2208844800, -2208841200,
-2208837600, -2208834000, -2208830400, -2208826800, -2208823200,
-2208819600, -2208816000, -2208812400, -2208808800, -2208805200,
-2208801600), class = c("POSIXct", "POSIXt"), tzone = ""), var1 = c(1L,
3L, 4L, 5L, 4L, 3L, 5L, 6L, 7L, 8L, 3L, 2L, 2L, 2L, 3L, 3L, 2L,
2L, 1L, 1L, 1L, 1L, 4L, 4L, 3L, 9L, 3L, 3L, 3L, 2L, 2L, 3L, 4L,
5L, 3L, 2L, 1L, 2L, 3L), HoursBelow5 = c(1L, 2L, 3L, 0L, 1L,
2L, 0L, 0L, 0L, 0L, 1L, 2L, 3L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L,
9L, 10L, 11L, 1L, 0L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 0L, 1L, 2L,
3L, 4L, 5L), RunningGroup = c(1L, 1L, 1L, 2L, 3L, 3L, 4L, 5L,
6L, 7L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 9L,
10L, 11L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 13L, 14L, 14L, 14L,
14L, 14L), NotContinuous = c("", "", "", "", "", "", "", "",
"", "", "", "", "", "NC", "", "", "", "", "", "", "", "", "",
"", "NC", "", "", "", "", "", "", "", "", "", "", "", "", "",
"")), row.names = c(NA, -39L), class = "data.frame")
One way could using dplyr and data.table::rleid could be
library(dplyr)
df %>%
group_by(group = data.table::rleid(var1 > 4),
group1 = cumsum(ts - lag(ts, default = first(ts)) > 3600)) %>%
mutate(temp = row_number() * (var1 <= 4)) %>%
ungroup() %>%
select(ts, var1, HoursBelow5, temp)
# ts var1 HoursBelow5 temp
# <dttm> <int> <int> <int>
# 1 1900-01-01 12:46:46 1 1 1
# 2 1900-01-01 13:46:46 3 2 2
# 3 1900-01-01 14:46:46 4 3 3
# 4 1900-01-01 15:46:46 5 0 0
# 5 1900-01-01 16:46:46 4 1 1
# 6 1900-01-01 17:46:46 3 2 2
# 7 1900-01-01 18:46:46 5 0 0
# 8 1900-01-01 19:46:46 6 0 0
# 9 1900-01-01 20:46:46 7 0 0
#10 1900-01-01 21:46:46 8 0 0
# … with 29 more rows
temp column is the one which was generated programmatically and HoursBelow5 is kept as it is for comparison purposes. If you also need RunningGroup you could use group and group1 together.

Pick Different values from a column and write values correspondingly in R

I have a Dataframe that looks like :
REPRODUCIBLE DATA:
structure(list(User = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "Jibran", class = "factor"),
Event = structure(c(1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L,
1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("IN",
"OUT"), class = "factor"), Time = c("04/15/2015 00:31", "04/16/2015 20:10",
"04/21/2015 14:59", "04/22/2015 01:01", "04/22/2015 10:46",
"04/23/2015 00:58", "04/23/2015 14:50", "04/24/2015 01:37",
"04/25/2015 01:01", "04/27/2015 00:57", "04/17/2015 10:32",
"04/29/2015 15:03", "05/01/2015 00:44", "05/02/2015 01:19",
"05/02/2015 15:08", "05/03/2015 01:08", "05/03/2015 15:06",
"05/04/2015 01:01", "05/04/2015 15:11", "05/05/2015 01:08"
)), row.names = c(NA, -20L), class = "data.frame")
The values I'm looking for are the ones in which the event changes,
e.g:
User | Event | Time
Jibran IN 4/21/2015 14:59
Jibran OUT 4/22/2015 1:01
..
.. Values that occur when the next event is Different ( have values only when Event has In followed by Out)
The way I approached this problem is:
x = read.csv("TimeLog2.csv",header=TRUE)
df <- data.frame(matrix(ncol = 3, nrow = 0))
names(df)[1]<-paste("UserName")
names(df)[2]<-paste("Login")
names(df)[3]<-paste("Logout")
for(i in 1:length(x$Event))
{
if(x$Event[[i]]== 'IN' && x$Event[[i+1]]== 'OUT'){
df$Login[[i]]<-(x$Time[[i]])
df$Logout[[i]]<-(x$Time[[i+1]])
}
}
Which returns :
Error in $<-.data.frame(*tmp*, "Login", value = c(NA, NA, 4L)) :
replacement has 3 rows, data has 0
The desired Output should look like :
One thing to make sure is that a change in event should only be written to the next dataframe only if it occurs on the same or next day(as per the date) to get accurate Login/Logout values.
#df1 <-
# read.csv2("TimeLog2.csv", sep = ",")[,1:3]
library(data.table)
df1$Time2 <- df1$Time %>% as.Date(., format = "%m/%d/%Y", tz = 'GMT')
df1$grp <- shift(cumsum(df1$Event == "OUT"), 1 , 0)
setDT(df1)[, dataDiff := c(.SD$Time2[-.N] - .SD$Time2[.N] > -2 , F) ,by=grp]
df1 <- df1[, .SD[as.logical(cumsum(.SD$dataDiff)),], by=grp][,`:=`(dataDiff = NULL, Time2 = NULL)][]
dcast(df1, User + grp ~ Event)[,`:=`(grp = NULL)][]
Result:
# User IN OUT
#1: Jibran 4/21/2015 14:59 4/22/2015 1:01
#2: Jibran 4/22/2015 10:46 4/23/2015 0:58
#3: Jibran 4/23/2015 14:50 4/24/2015 1:37
#4: Jibran 5/2/2015 15:08 5/3/2015 1:08
#5: Jibran 5/3/2015 15:06 5/4/2015 1:01
#6: Jibran 5/4/2015 15:11 5/5/2015 1:08
Data
df = structure(list(User = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "Jibran", class = "factor"),
Event = structure(c(1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L,
1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L), .Label = c("IN",
"OUT"), class = "factor"), Time = structure(c(9L, 10L, 12L,
13L, 14L, 15L, 16L, 17L, 18L, 19L, 11L, 20L, 1L, 2L, 3L,
4L, 5L, 6L, 7L, 8L), .Label = c("05/01/2015 00:44", "05/02/2015 01:19",
"05/02/2015 15:08", "05/03/2015 01:08", "05/03/2015 15:06",
"05/04/2015 01:01", "05/04/2015 15:11", "05/05/2015 01:08",
"4/15/2015 0:31", "4/16/2015 20:10", "4/17/2015 10:32", "4/21/2015 14:59",
"4/22/2015 1:01", "4/22/2015 10:46", "4/23/2015 0:58", "4/23/2015 14:50",
"4/24/2015 1:37", "4/25/2015 1:01", "4/27/2015 0:57", "4/29/2015 15:03"
), class = "factor")), class = "data.frame", row.names = c(NA,
-20L))
Solution
library(dplyr)
library(tidyverse)
library(data.table)
df %>%
mutate(Time = mdy_hm(Time)) %>% # update to date variables
group_by(id = rleid(Event)) %>% # create a grouping variable
filter((Event == "IN" & Time == max(Time)) | # keep max time for IN
(Event == "OUT" & Time == min(Time))) %>% # keep min time for OUT
ungroup() %>% # forget the grouping
mutate(id = cumsum(Event == "IN")) %>% # create a new grouping variable
spread(Event, Time) %>% # reshape data
filter(ceiling(difftime(OUT, IN, units="days")) < 2) %>% # exclude cases where difference in time is 2+ days
select(-id) # remove grouping variable
# # A tibble: 6 x 3
# User IN OUT
# <fct> <dttm> <dttm>
# 1 Jibran 2015-04-21 14:59:00 2015-04-22 01:01:00
# 2 Jibran 2015-04-22 10:46:00 2015-04-23 00:58:00
# 3 Jibran 2015-04-23 14:50:00 2015-04-24 01:37:00
# 4 Jibran 2015-05-02 15:08:00 2015-05-03 01:08:00
# 5 Jibran 2015-05-03 15:06:00 2015-05-04 01:01:00
# 6 Jibran 2015-05-04 15:11:00 2015-05-05 01:08:00

print group in R by condition

Say, I have data
data=structure(list(x1 = structure(c(1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L), .Label = c("q",
"r", "w"), class = "factor"), x2 = structure(c(2L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), .Label = c("e", "w"), class = "factor"), x3 = structure(c(1L,
1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L,
2L, 2L, 2L), .Label = c("e", "q", "r"), class = "factor"), var = c(34L,
35L, 34L, 34L, 34L, 34L, 35L, 34L, 34L, 34L, 34L, 35L, 34L, 34L,
34L, 34L, 34L, 34L, 34L, 34L), act = c(1L, 1L, 1L, 1L, 1L, 0L,
0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L)), .Names = c("x1",
"x2", "x3", "var", "act"), class = "data.frame", row.names = c(NA,
-20L))
by columns x1, x2, x3 we have three groups
q w e
w e r
r e q
I must print only those groups which for act column have only 1 value.
In this case, only w e r group has by act column both 0 and 1 value, and
another group q w e and r e q has only 1 by act column, so I need to print it.
How to do it?
Expected output
x1 x2 x3
q w e
r e q
library(dplyr)
data %>% distinct(x1,x2,x3, .keep_all = TRUE) %>%
filter(act==1) %>% select(-var,-act)
x1 x2 x3
1 q w e
2 r e q
data %>% distinct(x1,x2,x3, .keep_all = TRUE) %>%
filter(act==1) %>% select(-var,-act) %>%
filter(x1=='r',x2=='e',x3=='q')
x1 x2 x3
1 r e q
#OR
data %>% filter(x1=='r',x2=='e',x3=='q')
If I understand correctly, the OP has requested to extract/print only those groups which contain only the value 1 in the act column.
This can be achieved using the all() function.
data.table
library(data.table)
setDT(data)[, which(all(act == 1L)), by = .(x1, x2, x3)][, -"V1"]
x1 x2 x3
1: q w e
2: r e q
dplyr
library(dplyr)
data %>%
group_by(x1, x2, x3) %>%
filter(all(act == 1L)) %>%
distinct(x1, x2, x3)
# A tibble: 2 x 3
# Groups: x1, x2, x3 [2]
x1 x2 x3
<fct> <fct> <fct>
1 q w e
2 r e q
Data
In addition to the dataset data provided by the OP I have tested my answer with a second dataset which contains an additional group and where the rows in the w e r group are ordered differently.
data2 <- data.table::fread(
" x1 x2 x3 var act
a b c 33 0
a b c 33 0
q w e 34 1
q w e 35 1
w e r 34 1
w e r 35 1
w e r 34 0
w e r 35 0
r e q 34 1
r e q 34 1
", data.table = FALSE)

Resources