Conditional statement within group - r

I have a dataframe in which I want to make a new column with values based on condition within groups. So for the dataframe below, I want to make a new column n_actions which gives
Cond1. for the whole group GROUP the number 2 if a 6 appears in column STEP
Cond 2. for the whole group GROUP the number 3 if a 9 appears in column STEP
Cond 3. if not a 6 or 9 appears within column STEP for the GROUP, then 1
#dataframe start
dataframe <- data.frame(group = c("A", "A", "A", "B", "B", "B", "B", "B", "B", "C", "C", "C", "D", "D", "D", "D", "D", "D", "D", "D", "D"),
step = c(1, 2, 3, 1, 2, 3, 4, 5, 6, 1, 2, 3, 1, 2, 3, 4, 5, 6, 7, 8, 9))
# dataframe desired
dataframe$n_actions <- c(rep(1, 3), rep(2, 6,), rep(1, 3), rep(3, 9))

Try out:
library(dplyr)
dataframe %>%
group_by(group) %>%
mutate(n_actions = ifelse(9 %in% step, 3,
ifelse(6 %in% step, 2, 1)))
# A tibble: 21 x 3
# Groups: group [4]
group step n_actions
<fctr> <dbl> <dbl>
1 A 1 1
2 A 2 1
3 A 3 1
4 B 1 2
5 B 2 2
6 B 3 2
7 B 4 2
8 B 5 2
9 B 6 2
10 C 1 1
# ... with 11 more rows

Another way with dplyr's case_when:
library(dplyr)
dataframe %>%
group_by(group) %>%
mutate(
n_actions1 = case_when(
9 %in% step ~ 3,
6 %in% step ~ 2,
TRUE ~ 1
)
)
Output:
# A tibble: 21 x 3
# Groups: group [4]
group step n_actions
<fct> <dbl> <dbl>
1 A 1 1
2 A 2 1
3 A 3 1
4 B 1 2
5 B 2 2
6 B 3 2
7 B 4 2
8 B 5 2
9 B 6 2
10 C 1 1
11 C 2 1
12 C 3 1
13 D 1 3
14 D 2 3
15 D 3 3
16 D 4 3
17 D 5 3
18 D 6 3
19 D 7 3
20 D 8 3
21 D 9 3

You could divide the maximum value per group by %/% 3, it seems.
dataframe <- transform(dataframe,
n_actions2 = ave(step, group, FUN = function(x) max(x) %/% 3))
dataframe
# group step n_actions n_actions2
#1 A 1 1 1
#2 A 2 1 1
#3 A 3 1 1
#4 B 1 2 2
#5 B 2 2 2
#6 B 3 2 2
#7 B 4 2 2
#8 B 5 2 2
#9 B 6 2 2
#10 C 1 1 1
#11 C 2 1 1
#12 C 3 1 1
#13 D 1 3 3
#14 D 2 3 3
#15 D 3 3 3
#16 D 4 3 3
#17 D 5 3 3
#18 D 6 3 3
#19 D 7 3 3
#20 D 8 3 3
#21 D 9 3 3

Related

Replace value when value above and below are the same

I have the following dataframe df (dput below):
> df
group value
1 A 2
2 A 2
3 A 3
4 A 2
5 A 1
6 A 2
7 A 2
8 A 2
9 B 3
10 B 3
11 B 3
12 B 4
13 B 3
14 B 3
15 B 4
16 B 4
I would like to replace value when the value above and below are the same per group. For example row 3 has a value above of 2 and below of 2 which means the 3 should be 2. The desired output should look like this:
group value
1 A 2
2 A 2
3 A 2
4 A 2
5 A 2
6 A 2
7 A 2
8 A 2
9 B 3
10 B 3
11 B 3
12 B 3
13 B 3
14 B 3
15 B 4
16 B 4
So I was wondering if anyone knows how to replace values when the value above and below are the same like in the example above?
dput data:
df<-structure(list(group = c("A", "A", "A", "A", "A", "A", "A", "A",
"B", "B", "B", "B", "B", "B", "B", "B"), value = c(2, 2, 3, 2,
1, 2, 2, 2, 3, 3, 3, 4, 3, 3, 4, 4)), class = "data.frame", row.names = c(NA,
-16L))
With ifelse, lead and lag:
library(dplyr)
df %>%
mutate(value = ifelse(lead(value, default = TRUE) == lag(value, default = TRUE),
lag(value), value))
group value
1 A 2
2 A 2
3 A 2
4 A 2
5 A 2
6 A 2
7 A 2
8 A 2
9 B 3
10 B 3
11 B 3
12 B 3
13 B 3
14 B 3
15 B 4
16 B 4

filling NA with values from another table

I have the following datasets in RStudio:
df =
a b
1 A
1 NA
1 A
1 NA
2 C
2 NA
2 B
3 A
3 NA
3 C
3 D
and fill_with =
a b
1 A
2 B
3 C
How do I fill the NA values in df in the b column according to the a column?
Ex: a=1, b=NA, then I look at the table fill_with at a=1, and I see that I should fill it with b=A.
In the end it should look the following way:
df =
a b
1 A
1 A
1 A
1 A
2 C
2 B
2 B
3 A
3 C
3 C
3 D
We can use ifelse
df$b <- ifelse(is.na(df$b) ,
fill_with$b[match(df$a , fill_with$a)] , df$b)
Output
a b
1 1 A
2 1 A
3 1 A
4 1 A
5 2 C
6 2 B
7 2 B
8 3 A
9 3 C
10 3 C
11 3 D
library(tidyverse)
df <- read_table("a b
1 A
1 NA
1 A
1 NA
2 C
2 NA
2 B
3 A
3 NA
3 C
3 D")
df %>%
group_by(a) %>%
fill(b, .direction = "updown")
# A tibble: 11 x 2
# Groups: a [3]
a b
<dbl> <chr>
1 1 A
2 1 A
3 1 A
4 1 A
5 2 C
6 2 B
7 2 B
8 3 A
9 3 C
10 3 C
11 3 D
Base R
tmp=which(is.na(df$b))
df$b[tmp]=fill_with$b[match(df$a,fill_with$a)[tmp]]
a b
1 1 A
2 1 A
3 1 A
4 1 A
5 2 C
6 2 B
7 2 B
8 3 A
9 3 C
10 3 C
11 3 D
library(tidyverse)
df <- data.frame(
stringsAsFactors = FALSE,
a = c(1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3),
b = c("A", NA, "A", NA, "C", NA, "B", "A", NA, "C", "D")
)
fill_with <- data.frame(
stringsAsFactors = FALSE,
a = c(1L, 2L, 3L),
b = c("A", "B", "C")
)
rows_update(x = df, y = fill_with, by = "a")
#> a b
#> 1 1 A
#> 2 1 A
#> 3 1 A
#> 4 1 A
#> 5 2 B
#> 6 2 B
#> 7 2 B
#> 8 3 C
#> 9 3 C
#> 10 3 C
#> 11 3 C
Created on 2022-08-22 with reprex v2.0.2

Group by cumulative sums with conditions

In this dataframe:
df <- data.frame(
ID = c("C", "B", "B", "B", NA, "C", "A", NA, "B", "B", "B")
)
I'd like to group the rows using cumsum with two conditions: (i) cumsum should not continue if is.na(ID) and (ii) it should not continue if the next ID value is the same as the prior. I do meet condition (i) with this:
df %>%
group_by(grp = cumsum(!is.na(ID)))
# A tibble: 11 x 2
# Groups: grp [9]
ID grp
<chr> <int>
1 C 1
2 B 2
3 B 3
4 B 4
5 NA 4
6 C 5
7 A 6
8 NA 6
9 B 7
10 B 8
11 B 9
but I don't know how to implement condition (ii) too, to obtain the desired result:
1 C 1
2 B 2
3 B 2
4 B 2
5 NA 2
6 C 3
7 A 4
8 NA 4
9 B 5
10 B 5
11 B 5
I tried it with this but I doesn't work:
df %>%
group_by(grp = cumsum(!is.na(ID) |!lag(ID,1) == ID))
Use na.locf0 from zoo to fill in the NAs and then apply rleid from data.table:
library(data.table)
library(zoo)
rleid(na.locf0(df$ID))
## [1] 1 2 2 2 2 3 4 4 5 5 5
Using tidyr and dplyr, you could do:
df %>%
mutate(grp = fill(., ID) %>% pull(),
grp = cumsum(grp != lag(grp, default = first(grp))))
ID grp
1 C 0
2 B 1
3 B 1
4 B 1
5 <NA> 1
6 C 2
7 A 3
8 <NA> 3
9 B 4
10 B 4
11 B 4
Using rle
library(zoo)
with(rle(na.locf0(df$ID)), rep(seq_along(values), lengths))
#[1] 1 2 2 2 2 3 4 4 5 5 5

Assign ID to consecutive groups column r

I would like to produce a column in a data.frame that counts the consecutive id of the groups (s column in dummy df)
dummy_df = data.frame(s = c("a", "a", "b","b", "b", "c","c", "a", "a", "c", "c","a","a"),
desired_output= c(1,1,1,1,1,1,1,2,2,2,2,3,3))
dummy_df$rleid_output = rleid(dummy_df$s)
dummy_df
s desired_output rleid_output
1 a 1 1
2 a 1 1
3 b 1 2
4 b 1 2
5 b 1 2
6 c 1 3
7 c 1 3
8 a 2 4
9 a 2 4
10 c 2 5
11 c 2 5
12 a 3 6
13 a 3 6
I would say it's similar to what rleid() does but restarting the counting when a new group is seen. However, I can't find a way to do it in such straight way. Thanks.
You can do:
dummy_df$out <- with(rle(dummy_df$s), rep(ave(lengths, values, FUN = seq_along), lengths))
Result:
s desired_output out
1 a 1 1
2 a 1 1
3 b 1 1
4 b 1 1
5 b 1 1
6 c 1 1
7 c 1 1
8 a 2 2
9 a 2 2
10 c 2 2
11 c 2 2
12 a 3 3
13 a 3 3
If you are willing to use data.table (rleid is part of the package), you can do it in two steps as follows:
library(data.table)
dummy_df = data.frame(s = c("a", "a", "b", "b", "b", "c", "c", "a", "a", "c", "c", "a", "a"))
# cast data.frame to data.table
setDT(dummy_df)
# create auxiliary variable
dummy_df[, rleid_output := rleid(s)]
# obtain desired output
dummy_df[, desired_output := rleid(rleid_output), by = "s"]
# end result
dummy_df
#> s rleid_output desired_output
#> 1: a 1 1
#> 2: a 1 1
#> 3: b 2 1
#> 4: b 2 1
#> 5: b 2 1
#> 6: c 3 1
#> 7: c 3 1
#> 8: a 4 2
#> 9: a 4 2
#> 10: c 5 2
#> 11: c 5 2
#> 12: a 6 3
#> 13: a 6 3
Created on 2020-10-16 by the reprex package (v0.3.0)
you can try a tidyverse in combination with the base R rle function
library(tidyverse)
rle(dummy_df$s) %>%
with(., data.frame(a=.$length, b=.$value)) %>%
group_by(b) %>%
mutate(n = 1:n()) %>%
with(., rep(n, times=a)) %>%
bind_cols(dummy_df, res=.)
s desired_output res
1 a 1 1
2 a 1 1
3 b 1 1
4 b 1 1
5 b 1 1
6 c 1 1
7 c 1 1
8 a 2 2
9 a 2 2
10 c 2 2
11 c 2 2
12 a 3 3
13 a 3 3

R: new column of row difference from max value of another column according to group

The title of the question may be unclear but I hope these codes will clearly demonstrate my problem.
I have a data frame with three columns. $sensor (A and B); $hour of the day (0-4); and the $value taken by the temperature (1-5).
new.df <- data.frame(
sensor = c("A", "A", "A", "A", "A", "B", "B", "B", "B", "B"),
hour_day = c(0:4, 0:4),
value = c(1, 1, 3, 1, 2, 1, 3, 4, 5, 2)
new.df
sensor hour_day value
1 A 0 1
2 A 1 1
3 A 2 3
4 A 3 1
5 A 4 2
6 B 0 1
7 B 1 3
8 B 2 4
9 B 3 5
10 B 4 2
I want to make a new column that indicates the difference in hour from the hour with maximum value according to the sensor.
Desired result
sensor value hour_day hour_from_max_hour
1 A 1 0 -2
2 A 1 1 -1
3 A 3 2 0
4 A 1 3 1
5 A 2 4 2
6 B 1 0 -3
7 B 3 1 -2
8 B 4 2 -1
9 B 5 3 0
10 B 2 4 1
Note that for sensor A (max = hour 2), and sensor B (max = hour 3). I just want a new column that tells me how many hour different is that sensor-value group is from the max sensor-value.
Thank you in advance and please let me know if I can provide more information.
EDIT
Previous answer were very helpful, I forgot that there is one more variable (day) in this problem. Also, some times there is more than one maximum in a column. When this is the case, I would like to base the difference on the first maximum.
df_add <- data.frame(
sensor = c("A", "A", "A", "A", "A", "B", "B", "B", "B", "B",
"A", "A", "A", "A", "A", "B", "B", "B", "B", "B"),
hour_day = c(0:4, 0:4, 0:4, 0:4),
value = c(1, 1, 3, 3, 2,
3, 2, 4, 4, 1,
1, 5, 6, 6, 2,
2, 1, 3, 3, 1),
day = c(1, 1, 1, 1, 1,
1, 1, 1, 1, 1,
2, 2, 2, 2, 2,
2, 2, 2, 2, 2)
)
df_add
sensor hour_day value day
1 A 0 1 1
2 A 1 1 1
3 A 2 3 1
4 A 3 3 1
5 A 4 2 1
6 B 0 3 1
7 B 1 2 1
8 B 2 4 1
9 B 3 4 1
10 B 4 1 1
11 A 0 1 2
12 A 1 5 2
13 A 2 6 2
14 A 3 6 2
15 A 4 2 2
16 B 0 2 2
17 B 1 1 2
18 B 2 3 2
19 B 3 3 2
20 B 4 1 2
A simple pipe can do it. All you have to do is to get max(value) in the mutate instruction.
new.df %>%
group_by(sensor) %>%
mutate(hour_from_max_hour = hour_day - hour_day[which(value == max(value))[1]])
## A tibble: 10 x 4
## Groups: sensor [2]
# sensor hour_day value hour_from_max_hour
# <fct> <int> <dbl> <int>
# 1 A 0 1. -2
# 2 A 1 1. -1
# 3 A 2 3. 0
# 4 A 3 1. 1
# 5 A 4 2. 2
# 6 B 0 1. -3
# 7 B 1 3. -2
# 8 B 2 4. -1
# 9 B 3 5. 0
#10 B 4 2. 1
library(dplyr)
new.df.2 <-
# First get the hours with the max values
new.df %>%
group_by(sensor) %>%
filter(value == max(value)) %>%
ungroup() %>%
select(sensor, max_hour = hour_day) %>% # This renames hour_day as max_hour
# Now join that to the original table and make the calculation
right_join(new.df) %>%
mutate(hour_from_max_hour = hour_day - max_hour)
Result:
new.df.2
# A tibble: 10 x 5
sensor max_hour hour_day value hour_from_max_hour
<fct> <int> <int> <dbl> <int>
1 A 2 0 1 -2
2 A 2 1 1 -1
3 A 2 2 3 0
4 A 2 3 1 1
5 A 2 4 2 2
6 B 3 0 1 -3
7 B 3 1 3 -2
8 B 3 2 4 -1
9 B 3 3 5 0
10 B 3 4 2 1
This is probably how I would do it:
library(plyr)
dd = ddply(new.df, .(sensor), summarize,
max.value = max(value),
hour.of.max = hour_day[which.max(value)])
new.df = merge(new.df, dd, all.x=T, by='sensor')
new.df$hour_from_max_hour = new.df$hour_day - new.df$hour.of.max
Gave you a couple extra columns, but you can delete them:
sensor hour_day value max.value hour.of.max hour_from_max_hour
1 A 0 1 3 2 -2
2 A 1 1 3 2 -1
3 A 2 3 3 2 0
4 A 3 1 3 2 1
5 A 4 2 3 2 2
6 B 0 1 5 3 -3
7 B 1 3 5 3 -2
8 B 2 4 5 3 -1
9 B 3 5 5 3 0
10 B 4 2 5 3 1

Resources