Group repeated values with random breaks in R - r

I have the following data frame called df (dput below):
group value
1 A 0
2 A 24
3 A 0
4 A 24
5 A 0
6 A 0
7 B 0
8 B 24
9 B 0
10 B 0
11 B 24
12 B 0
I would like to group the repeated values per group when the order is 0->24. Sometimes there is a random 0 with no 24 after. The desired output should look like this:
group value subgroup
1 A 0 1
2 A 24 1
3 A 0 2
4 A 24 2
5 A 0 3
6 A 0 4
7 B 0 1
8 B 24 1
9 B 0 2
10 B 0 3
11 B 24 3
12 B 0 4
As you can see for rows 5 and 9 there is no 24 after it, that's why they have grouped alone. So I was wondering if anyone knows how to group repeated values with some random breaks in R?
dput df:
df <- structure(list(group = c("A", "A", "A", "A", "A", "A", "B", "B",
"B", "B", "B", "B"), value = c(0, 24, 0, 24, 0, 0, 0, 24, 0,
0, 24, 0)), class = "data.frame", row.names = c(NA, -12L))

Looks like the subgroup increments whenever there is a 0 value:
df %>%
group_by(group) %>%
mutate(subgroup = cumsum(value == 0)) %>%
ungroup()
# # A tibble: 12 × 3
# group value subgroup
# <chr> <dbl> <int>
# 1 A 0 1
# 2 A 24 1
# 3 A 0 2
# 4 A 24 2
# 5 A 0 3
# 6 A 0 4
# 7 B 0 1
# 8 B 24 1
# 9 B 0 2
# 10 B 0 3
# 11 B 24 3
# 12 B 0 4

Related

Combine csv files with redundant columns R

I have a series of .csv files that look like this :
a.csv contains
id, a, b, c
1, 10, 0, 0
2, 3, 0 , 0
3, 20, 0, 0
b.csv contains
id, a, b, c
1, 0, 7, 0
2, 0, 9, 0
3, 0, 14, 0
c.csv contains
id, a, b, c
1, 0, 0, 12
2, 0, 0, 8
3, 0, 0, 22
I'm trying to figure out the most efficient way to read them in and create a dataframe that looks like this
id, a, b, c
1, 10, 7, 12
2, 3, 9, 8
3, 20, 14, 22
What would be the best way to do this if there are many more files with many more columns and rows? tidyverse is preferred.
How about this. If all redundant columns have zeros, then you can go long, filter out the zeros, bind the rows, and then go wide.
library(tidyverse)
df_a <- read_table("id a b c
1 10 0 0
2 3 0 0
3 20 0 0")
df_b <- read_table("id a b c
1 0 7 0
2 0 9 0
3 0 14 0")
df_c <- read_table("id a b c
1 0 0 12
2 0 0 8
3 0 0 22")
list(df_a, df_b, df_c)|>
map(\(d) pivot_longer(d, cols = -id) |>
filter(value >0)) |>
bind_rows() |>
pivot_wider(names_from = name, values_from = value)
#> # A tibble: 3 x 4
#> id a b c
#> <dbl> <dbl> <dbl> <dbl>
#> 1 1 10 7 12
#> 2 2 3 9 8
#> 3 3 20 14 22
Or better yet, read in the data marking 0 as NA and then coalesce the data frames.
df_a <- read_table("id a b c
1 10 0 0
2 3 0 0
3 20 0 0", na = "0")
df_b <- read_table("id a b c
1 0 7 0
2 0 9 0
3 0 14 0", na = "0")
df_c <- read_table("id a b c
1 0 0 12
2 0 0 8
3 0 0 22", na = "0")
coalesce(df_a, df_b, df_c)
#> # A tibble: 3 x 4
#> id a b c
#> <dbl> <dbl> <dbl> <dbl>
#> 1 1 10 7 12
#> 2 2 3 9 8
#> 3 3 20 14 22
Or if you can read the data in with NA, you can define 0 as NA:
list(df_a, df_b, df_c) |>
map(\(d) mutate(d, across(everything(), \(x) ifelse(x == 0, NA, x)))) |>
reduce(coalesce)
#> # A tibble: 3 x 4
#> id a b c
#> <dbl> <dbl> <dbl> <dbl>
#> 1 1 10 7 12
#> 2 2 3 9 8
#> 3 3 20 14 22
A base R solution, given the symmetry of the files.
Read the files
file_names <- list.files(pattern="^[abc]\\.csv")
lis <- sapply(file_names, function(x) list(read.csv(x, header=T)))
lis
$a.csv
id a b c
1 1 10 0 0
2 2 3 0 0
3 3 20 0 0
$b.csv
id a b c
1 1 0 7 0
2 2 0 9 0
3 3 0 14 0
$c.csv
id a b c
1 1 0 0 12
2 2 0 0 8
3 3 0 0 22
Combine the columns
column_names <- c("a","b","c")
cbind( lis[["a.csv"]]["id"], sapply(lis, function(x) rowSums(x[column_names])) )
id a.csv b.csv c.csv
1 1 10 7 12
2 2 3 9 8
3 3 20 14 22

Conditional replacing of a numeric value in dplyr

Dear all I have a data frame that looks like this
df <- data.frame(time=c(1,2,3,4,1,2,3,4,5), type=c("A","A","A","A","B","B","B","B","B"), count=c(10,0,0,1,8,0,1,0,1))
df
time type count
1 1 A 10
2 2 A 0
3 3 A 0
4 4 A 1
5 1 B 8
6 2 B 0
7 3 B 1
8 4 B 0
9 5 B 1
I want to examine each group of types and if I see that one count is 0 then to replace the next count forward in time with 0. I do not count to be resurrected from the zero.
I want my data to looks like this
time type count
1 1 A 10
2 2 A 0
3 3 A 0
4 4 A 0
5 1 B 8
6 2 B 0
7 3 B 0
8 4 B 0
9 5 B 0
If I understood correctly
library(tidyverse)
df <-
data.frame(
time = c(1, 2, 3, 4, 1, 2, 3, 4, 5),
type = c("A", "A", "A", "A", "B", "B", "B", "B", "B"),
count = c(10, 0, 0, 1, 8, 0, 1, 0, 1)
)
df %>%
group_by(type) %>%
mutate(count = if_else(lag(count, default = first(count)) == 0, 0, count))
#> # A tibble: 9 x 3
#> # Groups: type [2]
#> time type count
#> <dbl> <chr> <dbl>
#> 1 1 A 10
#> 2 2 A 0
#> 3 3 A 0
#> 4 4 A 0
#> 5 1 B 8
#> 6 2 B 0
#> 7 3 B 0
#> 8 4 B 0
#> 9 5 B 0
Created on 2021-09-10 by the reprex package (v2.0.1)
You may use cummin function.
library(dplyr)
df %>% group_by(type) %>% mutate(count = cummin(count))
# time type count
# <dbl> <chr> <dbl>
#1 1 A 10
#2 2 A 0
#3 3 A 0
#4 4 A 0
#5 1 B 8
#6 2 B 0
#7 3 B 0
#8 4 B 0
#9 5 B 0
Since cummin is a base R function you may also implement it in base R -
transform(df, count = ave(count, type, FUN = cummin))

dplyr: add numbers based on matching rows

Let's say I have
> fig
hands imp_spe n
1 A 0 39
2 A 1 32
3 B 0 3
4 B 1 2
5 C 0 115
6 C 1 24
7 D 0 11
8 D 1 3
I want to add a new column fig$new, that adds numbers in fig$n, but only when rows in fig$hands are matching.
I need to keep the dataframe as it is.
Expected output
> fig
hands imp_spe n new
1 A 0 39 71
2 A 1 32 71
3 B 0 3 5
4 B 1 2 5
5 C 0 115 139
6 C 1 24 139
7 D 0 11 14
8 D 1 3 14
I am looking for a solution in dplyr
fig <- structure(list(hands = c("A", "A", "B", "B", "C", "C", "D", "D"
), imp_spe = c(0, 1, 0, 1, 0, 1, 0, 1), n = c(39L, 32L, 3L, 2L,
115L, 24L, 11L, 3L)), row.names = c(NA, -8L), class = "data.frame")
here you go
library(dplyr)
fig %>%
group_by(hands) %>%
mutate(new = sum(n)) %>%
ungroup
dplyr solution:
dplyr::add_count(fig, hands, wt = n, name = 'new')
# hands imp_spe n new
# 1 A 0 39 71
# 2 A 1 32 71
# 3 B 0 3 5
# 4 B 1 2 5
# 5 C 0 115 139
# 6 C 1 24 139
# 7 D 0 11 14
# 8 D 1 3 14
base solution:
transform(
fig,
new = ave(x = n, hands, FUN = sum)
)
# hands imp_spe n new
# 1 A 0 39 71
# 2 A 1 32 71
# 3 B 0 3 5
# 4 B 1 2 5
# 5 C 0 115 139
# 6 C 1 24 139
# 7 D 0 11 14
# 8 D 1 3 14

grouped event chain ID in tidyverse

I'm attempting to create an ID column for my data frame that counts a sequence of events and can't figure out where I'm going wrong.
The data looks like this:
data
library(tidyverse)
df <- tribble(
~group, ~value,
"a", 4,
"a", 3,
"a", 10,
"b", 2,
"b", 4,
"a", 20,
"a", 14,
"a", 12,
"a", 9,
"b", 66,
"b", 23,
"b", 48)
Things I've tried...
I tried to use cur_group_id() but that only seems to return a binary value recognizing each group:
df %>%
group_by(group) %>%
mutate(ID = cur_group_id()) %>%
as.data.frame()
# A tibble: 12 x 3
group value expectedID
<chr> <dbl> <dbl>
1 a 4 1
2 a 3 1
3 a 10 1
4 b 2 1
5 b 4 1
6 a 20 2
7 a 14 2
8 a 12 2
9 a 9 2
10 b 66 2
11 b 23 2
12 b 48 2
I've also tried seq_along() which gets me a bit closer to what I want, but is more a running count of the rows, like row_number(), for each time the group has a value.
df %>%
group_by(group) %>%
mutate(ID = seq_along(group)) %>%
as.data.frame()
group value expectedID ID
1 a 4 1 1
2 a 3 1 2
3 a 10 1 3
4 b 2 1 1
5 b 4 1 2
6 a 20 2 4
7 a 14 2 5
8 a 12 2 6
9 a 9 2 7
10 b 66 2 3
11 b 23 2 4
12 b 48 2 5
My desired output
What I'd really like it to look like is this:
df$expectedID <- c(1,1,1,1,1,2,2,2,2,2,2,2)
# A tibble: 12 x 3
group value expectedID
<chr> <dbl> <dbl>
1 a 4 1
2 a 3 1
3 a 10 1
4 b 2 1
5 b 4 1
6 a 20 2
7 a 14 2
8 a 12 2
9 a 9 2
10 b 66 2
11 b 23 2
12 b 48 2
Basically, if the lagged group is the same as the current group, retain the count. If the lagged group is different than the current group, begin a new count. Each time the group changes, increase the count by one.
Here is one option, (ab)using rle() with data.table::rowid():
df$id <-
rle(df$group) %>% {rep(data.table::rowid(.$values), times = .$length)}

R: new column of row difference from max value of another column according to group

The title of the question may be unclear but I hope these codes will clearly demonstrate my problem.
I have a data frame with three columns. $sensor (A and B); $hour of the day (0-4); and the $value taken by the temperature (1-5).
new.df <- data.frame(
sensor = c("A", "A", "A", "A", "A", "B", "B", "B", "B", "B"),
hour_day = c(0:4, 0:4),
value = c(1, 1, 3, 1, 2, 1, 3, 4, 5, 2)
new.df
sensor hour_day value
1 A 0 1
2 A 1 1
3 A 2 3
4 A 3 1
5 A 4 2
6 B 0 1
7 B 1 3
8 B 2 4
9 B 3 5
10 B 4 2
I want to make a new column that indicates the difference in hour from the hour with maximum value according to the sensor.
Desired result
sensor value hour_day hour_from_max_hour
1 A 1 0 -2
2 A 1 1 -1
3 A 3 2 0
4 A 1 3 1
5 A 2 4 2
6 B 1 0 -3
7 B 3 1 -2
8 B 4 2 -1
9 B 5 3 0
10 B 2 4 1
Note that for sensor A (max = hour 2), and sensor B (max = hour 3). I just want a new column that tells me how many hour different is that sensor-value group is from the max sensor-value.
Thank you in advance and please let me know if I can provide more information.
EDIT
Previous answer were very helpful, I forgot that there is one more variable (day) in this problem. Also, some times there is more than one maximum in a column. When this is the case, I would like to base the difference on the first maximum.
df_add <- data.frame(
sensor = c("A", "A", "A", "A", "A", "B", "B", "B", "B", "B",
"A", "A", "A", "A", "A", "B", "B", "B", "B", "B"),
hour_day = c(0:4, 0:4, 0:4, 0:4),
value = c(1, 1, 3, 3, 2,
3, 2, 4, 4, 1,
1, 5, 6, 6, 2,
2, 1, 3, 3, 1),
day = c(1, 1, 1, 1, 1,
1, 1, 1, 1, 1,
2, 2, 2, 2, 2,
2, 2, 2, 2, 2)
)
df_add
sensor hour_day value day
1 A 0 1 1
2 A 1 1 1
3 A 2 3 1
4 A 3 3 1
5 A 4 2 1
6 B 0 3 1
7 B 1 2 1
8 B 2 4 1
9 B 3 4 1
10 B 4 1 1
11 A 0 1 2
12 A 1 5 2
13 A 2 6 2
14 A 3 6 2
15 A 4 2 2
16 B 0 2 2
17 B 1 1 2
18 B 2 3 2
19 B 3 3 2
20 B 4 1 2
A simple pipe can do it. All you have to do is to get max(value) in the mutate instruction.
new.df %>%
group_by(sensor) %>%
mutate(hour_from_max_hour = hour_day - hour_day[which(value == max(value))[1]])
## A tibble: 10 x 4
## Groups: sensor [2]
# sensor hour_day value hour_from_max_hour
# <fct> <int> <dbl> <int>
# 1 A 0 1. -2
# 2 A 1 1. -1
# 3 A 2 3. 0
# 4 A 3 1. 1
# 5 A 4 2. 2
# 6 B 0 1. -3
# 7 B 1 3. -2
# 8 B 2 4. -1
# 9 B 3 5. 0
#10 B 4 2. 1
library(dplyr)
new.df.2 <-
# First get the hours with the max values
new.df %>%
group_by(sensor) %>%
filter(value == max(value)) %>%
ungroup() %>%
select(sensor, max_hour = hour_day) %>% # This renames hour_day as max_hour
# Now join that to the original table and make the calculation
right_join(new.df) %>%
mutate(hour_from_max_hour = hour_day - max_hour)
Result:
new.df.2
# A tibble: 10 x 5
sensor max_hour hour_day value hour_from_max_hour
<fct> <int> <int> <dbl> <int>
1 A 2 0 1 -2
2 A 2 1 1 -1
3 A 2 2 3 0
4 A 2 3 1 1
5 A 2 4 2 2
6 B 3 0 1 -3
7 B 3 1 3 -2
8 B 3 2 4 -1
9 B 3 3 5 0
10 B 3 4 2 1
This is probably how I would do it:
library(plyr)
dd = ddply(new.df, .(sensor), summarize,
max.value = max(value),
hour.of.max = hour_day[which.max(value)])
new.df = merge(new.df, dd, all.x=T, by='sensor')
new.df$hour_from_max_hour = new.df$hour_day - new.df$hour.of.max
Gave you a couple extra columns, but you can delete them:
sensor hour_day value max.value hour.of.max hour_from_max_hour
1 A 0 1 3 2 -2
2 A 1 1 3 2 -1
3 A 2 3 3 2 0
4 A 3 1 3 2 1
5 A 4 2 3 2 2
6 B 0 1 5 3 -3
7 B 1 3 5 3 -2
8 B 2 4 5 3 -1
9 B 3 5 5 3 0
10 B 4 2 5 3 1

Resources