This is my dataset.
num col1
1 SENSOR_01
2 SENSOR_01
3 SENSOR_01
4 SENSOR_05
5 SENSOR_05
6 SENSOR_05
7 NA
8 SENSOR_01
9 SENSOR_01
10 SENSOR_05
11 SENSOR_05
structure(list(num = 1:11, col1 = structure(c(1L, 1L, 1L, 2L, 2L, 2L,
NA, 1L, 1L, 2L, 2L), .Label = c("SENSOR_01", "SENSOR_05" ), class =
"factor"), count = c(3L, 3L, 3L, 3L, 3L, 3L, 0L, 2L, 2L, 2L, 2L)),
class = "data.frame", row.names = c(NA, -11L))
I would like to count for only previous duplicated rows. In the row 1-3, there are sensor 3 repeatedly 3 times so count = 3. Here is my expected outcome.
num col1 count
1 SENSOR_01 3
2 SENSOR_01 3
3 SENSOR_01 3
4 SENSOR_05 3
5 SENSOR_05 3
6 SENSOR_05 3
7 NA 1
8 SENSOR_01 2
9 SENSOR_01 2
10 SENSOR_05 2
11 SENSOR_05 2
Using dplyr, How can I make this outcome?
We can use rleid to create groups and then count number of rows in each group.
library(dplyr)
df %>%
group_by(group = data.table::rleid(col1)) %>%
mutate(n = n()) %>%
ungroup() %>%
dplyr::select(-group)
# A tibble: 11 x 4
# num col1 count n
# <int> <fct> <int> <int>
# 1 1 SENSOR_01 3 3
# 2 2 SENSOR_01 3 3
# 3 3 SENSOR_01 3 3
# 4 4 SENSOR_05 3 3
# 5 5 SENSOR_05 3 3
# 6 6 SENSOR_05 3 3
# 7 7 NA 1 1
# 8 8 SENSOR_01 2 2
# 9 9 SENSOR_01 2 2
#10 10 SENSOR_05 2 2
#11 11 SENSOR_05 2 2
Keeping both the columns for comparison purposes.
Or using data.table
library(data.table)
setDT(df)[, n := .N, by = rleid(col1)]
Like an option, we can use order of variables (rownames in traditional data.frame). The idea is simple:
If within the group of identical sensor names, the distance between adjacent records is equal to 1 and the same is true in a global view, without grouping - set the flag for this record to zero or one otherwise;
Still within the group of identical sensor names, find cumulative sum of flags, which allows us to identify all subgroups of records appearing consequently in global data set;
Still within the group count the number of elements in each individual subgroup;
Repeat for each group of records.
In tidyverse:
dat %>%
mutate(tmp = 1:n()) %>%
group_by(col1) %>%
add_count(tmp = cumsum(c(0, diff(tmp)) > 1)) %>%
ungroup() %>%
select(-tmp)
# # A tibble: 11 x 3
# num col1 n
# <int> <fct> <int>
# 1 1 SENSOR_01 3
# 2 2 SENSOR_01 3
# 3 3 SENSOR_01 3
# 4 4 SENSOR_05 3
# 5 5 SENSOR_05 3
# 6 6 SENSOR_05 3
# 7 7 NA 1
# 8 8 SENSOR_01 2
# 9 9 SENSOR_01 2
# 10 10 SENSOR_05 2
# 11 11 SENSOR_05 2
Data:
dat <- structure(
list(
num = 1:11,
col1 = structure(
c(1L, 1L, 1L, 2L, 2L, 2L, NA, 1L, 1L, 2L, 2L),
.Label = c("SENSOR_01", "SENSOR_05" ),
class = "factor")
),
class = "data.frame",
row.names = c(NA, -11L)
)
We can use base R with rle to create the 'count' column
df$count <- with(rle(df$col1), rep(lengths, lengths))
df$count
#[1] 3 3 3 3 3 3 1 2 2 2 2
Or the dplyr implementation of the above
library(dplyr)
df %>%
mutate(count = with(rle(col1), rep(lengths, lengths)))
Or an option with tidyverse without including any other packages
library(dplyr)
df %>%
group_by(grp = replace_na(col1, "VALUE"),
grp = cumsum(grp != lag(grp, default = first(grp)))) %>%
mutate(count = n()) %>%
ungroup %>%
select(-grp)
# A tibble: 11 x 3
# num col1 count
# <int> <chr> <int>
# 1 1 SENSOR_01 3
# 2 2 SENSOR_01 3
# 3 3 SENSOR_01 3
# 4 4 SENSOR_05 3
# 5 5 SENSOR_05 3
# 6 6 SENSOR_05 3
# 7 7 <NA> 1
# 8 8 SENSOR_01 2
# 9 9 SENSOR_01 2
#10 10 SENSOR_05 2
#11 11 SENSOR_05 2
data
df <- structure(list(num = 1:11, col1 = c("SENSOR_01", "SENSOR_01",
"SENSOR_01", "SENSOR_05", "SENSOR_05", "SENSOR_05", NA, "SENSOR_01",
"SENSOR_01", "SENSOR_05", "SENSOR_05")),
class = "data.frame", row.names = c(NA,
-11L))
Related
I'm trying to count the number of every observation for each variable in a dataset regarding a specific group.
The data looks like this:
grp v1 vn
1 2 5
2 4
3 3 4
1 3
1 2 12
4 5
5 3 6
5 6
The Result should be a table like this:
grp v1 vn
1 2 3
2 1 0
3 1 1
4 0 1
5 2 1
I tried to use
x %>% group_by(grp) %>% summarise(across(everything(),n = n()))
but it didn`t really worked.
Any help is appreciated. Thanks in advance!
You can also use the following solution:
library(dplyr)
df %>%
group_by(grp) %>%
summarise(across(v1:vn, ~ sum(!is.na(.x))))
# A tibble: 5 x 3
grp v1 vn
<int> <int> <int>
1 1 2 3
2 2 1 0
3 3 1 1
4 4 0 1
5 5 2 1
Get the data in long format, count non-NA values for each column in each group and get the data in wide format.
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = -grp) %>%
group_by(grp, name) %>%
summarise(n = sum(!is.na(value))) %>%
ungroup %>%
pivot_wider(names_from = name, values_from = n)
# grp v1 vn
# <int> <int> <int>
#1 1 2 3
#2 2 1 0
#3 3 1 1
#4 4 0 1
#5 5 2 1
data
df <- structure(list(grp = c(1L, 2L, 3L, 1L, 1L, 4L, 5L, 5L), v1 = c(2L,
4L, 3L, NA, 2L, NA, 3L, 6L), vn = c(5L, NA, 4L, 3L, 2L, 5L, 6L,
NA)), class = "data.frame", row.names = c(NA, -8L))
Using data.table
library(data.table)
setDT(df)[, lapply(.SD, function(x) sum(!is.na(x))), grp]
# grp v1 vn
#1: 1 2 3
#2: 2 1 0
#3: 3 1 1
#4: 4 0 1
#5: 5 2 1
Using aggregate.
aggregate(cbind(v1, vn) ~ grp, replace(dat, is.na(dat), 0), function(x) sum(as.logical(x)))
# grp v1 vn
# 1 1 2 3
# 2 2 1 0
# 3 3 1 1
# 4 4 0 1
# 5 5 2 1
Data:
dat <- read.table(header=T, text='grp v1 vn
1 2 5
2 4 NA
3 3 4
1 NA 3
1 2 12
4 NA 5
5 3 6
5 6 NA
')
I would like to use dplyr in replacing NA value in the DV column of each ID with DV value at a specific time point within that individual:
I want to replace NA (DV column) at the time 2 of each ID with DV value at time 4 of that specific ID.
I want to replace NA (DV column) at the time 4 of each ID with DV value at time 0 of that specific ID.
I can not figure out how to do it with dplyr.
Here is my dataset:
ID TIME DV
1 0 5
1 2 NA
1 4 4
2 0 3
2 2 3
2 4 NA
3 0 7
3 2 NA
3 4 9
Expected output:
ID TIME DV
1 0 5
1 2 4
1 4 4
2 0 3
2 2 3
2 4 3
3 0 7
3 2 9
3 4 9
Any suggestions are appreciated.
Best,
I agree with #akrun that perhaps fill is a good fit in general, but your rules suggest handling things a little differently (since "updown" does not follow your rules).
library(dplyr)
# library(tidyr)
dat %>%
tidyr::pivot_wider(id_cols = "ID", names_from = "TIME", values_from = "DV") %>%
mutate(
`2` = if_else(is.na(`2`), `4`, `2`),
`4` = if_else(is.na(`4`), `0`, `4`)
) %>%
tidyr::pivot_longer(-ID, names_to = "TIME", values_to = "DV")
# # A tibble: 9 x 3
# ID TIME DV
# <int> <chr> <int>
# 1 1 0 5
# 2 1 2 4
# 3 1 4 4
# 4 2 0 3
# 5 2 2 3
# 6 2 4 3
# 7 3 0 7
# 8 3 2 9
# 9 3 4 9
It might help to visualize what this is doing by looking mid-pipe:
dat %>%
tidyr::pivot_wider(id_cols = "ID", names_from = "TIME", values_from = "DV")
# # A tibble: 3 x 4
# ID `0` `2` `4`
# <int> <int> <int> <int>
# 1 1 5 NA 4
# 2 2 3 3 NA
# 3 3 7 NA 9
dat %>%
tidyr::pivot_wider(id_cols = "ID", names_from = "TIME", values_from = "DV") %>%
mutate(
`2` = if_else(is.na(`2`), `4`, `2`),
`4` = if_else(is.na(`4`), `0`, `4`)
)
# # A tibble: 3 x 4
# ID `0` `2` `4`
# <int> <int> <int> <int>
# 1 1 5 4 4
# 2 2 3 3 3
# 3 3 7 9 9
We could use fill after grouping by 'ID'
library(dplyr)
library(tidyr)
df1 %>%
arrange(ID, TIME) %>%
# or as #r2evans mentioned
#arrange(ID, factor(TIME, levels = c(0, 2, 4))) %>%
group_by(ID) %>%
fill(DV, .direction = 'downup')
# A tibble: 9 x 3
# Groups: ID [3]
# ID TIME DV
# <int> <int> <int>
#1 1 0 5
#2 1 2 4
#3 1 4 4
#4 2 0 3
#5 2 2 3
#6 2 4 3
#7 3 0 7
#8 3 2 9
#9 3 4 9
data
df1 <- structure(list(ID = c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L), TIME = c(0L,
2L, 4L, 0L, 2L, 4L, 0L, 2L, 4L), DV = c(5L, NA, 4L, 3L, 3L, NA,
7L, NA, 9L)), class = "data.frame", row.names = c(NA, -9L))
Data looks like this:
Col1 Col2 Col3 Group
1 1 2 1
1 1 3 1
2 2 4 1
2 3 3 2
2 3 4 2
2 4 5 2
3 4 6 2
I want to set Col1 and Col3 to their LAST value, within Group
For instance, the last value of Col1 Group 2 is 3. So in Group 2, I want all values of Col1 to be set to 3.
Expected result:
Col1 Col2 Col3 Group
2 1 4 1
2 1 4 1
2 2 4 1
3 3 6 2
3 3 6 2
3 4 6 2
3 4 6 2
How can this be done with data.table?
We can use tidyverse. We group by 'Group', and use mutate_at to select the variable of interest, replace with the last value of each of the columns
library(dplyr)
df1 %>%
group_by(Group) %>%
mutate_at(vars(Col1, Col3), last)
# A tibble: 7 x 4
# Groups: Group [2]
# Col1 Col2 Col3 Group
# <int> <int> <int> <int>
#1 2 1 4 1
#2 2 1 4 1
#3 2 2 4 1
#4 3 3 6 2
#5 3 3 6 2
#6 3 4 6 2
#7 3 4 6 2
Or with data.table, use the same logic, (if it is not a data.table, convert to data.table with setDT), specify the columns of interst in .SDcols, loop through the Subset of Data.table (.SD), get the last value and assign (:=) it to the columns
library(data.table)
nm1 <- c("Col1", "Col3")
setDT(df1)[, (nm1) := lapply(.SD, last), by = Group, .SDcols = nm1]
data
df1 <- structure(list(Col1 = c(1L, 1L, 2L, 2L, 2L, 2L, 3L), Col2 = c(1L,
1L, 2L, 3L, 3L, 4L, 4L), Col3 = c(2L, 3L, 4L, 3L, 4L, 5L, 6L),
Group = c(1L, 1L, 1L, 2L, 2L, 2L, 2L)), class = "data.frame",
row.names = c(NA,
-7L))
library(data.table)
cols <- c("Col1", "Col3")
DT[, (cols) := .SD[.N], by = Group, .SDcols = cols][]
# Col1 Col2 Col3 Group
# 1: 2 1 4 1
# 2: 2 1 4 1
# 3: 2 2 4 1
# 4: 3 3 6 2
# 5: 3 3 6 2
# 6: 3 4 6 2
# 7: 3 4 6 2
Data
DT <- fread("Col1 Col2 Col3 Group
1 1 2 1
1 1 3 1
2 2 4 1
2 3 3 2
2 3 4 2
2 4 5 2
3 4 6 2")
"w" "n"
"1" 2 1
"2" 3 1
"3" 4 1
"4" 2 1
"5" 5 1
"6" 6 1
"7" 3 2
"8" 7 2
I tried the following command,but didnt show any change as I expect.
w2 <- w1 %>%
expand(w,n)
My output should look like this
w n
2 1
2 2
3 1
3 2
4 1
4 2
5 1
5 2
6 1
6 2
7 1
7 2
data
w1 <- structure(list(w = c(2L, 3L, 3L, 4L, 5L, 6L, 7L), n = c(1L, 1L,
2L, 1L, 1L, 1L, 2L)), .Names = c("w", "n"), row.names = c(NA,
-7L), class = c("grouped_df", "tbl_df", "tbl", "data.frame"), groups = structure(list(
w = c(2L, 3L, 3L, 4L, 5L, 6L, 7L), n = c(1L, 1L, 2L, 1L,
1L, 1L, 2L), .rows = list(1L, 2L, 3L, 4L, 5L, 6L, 7L)), .Names = c("w",
"n", ".rows"), row.names = c(NA, -7L), class = c("tbl_df", "tbl",
"data.frame"), .drop = TRUE))
The issue was in your data frame being grouped, consider:
w1 %>%
ungroup() %>%
expand(w, n)
Output:
# A tibble: 12 x 2
w n
<int> <int>
1 2 1
2 2 2
3 3 1
4 3 2
5 4 1
6 4 2
7 5 1
8 5 2
9 6 1
10 6 2
11 7 1
12 7 2
We can use complete from tidyr.
library(dplyr)
library(tidyr)
dat2 <- dat %>%
distinct(w, .keep_all = TRUE) %>%
complete(w, n)
dat2
# # A tibble: 12 x 2
# w n
# <int> <int>
# 1 2 1
# 2 2 2
# 3 3 1
# 4 3 2
# 5 4 1
# 6 4 2
# 7 5 1
# 8 5 2
# 9 6 1
# 10 6 2
# 11 7 1
# 12 7 2
DATA
dat <- read.table(text = "w n
2 1
3 1
4 1
2 1
5 1
6 1
3 2
7 2",
header = TRUE)
Using the original data frame df you can create a new data frame that copies w for each unique value of n:
data.frame(w = rep(unique(df$w),
each = uniqueN(df$n)),
n = rep(unique(df$n),
times = uniqueN(df$w)))
Output:
w n
1 2 1
2 2 2
3 3 1
4 3 2
5 4 1
6 4 2
7 5 1
8 5 2
9 6 1
10 6 2
11 7 1
12 7 2
So I have a sequence dataset that looks like this
id epnum clockst
1 1 1 0
2 1 2 1
3 1 3 2
4 2 1 4
5 2 2 5
6 2 3 6
7 3 1 4
8 3 2 5
9 3 3 6
What I want is to create a vector of clockst based on epnum == 1.
So, I want basically this
id epnum clockst ep_start
1 1 1 0 0
2 1 2 1 0
3 1 3 2 0
4 2 1 4 4
5 2 2 5 4
6 2 3 6 4
7 3 1 4 4
8 3 2 5 4
9 3 3 6 4
However, I struggle to do so.
I came up with this, but it doesn't fully work.
dt$ep_start = ifelse(dt$epnum == 1 & dt$clockst == 0, 0,
ifelse(dt$epnum == 1 & dt$clockst == 4, 4, -9))
Any idea?
Data
dt = structure(list(id = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L,
3L), .Label = c("1", "2", "3"), class = "factor"), epnum = structure(c(1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), .Label = c("1", "2", "3"), class = "factor"),
clockst = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 4L, 5L, 6L), .Label = c("0",
"1", "2", "4", "5", "6"), class = "factor")), .Names = c("id",
"epnum", "clockst"), row.names = c(NA, -9L), class = "data.frame")
Here is a solution using tidyverse:
First check the condition epnum == 1 and if TRUE, use clockst value if not NA. Then just fill NA with previous values.
Since clockst is a factor one needs to convert it to numeric while keeping the same values so as.numeric(as.character( needs to be used.
library(tidyverse)
dt %>%
mutate(ep_start = ifelse(epnum == 1, as.numeric(as.character(clockst)), NA)) %>%
fill(ep_start, .direction = "down")
#output:
id epnum clockst ep_start
1 1 1 0 0
2 1 2 1 0
3 1 3 2 0
4 2 1 4 4
5 2 2 5 4
6 2 3 6 4
7 3 1 4 4
8 3 2 5 4
9 3 3 6 4
Here is a quick comparison of the available answers. I chose to use a 90 k row data set:
df <- df[rep(1:nrow(df), times = 10000),] #where df = dt
dt <- data.table(df)
library(microbenchmark)
bench <- microbenchmark(SunBee = dt[, ep_start := .SD[1]$clockst, by = "id"],
missuse = df %>%
mutate(ep_start = ifelse(epnum == 1, as.numeric(as.character(clockst)), NA)) %>%
fill(ep_start, .direction = "down"),
d.b. = df$clockst[rep(which(df$epnum == 1), rle(cumsum(df$epnum == 1))$lengths)],
www = df %>%
arrange(id, epnum) %>%
group_by(id) %>%
mutate(ep_start = first(clockst)) %>%
ungroup())
plot(bench)
with a 900 k row data set:
oh man I really need to learn DT.
Another tidyverse solution. arrange is not required if you are certain that the rows are in the right order.
library(dplyr)
dt2 <- dt %>%
arrange(id, epnum) %>%
group_by(id) %>%
mutate(ep_start = first(clockst)) %>%
ungroup()
dt2
# # A tibble: 9 x 4
# id epnum clockst ep_start
# <fctr> <fctr> <fctr> <fctr>
# 1 1 1 0 0
# 2 1 2 1 0
# 3 1 3 2 0
# 4 2 1 4 4
# 5 2 2 5 4
# 6 2 3 6 4
# 7 3 1 4 4
# 8 3 2 5 4
# 9 3 3 6 4
You can do this with library(data.table) as follows
T <- data.table(T)
T[, ep_start := .SD[1]$clockst, by = "id"]
This gives:
id epnum clockst ep_start
1: 1 1 0 0
2: 1 2 1 0
3: 1 3 2 0
4: 2 1 4 4
5: 2 2 5 4
6: 2 3 6 4
7: 3 1 4 4
8: 3 2 5 4
9: 3 3 6 4
dt$ep_start = dt$clockst[rep(which(dt$epnum == 1), rle(cumsum(dt$epnum == 1))$lengths)]
dt
# id epnum clockst ep_start
#1 1 1 0 0
#2 1 2 1 0
#3 1 3 2 0
#4 2 1 4 4
#5 2 2 5 4
#6 2 3 6 4
#7 3 1 4 4
#8 3 2 5 4
#9 3 3 6 4
Using match
clock = dt[dt$epnum == 1, ]
dt$ep_start = clock$clockst[match(dt$id, clock$id)]