Count number of observations by group - r

I'm trying to count the number of every observation for each variable in a dataset regarding a specific group.
The data looks like this:
grp v1 vn
1 2 5
2 4
3 3 4
1 3
1 2 12
4 5
5 3 6
5 6
The Result should be a table like this:
grp v1 vn
1 2 3
2 1 0
3 1 1
4 0 1
5 2 1
I tried to use
x %>% group_by(grp) %>% summarise(across(everything(),n = n()))
but it didn`t really worked.
Any help is appreciated. Thanks in advance!

You can also use the following solution:
library(dplyr)
df %>%
group_by(grp) %>%
summarise(across(v1:vn, ~ sum(!is.na(.x))))
# A tibble: 5 x 3
grp v1 vn
<int> <int> <int>
1 1 2 3
2 2 1 0
3 3 1 1
4 4 0 1
5 5 2 1

Get the data in long format, count non-NA values for each column in each group and get the data in wide format.
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = -grp) %>%
group_by(grp, name) %>%
summarise(n = sum(!is.na(value))) %>%
ungroup %>%
pivot_wider(names_from = name, values_from = n)
# grp v1 vn
# <int> <int> <int>
#1 1 2 3
#2 2 1 0
#3 3 1 1
#4 4 0 1
#5 5 2 1
data
df <- structure(list(grp = c(1L, 2L, 3L, 1L, 1L, 4L, 5L, 5L), v1 = c(2L,
4L, 3L, NA, 2L, NA, 3L, 6L), vn = c(5L, NA, 4L, 3L, 2L, 5L, 6L,
NA)), class = "data.frame", row.names = c(NA, -8L))

Using data.table
library(data.table)
setDT(df)[, lapply(.SD, function(x) sum(!is.na(x))), grp]
# grp v1 vn
#1: 1 2 3
#2: 2 1 0
#3: 3 1 1
#4: 4 0 1
#5: 5 2 1

Using aggregate.
aggregate(cbind(v1, vn) ~ grp, replace(dat, is.na(dat), 0), function(x) sum(as.logical(x)))
# grp v1 vn
# 1 1 2 3
# 2 2 1 0
# 3 3 1 1
# 4 4 0 1
# 5 5 2 1
Data:
dat <- read.table(header=T, text='grp v1 vn
1 2 5
2 4 NA
3 3 4
1 NA 3
1 2 12
4 NA 5
5 3 6
5 6 NA
')

Related

How to add new rows conditionally on R

I have a df with
v1 t1 c1 o1
1 1 9 1
1 1 12 2
1 2 2 1
1 2 7 2
2 1 3 1
2 1 6 2
2 2 3 1
2 2 12 2
And I would like to add 2 rows each time that v1 changes it's value, in order to get this:
v1 t1 c1 o1
1 1 1 1
1 1 1 2
1 2 9 1
1 2 12 2
1 3 2 1
1 3 7 2
2 1 1 1
2 1 1 2
1 2 3 1
1 2 6 2
2 3 3 1
2 3 12 2
So what I'm doing is that every time v1 changes its value I'm adding 2 rows of ones and adding a 1 to the values of t1. This is kind of tricky. I've been able to do it in Excel but I would like to scale to big files in R.
We may do the expansion in group_modify
library(dplyr)
df1 %>%
group_by(v1) %>%
group_modify(~ .x %>%
slice_head(n = 2) %>%
mutate(across(-o1, ~ 1)) %>%
bind_rows(.x) %>%
mutate(t1 = as.integer(gl(n(), 2, n())))) %>%
ungroup
-output
# A tibble: 12 × 4
v1 t1 c1 o1
<int> <int> <dbl> <int>
1 1 1 1 1
2 1 1 1 2
3 1 2 9 1
4 1 2 12 2
5 1 3 2 1
6 1 3 7 2
7 2 1 1 1
8 2 1 1 2
9 2 2 3 1
10 2 2 6 2
11 2 3 3 1
12 2 3 12 2
Or do a group by summarise
df1 %>%
group_by(v1) %>%
summarise(t1 = as.integer(gl(n() + 2, 2, n() + 2)),
c1 = c(1, 1, c1), o1 = rep(1:2, length.out = n() + 2),
.groups = 'drop')
-output
# A tibble: 12 × 4
v1 t1 c1 o1
<int> <int> <dbl> <int>
1 1 1 1 1
2 1 1 1 2
3 1 2 9 1
4 1 2 12 2
5 1 3 2 1
6 1 3 7 2
7 2 1 1 1
8 2 1 1 2
9 2 2 3 1
10 2 2 6 2
11 2 3 3 1
12 2 3 12 2
data
df1 <- structure(list(v1 = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), t1 = c(1L,
1L, 2L, 2L, 1L, 1L, 2L, 2L), c1 = c(9L, 12L, 2L, 7L, 3L, 6L,
3L, 12L), o1 = c(1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L)),
class = "data.frame", row.names = c(NA,
-8L))

Sort, calculate and then mutate in a dataframe

I'm a beginner in R and I'm facing an issue.
Problem: I need to sort a dataframe by 2 columns (ID, i'th column) and then take lagged difference of the i'th column and record it. Then resort the data with the ID and the i+1 column and so on and so forth.
What I have written up till now:
for (val in (4:length(colnames(df)))){
df <- df[with(df, order(ID, df[val])), ]
d2_df <- df %>%
mutate_at(c(df[val]), list(lagged = ~ . - lag(.)))
}
The above code is messing somehow because the mutate_at function is throwing the error below:
Error: `.vars` must be a character/numeric vector or a `vars()` object, not a list.
Original dataset:
ID S1 S2
1 1 3 1
2 1 5 2
3 1 1 3
4 2 2 7
5 3 4 9
6 3 2 11
After Sort on ID and S1
ID S1 S2
1 1 1 3
2 1 3 1
3 1 5 2
4 2 2 7
5 3 2 11
6 3 4 9
Now what I need? S1.1 (which is the lagged difference of the sorted dataframe respective to each ID)
ID S1 S2 S1.1
1 1 1 3 NA
2 1 3 1 2
3 1 5 2 2
4 2 2 7 NA
5 3 2 11 NA
6 3 4 9 2
Similar logic applies for S2 where a new S2.2 will be generated.
Any help would be immensely appreciated.
Additionally what is required (below); where sum.S1 is the sum of the lagged differences and count.S1 is the count of observations at S1 for respective ID:
ID sum.S1 sum.S2 count.S1 count.S2
1 1 4 2 3 3
2 2 NA NA 1 1
3 3 2 2 2 2
Here's a way using non-standard evaluation (NSE) :
library(dplyr)
library(purrr)
library(rlang)
cols <- c('S1', 'S2')
bind_cols(df, map_dfc(cols, ~{
col <- sym(.x)
df %>%
arrange(ID, !!col) %>%
group_by(ID) %>%
transmute(!!paste0(.x, '.1') := !!col - lag(!!col)) %>%
ungroup %>%
select(-ID)
}))
# ID S1 S2 S1.1 S2.1
#1 1 3 1 NA NA
#2 1 5 2 2 1
#3 1 1 3 2 1
#4 2 2 7 NA NA
#5 3 4 9 NA NA
#6 3 2 11 2 2
data
df <- structure(list(ID = c(1L, 1L, 1L, 2L, 3L, 3L), S1 = c(3L, 5L,
1L, 2L, 4L, 2L), S2 = c(1L, 2L, 3L, 7L, 9L, 11L)),
class = "data.frame", row.names = c(NA, -6L))

Ifelse with dplyr in R

I would like to use dplyr in replacing NA value in the DV column of each ID with DV value at a specific time point within that individual:
I want to replace NA (DV column) at the time 2 of each ID with DV value at time 4 of that specific ID.
I want to replace NA (DV column) at the time 4 of each ID with DV value at time 0 of that specific ID.
I can not figure out how to do it with dplyr.
Here is my dataset:
ID TIME DV
1 0 5
1 2 NA
1 4 4
2 0 3
2 2 3
2 4 NA
3 0 7
3 2 NA
3 4 9
Expected output:
ID TIME DV
1 0 5
1 2 4
1 4 4
2 0 3
2 2 3
2 4 3
3 0 7
3 2 9
3 4 9
Any suggestions are appreciated.
Best,
I agree with #akrun that perhaps fill is a good fit in general, but your rules suggest handling things a little differently (since "updown" does not follow your rules).
library(dplyr)
# library(tidyr)
dat %>%
tidyr::pivot_wider(id_cols = "ID", names_from = "TIME", values_from = "DV") %>%
mutate(
`2` = if_else(is.na(`2`), `4`, `2`),
`4` = if_else(is.na(`4`), `0`, `4`)
) %>%
tidyr::pivot_longer(-ID, names_to = "TIME", values_to = "DV")
# # A tibble: 9 x 3
# ID TIME DV
# <int> <chr> <int>
# 1 1 0 5
# 2 1 2 4
# 3 1 4 4
# 4 2 0 3
# 5 2 2 3
# 6 2 4 3
# 7 3 0 7
# 8 3 2 9
# 9 3 4 9
It might help to visualize what this is doing by looking mid-pipe:
dat %>%
tidyr::pivot_wider(id_cols = "ID", names_from = "TIME", values_from = "DV")
# # A tibble: 3 x 4
# ID `0` `2` `4`
# <int> <int> <int> <int>
# 1 1 5 NA 4
# 2 2 3 3 NA
# 3 3 7 NA 9
dat %>%
tidyr::pivot_wider(id_cols = "ID", names_from = "TIME", values_from = "DV") %>%
mutate(
`2` = if_else(is.na(`2`), `4`, `2`),
`4` = if_else(is.na(`4`), `0`, `4`)
)
# # A tibble: 3 x 4
# ID `0` `2` `4`
# <int> <int> <int> <int>
# 1 1 5 4 4
# 2 2 3 3 3
# 3 3 7 9 9
We could use fill after grouping by 'ID'
library(dplyr)
library(tidyr)
df1 %>%
arrange(ID, TIME) %>%
# or as #r2evans mentioned
#arrange(ID, factor(TIME, levels = c(0, 2, 4))) %>%
group_by(ID) %>%
fill(DV, .direction = 'downup')
# A tibble: 9 x 3
# Groups: ID [3]
# ID TIME DV
# <int> <int> <int>
#1 1 0 5
#2 1 2 4
#3 1 4 4
#4 2 0 3
#5 2 2 3
#6 2 4 3
#7 3 0 7
#8 3 2 9
#9 3 4 9
data
df1 <- structure(list(ID = c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L), TIME = c(0L,
2L, 4L, 0L, 2L, 4L, 0L, 2L, 4L), DV = c(5L, NA, 4L, 3L, 3L, NA,
7L, NA, 9L)), class = "data.frame", row.names = c(NA, -9L))

Order values within column according to values within different column by group in R

I have the following panel data set:
group i f r d
1 4 8 3 3
1 9 4 5 1
1 2 2 2 2
2 5 5 3 2
2 3 9 3 3
2 9 1 3 1
I want to reorder column i in this data frame according to values in column d for each group. So the highest value for group 1 in column i should correspond to the highest value in column d. In the end my data.frame should look like this:
group i f r d
1 9 8 3 3
1 2 4 5 1
1 4 2 2 2
2 5 5 3 2
2 9 9 3 3
2 3 1 3 1
Here is a dplyr solution.
First, group by group. Then get the permutation rearrangement of column d in a temporary new column, ord and use it to reorder i.
library(dplyr)
df1 %>%
group_by(group) %>%
mutate(ord = order(d),
i = i[ord]) %>%
ungroup() %>%
select(-ord)
## A tibble: 6 x 5
# group i f r d
# <int> <int> <int> <int> <int>
#1 1 9 8 3 3
#2 1 2 4 5 1
#3 1 4 2 2 2
#4 2 9 5 3 2
#5 2 5 9 3 3
#6 2 3 1 3 1
original (wrong)
You can achieve this using dplyr and rank:
library(dplyr)
df1 %>% group_by(group) %>%
mutate(i = i[rev(rank(d))])
Edit
This question is actually trickier than it first seems and the original answer I posted is incorrect. The correct solution orders by i before subsetting by the rank of d. This gives OP's desired output which my previous answer did not (not paying attention!)
df1 %>% group_by(group) %>%
mutate(i = i[order(i)][rank(d)])
# A tibble: 6 x 5
# Groups: group [2]
# group i f r d
# <int> <int> <int> <int> <int>
#1 1 9 8 3 3
#2 1 2 4 5 1
#3 1 4 2 2 2
#4 2 5 5 3 2
#5 2 9 9 3 3
#6 2 3 1 3 1
There is some confusion regarding the expected output. Here I am showing a way to get both the versions of the output.
A base R using split and mapply
df$i <- c(mapply(function(x, y) sort(y)[x],
split(df$d, df$group), split(df$i, df$group)))
df
# group i f r d
#1 1 9 8 3 3
#2 1 2 4 5 1
#3 1 4 2 2 2
#4 2 5 5 3 2
#5 2 9 9 3 3
#6 2 3 1 3 1
Or another version
df$i <- c(mapply(function(x, y) y[order(x)],
split(df$d, df$group), split(df$i, df$group)))
df
# group i f r d
#1 1 9 8 3 3
#2 1 2 4 5 1
#3 1 4 2 2 2
#4 2 9 5 3 2
#5 2 5 9 3 3
#6 2 3 1 3 1
We can also use dplyr for this :
For 1st version
library(dplyr)
df %>%
group_by(group) %>%
mutate(i = sort(i)[d])
2nd version is already shown by #Rui using order
df %>%
group_by(group) %>%
mutate(i = i[order(d)])
An option with data.table
library(data.table)
setDT(df1)[, i := i[order(d)], group]
df1
# group i f r d
#1: 1 9 8 3 3
#2: 1 2 4 5 1
#3: 1 4 2 2 2
#4: 2 9 5 3 2
#5: 2 5 9 3 3
#6: 2 3 1 3 1
If we need the second version
setDT(df1)[, i := sort(i)[d], group]
data
df1 <- structure(list(group = c(1L, 1L, 1L, 2L, 2L, 2L), i = c(4L, 9L,
2L, 5L, 3L, 9L), f = c(8L, 4L, 2L, 5L, 9L, 1L), r = c(3L, 5L,
2L, 3L, 3L, 3L), d = c(3L, 1L, 2L, 2L, 3L, 1L)), class = "data.frame",
row.names = c(NA,
-6L))

R — Assign value to vector based on first episode

So I have a sequence dataset that looks like this
id epnum clockst
1 1 1 0
2 1 2 1
3 1 3 2
4 2 1 4
5 2 2 5
6 2 3 6
7 3 1 4
8 3 2 5
9 3 3 6
What I want is to create a vector of clockst based on epnum == 1.
So, I want basically this
id epnum clockst ep_start
1 1 1 0 0
2 1 2 1 0
3 1 3 2 0
4 2 1 4 4
5 2 2 5 4
6 2 3 6 4
7 3 1 4 4
8 3 2 5 4
9 3 3 6 4
However, I struggle to do so.
I came up with this, but it doesn't fully work.
dt$ep_start = ifelse(dt$epnum == 1 & dt$clockst == 0, 0,
ifelse(dt$epnum == 1 & dt$clockst == 4, 4, -9))
Any idea?
Data
dt = structure(list(id = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L,
3L), .Label = c("1", "2", "3"), class = "factor"), epnum = structure(c(1L,
2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), .Label = c("1", "2", "3"), class = "factor"),
clockst = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 4L, 5L, 6L), .Label = c("0",
"1", "2", "4", "5", "6"), class = "factor")), .Names = c("id",
"epnum", "clockst"), row.names = c(NA, -9L), class = "data.frame")
Here is a solution using tidyverse:
First check the condition epnum == 1 and if TRUE, use clockst value if not NA. Then just fill NA with previous values.
Since clockst is a factor one needs to convert it to numeric while keeping the same values so as.numeric(as.character( needs to be used.
library(tidyverse)
dt %>%
mutate(ep_start = ifelse(epnum == 1, as.numeric(as.character(clockst)), NA)) %>%
fill(ep_start, .direction = "down")
#output:
id epnum clockst ep_start
1 1 1 0 0
2 1 2 1 0
3 1 3 2 0
4 2 1 4 4
5 2 2 5 4
6 2 3 6 4
7 3 1 4 4
8 3 2 5 4
9 3 3 6 4
Here is a quick comparison of the available answers. I chose to use a 90 k row data set:
df <- df[rep(1:nrow(df), times = 10000),] #where df = dt
dt <- data.table(df)
library(microbenchmark)
bench <- microbenchmark(SunBee = dt[, ep_start := .SD[1]$clockst, by = "id"],
missuse = df %>%
mutate(ep_start = ifelse(epnum == 1, as.numeric(as.character(clockst)), NA)) %>%
fill(ep_start, .direction = "down"),
d.b. = df$clockst[rep(which(df$epnum == 1), rle(cumsum(df$epnum == 1))$lengths)],
www = df %>%
arrange(id, epnum) %>%
group_by(id) %>%
mutate(ep_start = first(clockst)) %>%
ungroup())
plot(bench)
with a 900 k row data set:
oh man I really need to learn DT.
Another tidyverse solution. arrange is not required if you are certain that the rows are in the right order.
library(dplyr)
dt2 <- dt %>%
arrange(id, epnum) %>%
group_by(id) %>%
mutate(ep_start = first(clockst)) %>%
ungroup()
dt2
# # A tibble: 9 x 4
# id epnum clockst ep_start
# <fctr> <fctr> <fctr> <fctr>
# 1 1 1 0 0
# 2 1 2 1 0
# 3 1 3 2 0
# 4 2 1 4 4
# 5 2 2 5 4
# 6 2 3 6 4
# 7 3 1 4 4
# 8 3 2 5 4
# 9 3 3 6 4
You can do this with library(data.table) as follows
T <- data.table(T)
T[, ep_start := .SD[1]$clockst, by = "id"]
This gives:
id epnum clockst ep_start
1: 1 1 0 0
2: 1 2 1 0
3: 1 3 2 0
4: 2 1 4 4
5: 2 2 5 4
6: 2 3 6 4
7: 3 1 4 4
8: 3 2 5 4
9: 3 3 6 4
dt$ep_start = dt$clockst[rep(which(dt$epnum == 1), rle(cumsum(dt$epnum == 1))$lengths)]
dt
# id epnum clockst ep_start
#1 1 1 0 0
#2 1 2 1 0
#3 1 3 2 0
#4 2 1 4 4
#5 2 2 5 4
#6 2 3 6 4
#7 3 1 4 4
#8 3 2 5 4
#9 3 3 6 4
Using match
clock = dt[dt$epnum == 1, ]
dt$ep_start = clock$clockst[match(dt$id, clock$id)]

Resources