New variable that indicates the first occurrence of a specific value - r

I want to create a new variable that indicates the first specific observation of a value for a variable.
In the following example dataset I want to have a new variable "firstna" that is "1" for the first observation of "NA" for this player.
game_data <- data.frame(player = c(1,1,1,1,2,2,2,2), level = c(1,2,3,4,1,2,3,4), points = c(20,NA,NA,NA,20,40,NA,NA))
game_data
player level points
1 1 1 20
2 1 2 NA
3 1 3 NA
4 1 4 NA
5 2 1 20
6 2 2 40
7 2 3 NA
8 2 4 NA
The resulting dataframe should look like this:
game_data_new <- data.frame(player = c(1,1,1,1,2,2,2,2), level = c(1,2,3,4,1,2,3,4), points = c(20,NA,NA,NA,20,40,NA,NA), firstna = c(0,1,0,0,0,0,1,0))
game_data_new
player level points firstna
1 1 1 20 0
2 1 2 NA 1
3 1 3 NA 0
4 1 4 NA 0
5 2 1 20 0
6 2 2 40 0
7 2 3 NA 1
8 2 4 NA 0
To be honest i don't know how to do this. It would be perfect if there is a dplyr option to do so.

A base R solution:
ave(game_data$points, game_data$player,
FUN = function(x) seq_along(x) == match(NA, x, nomatch = 0))

Another ave option to find out first NA by group (player).
game_data$firstna <- ave(game_data$points, game_data$player,
FUN = function(x) cumsum(is.na(x)) == 1)
game_data
# player level points firstna
#1 1 1 20 0
#2 1 2 NA 1
#3 1 3 NA 0
#4 1 4 NA 0
#5 2 1 20 0
#6 2 2 40 0
#7 2 3 NA 1
#8 2 4 NA 0

Here is a solution with data.table:
library("data.table")
game_data <- data.table(player = c(1,1,1,1,2,2,2,2), level = c(1,2,3,4,1,2,3,4), points = c(20,NA,NA,NA,20,40,NA,NA))
game_data[, firstna:=is.na(points) & !is.na(shift(points)), player][]
# > game_data[, firstna:=is.na(points) & !is.na(shift(points)), player][]
# player level points firstna
# 1: 1 1 20 FALSE
# 2: 1 2 NA TRUE
# 3: 1 3 NA FALSE
# 4: 1 4 NA FALSE
# 5: 2 1 20 FALSE
# 6: 2 2 40 FALSE
# 7: 2 3 NA TRUE
# 8: 2 4 NA FALSE

You can do this by grouping by player and then mutating to check if a row has an NA value and the previous row doesn't
game_data %>%
group_by(player) %>%
mutate(firstna = ifelse(is.na(points) & lag(!is.na(points)),1,0)) %>%
ungroup()
Result:
# A tibble: 8 x 4
# Groups: player [2]
player level points firstna
<dbl> <dbl> <dbl> <dbl>
1 1 1 20 0
2 1 2 NA 1
3 1 3 NA 0
4 1 4 NA 0
5 2 1 20 0
6 2 2 40 0
7 2 3 NA 1
8 2 4 NA 0

library(tidyverse)
library(data.table)
data.frame(
player = c(1,1,1,1,2,2,2,2),
level = c(1,2,3,4,1,2,3,4),
points = c(20,NA,NA,NA,20,40,NA,NA)
) -> game_data
game_data_base1 <- game_data
game_data_dt <- data.table(game_data)
microbenchmark::microbenchmark(
better_base = game_data$first_na <- ave(
game_data$points,
game_data$player,
FUN=function(x) seq_along(x)==match(NA,x,nomatch=0)
),
brute_base = do.call(
rbind.data.frame,
lapply(
split(game_data, game_data$player),
function(x) {
x$firstna <- 0
na_loc <- which(is.na(x$points))
if (length(na_loc) > 0) x$firstna[na_loc[1]] <- 1
x
}
)
),
tidy = game_data %>%
group_by(player) %>%
mutate(firstna=as.numeric(is.na(points) & !duplicated(points))) %>%
ungroup(),
dt = game_data_dt[, firstna:=as.integer(is.na(points) & !is.na(shift(points))), player]
)
## Unit: microseconds
## expr min lq mean median uq max neval
## better_base 125.188 156.861 362.9829 191.6385 355.6675 3095.958 100
## brute_base 366.642 450.002 2782.6621 658.0380 1072.6475 174373.974 100
## tidy 998.924 1119.022 2528.3687 1509.0705 2516.9350 42406.778 100
## dt 330.428 421.211 1031.9978 535.8415 1042.1240 9671.991 100

game_data %>%
group_by(player) %>%
mutate(firstna=as.numeric(is.na(points) & !duplicated(points)))
Group by player, then create a boolean vector for cases that are both NA and not duplicates for previous rows.
# A tibble: 8 x 4
# Groups: player [2]
player level points firstna
<dbl> <dbl> <dbl> <dbl>
1 1 1 20 0
2 1 2 NA 1
3 1 3 NA 0
4 1 4 NA 0
5 2 1 20 0
6 2 2 40 0
7 2 3 NA 1
8 2 4 NA 0
If you want the 1s on the last non-NA line before an NA, replace the mutate line with this:
mutate(lastnonNA=as.numeric(!is.na(points) & is.na(lead(points))))
First row of a block of NAs that runs all the way to the end of the player's group:
game_data %>%
group_by(player) %>%
mutate(firstna=as.numeric(is.na(points) & !duplicated(cbind(points,cumsum(!is.na(points))))))

Another way using base:
game_data$firstna <-
unlist(
tapply(game_data$points, game_data$player, function(x) {i<-which(is.na(x))[1];x[]<-0;x[i]<-1;x})
)
or as another ?ave clone:
ave(game_data$points, game_data$player, FUN = function(x) {
i<-which(is.na(x))[1];x[]<-0;x[i]<-1;x
})

An option using diff
transform(game_data, firstna = ave(is.na(points), player, FUN = function(x) c(0,diff(x))))
# player level points firstna
# 1 1 1 20 0
# 2 1 2 NA 1
# 3 1 3 NA 0
# 4 1 4 NA 0
# 5 2 1 20 0
# 6 2 2 40 0
# 7 2 3 NA 1
# 8 2 4 NA 0
And its dplyr equivalent:
library(dplyr)
game_data %>% group_by(player) %>% mutate(firstna = c(0,diff(is.na(points))))
# # A tibble: 8 x 4
# # Groups: player [2]
# player level points firstna
# <dbl> <dbl> <dbl> <dbl>
# 1 1 1 20 0
# 2 1 2 NA 1
# 3 1 3 NA 0
# 4 1 4 NA 0
# 5 2 1 20 0
# 6 2 2 40 0
# 7 2 3 NA 1
# 8 2 4 NA 0

Related

Filling in NA values with a sequence by group

I have a data set that looks like the following:
ID Count
1 0
1 1
1 NA
1 2
1 NA
1 NA
1 NA
1 NA
1 NA
2 0
2 NA
2 NA
2 3
The first row of each ID starts with 0. I want to fill the NA values with sequential values by group. If there are values before and after the NA values, I need to fill the NA values with a sequence counting up to the first value after the NA values. If there are no values after the NA values, I need to fill the NA values with a sequence counting up from the last value before the NA value. The output should look like following:
ID Count
1 0
1 1
1 1
1 2
1 3
1 4
1 5
1 6
1 7
2 0
2 1
2 2
2 3
This is a little complicated, but I think this does what you want. I left all my helper columns in so you can see what's happening, but the non-needed columns can all be dropped at the end.
library(dplyr)
library(vctrs)
df %>%
group_by(ID, na_group = cumsum(!is.na(Count))) %>%
mutate(n_til_non_na = ifelse(is.na(Count), rev(row_number()), 0L)) %>%
group_by(ID) %>%
mutate(
fill_down = vec_fill_missing(Count, direction = "down"),
fill_up = vec_fill_missing(Count, direction = "up"),
result = case_when(
is.na(fill_up) ~ fill_down + cumsum(is.na(fill_up)),
is.na(Count) ~ fill_up - n_til_non_na,
TRUE ~ Count
)
) %>%
ungroup()
# # A tibble: 13 × 7
# ID Count na_group n_til_non_na fill_down fill_up result
# <int> <int> <int> <int> <int> <int> <int>
# 1 1 0 1 0 0 0 0
# 2 1 1 2 0 1 1 1
# 3 1 NA 2 1 1 2 1
# 4 1 2 3 0 2 2 2
# 5 1 NA 3 5 2 NA 3
# 6 1 NA 3 4 2 NA 4
# 7 1 NA 3 3 2 NA 5
# 8 1 NA 3 2 2 NA 6
# 9 1 NA 3 1 2 NA 7
# 10 2 0 4 0 0 0 0
# 11 2 NA 4 2 0 3 1
# 12 2 NA 4 1 0 3 2
# 13 2 3 5 0 3 3 3
Using this sample data:
df = read.table(text = 'ID Count
1 0
1 1
1 NA
1 2
1 NA
1 NA
1 NA
1 NA
1 NA
2 0
2 NA
2 NA
2 3', header = T)
You can use purrr::accumulate(), first backwards, then forward. While going backwards, replace each missing value with the previous value - 1 to count down; then while moving forwards, replace remaining missing values with the previous value + 1 to count up.
library(dplyr)
library(purrr)
dat %>%
group_by(ID) %>%
mutate(
Count = accumulate(
Count,
\(x, y) ifelse(is.na(x), y - 1, x),
.dir = "backward"
),
Count = accumulate(
Count,
\(x, y) ifelse(is.na(y), x + 1, y)
)
) %>%
ungroup()
# A tibble: 13 × 2
ID Count
<dbl> <dbl>
1 1 0
2 1 1
3 1 1
4 1 2
5 1 3
6 1 4
7 1 5
8 1 6
9 1 7
10 2 0
11 2 1
12 2 2
13 2 3

how to move up the values within each group in R

I need to shift valid values to the top the of dataframe withing each id. Here is an example dataset:
df <- data.frame(id = c(1,1,1,2,2,2,3,3,3,3),
itemid = c(1,2,3,1,2,3,1,2,3,4),
values = c(1,NA,0,NA,NA,0,1,NA,0,NA))
df
id itemid values
1 1 1 1
2 1 2 NA
3 1 3 0
4 2 1 NA
5 2 2 NA
6 2 3 0
7 3 1 1
8 3 2 NA
9 3 3 0
10 3 4 NA
excluding the id column, when there is a missing value in values column, I want to shift all values aligned to the top for each id.
How can I get this desired dataset below?
df1
id itemid values
1 1 1 1
2 1 2 0
3 1 3 NA
4 2 1 0
5 2 2 NA
6 2 3 NA
7 3 1 1
8 3 2 0
9 3 3 NA
10 3 4 NA
Using tidyverse you can arrange by whether values is missing or not (which will put those at the bottom).
library(tidyverse)
df %>%
arrange(id, is.na(values))
Output
id itemid values
<dbl> <dbl> <dbl>
1 1 1 1
2 1 3 0
3 1 2 NA
4 2 3 0
5 2 1 NA
6 2 2 NA
7 3 1 1
8 3 3 0
9 3 2 NA
10 3 4 NA
Or, if you wish to retain the same order for itemid and other columns, you can use mutate to specifically order columns of interest (like values). Other answers provide good solutions, such as #Santiago and #ThomasIsCoding. If you have multiple columns of interest to move NA to the bottom per group, you can also try:
df %>%
group_by(id) %>%
mutate(across(.cols = values, ~values[order(is.na(.))]))
where the .cols argument would contain the columns to transform and reorder independently.
Output
id itemid values
<dbl> <dbl> <dbl>
1 1 1 1
2 1 2 0
3 1 3 NA
4 2 1 0
5 2 2 NA
6 2 3 NA
7 3 1 1
8 3 2 0
9 3 3 NA
10 3 4 NA
We can try ave + order
> transform(df, values = ave(values, id, FUN = function(x) x[order(is.na(x))]))
id itemid values
1 1 1 1
2 1 2 0
3 1 3 NA
4 2 1 0
5 2 2 NA
6 2 3 NA
7 3 1 1
8 3 2 0
9 3 3 NA
10 3 4 NA
With data.table:
library(data.table)
setDT(df)[, values := values[order(is.na(values))], id][]
#> id itemid values
#> 1: 1 1 1
#> 2: 1 2 0
#> 3: 1 3 NA
#> 4: 2 1 0
#> 5: 2 2 NA
#> 6: 2 3 NA
#> 7: 3 1 1
#> 8: 3 2 0
#> 9: 3 3 NA
#> 10: 3 4 NA
I'd define a function that does what you want and then group by id:
completed_first <- function(x) {
completed <- x[!is.na(x)]
length(completed) <- length(x)
completed
}
library(dplyr)
df %>%
group_by(id) %>%
mutate(
values = completed_first(values)
) %>%
ungroup()
# # A tibble: 10 × 3
# id itemid values
# <dbl> <dbl> <dbl>
# 1 1 1 1
# 2 1 2 0
# 3 1 3 NA
# 4 2 1 0
# 5 2 2 NA
# 6 2 3 NA
# 7 3 1 1
# 8 3 2 0
# 9 3 3 NA
# 10 3 4 NA
(This method preserves the order of itemid.)
Or building upon ThomasIsCoding's answer:
library(dplyr)
df %>%
group_by(id) %>%
mutate(
values = values[order(is.na(values))]
) %>%
ungroup()
# # A tibble: 10 × 3
# id itemid values
# <dbl> <dbl> <dbl>
# 1 1 1 1
# 2 1 2 0
# 3 1 3 NA
# 4 2 1 0
# 5 2 2 NA
# 6 2 3 NA
# 7 3 1 1
# 8 3 2 0
# 9 3 3 NA
# 10 3 4 NA

Create a new dataframe with 1's and 0's from summarized data?

I have the below dataset that I am working with in R:
df <- data.frame(day=seq(1,3,1), tot.infected=c(1,2,4), tot.ind=5)
df
And I would like to transform the tot.infected column into a binomial variable with 1's and 0's, such as the following dataframe:
df2 <- data.frame(year = c(rep(1,5), rep(2,5), rep(3,5)), infected = c(rep(1,1), rep(0,4), rep(1,2), rep(0,3), rep(1,4), rep(0,1)))
Is there a more elegant way to do this in R?
Thank you for your help!
I tried hard-coding a dataframe using rep(), but this is extremely time-consuming for large datasets and I was looking for a more elegant way to achieve this.
base R
tmp <- do.call(Map, c(list(f = function(y, inf, ind) data.frame(year = y, infected = replace(integer(ind), seq(ind) <= inf, 1L))), unname(df)))
do.call(rbind, tmp)
# year infected
# 1 1 1
# 2 1 0
# 3 1 0
# 4 1 0
# 5 1 0
# 6 2 1
# 7 2 1
# 8 2 0
# 9 2 0
# 10 2 0
# 11 3 1
# 12 3 1
# 13 3 1
# 14 3 1
# 15 3 0
dplyr
library(dplyr)
df %>%
rowwise() %>%
summarize(tibble(year = day, infected = replace(integer(tot.ind), seq(tot.ind) <= tot.infected, 1L)))
# # A tibble: 15 x 2
# year infected
# <dbl> <int>
# 1 1 1
# 2 1 0
# 3 1 0
# 4 1 0
# 5 1 0
# 6 2 1
# 7 2 1
# 8 2 0
# 9 2 0
# 10 2 0
# 11 3 1
# 12 3 1
# 13 3 1
# 14 3 1
# 15 3 0
We can do it this way:
library(dplyr)
df %>%
group_by(day) %>%
summarise(cur_data()[seq(unique(tot.ind)),]) %>%
#mutate(x = row_number())
mutate(tot.infected = ifelse(row_number() <= first(tot.infected),
first(tot.infected)/first(tot.infected), 0), .keep="used")
day tot.infected
<dbl> <dbl>
1 1 1
2 1 0
3 1 0
4 1 0
5 1 0
6 2 1
7 2 1
8 2 0
9 2 0
10 2 0
11 3 1
12 3 1
13 3 1
14 3 1
15 3 0
Using rep.int and replace, basically.
with(df, data.frame(
year=do.call(rep.int, unname(df[c(1, 3)])),
infected=unlist(Map(replace, Map(rep.int, 0, tot.ind), lapply(tot.infected, seq), 1))
))
# year infected
# 1 1 1
# 2 1 0
# 3 1 0
# 4 1 0
# 5 1 0
# 6 2 1
# 7 2 1
# 8 2 0
# 9 2 0
# 10 2 0
# 11 3 1
# 12 3 1
# 13 3 1
# 14 3 1
# 15 3 0
Data:
df <- structure(list(day = c(1, 2, 3), tot.infected = c(1, 2, 4), tot.ind = c(5,
5, 5)), class = "data.frame", row.names = c(NA, -3L))

is there a way in R to subtract two rows within a group by specifying another grouping var?

Say I have something like this:
ID = c("a","a","a","a","a", "b","b","b","b","b")
Group = c("1","2","3","4","5", "1","2","3","4","5")
Value = c(3, 4,2,4,3, 6, 1, 8, 9, 10)
df<-data.frame(ID,Group,Value)
I want to subtract group=5 from group=3 within the ID, with an output column which has this difference for each ID like so:
ID Group Value Want
1 a 1 3 1
2 a 2 4 1
3 a 3 2 1
4 a 4 4 1
5 a 5 3 1
6 b 1 6 2
7 b 2 1 2
8 b 3 8 2
9 b 4 9 2
10 b 5 10 2
Also, if that calculation cannot be done (i.e. group 5 is missing), NA values for the 'want' column would be ideal.
As there is only one unique 'Group' per 'ID', we can do subsetting
library(dplyr)
df %>%
group_by(ID) %>%
mutate(want = Value[Group == 5] - Value[Group == 3])
# A tibble: 10 x 4
# Groups: ID [2]
# ID Group Value want
# <fct> <fct> <dbl> <dbl>
# 1 a 1 3 1
# 2 a 2 4 1
# 3 a 3 2 1
# 4 a 4 4 1
# 5 a 5 3 1
# 6 b 1 6 2
# 7 b 2 1 2
# 8 b 3 8 2
# 9 b 4 9 2
#10 b 5 10 2
The above can be made more error-proof if we convert to numeric index and get the first element. When there are no TRUE, by using [1], it returns NA
df %>%
slice(-10) %>%
group_by(ID) %>%
mutate(want = Value[which(Group == 5)[1]] - Value[which(Group == 3)[1]])
Or use match which returns an index of NA if there are no matches, and anything with NA index returns NA which will subsequently return NA in subtraction (NA -3)
df %>%
slice(-10) %>% # removing the last row where Group is 10
group_by(ID) %>%
mutate(want = Value[match(5, Group)] - Value[match(3, Group)])
Here is a base R solution
dfout <- Reduce(rbind,
lapply(split(df,df$ID),
function(x) within(x, Want <-diff(subset(Value, Group %in% c("3","5"))))))
such that
> dfout
ID Group Value Want
1 a 1 3 1
2 a 2 4 1
3 a 3 2 1
4 a 4 4 1
5 a 5 3 1
6 b 1 6 2
7 b 2 1 2
8 b 3 8 2
9 b 4 9 2
10 b 5 10 2
A data.table method:
library(data.table)
setDT(df)[, want := (Value[Group == 5] - Value[Group == 3]), by = .(ID)]
df
# ID Group Value want
# 1: a 1 3 1
# 2: a 2 4 1
# 3: a 3 2 1
# 4: a 4 4 1
# 5: a 5 3 1
# 6: b 1 6 2
# 7: b 2 1 2
# 8: b 3 8 2
# 9: b 4 9 2
# 10: b 5 10 2
Here is a solution using base R.
unsplit(
lapply(
split(df, df$ID),
function(d) {
x5 = d$Value[d$Group == "5"]
x5 = ifelse(length(x5) == 1, x5, NA)
x3 = d$Value[d$Group == "3"]
x3 = ifelse(length(x3) == 1, x3, NA)
d$Want = x5 - x3
d
}),
df$ID)

How to remove rows by condition

I'm trying to delete rows for which the condition is not satisfying
eg.Remove that Subject row which do not have all period's value
following is the dataframe
Subject Period
1 1
1 2
1 3
2 1
2 2
2 3
3 1
3 2
4 1
4 2
4 3
Subject Period
1 1
1 2
1 3
2 1
2 2
2 3
4 1
4 2
4 3
A dplyr solution.
library(dplyr)
dat %>%
group_by(Subject) %>%
filter(all(unique(dat$Period) %in% Period)) %>%
ungroup()
# # A tibble: 9 x 2
# Subject Period
# <int> <int>
# 1 1 1
# 2 1 2
# 3 1 3
# 4 2 1
# 5 2 2
# 6 2 3
# 7 4 1
# 8 4 2
# 9 4 3
A base R solution.
dat_list <- split(dat, f = dat$Subject)
keep_vec <- sapply(dat_list, function(x) all(unique(dat$Period) %in% x$Period))
dat_keep <- dat_list[keep_vec]
dat2 <- do.call(rbind, dat_keep)
dat2
# Subject Period
# 1.1 1 1
# 1.2 1 2
# 1.3 1 3
# 2.4 2 1
# 2.5 2 2
# 2.6 2 3
# 4.9 4 1
# 4.10 4 2
# 4.11 4 3
A solution using purrr and dplyr.
library(purrr)
library(dplyr)
dat2 <- dat %>%
split(f = .$Subject) %>%
keep(~all(unique(dat$Period) %in% .x$Period)) %>%
bind_rows()
dat2
# Subject Period
# 1 1 1
# 2 1 2
# 3 1 3
# 4 2 1
# 5 2 2
# 6 2 3
# 7 4 1
# 8 4 2
# 9 4 3
DATA
dat <- read.table(text = "Subject Period
1 1
1 2
1 3
2 1
2 2
2 3
3 1
3 2
4 1
4 2
4 3",
header = TRUE)
Consider ave for inline aggregation then subset accordingly:
sub_df <- subset(df, ave(Period, Subject, FUN=max) != 3)

Resources