Remove trailing NA by group in a data.frame - r

I have a data.frame with a grouping variable, and some NAs in the value column.
df = data.frame(group=c(1,1,2,2,2,2,2,3,3), value1=1:9, value2=c(NA,4,9,6,2,NA,NA,1,NA))
I can use zoo::na.trim to remove NA at the end of a column: this will remove the last line of the data.frame:
library(zoo)
library(dplyr)
df %>% na.trim(sides="right")
Now I want to remove the trailing NAs by group; how can I achieve this using dplyr?
Expected output for value2 column: c(NA, 4,9,6,2,1)

You could write a little helper function that checks for trailing NAs of a vector and then use group_by and filter.
f <- function(x) { rev(cumsum(!is.na(rev(x)))) != 0 }
library(dplyr)
df %>%
group_by(group) %>%
filter(f(value2))
# A tibble: 6 x 3
# Groups: group [3]
group value1 value2
<dbl> <int> <dbl>
1 1 1 NA
2 1 2 4
3 2 3 9
4 2 4 6
5 2 5 2
6 3 8 1
edit
If we need to remove both leading and trailing zero we need to extend that function a bit.
f1 <- function(x) { cumsum(!is.na(x)) != 0 & rev(cumsum(!is.na(rev(x)))) != 0 }
Given df1
df1 = data.frame(group=c(1,1,2,2,2,2,2,3,3), value1=1:9, value2=c(NA,4,9,NA,2,NA,NA,1,NA))
df1
# group value1 value2
#1 1 1 NA
#2 1 2 4
#3 2 3 9
#4 2 4 NA
#5 2 5 2
#6 2 6 NA
#7 2 7 NA
#8 3 8 1
#9 3 9 NA
We get this result
df1 %>%
group_by(group) %>%
filter(f1(value2))
# A tibble: 5 x 3
# Groups: group [3]
group value1 value2
<dbl> <int> <dbl>
1 1 2 4
2 2 3 9
3 2 4 NA
4 2 5 2
5 3 8 1

Using lapply, loop through group:
do.call("rbind", lapply(split(df, df$group), na.trim, sides = "right"))
# group value1 value2
# 1.1 1 1 NA
# 1.2 1 2 4
# 2.3 2 3 9
# 2.4 2 4 6
# 2.5 2 5 2
# 3 3 8 1
Or using by, as mentioned by #Henrik:
do.call("rbind", by(df, df$group, na.trim, sides = "right"))

Related

Roll max in R. From first row to current row

I would like to calculate max value from first row to current row
df <- data.frame(id = c(1,1,1,1,2,2,2), value = c(2,5,3,2,4,5,4), result = c(NA,2,5,5,NA,4,5))
I have tried grouping by id with dplyr and using rollmax function from zoo but did not success
1) rollmax is used with a fixed width but here we have a variable width so using rollapplyr, which seems close to the approach of the question, we have:
library(dplyr)
library(zoo)
df %>%
group_by(id) %>%
mutate(out = lag(rollapplyr(value, 1:n(), max))) %>%
ungroup
giving:
# A tibble: 7 x 4
# Groups: id [2]
id value result out
<dbl> <dbl> <dbl> <dbl>
1 1 2 NA NA
2 1 5 2 2
3 1 3 5 5
4 1 2 5 5
5 2 4 NA NA
6 2 5 4 4
7 2 4 5 5
2) It is also possible to perform the grouping via the width (second) argument of rollapplyr like this eliminating dplyr. In this case the widths are 1, 2, 3, 4, 1, 2, 3 and Max is like max except it does not use the last element of its argument x. (An alternate expression for the width would be seq_along(id) - match(id, id) + 1).
library(zoo)
Max <- function(x) if (length(x) == 1) NA else max(head(x, -1))
transform(df, out = rollapplyr(value, sequence(rle(id)$lengths), Max))
giving:
id value result out
1 1 2 NA NA
2 1 5 2 2
3 1 3 5 5
4 1 2 5 5
5 2 4 NA NA
6 2 5 4 4
7 2 4 5 5
A data.table option using shift + cummax
> setDT(df)[, result2 := shift(cummax(value)), id][]
id value result result2
1: 1 2 NA NA
2: 1 5 2 2
3: 1 3 5 5
4: 1 2 5 5
5: 2 4 NA NA
6: 2 5 4 4
7: 2 4 5 5
library(dplyr)
df |>
group_by(id) |>
mutate(result = lag(cummax(value)))
# # A tibble: 7 x 3
# # Groups: id [2]
# id value result
# <dbl> <dbl> <dbl>
# 1 1 2 NA
# 2 1 5 2
# 3 1 3 5
# 4 1 2 5
# 5 2 4 NA
# 6 2 5 4
# 7 2 4 5
Here is a base R solution. This would just get you the cumulative maximum:
df$result = ave(df$value, df$i, FUN=cummax)
To get the cumulative maximum with the lag you wanted:
df$result = ave(df$value, df$i, FUN=function(x) c(NA,cummax(x[-(length(x))])))

filter duplicated rows that has nonmatching variable values .in R

I am trying to filter some rows that have duplicated and I need the non-matching duplicates to filter.
Here is the sample dataset.
df <- data.frame(
id = c(1,2,2,3,4,5,5,6),
cat = c(3,3,4,5,2,2,1,5),
actual.cat = c(3,4,4,5,2,1,1,7))
> df
id cat actual.cat
1 1 3 3
2 2 3 4
3 2 4 4
4 3 5 5
5 4 2 2
6 5 2 1
7 5 1 1
8 6 5 7
So, each id has cat and actual.cat. When there is a duplicated id, I need to filter the nonmatching row.
Here what I need.
> df
id cat actual.cat
1 3 3
2 3 4
3 5 5
4 2 2
5 2 1
6 5 7
Any ideas on this?
Thanks!
We can do a group by operation and filter
library(dplyr)
df %>%
group_by(id) %>%
filter(n() > 1 & cat != actual.cat|n() == 1)
-output
# A tibble: 6 x 3
# Groups: id [6]
# id cat actual.cat
# <dbl> <dbl> <dbl>
#1 1 3 3
#2 2 3 4
#3 3 5 5
#4 4 2 2
#5 5 2 1
#6 6 5 7
Or using base R
subset(df, id %in% names(which(table(id) > 1)) &
cat != actual.cat| id %in% names(which(table(id) == 1)))
In base R, you can use subset with ave to select rows in each id where number of rows in each group is 1 or cat is not equal to actual.cat.
subset(df, ave(cat != actual.cat, id, FUN = function(x) length(x) == 1 | x))
# id cat actual.cat
#1 1 3 3
#2 2 3 4
#4 3 5 5
#5 4 2 2
#6 5 2 1
#8 6 5 7
You can also write this logic in data.table :
library(data.table)
setDT(df)[, .SD[.N == 1 | cat != actual.cat], id]

is there a way in R to subtract two rows within a group by specifying another grouping var?

Say I have something like this:
ID = c("a","a","a","a","a", "b","b","b","b","b")
Group = c("1","2","3","4","5", "1","2","3","4","5")
Value = c(3, 4,2,4,3, 6, 1, 8, 9, 10)
df<-data.frame(ID,Group,Value)
I want to subtract group=5 from group=3 within the ID, with an output column which has this difference for each ID like so:
ID Group Value Want
1 a 1 3 1
2 a 2 4 1
3 a 3 2 1
4 a 4 4 1
5 a 5 3 1
6 b 1 6 2
7 b 2 1 2
8 b 3 8 2
9 b 4 9 2
10 b 5 10 2
Also, if that calculation cannot be done (i.e. group 5 is missing), NA values for the 'want' column would be ideal.
As there is only one unique 'Group' per 'ID', we can do subsetting
library(dplyr)
df %>%
group_by(ID) %>%
mutate(want = Value[Group == 5] - Value[Group == 3])
# A tibble: 10 x 4
# Groups: ID [2]
# ID Group Value want
# <fct> <fct> <dbl> <dbl>
# 1 a 1 3 1
# 2 a 2 4 1
# 3 a 3 2 1
# 4 a 4 4 1
# 5 a 5 3 1
# 6 b 1 6 2
# 7 b 2 1 2
# 8 b 3 8 2
# 9 b 4 9 2
#10 b 5 10 2
The above can be made more error-proof if we convert to numeric index and get the first element. When there are no TRUE, by using [1], it returns NA
df %>%
slice(-10) %>%
group_by(ID) %>%
mutate(want = Value[which(Group == 5)[1]] - Value[which(Group == 3)[1]])
Or use match which returns an index of NA if there are no matches, and anything with NA index returns NA which will subsequently return NA in subtraction (NA -3)
df %>%
slice(-10) %>% # removing the last row where Group is 10
group_by(ID) %>%
mutate(want = Value[match(5, Group)] - Value[match(3, Group)])
Here is a base R solution
dfout <- Reduce(rbind,
lapply(split(df,df$ID),
function(x) within(x, Want <-diff(subset(Value, Group %in% c("3","5"))))))
such that
> dfout
ID Group Value Want
1 a 1 3 1
2 a 2 4 1
3 a 3 2 1
4 a 4 4 1
5 a 5 3 1
6 b 1 6 2
7 b 2 1 2
8 b 3 8 2
9 b 4 9 2
10 b 5 10 2
A data.table method:
library(data.table)
setDT(df)[, want := (Value[Group == 5] - Value[Group == 3]), by = .(ID)]
df
# ID Group Value want
# 1: a 1 3 1
# 2: a 2 4 1
# 3: a 3 2 1
# 4: a 4 4 1
# 5: a 5 3 1
# 6: b 1 6 2
# 7: b 2 1 2
# 8: b 3 8 2
# 9: b 4 9 2
# 10: b 5 10 2
Here is a solution using base R.
unsplit(
lapply(
split(df, df$ID),
function(d) {
x5 = d$Value[d$Group == "5"]
x5 = ifelse(length(x5) == 1, x5, NA)
x3 = d$Value[d$Group == "3"]
x3 = ifelse(length(x3) == 1, x3, NA)
d$Want = x5 - x3
d
}),
df$ID)

Shift certain values in a column of dataframe to a next column in R

I have a dataframe that looks like the following:
x y z
1 2 3
1 2 3
1 2 3
2 3
1 2 3
1 3
I would like to ask if there is a command in R that will allow to obtain the following dataframe (by shifting and aligning similar values)
x y z
1 2 3
1 2 3
1 2 3
NA 2 3
1 2 3
1 NA 3
An alternative solution, where the main idea is to capture the pattern of your dataset based on rows that don't have NAs and then perform some reshaping using the pattern you captured.
df = read.table(text = "
x y z
1 2 3
1 2 3
1 2 3
2 3 NA
1 2 3
1 3 NA
", header= T)
library(tidyverse)
# get the column names of your dataset
names = names(df)
# get unique values after omitting rows with NAs
value = unlist(unique(na.omit(df)))
# create a dataset with names and values
# (this is the pattern you want to follow)
df3 = data.frame(names, value)
df %>%
mutate(id = row_number()) %>% # flag the row number
gather(v,value,-id) %>% # reshape
na.omit() %>% # remove rows with NAs
left_join(df3, by="value") %>% # join info about your pattern
select(-v) %>% # remove that column
spread(names, value) %>% # reshape
select(-id) # remove row number
# x y z
# 1 1 2 3
# 2 1 2 3
# 3 1 2 3
# 4 NA 2 3
# 5 1 2 3
# 6 1 NA 3
library(tidyverse)
df %>%
pmap_dfr(~{ x <- list(...)
if(any(is.na(x))) intersect(x, df[1,]) # match with first row's values to assign names
else x})
Output:
# # A tibble: 6 x 3
# x y z
# <int> <int> <int>
# 1 1 2 3
# 2 1 2 3
# 3 1 2 3
# 4 NA 2 3
# 5 1 2 3
# 6 1 NA 3
reading your data:
df<- fread("x y z
1 2 3
1 2 3
1 2 3
2 3 NA
1 2 3
1 3 NA") %>% setDF
code:
library(magrittr)
getmode <- function(v) {
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
pattern <- sapply(df,getmode)
df[!complete.cases(df),] %<>% apply(1,function(x){tmp<-pattern;tmp[!(tmp%in%x)] <- NA;return(tmp)}) %>% t %>% data.frame
result:
> df
x y z
1 1 2 3
2 1 2 3
3 1 2 3
4 NA 2 3
5 1 2 3
6 1 NA 3

Shifting rows up in columns and flush remaining ones

I have a problem with moving the rows to one upper row. When the rows become completely NA I would like to flush those rows (see the pic below). My current approach for this solution however still keeping the second rows.
Here is my approach
data <- data.frame(gr=c(rep(1:3,each=2)),A=c(1,NA,2,NA,4,NA), B=c(NA,1,NA,3,NA,7),C=c(1,NA,4,NA,5,NA))
> data
gr A B C
1 1 1 NA 1
2 1 NA 1 NA
3 2 2 NA 4
4 2 NA 3 NA
5 3 4 NA 5
6 3 NA 7 NA
so using this approach
data.frame(apply(data,2,function(x){x[complete.cases(x)]}))
gr A B C
1 1 1 1 1
2 1 2 3 4
3 2 4 7 5
4 2 1 1 1
5 3 2 3 4
6 3 4 7 5
As we can see still I am having the second rows in each group!
The expected output
> data
gr A B C
1 1 1 1 1
2 2 2 3 4
3 3 4 7 5
thanks!
If there's at most one valid value per gr, you can use na.omit then take the first value from it:
data %>% group_by(gr) %>% summarise_all(~ na.omit(.)[1])
# [1] is optional depending on your actual data
# A tibble: 3 x 4
# gr A B C
# <int> <dbl> <dbl> <dbl>
#1 1 1 1 1
#2 2 2 3 4
#3 3 4 7 5
You can do it with dplyr like this:
data$ind <- rep(c(1,2), replace=TRUE)
data %>% fill(A,B,C) %>% filter(ind == 2) %>% mutate(ind=NULL)
gr A B C
1 1 1 1 1
2 2 2 3 4
3 3 4 7 5
Depending on how consistent your full data is, this may need to be adjusted.
One more solution using data.table:-
data <- data.frame(gr=c(rep(1:3,each=2)),A=c(1,NA,2,NA,4,NA), B=c(NA,1,NA,3,NA,7),C=c(1,NA,4,NA,5,NA))
library(data.table)
library(zoo)
setDT(data)
data[, A := na.locf(A), by = gr]
data[, B := na.locf(B), by = gr]
data[, C := na.locf(C), by = gr]
data <- unique(data)
data
gr A B C
1: 1 1 1 1
2: 2 2 3 4
3: 3 4 7 5

Resources