Iteratively shift variables in data - r

Some example of my data:
library(tidyverse)
set.seed(1234)
df <- tibble(
v1 = c(1:6),
v2 = rnorm(6, 5, 2) %>% round,
v3 = rnorm(6, 4, 2) %>% round,
v4 = rnorm(6, 4, 1) %>% round %>% lag(1),
v5 = rnorm(6, 6, 2) %>% round %>% lag(2),
v6 = rnorm(6, 5, 3) %>% round %>% lag(3),
v7 = rnorm(6, 5, 3) %>% round %>% lag(4))
v1 v2 v3 v4 v5 v6 v7
1 1 3 3 NA NA NA NA
2 2 6 3 3 NA NA NA
3 3 7 3 4 4 NA NA
4 4 0 2 5 11 3 NA
5 5 6 3 4 6 1 8
6 6 6 2 3 5 7 4
I want to shift it by diagonal, that separates NA and filled data.
So, desired output looks like this:
v1 v2 v3 v4 v5 v6 v7
1 NA NA 3 3 4 3 8
2 NA 3 3 4 11 1 4
3 1 6 3 5 6 7 NA
4 2 7 2 4 5 NA NA
5 3 0 3 4 NA NA NA
6 4 6 2 NA NA NA NA
7 5 6 NA NA NA NA NA
8 6 NA NA NA NA NA NA
Each column around v3 is just shifted by 1, 2, 3.. etc rows down and up.
Tried to achieve this inside dplyr::mutate_all() but I failed to iterate it with a lag() and lead() functions.
EDIT: after #wibeasley advice I made this
df %>%
mutate(dummy1 = c(3:8)) %>%
gather("var", "val", -dummy1) %>%
mutate(
dummy2 = sub("v", "", var, fixed = T),
dummy3 = dummy1 - as.numeric(dummy2) + 1) %>%
select(-dummy1, -dummy2) %>%
spread(var, val) %>%
slice(-c(1:4)) %>% select(-dummy3)
Looks ugly, but works.

We can use lapply to handle each column, putting NA to the back.
df[] <- lapply(df, function(x) c(x[!is.na(x)], x[is.na(x)]))
df
# # A tibble: 6 x 7
# v1 v2 v3 v4 v5 v6 v7
# <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 3 3 3 4 3 8
# 2 2 6 3 4 11 1 4
# 3 3 7 3 5 6 7 NA
# 4 4 0 2 4 5 NA NA
# 5 5 6 3 3 NA NA NA
# 6 6 6 2 NA NA NA NA

Related

trying to calculate sum of row with dataframe having NA values

I am trying to sum the row of values if any column have values but not working for me like below
df=data.frame(
x3=c(2,NA,3,5,4,6,NA,NA,3,3),
x4=c(0,NA,NA,6,5,6,NA,0,4,2))
df$summ <- ifelse(is.na(c(df[,"x3"] & df[,"x4"])),NA,rowSums(df[,c("x3","x4")], na.rm=TRUE))
the output should be like
An alternative solution:
library(data.table)
setDT(df)[!( is.na(x3) & is.na(x4)),summ:=rowSums(.SD, na.rm = T)]
You can do :
df <- transform(df, summ = ifelse(is.na(x3) & is.na(x4), NA,
rowSums(df, na.rm = TRUE)))
df
# x3 x4 summ
#1 2 0 2
#2 NA NA NA
#3 3 NA 3
#4 5 6 11
#5 4 5 9
#6 6 6 12
#7 NA NA NA
#8 NA 0 0
#9 3 4 7
#10 3 2 5
In general for any number of columns :
cols <- c('x3', 'x4')
df <- transform(df, summ = ifelse(rowSums(is.na(df[cols])) == length(cols),
NA, rowSums(df, na.rm = TRUE)))
Try the code below with rowSums + replace
df$summ <- replace(rowSums(df, na.rm = TRUE), rowSums(is.na(df)) == 2, NA)
which gives
> df
x3 x4 summ
1 2 0 2
2 NA NA NA
3 3 NA 3
4 5 6 11
5 4 5 9
6 6 6 12
7 NA NA NA
8 NA 0 0
9 3 4 7
10 3 2 5
This is not much different from already posted answers, however, it contains some useful functions:
library(dplyr)
df %>%
rowwise() %>%
mutate(Count = ifelse(all(is.na(cur_data())), NA,
sum(c_across(everything()), na.rm = TRUE)))
# A tibble: 10 x 3
# Rowwise:
x3 x4 Count
<dbl> <dbl> <dbl>
1 2 0 2
2 NA NA NA
3 3 NA 3
4 5 6 11
5 4 5 9
6 6 6 12
7 NA NA NA
8 NA 0 0
9 3 4 7
10 3 2 5

Removing rows with all NA's, from data.frames within a list

Example Data:
df1 <- as.data.frame(rbind(c(1,2,3), c(1, NA, 4), c(NA, NA, NA), c(4,6,7), c(4, 8, NA)))
df2 <- as.data.frame(rbind(c(1,2,3), c(1, NA, 4), c(4,6,7), c(NA, NA, NA), c(4, 8, NA)))
dfList <- list(df1,df2)
colnames <- c("A","B","C")
dfList[[1]]
V1 V2 V3
1 1 2 3
2 1 NA 4
3 NA NA NA
4 4 6 7
5 4 8 NA
dfList[[2]]
V1 V2 V3
1 1 2 3
2 1 NA 4
3 4 6 7
4 NA NA NA
5 4 8 NA
How do I remove the rows that are empty/have ALL values NA, within each of the data.frames in the list?
Desired outcome:
V1 V2 V3
1 1 2 3
2 1 NA 4
3 4 6 7
4 4 8 NA
V1 V2 V3
1 1 2 3
2 1 NA 4
3 4 6 7
4 4 8 NA
You can use lapply to iterate over the list and rowSums to drop rows with all NA values.
lapply(dfList, function(x) x[rowSums(!is.na(x)) != 0, ])
#[[1]]
# V1 V2 V3
#1 1 2 3
#2 1 NA 4
#4 4 6 7
#5 4 8 NA
#[[2]]
# V1 V2 V3
#1 1 2 3
#2 1 NA 4
#3 4 6 7
#5 4 8 NA
use tidyverse
library(tidyverse)
library(janitor)
map(dfList, remove_empty, which = c("rows"))
[[1]]
V1 V2 V3
1 1 2 3
2 1 NA 4
4 4 6 7
5 4 8 NA
[[2]]
V1 V2 V3
1 1 2 3
2 1 NA 4
3 4 6 7
5 4 8 NA
Here is another solution with all()
lapply(dfList, function(d) d[!apply(is.na(d), 1, all),])

Get summaries of repeated consecutive values by row in R

I´m trying to get some statistics (min, max, mean) of repeated values by row in R.
My dataframe looks similar to this:
b <- as.data.frame(matrix(ncol=7, nrow=3,
c(3,NA,NA,4,5,NA,7,6,NA,7,NA,8,9,NA,NA,4,6,NA,NA,7,NA), byrow = TRUE))
For each row, I want to add a column with the min, max and mean of the no. of columns containing consecutive NAs and it should something like this
V1 V2 V3 V4 V5 V6 V7 max min mean
1 3 NA NA 4 5 NA 7 2 1 1.5
2 6 NA 7 NA 8 9 NA 1 1 1.0
3 NA 4 6 NA NA 7 NA 2 1 1.33
This is just a small example of my dataset with 2000 rows and 48 columns.
Does anyone have some code for this?
You can apply over the rows and get the "runs" of non-NA columns. Once you have that, you can simply take the summary stats of those:
b[,c("mean", "max", "min")] <- do.call(rbind, apply(b, 1, function(x){
res <- rle(!is.na(x))
res2 <- res[["lengths"]][res[["values"]]]
data.frame(mean = mean(res2), max = max(res2), min = min(res2))
}
))
b
# V1 V2 V3 V4 V5 V6 V7 mean max min
#1 3 NA NA 4 5 NA 7 1.333333 2 1
#2 6 NA 7 NA 8 9 NA 1.333333 2 1
#3 NA 4 6 NA NA 7 NA 1.500000 2 1
A dplyr solution with rlewhich computes the lengths of runs of equal values in a vector.
library(dplyr)
b %>% cbind( b %>% rowwise() %>% do(rl = rle(is.na(.))$lengths[rle(is.na(.))$values == T]))
%>% rowwise()
%>% mutate(mean = mean(rl),
max = max(rl),
min = min(rl))
%>% select(-rl)
# V1 V2 V3 V4 V5 V6 V7 max min mean
# <int> <int> <int> <int> <int> <int> <int> <int> <int> <dbl>
# 1 3 NA NA 4 5 NA 7 2 1 1.50
# 2 6 NA 7 NA 8 9 NA 1 1 1.00
# 3 NA 4 6 NA NA 7 NA 2 1 1.33

fill column with previous column if NA

I have a data frame like this
df <- data.frame(v1 = 10:14, v2 = c(NA, 1, NA, 3, 6), v3 = c(1, NA, NA, 9, 4))
v1 v2 v3
1 10 NA 1
2 11 1 NA
3 12 NA NA
4 13 3 9
5 14 6 4
I now want to fill the NAs with the value of the previous column, so it looks like this:
v1 v2 v3
1 10 10 1
2 11 1 1
3 12 12 12
4 13 3 9
5 14 6 4
I know how to do this manually, like this:
df$v2 <- ifelse(is.na(df$v2), df$v1, df$v2)
How can I automate this for a full data frame with many columns?
You can do this with fill from tidyr:
library(dplyr)
library(tidyr)
data.frame(t(df)) %>%
fill(., names(.)) %>%
t()
Result:
v1 v2 v3
X1 10 10 1
X2 11 1 1
X3 12 12 12
X4 13 3 9
X5 14 6 4
Note:
Basically, I transposed df, filled every column downward, then transposed it back to the original orientation
for (i in 2:ncol(df))
df[,i] = ifelse(is.na(df[,i]), df[,i-1],df[,i])
This will propagate values across streaks of NA columns. If you don't want this, just reverse the order of the indexes in the for loop declaration.
Another option using Reduce with ifelse:
df[] <- Reduce(function(x, y) ifelse(is.na(y), x, y), df, accumulate = TRUE)
df
# v1 v2 v3
#1 10 10 1
#2 11 1 1
#3 12 12 12
#4 13 3 9
#5 14 6 4
You could use apply but note that the output will be a matrix
t(apply(df, 1, function(x){
replace(x, is.na(x), x[cumsum(!is.na(x))][is.na(x)])
}))
# v1 v2 v3
#[1,] 10 10 1
#[2,] 11 1 1
#[3,] 12 12 12
#[4,] 13 3 9
#[5,] 14 6 4
By using zoo na.locf
data.frame(t(apply(df,1,function(x) na.locf(x))))
v1 v2 v3
1 10 10 1
2 11 1 1
3 12 12 12
4 13 3 9
5 14 6 4

Sort dataframe in a function

I am trying to create a function which takes a dataframe and the columns by which I want to sort as arguments. This is what I have come up with:
sortDf <- function(df, columns){
df <- df[order(df[,columns]),]
return(df)
}
This is my usecase:
set.seed(24)
dataset <- matrix(sample(c(NA, 1:5), 25, replace = TRUE), 5)
df <- as.data.frame(dataset)
sortedDf <- sortDf(df, c('V1', 'V2'))
How ever I get this as a result:
V1 V2 V3 V4 V5
3 1 1 5 3 4
5 1 5 2 5 2
NA NA NA NA NA NA
NA.1 NA NA NA NA NA
NA.2 NA NA NA NA NA
NA.3 NA NA NA NA NA
1 5 2 1 2 5
4 5 2 1 2 1
NA.4 NA NA NA NA NA
2 NA 4 NA 1 4
The dataframe is kinda sorted but where does the 'NA' come from and how can I remove them? What do I do wrong? I want to sort descending. Thanks in advance.
We can create a different function
f1 <- function(dat, cols){
dat[do.call(order, dat[cols]),]
}
f1(df, c("V1", "V2"))
# V1 V2 V3 V4 V5
#2 1 1 2 1 3
#1 1 5 3 5 NA
#5 3 1 1 NA 1
#4 3 4 4 3 NA
#3 4 4 4 NA 4
In the OP's code, the order is applied on a data.frame instead of a vector. It can be used either separately or within do.call i.e.
df[order(df$V1, df$V2),]
# V1 V2 V3 V4 V5
#2 1 1 2 1 3
#1 1 5 3 5 NA
#5 3 1 1 NA 1
#4 3 4 4 3 NA
#3 4 4 4 NA 4
gives the same result as the OP's code. So, either it columns can be individually mentioned (which would not be easy when there are more number of columns) or use do.call.
This can also be implemented using the devel version of dplyr (soon to be released 0.6.0) with quosures. After taking the input vector, it is converted to quosures (parse_quosures) and then evaluated by unquoting (!!!) it in arrange
library(dplyr)
f2 <- function(dat, cols){
cols <- rlang::parse_quosures(paste(cols, collapse=";"))
dat %>%
arrange(!!! cols)
}
f2(df, c("V1", "V2"))
# V1 V2 V3 V4 V5
#1 1 1 2 1 3
#2 1 5 3 5 NA
#3 3 1 1 NA 1
#4 3 4 4 3 NA
#5 4 4 4 NA 4
data
set.seed(24)
df <- as.data.frame(matrix(sample(c(NA, 1:5), 25, replace = TRUE), 5))

Resources