Calculate standard deviation across multiple rows grouped by ID - r

I want to calculate the standard deviation across multiple rows (not per row) and then save the results into a new data frame. Best to explain using an example.
Data:
ID <- c("a","a","a","a","b","b","b","b","c","c","c","c")
y1 <- c(8,9,3,6,6,4,5,8,7,5,8,1)
y2 <- c(3,6,6,1,7,3,8,7,5,8,1,7)
y3 <- c(9,3,1,8,4,6,3,8,4,6,5,7)
df <- data.frame(ID, y1, y2, y3)
ID y1 y2 y3
1 a 8 3 9
2 a 9 6 3
3 a 3 6 1
4 a 6 1 8
5 b 6 7 4
6 b 4 3 6
7 b 5 8 3
8 b 8 7 8
9 c 7 5 4
10 c 5 8 6
11 c 8 1 5
12 c 1 7 7
I want to calculate the standard deviation of ID$a, ID$b and ID$c and store in a new data frame. I know I can do this:
sd_a <- sd(as.matrix(subset(df, ID == "a")), na.rm = TRUE)
sd_b <- sd(as.matrix(subset(df, ID == "b")), na.rm = TRUE)
sd_c <- sd(as.matrix(subset(df, ID == "c")), na.rm = TRUE)
ID <- c("a","b","c")
sd <- c(sd_a,sd_b,sd_c)
df2 <- data.frame(ID, sd)
ID sd
1 a 2.958040
2 b 1.912875
3 c 2.386833
But is there a more straightforward way of achieving this?

You can use pivot_longer() to stack y1 to y3 and then calculate the sd.
library(dplyr)
library(tidyr)
df %>%
pivot_longer(y1:y3) %>%
group_by(ID) %>%
summarise(sd = sd(value))
# # A tibble: 3 x 2
# ID sd
# <chr> <dbl>
# 1 a 2.96
# 2 b 1.91
# 3 c 2.39

One dplyr solution could be:
df %>%
group_by(ID) %>%
summarise(sd = sd(unlist(cur_data())))
ID sd
<fct> <dbl>
1 a 2.96
2 b 1.91
3 c 2.39

In base R you can do:
aggregate(values ~ ID, cbind(df[1], stack(df[-1])), sd)
ID values
1 a 2.958040
2 b 1.912875
3 c 2.386833

Related

Efficient recoding of numeric variables into a factor in a data.frame

In recoding values of numeric variables like var1 below into character values, sometimes there is an easy patter. For example, suppose numeric values 1:4 in var1 need to be recoded as LETTERS[27-(4:1)], respectively.
In such situations, is it possible to avoid writing var1 = recode(var1,1="W",2="X",3="Y",4="Z") and instead loop the recoding?
library(tidyverse)
(dat <- data.frame(var1 = rep(1:4,2), id = 1:8))
mutate(dat, var1 = recode(var1,`1`="W",`2`="X",`3`="Y",`4`="Z")) # This works but can we
# loop it as well?
We can use a vectorized approach, no loops necessary. tail and base subsetting with [ will do the trick here.
library(dplyr)
dat %>% mutate(var1=tail(LETTERS, max(var1))[var1] %>% as.factor)
var1 id
1 W 1
2 X 2
3 Y 3
4 Z 4
5 W 5
6 X 6
7 Y 7
8 Z 8
data
dat <- data.frame(var1 = rep(1:4,2), id = 1:8)
data2
dat2 <- data.frame(var1 = c(2,1,3,1,4:1), id = 1:8))
var1 id
1 2 1
2 1 2
3 3 3
4 1 4
5 4 5
6 3 6
7 2 7
8 1 8
output2
var1 id
1 X 1
2 W 2
3 Y 3
4 W 4
5 Z 5
6 Y 6
7 X 7
8 W 8
You can use -
library(dplyr)
dat %>% mutate(var1 = LETTERS[length(LETTERS)-max(var1) + var1])
# var1 id
#1 W 1
#2 X 2
#3 Y 3
#4 Z 4
#5 W 5
#6 X 6
#7 Y 7
#8 Z 8
you can also just use the labels argument of factor()
library(dplyr)
dat <- data.frame(var1 = rep(1:4,2), id = 1:8) %>%
mutate(var1 = factor(var1, labels = tail(LETTERS, 4)))
dat
var1 id
1 W 1
2 X 2
3 Y 3
4 Z 4
5 W 5
6 X 6
7 Y 7
8 Z 8

If a column is NA, calculate row mean on other columns using dplyR

In the example below how can I calculate the row mean when column A is NA? The row mean would replace the NA in column A. Using base R, I can use this:
foo <- tibble(A = c(3,5,NA,6,NA,7,NA),
B = c(4,5,4,5,6,4,NA),
C = c(6,5,2,8,8,5,NA))
foo
tmp <- rowMeans(foo[,-1],na.rm = TRUE)
foo$A[is.na(foo$A)] <- tmp[is.na(foo$A)]
foo$A[is.nan(foo$A)] <- NA
Curious how I might do this with dplyR?
You can use ifelse :
library(dplyr)
foo %>%
mutate(A = ifelse(is.na(A), rowMeans(., na.rm = TRUE), A),
A = replace(A, is.nan(A), NA))
# A B C
# <dbl> <dbl> <dbl>
#1 3 4 6
#2 5 5 5
#3 3 4 2
#4 6 5 8
#5 7 6 8
#6 7 4 5
#7 NA NA NA
Here is a solution that not only replace NA in column A, but for all columns in the data frame.
library(dplyr)
foo2 <- foo %>%
mutate(RowMean = rowMeans(., na.rm = TRUE)) %>%
mutate(across(-RowMean, .fns =
function(x) ifelse(is.na(x) & !is.nan(RowMean), RowMean, x))) %>%
select(-RowMean)
Use coalesce:
foo %>%
mutate(m = rowMeans(across(), na.rm = T),
A = if_else(is.na(A) & !is.na(m), m, A)) %>%
select(-m)
# # A tibble: 7 x 3
# A B C
# <dbl> <dbl> <dbl>
# 1 3 4 6
# 2 5 5 5
# 3 3 4 2
# 4 6 5 8
# 5 7 6 8
# 6 7 4 5
# 7 NA NA NA

join and sum columns together R

I have a dataframe:
df <- data.frame(ca = c("a","b","a","c","b", "b"),
f = c(3,4,0,NA,3, 4),
f2 = c(NA,5,6,1,9, 7),
f3 = c(3,0,6,3,0, 8))
I want join and sum my columns "f" and "f2" and rename it in "f_news"
exemple :
df <- data.frame(ca = c("a","b","a","c","b", "b"),
f_new = c(3,9,6,1,12, 11),
f3 = c(3,0,6,3,0, 8))
Do you have an idea of how to do this with summarise, spread, group_by?
Using plyr and dplyr you can do this:
df %>%
rowwise() %>%
mutate(f_new=sum(f, f2, na.rm = T))
# A tibble: 6 x 5
# ca f f2 f3 f_new
# <fct> <dbl> <dbl> <dbl> <dbl>
#1 a 3 NA 3 3
#2 b 4 5 0 9
#3 a 0 6 6 6
#4 c NA 1 3 1
#5 b 3 9 0 12
#6 b 4 7 8 11
This method will retain and NA values
Here is an answer using tidyverse methods from dplyr and tidyr
library(tidyverse)
df <- data.frame(ca = c("a","b","a","c","b", "b"),
f = c(3,4,0,NA,3, 4),
f2 = c(NA,5,6,1,9, 7),
f3 = c(3,0,6,3,0, 8))
df %>%
replace_na(list(f = 0, f2 = 0)) %>%
mutate(f_new = f + f2)
#> ca f f2 f3 f_new
#> 1 a 3 0 3 3
#> 2 b 4 5 0 9
#> 3 a 0 6 6 6
#> 4 c 0 1 3 1
#> 5 b 3 9 0 12
#> 6 b 4 7 8 11
Dplyr can do this quite nice with the following code. Rowwise allows you to consider each row separately. And the mutate command sums whatever columns you want. the na.rm=TRUE handles the issue when you have NA's and want to ignore them. As a comment mentioned, if you do not have this, it will give you an NA if it's in any of the summed values.
library(dplyr)
df %>%
rowwise() %>%
mutate(f_new = sum(f,f2, na.rm = TRUE))

unique pairwise distances between any points in the dataframe

I have a list of ten points with X and Ỳ coordinates. I would like to calculate the possible permutations of distances between any two points. Precisely, only one of the distances in 1-2, 2-1 should be present. I have managed to remove the distances of a point with itself. But couldn't achieve this permutation distances.
# Data Generation
df <- data.frame(X = runif(10, 0, 1), Y = runif(10, 0, 1), ID = 1:10)
# Temporary key Creation
df <- df %>% mutate(key = 1)
# Calculating pairwise distances
df %>% full_join(df, by = "key") %>%
mutate(dist = sqrt((X.x - X.y)^2 + (Y.x - Y.y)^2)) %>%
select(ID.x, ID.y, dist) %>% filter(!dist == 0) %>% head(11)
# Output
# ID.x ID.y dist
# 1 1 2 0.90858911
# 2 1 3 0.71154587
# 3 1 4 0.05687495
# 4 1 5 1.03885510
# 5 1 6 0.93747717
# 6 1 7 0.62070415
# 7 1 8 0.88351690
# 8 1 9 0.89651911
# 9 1 10 0.05079906
# 10 2 1 0.90858911
# 11 2 3 0.27530175
How to achieve the expected output shown below?
# Expected Output
# ID.x ID.y dist
# 1 1 2 0.90858911
# 2 1 3 0.71154587
# 3 1 4 0.05687495
# 4 1 5 1.03885510
# 5 1 6 0.93747717
# 6 1 7 0.62070415
# 7 1 8 0.88351690
# 8 1 9 0.89651911
# 9 1 10 0.05079906
# 10 2 3 0.27530175
# 11 2 4 0.5415415
But this approach is computationally slower compared to dist(). Would be happier to listen to faster approaches.
I would use dist on the data and then process the output into the required format. You can replace dist with any other distance function. Here I've used letters rather than numbers as ID to better show what is happening
set.seed(42)
df <- data.frame(X = runif(10, 0, 1), Y = runif(10, 0, 1), ID = letters[1:10])
df %>%
column_to_rownames("ID") %>% #make the ID the rownames. dist will use these> NB will not work on a tibble
dist() %>%
as.matrix() %>%
as.data.frame() %>%
rownames_to_column(var = "ID.x") %>% #capture the row IDs
gather(key = ID.y, value = dist, -ID.x) %>%
filter(ID.x < ID.y) %>%
as_tibble()
# A tibble: 45 x 3
ID.x ID.y dist
<chr> <chr> <dbl>
1 a b 0.2623175
2 a c 0.7891034
3 b c 0.6856994
4 a d 0.2191960
5 b d 0.4757855
6 c d 0.8704269
7 a e 0.2730984
8 b e 0.3913770
9 c e 0.5912681
10 d e 0.2800021
# ... with 35 more rows
dist is very fast compared with looping through calculating distances.
The code can probably be made more efficient, by working directly of the dist object rather than converting it into a matrix.
Perhaps this is a slightly simpler approach:
df <- data.frame(X = runif(10, 0, 1), Y = runif(10, 0, 1), ID = 1:10)
df2 <- data.frame(ID1 = rep(1:10, each = 10),
ID2 = 1:10,
distance = as.vector(as.matrix((dist(df)))))
Then get rid of diagonal:
df2 <- df2[df2$ID1 != df2$ID2,]
Get rid of upper triangle:
df2 <- df2[df2$ID1 < df2$ID2,]
df2
ID1 ID2 distance
2 1 2 1.000615
3 1 3 2.057813
4 1 4 3.010261
5 1 5 4.039502
6 1 6 5.029982
7 1 7 6.035427
8 1 8 7.012540
9 1 9 8.006249
10 1 10 9.015352
13 2 3 1.099245
14 2 4 2.011664
...

R, dplyr: cumulative version of n_distinct

I have a dataframe as follows. It is ordered by column time.
Input -
df = data.frame(time = 1:20,
grp = sort(rep(1:5,4)),
var1 = rep(c('A','B'),10)
)
head(df,10)
time grp var1
1 1 1 A
2 2 1 B
3 3 1 A
4 4 1 B
5 5 2 A
6 6 2 B
7 7 2 A
8 8 2 B
9 9 3 A
10 10 3 B
I want to create another variable var2 which computes no of distinct var1 values so far i.e. until that point in time for each group grp . This is a little different from what I'd get if I were to use n_distinct.
Expected output -
time grp var1 var2
1 1 1 A 1
2 2 1 B 2
3 3 1 A 2
4 4 1 B 2
5 5 2 A 1
6 6 2 B 2
7 7 2 A 2
8 8 2 B 2
9 9 3 A 1
10 10 3 B 2
I want to create a function say cum_n_distinct for this and use it as -
d_out = df %>%
arrange(time) %>%
group_by(grp) %>%
mutate(var2 = cum_n_distinct(var1))
A dplyr solution inspired from #akrun's answer -
Ths logic is basically to set 1st occurrence of each unique values of var1 to 1 and rest to 0 for each group grp and then apply cumsum on it -
df = df %>%
arrange(time) %>%
group_by(grp,var1) %>%
mutate(var_temp = ifelse(row_number()==1,1,0)) %>%
group_by(grp) %>%
mutate(var2 = cumsum(var_temp)) %>%
select(-var_temp)
head(df,10)
Source: local data frame [10 x 4]
Groups: grp
time grp var1 var2
1 1 1 A 1
2 2 1 B 2
3 3 1 A 2
4 4 1 B 2
5 5 2 A 1
6 6 2 B 2
7 7 2 A 2
8 8 2 B 2
9 9 3 A 1
10 10 3 B 2
Assuming stuff is ordered by time already, first define a cumulative distinct function:
dist_cum <- function(var)
sapply(seq_along(var), function(x) length(unique(head(var, x))))
Then a base solution that uses ave to create groups (note, assumes var1 is factor), and then applies our function to each group:
transform(df, var2=ave(as.integer(var1), grp, FUN=dist_cum))
A data.table solution, basically doing the same thing:
library(data.table)
(data.table(df)[, var2:=dist_cum(var1), by=grp])
And dplyr, again, same thing:
library(dplyr)
df %>% group_by(grp) %>% mutate(var2=dist_cum(var1))
Try:
Update
With your new dataset, an approach in base R
df$var2 <- unlist(lapply(split(df, df$grp),
function(x) {x$var2 <-0
indx <- match(unique(x$var1), x$var1)
x$var2[indx] <- 1
cumsum(x$var2) }))
head(df,7)
# time grp var1 var2
# 1 1 1 A 1
# 2 2 1 B 2
# 3 3 1 A 2
# 4 4 1 B 2
# 5 5 2 A 1
# 6 6 2 B 2
# 7 7 2 A 2
Here's another solution using data.table that's pretty quick.
Generic Function
cum_n_distinct <- function(x, na.include = TRUE){
# Given a vector x, returns a corresponding vector y
# where the ith element of y gives the number of unique
# elements observed up to and including index i
# if na.include = TRUE (default) NA is counted as an
# additional unique element, otherwise it's essentially ignored
temp <- data.table(x, idx = seq_along(x))
firsts <- temp[temp[, .I[1L], by = x]$V1]
if(na.include == FALSE) firsts <- firsts[!is.na(x)]
y <- rep(0, times = length(x))
y[firsts$idx] <- 1
y <- cumsum(y)
return(y)
}
Example Use
cum_n_distinct(c(5,10,10,15,5)) # 1 2 2 3 3
cum_n_distinct(c(5,NA,10,15,5)) # 1 2 3 4 4
cum_n_distinct(c(5,NA,10,15,5), na.include = FALSE) # 1 1 2 3 3
Solution To Your Question
d_out = df %>%
arrange(time) %>%
group_by(grp) %>%
mutate(var2 = cum_n_distinct(var1))

Resources