Average neighbours inside a vector - r

My data :
data <- c(1,5,11,15,24,31,32,65)
There are 2 neighbours: 31 and 32. I wish to remove them and keep only the mean value (e.g. 31.5), in such a way data would be :
data <- c(1,5,11,15,24,31.5,65)
It seems simple, but I wish to do it automatically, and sometimes with vectors containing more neighbours. For instance :
data_2 <- c(1,5,11,15,24,31,32,65,99,100,101,140)

Here is another idea that creates an id via cumsum(c(TRUE, diff(a) > 1)), where 1 shows the gap threshold, i.e.
#our group variable
grp <- cumsum(c(TRUE, diff(a) > 1))
#keep only groups with length 1 (i.e. with no neighbor)
i1 <- a[!!!ave(a, grp, FUN = function(i) length(i) > 1)]
#Find the mean of the groups with more than 1 rows,
i2 <- unname(tapply(a, grp, function(i)mean(i[length(i) > 1])))
#Concatenate the above 2 (eliminating NAs from i2) to get final result
c(i1, i2[!is.na(i2)])
#[1] 1.0 5.0 11.0 15.0 24.0 65.0 31.5
You can also wrap it in a function. I left the gap as a parameter so you can adjust,
get_vec <- function(x, gap) {
grp <- cumsum(c(TRUE, diff(x) > gap))
i1 <- x[!!!ave(x, grp, FUN = function(i) length(i) > 1)]
i2 <- unname(tapply(x, grp, function(i) mean(i[length(i) > 1])))
return(c(i1, i2[!is.na(i2)]))
}
get_vec(a, 1)
#[1] 1.0 5.0 11.0 15.0 24.0 65.0 31.5
get_vec(a_2, 1)
#[1] 1.0 5.0 11.0 15.0 24.0 65.0 140.0 31.5 100.0
DATA:
a <- c(1,5,11,15,24,31,32,65)
a_2 <- c(1, 5, 11, 15, 24, 31, 32, 65, 99, 100, 101, 140)

Here is my solution, which uses run-length encoding to identify groups:
foo <- function(x) {
y <- x - seq_along(x) #normalize to zero differences in groups
ind <- rle(y) #run-length encoding
ind$values <- ind$lengths != 1 #to find groups
ind$values[ind$values] <- cumsum(ind$values[ind$values]) #group ids
ind <- inverse.rle(ind)
xnew <- x
xnew[ind != 0] <- ave(x, ind, FUN = mean)[ind != 0] #calculate means
xnew[!(duplicated(ind) & ind != 0)] #remove duplicates from groups
}
foo(data)
#[1] 1.0 5.0 11.0 15.0 24.0 31.5 65.0
foo(data_2)
#[1] 1.0 5.0 11.0 15.0 24.0 31.5 65.0 100.0 140.0
data_3 <- c(1, 2, 4, 1, 2)
foo(data_3)
#[1] 1.5 4.0 1.5
I assume that you don't need an extremely efficient solution. If you do, I'd recommend a simple C++ for loop in Rcpp.

I have a data.table based solution, same could be translated into dplyr I guess:
library(data.table)
df <- data.table(data2 = c(1,5,11,15,24,31,32,65,99,100,101,140))
df[,neighbours := ifelse(c(0,diff(data_2)) == 1,1,0)]
df[,neighbours := c(neighbours[1:(.N-1)],1),by = rleid(neighbours)]
df[,neigh_seq := rleid(neighbours)]
unique(df[,ifelse(neighbours == 1,mean(data2),data2),by = neigh_seq])
neigh_seq V1
1: 1 1.0
2: 1 5.0
3: 1 11.0
4: 1 15.0
5: 1 24.0
6: 2 31.5
7: 3 65.0
8: 4 100.0
9: 5 140.0
What it does :
first line set neigbours to 1 if the difference with following number is 1
1: 1 0
2: 5 0
3: 11 0
4: 15 0
5: 24 0
6: 31 0
7: 32 1
8: 65 0
9: 99 0
10: 100 1
11: 101 1
12: 140 0
I wanr to group so that neighbour variable is 1 for all neigbours. I need to add 1 to each end of each groups:
df[,neighbours := c(neighbours[1:(.N-1)],1),by = rleid(neighbours)]
data2 neighbours
1: 1 0
2: 5 0
3: 11 0
4: 15 0
5: 24 0
6: 31 1
7: 32 1
8: 65 0
9: 99 1
10: 100 1
11: 101 1
12: 140 0
Then after I just do a grouping on changing neighbour value, and set the value to mean if they are neihbours
df[,ifelse(neighbours == 1,mean(data2),data2),by = rleid(neighbours)]
rleid V1
1: 1 1.0
2: 1 5.0
3: 1 11.0
4: 1 15.0
5: 1 24.0
6: 2 31.5
7: 2 31.5
8: 3 65.0
9: 4 100.0
10: 4 100.0
11: 4 100.0
12: 5 140.0
and take the unique values. And voila.

This is a dplyr version, also using as a grouping variable cumsum(c(1,diff(x)!=1)):
library(dplyr)
data_2 %>% data.frame(x = .) %>%
group_by(id = cumsum(c(1,diff(x)!=1))) %>%
summarise(res = mean(x)) %>%
select(res)
# A tibble: 9 x 1
res
<dbl>
1 1.0
2 5.0
3 11.0
4 15.0
5 24.0
6 31.5
7 65.0
8 100.0
9 140.0

Related

Randomly sampling 1 ID from each pair in column

Say I have something like the following..
df <- data.frame (ID = c("2330", "2331", "2333", "2334", "2336", "2337", "4430", "4431", "4510", "4511"), length = c(8.4,6,3,9,3,4,1,7,4,2))
> df
ID length
1 2330 8.4
2 2331 6.0
3 2333 3.0
4 2334 9.0
5 2336 3.0
6 2337 4.0
7 4430 1.0
8 4431 7.0
9 4510 4.0
10 4511 2.0
IDs that are in a pair are +/- 1 of each other. (2330, 2331), (2333, 2334), (2336, 2337), (4430, 4431), & (4510, 4511) are the pairs in my example. I would like to randomly sample 1 ID from each pair to get a dataframe that looks like the following...
> df
ID length
1 2330 8.4
2 2334 9.0
3 2336 3.0
4 4430 1.0
5 4510 4.0
How would I accomplish this with base R? Thank you.
We may create a grouping column with gl for every 2 adjacent elements and then use slice_sample with n = 1
library(dplyr)
df %>%
group_by(grp = as.integer(gl(n(), 2, n()))) %>%
slice_sample(n = 1) %>%
ungroup %>%
select(-grp)
-output
# A tibble: 5 × 2
ID length
<chr> <dbl>
1 2330 8.4
2 2333 3
3 2337 4
4 4430 1
5 4510 4
Or using base R
do.call(rbind, lapply(split(df, gl(nrow(df), 2, nrow(df)),
drop = TRUE), function(x) x[sample(nrow(x), 1),]))
-output
ID length
1 2330 8.4
2 2333 3.0
3 2337 4.0
4 4430 1.0
5 4510 4.0
Or with aggregate in base R
aggregate(.~ grp, transform(df, grp = cumsum(c(TRUE,
diff(as.numeric(ID)) !=1))), FUN = sample, 1)[-1]
ID length
1 2331 8.4
2 2334 3
3 2337 3
4 4431 7
5 4510 2
Or with tapply
df[with(df, tapply(seq_along(ID), rep(seq_along(ID), each = 2,
length.out = nrow(df)), FUN = sample, 1)),]
ID length
1 2330 8.4
4 2334 9.0
5 2336 3.0
7 4430 1.0
10 4511 2.0

Path dependent vector transformation in R

I have a vector in a dataframe in R which is a time series that oscillates between 0 and 100.
I am wanting to create a new column/vector in R that has that will be series on 1s and 0s. It will be 1 when the time series drops below 10 and will continue to be 1 until it reaches 80. Thereafter it will go back to zero. So there is a path dependency in this problem I am wanting to solve.
Something like;
DataFrame %>% mutate(BinaryIndicator = ....)
I think the picture below will be the easiest way to show what I am wanting to get to. Any help would be sincerely appreciated.
Here is a link to an example of what I would like to create
Any help much appreciated.
Since the value of one row depends on the value of the previous row (after its value is updated from its previous row, etc), I think a rolling-window operation is appropriate. zoo does this well.
dat <- data.frame(x=rep(c(60, 50, 40, 35, 30, 25, 20, 15, 10.2, 9, 2, 3, 9, 40, 72, 81, 90), 2))
dat$binary <- cumsum(zoo::rollapply(dat$x, 2, function(a) {
if (length(a) < 2) return(0)
if (a[1] >= 10 && a[2] < 10) return(1)
if (a[1] < 80 && a[2] >= 80) return(-1)
return(0)
}, partial = TRUE, align = "right"))
dat
# x binary
# 1 60.0 0
# 2 50.0 0
# 3 40.0 0
# 4 35.0 0
# 5 30.0 0
# 6 25.0 0
# 7 20.0 0
# 8 15.0 0
# 9 10.2 0
# 10 9.0 1
# 11 2.0 1
# 12 3.0 1
# 13 9.0 1
# 14 40.0 1
# 15 72.0 1
# 16 81.0 0
# 17 90.0 0
# 18 60.0 0
# 19 50.0 0
# 20 40.0 0
# 21 35.0 0
# 22 30.0 0
# 23 25.0 0
# 24 20.0 0
# 25 15.0 0
# 26 10.2 0
# 27 9.0 1
# 28 2.0 1
# 29 3.0 1
# 30 9.0 1
# 31 40.0 1
# 32 72.0 1
# 33 81.0 0
# 34 90.0 0
(I wonder if the internal logic can be simplified some.)

R_How can I fill blanks in one column with mean of two other columns?

I am trying to fill blanks in var1 with the mean of var2 and var3, but I can't get it to work. This is what I've tried so far:
df <- data.frame(var1=c(1,2,"",3,3,"","",2,2,6,7,3,"","","",3,3,11,12,2,"",3))
df$var2 <- c(1,8,9,1,1,5,8,8,3,2,0,9,4,4,7,3,5,5,2,4,6,6)
df$var3 <- c(4,1,1,4,4,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22)
for(i in 1:length(df$var1)) {
ifelse(is.na(df$var1[i]), df$var1[i] <- mean(df$var2[i], df$var3[i]), df$var1[i] == df$var1[i])
}
I am not sure what I am doing wrong. After running the code, var1 still shows empty cells.
Thank you very much for your help
Try this:
df <- data.frame(var1 = c(1,2,"",3,3,"","",2,2,6,7,3,"","","",3,3,11,12,2,"",3),
var2 = c(1,8,9,1,1,5,8,8,3,2,0,9,4,4,7,3,5,5,2,4,6,6),
var3 = c(4,1,1,4,4,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22),
stringsAsFactors = FALSE)
df[df==""] <- "NA"
for (i in 1:length(df$var1)) {
if (df$var1[i]== "NA") {
df$var1[i] = rowMeans(df[i, 2:3])
} else {
df$var1[i] = df$var1[i]
}
}
Or:
for (i in 1:length(df[,1])) {
ifelse (df[i,1] == "NA", df[i,1] <- rowMeans(df[i, 2:3]), df[i,1] <- df[i,1])
}
Alternatively, instead of redefining blanks as "NA" (as text in the example above), you could leave it as blank, skiping the df[df==""] <- "NA" bit:
for (i in 1:length(df[,1])) {
ifelse (df[i,1] == "", df[i,1] <- rowMeans(df[i, 2:3]), df[i,1] <- df[i,1])
}
Or identify blanks as "real" NAs:
df[df==""] <- NA
for (i in 1:length(df[,1])) {
ifelse (is.na(df[i,1]), df[i,1] <- rowMeans(df[i, 2:3]), df[i,1] <- df[i,1])
}
Another way without any loops:
library(dplyr)
df %>%
mutate_at(vars(var1:var3), as.numeric) %>%
mutate(var1 = case_when(is.na(var1) ~ (var2+var3)/2, TRUE ~ var1))
#> var1 var2 var3
#> 1 1.0 1 4
#> 2 2.0 8 1
#> 3 5.0 9 1
#> 4 3.0 1 4
#> 5 3.0 1 4
#> 6 5.5 5 6
#> 7 7.5 8 7
#> 8 2.0 8 8
#> 9 2.0 3 9
#> 10 6.0 2 10
#> 11 7.0 0 11
#> 12 3.0 9 12
#> 13 8.5 4 13
#> 14 9.0 4 14
#> 15 11.0 7 15
#> 16 3.0 3 16
#> 17 3.0 5 17
#> 18 11.0 5 18
#> 19 12.0 2 19
#> 20 2.0 4 20
#> 21 13.5 6 21
#> 22 3.0 6 22
I would use a data.table approach here. It should work well with larger data and it avoids looping over your data, where you dont need it.
library(data.table)
dt <- data.table(var1=c(1,2,"",3,3,"","",2,2,6,7,3,"","","",3,3,11,12,2,"",3),
var2 = c(1,8,9,1,1,5,8,8,3,2,0,9,4,4,7,3,5,5,2,4,6,6),
var3 = c(4,1,1,4,4,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22))
dt[, var1 := as.numeric(var1)]
dt[is.na(var1), var1 := apply(.SD, 1, mean), .SDcols =c("var2", "var3")]
dt
var1 var2 var3
1: 1.0 1 4
2: 2.0 8 1
3: 5.0 9 1
4: 3.0 1 4
5: 3.0 1 4
6: 5.5 5 6
7: 7.5 8 7
8: 2.0 8 8
9: 2.0 3 9
10: 6.0 2 10
11: 7.0 0 11
12: 3.0 9 12
13: 8.5 4 13
14: 9.0 4 14
15: 11.0 7 15
16: 3.0 3 16
17: 3.0 5 17
18: 11.0 5 18
19: 12.0 2 19
20: 2.0 4 20
21: 13.5 6 21
22: 3.0 6 22

R: How to create a new column for 90th quantile based off previous rows in a data frame

data.frame(c = c(1,7,11,4,5,5))
c
1 1
2 7
3 11
4 4
5 5
6 5
desired dataframe
c c.90th
1 1 NA
2 7 1
3 11 6.4
4 4 10.2
5 5 9.8
6 5 9.4
For the first row, I want it to look at the previous rows, none and get the 90th quantile, NA.
For the second row, I want it to look at the previous rows, 1 and get the 90th quantile, 1.
For the third row, I want it to look at the previous rows, 1, 7 and get the 90th quantile, 6.4.
etc.
A solution using data.table that also works by groups:
library(data.table)
dt <- data.table(c = c(1,7,11,4,5,5),
group = c(1, 1, 1, 2, 2, 2))
cumquantile <- function(y, prob) {
sapply(seq_along(y), function(x) quantile(y[0:(x - 1)], prob))
}
dt[, c90 := cumquantile(c, 0.9)]
dt[, c90_by_group := cumquantile(c, 0.9), by = group]
> dt
c group c90 c90_by_group
1: 1 1 NA NA
2: 7 1 1.0 1.0
3: 11 1 6.4 6.4
4: 4 2 10.2 NA
5: 5 2 9.8 4.0
6: 5 2 9.4 4.9
Try:
dff <- data.frame(c = c(1,7,11,4,5,5))
dff$c.90th <- sapply(1:nrow(dff),function(x) quantile(dff$c[0:(x-1)],0.9,names=F))
Output:
c c.90th
1 NA
7 1.0
11 6.4
4 10.2
5 9.8
5 9.4

Replicate row value following a factor

Given the following data frame:
df <- data.frame(patientID = rep(c(1:4), 3),
condition = c(rep("A", 4), rep("B",4), rep("C",4)),
weight = round(rnorm(12, 70, 7), 1),
height = round(c(rnorm(4, 170, 10), rep(0, 8)), 1))
> head(df)
patientID condition weight height
1 1 A 71.43 168.5
2 2 A 59.89 177.3
3 3 A 72.15 163.4
4 4 A 70.14 166.1
5 1 B 66.21 0.0
6 2 B 66.62 0.0
How can I copy the height for each patient from condition A into the other two conditions? I tried using for loops, data.table and dplyr without success.
How can I achieve this using either methods?
If your data is as it looks - sorted by condition, patientID, and the patients per condition are identical, then you can just make use of recycling as follows:
require(data.table)
setDT(df)[, height := height[condition == "A"]]
But I understand that's a lot of ifs there.
So, without assuming anything about the data, with one exception that condition,patientID pairs are unique, you can do:
require(data.table)
setDT(df)[, height := height[condition == "A"], by=patientID]
Once again, this makes use of recycling, but within each group - as it doesn't assume the data is ordered.
Both of the above methods on the sample data give:
# patientID condition weight height
# 1: 1 A 73.3 169.5
# 2: 2 A 76.3 173.4
# 3: 3 A 63.6 145.5
# 4: 4 A 56.2 164.7
# 5: 1 B 67.7 169.5
# 6: 2 B 77.3 173.4
# 7: 3 B 76.8 145.5
# 8: 4 B 70.9 164.7
# 9: 1 C 76.6 169.5
# 10: 2 C 73.0 173.4
# 11: 3 C 66.7 145.5
# 12: 4 C 71.6 164.7
The same idea can be translated to dplyr as well, which I'll leave it to you to try. Hint: it just requires group_by and mutate.
No need for the fancy stuff here. Just use the $ operator and [ subsetting.
> df$height <- df$height[df$patientID]
> df
patientID condition weight height
1 1 A 67.4 175.1
2 2 A 66.8 179.0
3 3 A 49.7 159.7
4 4 A 64.5 165.3
5 1 B 66.0 175.1
6 2 B 70.8 179.0
7 3 B 58.7 159.7
8 4 B 74.3 165.3
9 1 C 70.9 175.1
10 2 C 75.6 179.0
11 3 C 61.3 159.7
12 4 C 74.5 165.3
This should do the trick. It assumes that the first level of the condition factor is always the one with the true data.
idx <- tapply(rownames(df), list(df$patientID, df$condition), identity)
idx<-na.omit(cbind(as.vector(idx[,-1]),as.vector(idx[,1])))
df[as.vector(idx[,1]),"height"] <- df[as.vector(idx[,2]), "height"]
And from #Arun's suggestion
df$height<-with(df, ave(ifelse(condition=="A",height,-1),
factor(patientID), FUN=max))
where you can be explicit about the condition level to pull values from

Resources