Interpolate value only for short gap - r

My vector a <- c(2,0,0,1,0, 3, 0, 0,0,0,5,6)
The zeros represent the missing values
I want to interpolate missing values only if the succession of 0 is shorter than 4
I am looking for a function which allows me to perform this
The wished output is [1] 2.00 1.67 1.33 1.00 2.00 3.00 0.00 0.00 0.00 0.00 5.00 6.00

library(zoo)
temp1 = na.approx(replace(a, a == 0, NA))
temp2 = inverse.rle(with(rle(a), list(values = replace(values, values == 0 & lengths < 4, NA),
lengths = lengths)))
replace(temp2, is.na(temp2), temp1[is.na(temp2)])
# [1] 2.000000 1.666667 1.333333 1.000000 2.000000 3.000000 0.000000 0.000000
# [9] 0.000000 0.000000 5.000000 6.000000
The following (as commented by G.Grothendieck) is better
temp = na.approx(object = replace(a, a == 0, NA), maxgap = 3)
replace(temp, is.na(temp), 0)
# [1] 2.000000 1.666667 1.333333 1.000000 2.000000 3.000000 0.000000 0.000000
# [9] 0.000000 0.000000 5.000000 6.000000

Related

How to get the element-wise mean across blocks of same dataframe in R

I would like to compute the element-wise means across multiple blocks of the same dataframe. My input table looks like this, and it consists of 3 (3x3) blocks, with each block having a diagonal of ones:
input = data.frame(
var1 = c(1,7,4,1,2,9,1,8,3),
var2 = c(3,1,9,4,1,8,3,1,8),
var3 = c(3,9,1,6,8,1,3,5,1) )
The output table should be a 3x3 including the means of the elements which are located on similar positions in their blocks. E.g. the first row of the output table should be c(1, 3.3, 4). Any idea how to smartly code this? Thank you.
do.call(rbind, lapply(split(input, 1:3), colMeans))
var1 var2 var3
1 1.000000 3.333333 4.000000
2 5.666667 1.000000 7.333333
3 5.333333 8.333333 1.000000
You could use tapply or even aggregate
tapply(unlist(input), list((row(input)-1)%%3,col(input)), mean)
1 2 3
0 1.000000 3.333333 4.000000
1 5.666667 1.000000 7.333333
2 5.333333 8.333333 1.000000
aggregate(.~id, cbind(id=rep(1:3,3),input),mean)
id var1 var2 var3
1 1 1.000000 3.333333 4.000000
2 2 5.666667 1.000000 7.333333
3 3 5.333333 8.333333 1.000000
If each of the blocks are of the same dimension, then we can also use array route
t(apply(array(as.matrix(input), c(3, 3, 3)), 1, colMeans))
-output
[,1] [,2] [,3]
[1,] 1.000000 3.333333 4.000000
[2,] 5.666667 1.000000 7.333333
[3,] 5.333333 8.333333 1.000000

Divide numbers into equally-spaced intervals ranging between 0-1

Suppose I have this series of numbers in a vector:
vec <- c(1,2,3,4,5) # just an example, numbers could be far higher
How can I programmatically divide these numbers into equally-spaced intervals ranging between 0-1, such that I get:
for
1: 0
2: 0, 1
3: 0, 0.5, 1
4: 0, 0.33, 0.66, 1
5: 0, 0.25, 0.50, 0.75, 1
and so on.
Any idea?
We can use seq with length.out argument:
lapply(1:5, function(i) seq(0, 1, length.out = i))
# [[1]]
# [1] 0
#
# [[2]]
# [1] 0 1
#
# [[3]]
# [1] 0.0 0.5 1.0
#
# [[4]]
# [1] 0.0000000 0.3333333 0.6666667 1.0000000
#
# [[5]]
# [1] 0.00 0.25 0.50 0.75 1.00
or mapply:
mapply(seq, from = 0, to = 1, length.out = 1:5)
if I understand well maybe is somthing like this:
v <- 1:5
norm <- function(x){
if(length(x)==1)0 else{
(x-min(x))/(max(x)-min(x))
}
}
lapply(v, function(x)(norm(seq(1,x,length.out = x))))
output
[[1]]
[1] 0
[[2]]
[1] 0 1
[[3]]
[1] 0.0 0.5 1.0
[[4]]
[1] 0.0000000 0.3333333 0.6666667 1.0000000
[[5]]
[1] 0.00 0.25 0.50 0.75 1.00
Using map
library(purrr)
map(1:5, ~ seq(0, 1, length.out = .x))
-output
[[1]]
[1] 0
[[2]]
[1] 0 1
[[3]]
[1] 0.0 0.5 1.0
[[4]]
[1] 0.0000000 0.3333333 0.6666667 1.0000000
[[5]]
[1] 0.00 0.25 0.50 0.75 1.00

Calculate distances between XY coordinates in two different datasets in R

Consider we have two different datasets:
X1 = c(1,2,4,5,1,3,1)
Y1 = c(3,5,6,3,1,5,1)
df1= data.frame(X1,Y1)
X2 = c(2,3,4,3,2,3,2)
Y2 = c(3,4,2,6,4,3,4)
df2= data.frame(X2,Y2)
These data are represented in this scatterplot:
I would like to calculate the distances between the 7 XY coordinates in df1 (black open dots) and the 7 XY coordinates in df2 (red open triangles).
I know how to calculate the distances between the XY coordinates within a dataset using dist() and cbind(). But I don't know how to do the same but with XY coordinates in two different datasets.
Using two datasets, we would obtain a table composed by 7 columns and 7 rows, filled by the distances among all these coordinates. Column names would be the coordinates in df1 and row names would be coordinates in df2.
How can I get this data frame with all t
Maybe this strategy may help
X1 = c(1,2,4,5,1,3,1)
Y1 = c(3,5,6,3,1,5,1)
df1= data.frame(X1,Y1)
X2 = c(2,3,4,3,2,3,2)
Y2 = c(3,4,2,6,4,3,4)
df2= data.frame(X2,Y2)
library(tidyverse)
df1 = df1 %>% mutate(df_type = "data1") %>% select(X = X1, Y = Y1)
df2 = df2 %>% mutate(df_type = "data2") %>% select(X = X2, Y = Y2)
# link data frames by row
df = bind_rows(df1, df2)
dist(cbind(df$X,df$Y))
1 2 3 4 5 6 7 8 9 10 11 12 13
2 2.236068
3 4.242641 2.236068
4 4.000000 3.605551 3.162278
5 2.000000 4.123106 5.830952 4.472136
6 2.828427 1.000000 1.414214 2.828427 4.472136
7 2.000000 4.123106 5.830952 4.472136 0.000000 4.472136
8 1.000000 2.000000 3.605551 3.000000 2.236068 2.236068 2.236068
9 2.236068 1.414214 2.236068 2.236068 3.605551 1.000000 3.605551 1.414214
10 3.162278 3.605551 4.000000 1.414214 3.162278 3.162278 3.162278 2.236068 2.236068
11 3.605551 1.414214 1.000000 3.605551 5.385165 1.000000 5.385165 3.162278 2.000000 4.123106
12 1.414214 1.000000 2.828427 3.162278 3.162278 1.414214 3.162278 1.000000 1.000000 2.828427 2.236068
13 2.000000 2.236068 3.162278 2.000000 2.828427 2.000000 2.828427 1.000000 1.000000 1.414214 3.000000 1.414214
14 1.414214 1.000000 2.828427 3.162278 3.162278 1.414214 3.162278 1.000000 1.000000 2.828427 2.236068 0.000000 1.414214
Then you can create a data.frame with the distances between X and Y. First we need to transform the dist object into a data frame
df_dist = data.frame(as.matrix(dist(cbind(df$X,df$Y))))
Doing a bit of manipulation it is possible to have the distance between X and Y
df_dist_x = df_dist %>% select(X1:X7) %>%
mutate(row.1 = 1:nrow(df_dist)) %>%
filter(row.1 >= 8) %>%
mutate(Y = paste0("Y",row_number())) %>%
gather(X, distance, X1:X7) %>%
select(X, Y, distance)
head(df_dist_x)
X Y distance
1 X1 Y1 1.000000
2 X1 Y2 2.236068
3 X1 Y3 3.162278
4 X1 Y4 3.605551
5 X1 Y5 1.414214
6 X1 Y6 2.000000

Calculating division of unique pairs for each row in a matrix

Here I have a 5*4 matrix (much larger originally). I would like to calculate the ratios for each unique pair in each row of a matrix.
X1 X2 X3 X4
10 8 2 1
4 4 3 6
2 10 8 1
1 2 1 10
3 5 5 4
I would like to achieve a 5*6 matrix with non repeating divisions, like the following >
x1/x2 x1/x3 x1/x4 x2/x3 x2x4 x3/x4
1.25 5.00 10.00 4.00 8.00 2.00
1.00 1.33 0.67 1.33 0.67 0.50
0.20 0.25 2.00 1.25 10.00 8.00
0.50 1.00 0.10 2.00 0.20 0.10
0.60 0.60 0.75 1.00 1.25 1.25
For now I have created a functions that I hoped would do the trick, however the outcome is not as expected.
set.seed(7)
test <- data.frame(replicate(4,sample(1:10,5,rep=TRUE)))
func_calcRatio <- function(theMatrix){
ratios <- outer(theMatrix, theMatrix, '/')
ratios <- ratios[upper.tri(ratios)]
return(ratios)
}
func_ratioMatrix <- function(theMatrix){
ratios_list <- list()
i = 1
l = length(theMatrix)
for (i in 1:l) {
vec <- numeric(l)
for (j in 1:l){
vec[j] <- i^j
}
myrow <- theMatrix[,1]
onerow <- func_calcRatio(myrow)
ratios_list[[i]] <- onerow
i = i+1
}
ratios_df <- do.call("rbind", ratios_list)
return(ratios_df)
}
test.ratios <- func_ratioMatrix(test)
let the matrix above be A. then you can use the code below:
combn(4,2,function(x) A[,x[1]]/A[,x[2]])
[,1] [,2] [,3] [,4] [,5] [,6]
[1,] 1.25 5.000000 10.0000000 4.000000 8.0000000 2.00
[2,] 1.00 1.333333 0.6666667 1.333333 0.6666667 0.50
[3,] 0.20 0.250000 2.0000000 1.250000 10.0000000 8.00
[4,] 0.50 1.000000 0.1000000 2.000000 0.2000000 0.10
[5,] 0.60 0.600000 0.7500000 1.000000 1.2500000 1.25
If the data was in a dataframe and not a matrix, then you can use array manipulations:
eg. let us assume the matrix above to be A=as.data.frame(A) Then
combn(A,2,function(x)x[,1,]/x[,2,])
[,1] [,2] [,3] [,4] [,5] [,6]
[1,] 1.25 5.000000 10.0000000 4.000000 8.0000000 2.00
[2,] 1.00 1.333333 0.6666667 1.333333 0.6666667 0.50
[3,] 0.20 0.250000 2.0000000 1.250000 10.0000000 8.00
[4,] 0.50 1.000000 0.1000000 2.000000 0.2000000 0.10
[5,] 0.60 0.600000 0.7500000 1.000000 1.2500000 1.25
You can still modify the code the way you want.This is just a rough idea. Hope it helps
dat <- structure(list(X1 = c(10L, 4L, 2L, 1L, 3L), X2 = c(8L, 4L, 10L,
2L, 5L), X3 = c(2L, 3L, 8L, 1L, 5L), X4 = c(1L, 6L, 1L, 10L,
4L)), .Names = c("X1", "X2", "X3", "X4"), class = "data.frame", row.names = c(NA,
-5L))
When you use apply on a row-wise basis, you need to transpose the result to get values by row:
t( # this is the last function to execute, we will need to convert to row basis
apply(dat, 1, # loop over rows, single row at a time
function( r){ apply( combn(r,2), 2, # now loop over columns of `combn` result
function(x) x[[1]]/x[[2]]) }))
[,1] [,2] [,3] [,4] [,5] [,6]
[1,] 1.25 5.000000 10.0000000 4.000000 8.0000000 2.00
[2,] 1.00 1.333333 0.6666667 1.333333 0.6666667 0.50
[3,] 0.20 0.250000 2.0000000 1.250000 10.0000000 8.00
[4,] 0.50 1.000000 0.1000000 2.000000 0.2000000 0.10
[5,] 0.60 0.600000 0.7500000 1.000000 1.2500000 1.25

R - Apply dist function to groups

I am trying to apply the dist() function row wise in R but the result I get is as if it isn't grouping at all, it is simply applying dist() to all of my dataframe.
df2 %>% dplyr::group_by(X1) %>% dist()
Where df2 is my dataframe and I am just applying to the head for now, for simplicity. Essentially, each group contains coordinates (A,B) and I am trying to get the distance between each point.
Here is my dataframe:
X1 A B
1 1 12 0.0
2 1 18 0.0
3 1 18 1.0
4 1 13 0.0
5 1 18 4.0
6 1 18 0.0
7 1 18 5.0
8 1 18 0.0
9 1 18 0.0
10 2 73 -2.0
11 2 73 -0.5
12 2 74 -0.5
13 2 73 0.0
14 2 71 -1.0
15 2 75 0.0
My desired output is the lower triangular matrix of each group, here is an example:
Here's an example of creating distance matrices of the iris data set by species
results = list()
for(spec in unique(iris$Species)){
temp = iris[iris$Species==spec, 1:4]
results[[length(results)+1]] = dist(temp)
}
names(results) = unique(iris$Species)
You'll have to figure out what to do with it afterwords.
We can user purrr::map:
library(purrr)
df %>%
split(.$X1) %>%
map(~{
dist(.x)
}) -> distList
distList
#> $`1`
#> 1 2 3 4 5 6 7 8
#> 2 6.000000
#> 3 6.082763 1.000000
#> 4 1.000000 5.000000 5.099020
#> 5 7.211103 4.000000 3.000000 6.403124
#> 6 6.000000 0.000000 1.000000 5.000000 4.000000
#> 7 7.810250 5.000000 4.000000 7.071068 1.000000 5.000000
#> 8 6.000000 0.000000 1.000000 5.000000 4.000000 0.000000 5.000000
#> 9 6.000000 0.000000 1.000000 5.000000 4.000000 0.000000 5.000000 0.000000
#>
#> $`2`
#> 10 11 12 13 14
#> 11 1.500000
#> 12 1.802776 1.000000
#> 13 2.000000 0.500000 1.118034
#> 14 2.236068 2.061553 3.041381 2.236068
#> 15 2.828427 2.061553 1.118034 2.000000 4.123106
Data:
df <- read.table(text = 'X1 A B
1 1 12 0.0
2 1 18 0.0
3 1 18 1.0
4 1 13 0.0
5 1 18 4.0
6 1 18 0.0
7 1 18 5.0
8 1 18 0.0
9 1 18 0.0
10 2 73 -2.0
11 2 73 -0.5
12 2 74 -0.5
13 2 73 0.0
14 2 71 -1.0
15 2 75 0.0', h = T)
Here's my code and the solution
require(dplyr)
df2 <- structure(list(X1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L), A = c(12L, 18L, 18L, 13L, 18L, 18L, 18L,
18L, 18L, 73L, 73L, 74L, 73L, 71L, 75L), B = c(0, 0, 1, 0, 4,
0, 5, 0, 0, -2, -0.5, -0.5, 0, -1, 0)), .Names = c("X1", "A",
"B"), class = "data.frame", row.names = c("1", "2", "3", "4",
"5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15"))
mydf <- df2 %>% group_by(X1) %>% summarise(distmatrix=list(dist(cbind(A,B))))
mydf
# # A tibble: 2 × 2
# X1 distmatrix
# <int> <list>
# 1 1 <S3: dist>
# 2 2 <S3: dist>
mydf$distmatrix
# [[1]]
# 1 2 3 4 5 6 7 8
# 2 6.000000
# 3 6.082763 1.000000
# 4 1.000000 5.000000 5.099020
# 5 7.211103 4.000000 3.000000 6.403124
# 6 6.000000 0.000000 1.000000 5.000000 4.000000
# 7 7.810250 5.000000 4.000000 7.071068 1.000000 5.000000
# 8 6.000000 0.000000 1.000000 5.000000 4.000000 0.000000 5.000000
# 9 6.000000 0.000000 1.000000 5.000000 4.000000 0.000000 5.000000 0.000000
#
# [[2]]
# 1 2 3 4 5
# 2 1.500000
# 3 1.802776 1.000000
# 4 2.000000 0.500000 1.118034
# 5 2.236068 2.061553 3.041381 2.236068
# 6 2.828427 2.061553 1.118034 2.000000 4.123106

Resources