Interpolate missing values of a data frame - r

I have a dataset like this:
x y z
1 1 0.954
1 3 0.134
1 30 0.123
2 1 0.425
2 3 0.123
2 30 0.865
5 1 0.247
5 3 0.654
5 30 0.178
Let's think of this as the height of a surface sampled at 9 points over a 4x29 field. Suppose I want to fill in the missing values by interpolating (linear is fine), so that I end up with a z value for every (integer) x in [1,5] and every y in [1,30]. I want the result to still be a data frame with the same structure.
How can I do this in R?

I'll take the previous lack of answer as a gift :)
#akima_0.5-12
library(akima)
my_df <- data.frame(
x = c(rep(1, 3), rep(2, 3), rep(5, 3)),
y = rep(c(1, 3, 30), 3),
z = c(0.954, 0.134, 0.123, 0.425, 0.123, 0.865, 0.247, 0.654, 0.178)
)
my_op <- interp(
x = my_df$x,
y = my_df$y,
z = my_df$z,
xo = 1:5, # vector of x coordinates to use in interpolation
yo = 1:30, # vector of y coordinates to use in interpolation
linear = TRUE # default interpolation method
)
my_op$z # matrix of interpolated z coordinates, (row, col) correspond to (x, y)
ind <- which(!is.nan(my_op$z), arr.ind = TRUE)
desired_output <- data.frame(
x = ind[, 1],
y = ind[, 2],
z = as.vector(my_op$z) # data are organized column-by-column
)

Related

Create samples with different range and weights

I want to create a total sample of 3000 entries with some rules :
Category-1(low) 0.1 - 0.3
Category-2(Medium) 0.4 - 0.7
Category-3(High) 0.7 - 0.9
I want to create the sample in such a way that each category has weights for example :
Category-1(low) 20% of the dataset
Category-2(Medium) 30% of the dataset
Category-3(High) 50% of the dataset
I am unable to find pointers to do that. Can anyone help me out with the same. Thanks a lot in advance.
We can use Map to create a sequence of values between the ranges showed in the OP's post, while generating the sample on the ranges with the proportion also being passed in as argument to Map
lst1 <- Map(function(x, y, z) sample(seq(x, y, by = 0.1), z,
replace = TRUE), c(0.1, 0.4, 0.7), c(0.3, 0.7, 0.9), c(0.2, 0.3, 0.5) * 3000)
names(lst1) <- c("low", "medium", "high")
lengths(lst1)
# low medium high
# 600 900 1500
out <- unlist(lst1)
length(out)
#[1] 3000
If we need as a two column data.frame
dat <- stack(lst1)[2:1]
I like to use the simstudy package for data generation. In this case I back-filled your values that conform to category rules. Simstudy gives a data.table object, but I'm more familiar with Tidyverse syntax:
library(simstudy)
library(dplyr)
set.seed(1724)
# define data
def <- defData(varname = "category", formula = "0.2;0.3;0.5", dist = "categorical", id = "id")
def <- defData(def, varname = "value", dist = "nonrandom", formula = NA)
# generate data
df <- genData(3000, def) %>% as_tibble()
# add in values that conform to category rules
df[df$category == 1,]$value <- runif(nrow(df[df$category == 1,]), min = 0.1, max = 0.3)
df[df$category == 2,]$value <- runif(nrow(df[df$category == 2,]), min = 0.4, max = 0.7)
df[df$category == 3,]$value <- runif(nrow(df[df$category == 3,]), min = 0.7, max = 0.9)
# A tibble: 3,000 x 3
id category value
<int> <int> <dbl>
1 1 3 0.769
2 2 2 0.691
3 3 3 0.827
4 4 3 0.729
5 5 2 0.474
6 6 3 0.818
7 7 2 0.635
8 8 2 0.552
9 9 3 0.794
10 10 3 0.792
# ... with 2,990 more rows
A rather simple approach:
1. This is not that random, but depending on the application this may suffice
out <- c(runif(600, 0.1, 0.3), runif(900, 0.4, 0.7), runif(1500, 0.7, 0.9))
2. Here, you'd draw the numbers coming from each category as well: so more random...
sam <- sample(1:3, size = 3000, prob = c(0.2, 0.3, 0.5), replace = TRUE)
x1 <- sum(sam == 1)
x2 <- sum(sam == 2)
x3 <- sum(sam == 3)
out <- c(runif(x1, 0.1, 0.3), runif(x2, 0.4, 0.7), runif(x3, 0.7, 0.9))

Filter_all with differing condition for each column

I have the following vector
vec1 = c(0.001, 0.05, 0.003, 0.1)
and a data frame
df = data_frame( x = seq(0.001, 0.1, length.out = 10), y = seq(0.03, 0.07, length.out = 10), z = seq(0, 0.005, length.out = 10), w = seq(0.05, 0.25, length.out = 10))
I would like to filter df such that the output would contain the rows of df for which, in each column, the minimum value would be the corresponding value of vec1 - 0.05, and the maximum would be vec1 + 0.05.
So in this example, only the first 4 rows satisfy this condition (in x I allow -0.049 to 0.501 based on the first entry of vec1, in y I allow 0 to 0.1 based on the second entry, and so on).
I am sure this can be done with filter_all and (.), something along the lines of
filter_all(df, all_vars(. >= (vec1(.) - 0.05) & . <= (vec1(.) + 0.05))))
But this doesn't work.
What am I doing wrong?
We can use mapply on the dataframe and pass it along with vec1 and check which of the values satisfy the criteria and select only those rows where all of the columns have TRUE value in it.
df[rowSums(mapply(function(x, y) x > (y-0.05) & x < (y+0.05),
df, vec1)) == ncol(df), ]
# x y z w
# <dbl> <dbl> <dbl> <dbl>
#1 0.0120 0.0344 0.000556 0.0722
#2 0.0230 0.0389 0.00111 0.0944
#3 0.0340 0.0433 0.00167 0.117
#4 0.0450 0.0478 0.00222 0.139

using the uniroot function with dplyr pipes

I'm trying to utilize the uniroot function inside a piping scheme. I have root data by depth, and I fit a model for each crop-year set and put the fitted parameter (A in this example) into a tibble. A simplified dataset is below:
mydat <- tribble(
~crop, ~year, ~A,
"corn", 2011, 4,
"corn", 2012, 8.5,
"soy", 2011, 4.2
)
I want to add a column that tells me the x value of my function at y = 0.5. The following code works as a stand-alone.
myfunc <- function(x, y, A) {2 + A * x - y}
uniroot(myfunc, y = 0.5, A = 4, lower = 0, upper = 10, extendInt = "yes")
If I try to put it into a piping scheme using dplyr's mutate or do, it doesn't work.
mydat %>%
mutate(x50 = uniroot(myfunc, y = 0.5, A = .$A, lower = 0, upper = 10,
extendInt = "yes"))
mydat %>%
do(x50 = uniroot(myfunc, y = 0.5, A = .$A, lower = 0, upper = 10,
extendInt = "yes"))
The uniroot function is not vectorised over its arguments. Functions like sqrt are:
> sqrt(c(1,2,3))
[1] 1.000000 1.414214 1.732051
but uniroot isnt:
> uniroot(myfunc, y = 0.5, A = c(1,2,3), lower = 0, upper = 10, extendInt = "yes")
Error in uniroot(myfunc, y = 0.5, A = c(1, 2, 3), lower = 0, upper = 10, :
did not succeed extending the interval endpoints for f(lower) * f(upper) <= 0
In addition: Warning messages:
1: In if (is.na(f.lower)) stop("f.lower = f(lower) is NA") :
the condition has length > 1 and only the first element will be used
2: In if (is.na(f.upper)) stop("f.upper = f(upper) is NA") :
the condition has length > 1 and only the first element will be used
and mutate relies on having vectorised computation.
Use lapply to iterate over any vector and call a function like this:
> lapply(mydat$A, function(a){uniroot(myfunc, y = 0.5, A = a, lower = 0, upper = 10, extendInt = "yes")$root})
[[1]]
[1] -0.375
[[2]]
[1] -0.1764706
[[3]]
[1] -0.3571429
Then use standard R functions to put that data back in your data frame if that's where you want it.
You could use purrr::map to build a list column with the results (coercing it to a data.frame), then tidyr::unnest to spread it out into columns...
library(tibble)
library(dplyr)
library(purrr)
library(tidyr)
mydat <- tribble(
~crop, ~year, ~A,
"corn", 2011, 4,
"corn", 2012, 8.5,
"soy", 2011, 4.2
)
myfunc <- function(x, y, A) {2 + A * x - y}
mydat %>%
mutate(x50 = map(A, function(x) {
as.data.frame(uniroot(myfunc, y = 0.5, A = x, lower = 0, upper = 10,
extendInt = "yes"))
})) %>%
unnest()
# # A tibble: 3 x 8
# crop year A root f.root iter init.it estim.prec
# <chr> <dbl> <dbl> <dbl> <dbl> <int> <int> <dbl>
# 1 corn 2011. 4.00 -0.375 0. 20 19 52439.
# 2 corn 2012. 8.50 -0.176 2.22e-16 20 18 0.0000610
# 3 soy 2011. 4.20 -0.357 2.22e-16 21 19 0.0000610
The solution with dplyr is
data |>
rowwise() |>
mutate(var_name = uniroot(f, c(lower_limit, upper_limit), vars_from_data)$root)

how to apply functions on data frame in r

How can i apply the following function rt on each and every value l in df.
x and y have the following values.
x<-9
y<-1
rt<-function(x,y,l) min(x,max(0,l-y))
df
a b c
5 6 7
1 4 1
2 4 3
Probably simplest if you'd like to stick with dataframes is to use apply with the MARGIN parameter set to c(1,2), which makes it apply the function by both rows and columns (i.e., to every cell).
x <- 9
y <- 1
rt <- function(x, y, l) min(x, max(0, l-y))
df <- data.frame(a = c(5, 1, 2),
b = c(6, 4, 4),
c = c(7, 1, 3))
rt_df <- as.data.frame(apply(df, c(1,2), rt, x = x, y = y))

Creating a cumulative step graph in R

Say I have this example data frame
set.seed(12345)
n1 <- 3
n2 <- 10
n3 <- 60
times <- seq(0, 100, 0.5)
individual <- c(rep(1, n1),
rep(2, n2),
rep(3, n3))
events <- c(sort(sample(times, n1)),
sort(sample(times, n2)),
sort(sample(times, n3)))
df <- data.frame(individual = individual, events = events)
Which gives
> head(df, 10)
individual events
1 1 72.0
2 1 75.5
3 1 87.5
4 2 3.0
5 2 14.5
6 2 16.5
7 2 32.0
8 2 45.5
9 2 50.0
10 2 70.5
I would like to plot a cumulative step graph of the events so that I get one line per individual which goes up by 1 each time an event is "encountered".
So, for instance individual 1 will be 0 up to 72.0, then go up to 1, until 75.5 when it becomes 2 and up to 3 at 87.5 to the end of the graph.
What would be the easiest way to do that?
df$step <- 1
library(plyr)
df <- ddply(df,.(individual),transform,step=cumsum(step))
plot(step~events,data=df[df$individual==1,],type="s",xlim=c(0,max(df$events)),ylim=c(0,max(df$step)),xlab="time",ylab="step")
lines(step~events,data=df[df$individual==2,],type="s",col=2)
lines(step~events,data=df[df$individual==3,],type="s",col=3)
There is also the stepfun function in the stats package. Using that, you could use the plot method for that object class:
sdf <- split(df, individual)
plot(1, 1, type = "n", xlim = c(0, max(events)), ylim = c(0, max(table(individual))),
ylab = "step", xlab = "time")
sfun <- lapply(sdf, function(x){
sf <- stepfun(sort(x$events), seq_len(nrow(x) + 1) - 1)
plot(sf, add = TRUE, col = unique(x$individual), do.points = FALSE)
})
Use ggplot2:
library(ggplot2)
# Add step height information with sequence and rle
df$step <- sequence(rle(df$individual)$lengths)
# plot
df$individual <- factor(df$individual)
ggplot(df, aes(x=events, group=individual, colour=individual, y=step)) +
geom_step()

Resources