Related
I want to replace 0s in my dataset using sample to random select a value in the column to replace it with.
I have this example dataset:
Sepal.Length Sepal.Width Petal.Length Petal.Width species
1 0.0 3.5 0.0 0.2 setosa
2 4.9 3.0 0.0 0.2 setosa
3 4.7 3.2 1.3 0.2 setosa
4 4.6 3.1 1.5 0.0 setosa
5 5.0 0.0 0.0 0.0 setosa
6 0.0 0.0 0.0 0.4 setosa
I have tried:
ifelse(ir$Sepal.Width == 0, sample(ir$Sepal.Width != 0), ir$Sepal.Width)
[1] 3.5 3.0 3.2 3.1 0.0 0.0 3.4 1.0 2.9 1.0 3.7 1.0 3.0 3.0 4.0
The zero's still remain. I've tried to loop this for all the columns as doing the code above for each column is too time-consuming and I've tried:
lapply(ir[,-5], function(x)ifelse(ir[,1:4] == 0, sample(ir[,1:4]),ir[,1:4]))
However it creates unnecessary columns of data with the zeros still remaining.
Reproducible code:
structure(list(Sepal.Length = c(0, 4.9, 4.7, 4.6, 5, 0, 4.6,
5, 4.4, 0, 5.4, 4.8, 0, 0, 0), Sepal.Width = c(3.5, 3, 3.2, 3.1,
0, 0, 3.4, 0, 2.9, 0, 3.7, 0, 3, 3, 4), Petal.Length = c(0, 0,
1.3, 1.5, 0, 0, 1.4, 1.5, 1.4, 1.5, 0, 1.6, 1.4, 1.1, 1.2), Petal.Width = c(0.2,
0.2, 0.2, 0, 0, 0.4, 0.3, 0.2, 0.2, 0, 0.2, 0, 0, 0, 0.2), species = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("setosa",
"versicolor", "virginica"), class = "factor")), row.names = c(NA,
15L), class = "data.frame")
Here is a short dplyr solution:
ir %>%
mutate(across(.cols = where(is.numeric),
~ replace(., . == 0, sample(.[. != 0], length(.[. == 0]), replace=T))))
You may or may not need replace=T, which allow to repeat sampled elements.
Function that replaces zeros with random non zero value from vector:
f <- function(vec){
ind <- vec == 0
vec[ind] <- sample(vec[!ind], sum(ind), TRUE)
vec
}
apply function f to each numeric column:
library(data.table)
num_cols <- names(df)[as.vector(lapply(df, class)) == "numeric"]
setDT(df)[, (num_cols) := lapply(.SD, f), .SD = num_cols]
or using base R
num_cols <- names(df)[as.vector(lapply(df, class)) == "numeric"]
df[num_cols] <- lapply(df[num_cols], f)
note
it would be better to use this sample function from book Advanced R:
sample <- function(x, size = NULL, replace = FALSE, prob = NULL) {
size <- size %||% length(x)
x[sample.int(length(x), size, replace = replace, prob = prob)]
}
because of the behavior of base::sample in case when x is numeric of length 1.
Using data.table (library(data.table)):
setDT(ir)
ir[, Sepal.Width :=
ifelse(Sepal.Width==0,
sample(Sepal.Width[Sepal.Width!=0], .N, replace=TRUE),
Sepal.Width),
by=species]
You could also have it sample from within the same species by adding a by
setDT(ir)
ir[, Sepal.Width :=
ifelse(Sepal.Width==0,
sample(Sepal.Width[Sepal.Width!=0], .N, replace=TRUE),
Sepal.Width),
by=species]
Getting this for all coulmns:
ir[, c("Sepal.Length", "Sepal.Width", "Petal.Length", "Petal.Width") :=
lapply(.SD, function(x) {
ifelse(x==0, sample(x[x!=0], size=.N, replace=TRUE), x)}),
by=species]
Note that your code
ifelse(ir$Sepal.Width == 0, sample(ir$Sepal.Width != 0), ir$Sepal.Width)
Is sampling from values of TRUE and FALSE because you are not subsetting with this logical operation ir$Sepal.Width != 0 - you need
ifelse(ir$Sepal.Width == 0, sample(ir$Sepal.Width[ir$Sepal.Width != 0]), ir$Sepal.Width)
I am writing a function that uses a dataframe as filtering criteria for a big dataframe containing model outputs. These are the filtering criteria (as a df):
parameter value
1 alpha 0.1
2 beta 0.1
3 eta 0.1
4 zeta 0.1
5 lambda 0.5
6 phi 5.0
7 kappa 1.0
dput(values)
structure(list(parameter = structure(c(1L, 2L, 3L, 7L, 5L, 6L,
4L), .Label = c("alpha", "beta", "eta", "kappa", "lambda", "phi",
"zeta"), class = "factor"), value = c(0.1, 0.1, 0.1, 0.1, 0.5,
5, 1)), class = "data.frame", row.names = c(NA, -7L))
And this is how the 'outputs' df looks like:
time w x y z alpha beta eta zeta lambda phi kappa
1 0.0 10.00000 10.00000 10.000000 10.000000 0.1 0.1 0.1 0.1 0.95 5 1
1.1 0.1 10.00572 11.04680 9.896057 9.054394 0.1 0.1 0.1 0.1 0.95 5 1
1.2 0.2 10.01983 12.17827 9.592536 8.215338 0.1 0.1 0.1 0.1 0.95 5 1
1.3 0.3 10.04010 13.37290 9.112223 7.483799 0.1 0.1 0.1 0.1 0.95 5 1
1.4 0.4 10.06377 14.60353 8.489174 6.855626 0.1 0.1 0.1 0.1 0.95 5 1
1.5 0.5 10.08778 15.83982 7.764470 6.323152 0.1 0.1 0.1 0.1 0.95 5 1
dput(outputs)
structure(list(time = c(0, 0.1, 0.2, 0.3, 0.4, 0.5, 276.5, 276.6,
276.7, 276.8, 276.9, 276.961144437566), w = c(10, 10.0057192322758,
10.0198266325956, 10.040096099625, 10.0637654242843, 10.087779652849,
-1.71585943177118, -2.04004317987084, -2.56315700921588, -3.56775247519687,
-6.37643561014456, -13.828470036737), x = c(10, 11.0467963604334,
12.1782709261765, 13.3728962503142, 14.6035317074526, 15.8398164069251,
27.2774474452024, 26.3099862348669, 24.8705756934881, 22.3379071188018,
15.8960461541267, 3.62452931346518e-144), y = c(10, 9.89605687874935,
9.59253574727296, 9.11222320249057, 8.48917353431654, 7.76447036695841,
-0.604572230605542, -0.878231815857628, -1.46586965791714, -3.20623046085508,
-14.9365932475767, -3.30552834129368e+146), z = c(10, 9.05439359565339,
8.21533762023494, 7.48379901688836, 6.85562632179817, 6.3231517466183,
42.3149654949179, 43.8836626616462, 46.4372543252026, 51.7183454733949,
72.7027555440752, 3.30552834129368e+146), alpha = c(0.1, 0.1,
0.1, 0.1, 0.1, 0.1, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5), beta = c(0.1,
0.1, 0.1, 0.1, 0.1, 0.1, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5), eta = c(0.1,
0.1, 0.1, 0.1, 0.1, 0.1, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5), zeta = c(0.1,
0.1, 0.1, 0.1, 0.1, 0.1, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9), lambda = c(0.9,
0.9, 0.5, 0.5, 0.9, 0.9, 0.5, 0.9, 0.5, 0.9, 0.5, 0.5
), phi = c(5, 5, 5, 5, 5, 5, 20, 20, 20, 20, 20, 20), kappa = c(1,
1, 1, 1, 1, 1, 10, 10, 10, 10, 10, 10), ode_outputs..iteration.. = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), row.names = c("1",
"1.1", "1.2", "1.3", "1.4", "1.5", "2916.2765", "2916.2766",
"2916.2767", "2916.2768", "2916.2769", "2916.2770"), class = "data.frame")
So it should be something like:
filtered_outputs <- outputs %>% filter(all rows in column 1 == all values in column 2)
The names under the 'parameter' column correspond to column names in the 'outputs' df. I'd like this to be not hard-coded, so that I can feed in any filtering criteria as a df and the function will filter 'outputs'. I'd like to use dplyr or baseR preferably.
So you want to select all the rows in outputs dataframe which matches the values in values dataframe?
Here is a base R approach using sweep and rowSums.
result <- outputs[rowSums(sweep(outputs[as.character(values$parameter)], 2,
values$value, `!=`)) == 0, ]
result
# time w x y z alpha beta eta zeta lambda phi kappa
#1.2 0.2 10.01983 12.17827 9.592536 8.215338 0.1 0.1 0.1 0.1 0.5 5 1
#1.3 0.3 10.04010 13.37290 9.112223 7.483799 0.1 0.1 0.1 0.1 0.5 5 1
# ode_outputs..iteration..
#1.2 NA
#1.3 NA
A possible dplyr and tidyr solution:
Create a helper data frame by turning the values data frame into wide format, and apply a semi-join to filter by the required conditions.
You could easily wrap this up in one continuous workflow but I think it's easier to understand in separate steps.
library(dplyr)
library(tidyr)
conditions <-
values %>%
pivot_wider(names_from = parameter, values_from = value)
outputs %>%
semi_join(conditions)
#> Joining, by = c("alpha", "beta", "eta", "zeta", "lambda", "phi", "kappa")
#> time w x y z alpha beta eta zeta lambda phi
#> 1.2 0.2 10.01983 12.17827 9.592536 8.215338 0.1 0.1 0.1 0.1 0.5 5
#> 1.3 0.3 10.04010 13.37290 9.112223 7.483799 0.1 0.1 0.1 0.1 0.5 5
#> kappa ode_outputs..iteration..
#> 1.2 1 NA
#> 1.3 1 NA
Created on 2021-07-08 by the reprex package (v2.0.0)
I often find these kind of things are easier when the data is in long-form format - although this is just preference:
outputs %>%
tidyr::pivot_longer(
cols = -c(time, w, x, y, z, ode_outputs..iteration..),
names_to="parameter", values_to="value_truth"
) %>%
dplyr::left_join(filter_df) %>%
dplyr::group_by(time) %>%
dplyr::filter(all(value == value_truth)) %>%
dplyr::select(-value) %>%
tidyr::pivot_wider(
names_from="parameter",
values_from="value_truth"
)
Output:
# A tibble: 2 x 13
# Groups: time [2]
time w x y z ode_outputs..iteration.. alpha beta eta zeta lambda phi kappa
<dbl> <dbl> <dbl> <dbl> <dbl> <lgl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0.2 10.0 12.2 9.59 8.22 NA 0.1 0.1 0.1 0.1 0.5 5 1
2 0.3 10.0 13.4 9.11 7.48 NA 0.1 0.1 0.1 0.1 0.5 5 1
Data:
outputs = structure(list(time = c(0, 0.1, 0.2, 0.3, 0.4, 0.5, 276.5, 276.6,
276.7, 276.8, 276.9, 276.961144437566), w = c(10, 10.0057192322758,
10.0198266325956, 10.040096099625, 10.0637654242843, 10.087779652849,
-1.71585943177118, -2.04004317987084, -2.56315700921588, -3.56775247519687,
-6.37643561014456, -13.828470036737), x = c(10, 11.0467963604334,
12.1782709261765, 13.3728962503142, 14.6035317074526, 15.8398164069251,
27.2774474452024, 26.3099862348669, 24.8705756934881, 22.3379071188018,
15.8960461541267, 3.62452931346518e-144), y = c(10, 9.89605687874935,
9.59253574727296, 9.11222320249057, 8.48917353431654, 7.76447036695841,
-0.604572230605542, -0.878231815857628, -1.46586965791714, -3.20623046085508,
-14.9365932475767, -3.30552834129368e+146), z = c(10, 9.05439359565339,
8.21533762023494, 7.48379901688836, 6.85562632179817, 6.3231517466183,
42.3149654949179, 43.8836626616462, 46.4372543252026, 51.7183454733949,
72.7027555440752, 3.30552834129368e+146), alpha = c(0.1, 0.1,
0.1, 0.1, 0.1, 0.1, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5), beta = c(0.1,
0.1, 0.1, 0.1, 0.1, 0.1, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5), eta = c(0.1,
0.1, 0.1, 0.1, 0.1, 0.1, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5), zeta = c(0.1,
0.1, 0.1, 0.1, 0.1, 0.1, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9), lambda = c(0.9,
0.9, 0.5, 0.5, 0.9, 0.9, 0.5, 0.9, 0.5, 0.9, 0.5, 0.5
), phi = c(5, 5, 5, 5, 5, 5, 20, 20, 20, 20, 20, 20), kappa = c(1,
1, 1, 1, 1, 1, 10, 10, 10, 10, 10, 10), ode_outputs..iteration.. = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), row.names = c("1",
"1.1", "1.2", "1.3", "1.4", "1.5", "2916.2765", "2916.2766",
"2916.2767", "2916.2768", "2916.2769", "2916.2770"), class = "data.frame")
filter_df = fread(' parameter value
1 alpha 0.1
2 beta 0.1
3 eta 0.1
4 zeta 0.1
5 lambda 0.5
6 phi 5.0
7 kappa 1.0') %>% dplyr::select(-V1)
i got stuck in a problem:
i got this df:
df <- data.frame(station = c("A", "A", "A", "B", "B"),
Initial_height = c(20, 50, 100, 30, 60),
final_height = c(50, 100, 300, 60, 110),
initial_flow = c(0.5, 1.2, 1.9, 0.8, 0.7),
final_Flow = c(1.21, 1.92, 0.805, 0.7, 1))
context: each height has a flow value, but is calculated differently for each line of the data frame.
I would like to compare, for the same station, the flow value where the height is the same.
My perfect data frame:
df.answer <- data.frame(station = c("A", "A", "A", "B", "B"),
Initial_height = c(20, 50, 100, 30, 60),
final_height = c(50, 100, 300, 60, 110),
initial_flow = c(0.5, 1.2, 1.9, 0.8, 0.7),
final_Flow = c(1.21, 1.92, 0.805, 0.7, 1),
diff_flow = c(0.010, 0.020, NA, 0, NA))
NA can be replaced by any other character
EDIT: this can happen:
df <- data.frame(station = c("A", "A", "A", "B", "B"),
Initial_height = c(20, 51, 100, 30, 60),
final_height = c(50, 100, 300, 60, 110),
initial_flow = c(0.5, 1.2, 1.9, 0.8, 0.7),
final_Flow = c(1.21, 1.92, 0.805, 0.7, 1),
diff_flow = c(NA, 0.020, NA, 0, NA)))
at station A, the initial and final values ​​do not match. should return NA
We can subtract the lead i.e next value of 'initial_flow' from 'final_flow after grouping by 'station'
library(dplyr)
out <- df %>%
group_by(station) %>%
mutate(diff_flow = final_Flow - lead(initial_flow)) %>%
ungroup
-output
out
# A tibble: 5 x 6
# station Initial_height final_height initial_flow final_Flow diff_flow
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 A 20 50 0.5 1.21 0.01
#2 A 50 100 1.2 1.92 0.02
#3 A 100 30 1.9 0.805 NA
#4 B 30 60 0.8 0.7 0
#5 B 60 110 0.7 1 NA
In data.table you can use shift to get next row in each group.
library(data.table)
setDT(df)[,diff_flow := final_Flow - shift(initial_flow, type = 'lead'), station]
# station Initial_height final_height initial_flow final_Flow diff_flow
#1: A 20 50 0.5 1.210 0.01
#2: A 50 100 1.2 1.920 0.02
#3: A 100 300 1.9 0.805 NA
#4: B 30 60 0.8 0.700 0.00
#5: B 60 110 0.7 1.000 NA
Context: I am trying to plot crop rotations from transition matrix describing the changes from one crop to another. For those wondering, using transition matrix to represent crop rotations has been published but the reference is not about making the plots; see:
Castellazzi, M.S., Wood, G.A., Burgess, P.J., Morris, J., Conrad,
K.F., Perry, J.N., 2008. A systematic representation of crop
rotations. Agricultural Systems 97, 26–33.
https://doi.org/10.1016/j.agsy.2007.10.006
By stochastic matrix I mean a square matrix representing the transition probability from Ci to Cj in one step. The sum of each row is equal to one.
This comes down to use a stochastic transition matrix as an adjacency matrix to build a weighted and directed graph. This can be done using diagram::plotmat() and igraph::graph_from_data_frame() but I just can't find out how to order vertices and make the edges look "nice".
Expected plot:
The matrix I would like to use looks like this:
> transmat
C1 C2 C3 C4 C5
C1 0 1 0.0 0.0 0
C2 0 0 0.5 0.5 0
C3 0 0 0.0 0.0 1
C4 0 0 0.0 0.0 1
C5 1 0 0.0 0.0 0
> dput(transmat)
structure(list(C1 = c(0, 0, 0, 0, 1), C2 = c(1, 0, 0, 0, 0),
C3 = c(0, 0.5, 0, 0, 0), C4 = c(0, 0.5, 0, 0, 0), C5 = c(0,
0, 1, 1, 0)), class = "data.frame", row.names = c("C1", "C2",
"C3", "C4", "C5"))
Attempt with diagram::plotmat():
## With diagram::plotmat ----------------------------------------------------
plot.new()
plotmat(t(transmat),
pos = c(1,1,2,1), # non-0 count in each line of transmat
curve = 0.3,
absent = 0, # don't connect crops linked by 0
arr.type = "triangle",
arr.pos = 0.6,
box.type = "rect",
box.prop=0.3,
box.lwd=2,
shadow.size = 0,
cex.txt=0.8,
endhead = FALSE)
Output:
Another example with igraph::graph_from_data_frame() and layout = layout_as_tree():
Expected plot:
The associated matrix:
> complex_ex
C1 C2 C3 C4 C5
C1 0.00 1.00 0.0 0.0 0.00
C2 0.00 0.00 0.5 0.5 0.00
C3 0.25 0.25 0.0 0.0 0.50
C4 0.00 0.00 0.0 0.0 1.00
C5 0.75 0.00 0.0 0.0 0.25
> dput(complex_ex)
structure(list(C1 = c(0, 0, 0.25, 0, 0.75), C2 = c(1, 0, 0.25,
0, 0), C3 = c(0, 0.5, 0, 0, 0), C4 = c(0, 0.5, 0, 0, 0), C5 = c(0,
0, 0.5, 1, 0.25)), class = "data.frame", row.names = c("C1",
"C2", "C3", "C4", "C5"))
The code I used:
# Some data transformation to make it work with igraph::graph_from_data_frame()
df_transmat <- as.data.frame(complex_ex)
df_transmat$from <- rownames(df_transmat)
df_transmat <- reshape(df_transmat,
idvar = "from",
varying = colnames(df_transmat)[1:(ncol(df_transmat)-1)],
times = colnames(df_transmat)[1:(ncol(df_transmat)-1)],
timevar = "to",
v.names = "change",
direction = "long")
rownames(df_transmat) <- NULL
rotation <- subset(df_transmat, subset = df_transmat$change != 0)
# The plot
g1 <- graph_from_data_frame(rotation, directed = TRUE)
plot(g1,
layout = layout_as_tree(g1, root = "C1"),
edge.arrow.mode = 2,
edge.arrow.size = 0.5,
edge.curved = 0,
edge.width = 0.5,
vertex.label.cex = 1,
vertex.label.font = 2,
vertex.shape = "rectangle",
vertex.color = "white",
vertex.size = 50,
vertex.size2 = 30,
vertex.label.dist = 0,
vertex.label.color = "black")
The output:
I have data that looks like this:
time sucrose fructose glucose galactose molasses water
1 5 0.0 0.00 0.0 0.0 0.3 0
2 10 0.3 0.10 0.1 0.0 1.0 0
3 15 0.8 0.20 0.2 0.2 1.4 0
4 20 1.3 0.35 0.7 0.4 2.5 0
5 25 2.2 0.80 1.6 0.5 3.5 0
6 30 3.1 1.00 2.3 0.6 4.5 0
7 35 3.6 1.60 3.1 0.7 5.7 0
8 40 5.1 2.80 4.3 0.7 6.7 0
How can i make a time series plot that uses the time column? They are all increasing values.
I saw this post multiple-time-series-in-one-plot which uses ts.plot to achieve something similar to what i want to show, which is this:
Input data for the table above:
structure(list(time = c(5, 10, 15, 20, 25, 30, 35, 40), sucrose = c(0,
0.3, 0.8, 1.3, 2.2, 3.1, 3.6, 5.1), fructose = c(0, 0.1, 0.2,
0.35, 0.8, 1, 1.6, 2.8), glucose = c(0, 0.1, 0.2, 0.7, 1.6, 2.3,
3.1, 4.3), galactose = c(0, 0, 0.2, 0.4, 0.5, 0.6, 0.7, 0.7),
molasses = c(0.3, 1, 1.4, 2.5, 3.5, 4.5, 5.7, 6.7), water = c(0,
0, 0, 0, 0, 0, 0, 0)), .Names = c("time", "sucrose", "fructose",
"glucose", "galactose", "molasses", "water"), row.names = c(NA,
-8L), class = "data.frame")
It doesn't seem like a ts plot is necessary. Here's how you could do it in base-R:
with(df, plot(time, sucrose, type="n", ylab="contents"))
var <- names(df)[-1]
for(i in var) lines(df$time, df[,i])
The more elegant solution would however be using the 'dplyrandggplot2` package:
df <- df %>%
gather(content, val, -time)
ggplot(df, aes(time, val, col=content)) + geom_line()