How to overlap two different plots in R - r

I need to overlap two different plots. They use the same scale already.
My code for each separated scatterplot look like this.
ggscatter(chemicals, x = "columnB", y = "columnA",
color = "nombre",
palette = "jco",
ellipse = FALSE,
ellipse.type = "convex",
repel = TRUE,
max.overlaps = 10,
font.label = c(6, "plain", "red"))
ggscatter(rivers, x = "V3", y = "V2",
label = rivers$V1,
palette = "jco",
ellipse = FALSE,
ellipse.type = "convex",
repel = FALSE,
max.overlaps = 10,
font.label = c(6, "plain", "blue"))
The first data look like this...
chemicals <- structure(list(columnA = c(0.34526, -0.47491, 1.9717, -1.28922,
-1.3365, -1.06089, -1.35741, -1.03362, 1.33577, 0.26619, -1.33583,
0.56619, -0.84651, 0.52487, -0.44644, 0.33894, 1.33558, -1.36652,
-1.41608, 0.08864, -0.98665, -0.13102, 0.96633, -0.33869, -1.45537,
1.50434, -1.30283, -0.03662, -0.83985, -0.86605, 0.96659, -1.37216,
1.05501, 0.34936, -0.56608, -0.84148, 1.16633, 1.15391, -1.10533,
-0.04087, 1.36684, 0.39588, -0.4166, -0.7338, -1.33663, 1.24798,
0.26939, 0.57514, 0.21976, -0.62348, -1.3341, 0.6696, 1.71274,
0.0337, -1.33959, -0.33319, -0.21368, -0.25305, 0.56606, 0.56665
), columnB = c(0.46696, 0.15238, 0.28205, -1.01343, -0.45548, -0.58032,
-0.03174, -1.86618, 0.37332, 0.33668, 0.3668, 0.67415, -0.0393,
1.21716, 0.06624, 1.4333, 0.42663, 0.33143, 0.33529, -2.66816,
0.76601, 0.06666, 0.86633, 0.59532, -0.33115, -0.76641, 0.06633,
0.50038, -0.11718, 0.28718, -1.84348, -0.2598, -0.37834, 1.82102,
0.66669, 0.56604, -2.17667, -1.86617, 0.67087, -2.2598, -2.06249,
-0.25863, 1.26661, -1.76684, 0.06665, 0.80114, -1.33408, 0.23333,
0.21658, 0.39268, 0.50466, -0.09929, -0.09178, 1.07363, 1.15409,
-0.49409, 1.628, 0.26664, 0.62084, 0.50397)), row.names = c(1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L,
13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L,
24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L,
35L, 36L, 37L, 38L, 39L, 40L, 41L, 42L, 43L, 44L, 45L,
46L, 47L, 48L, 49L, 50L, 51L, 52L, 53L, 54L, 55L, 56L,
57L, 58L, 59L, 60L), class = "data.frame")
The second data looks like this...
rivers <- structure(list(V1 = structure(c(7L, 5L, 6L, 1L, 3L, 4L, 8L, 2L
), .Label = c("riverA", "riverB", "riverC", "riverD",
"riverE", "riverF", "riverG", "riverH"), class = "factor"),
V2 = structure(c(8L, 7L, 6L, 5L, 4L, 1L, 2L, 3L), .Label = c("-0.800",
"0.021", "0.220", "0.590", "0.999", "0.333", "0.700", "0.850"
), class = "factor"), V3 = structure(c(1L, 3L, 4L, 2L, 7L,
6L, 8L, 5L), .Label = c("-0.028", "-0.011", "-0.078", "-0.4",
"-0.952", "0.275", "0.630", "0.725"), class = "factor")), class = "data.frame", row.names = c(NA,
-8L))
I need to put both of these scatter plots together in one plot.

I don't have ggpubr, but here is a demonstration using ggplot2:
library(dplyr)
library(ggplot2)
rivers %>%
mutate(source = "rivers", across(c(V3,V2), ~ as.numeric(as.character(.)))) %>%
select(source, columnA = V3, columnB = V2) %>%
bind_rows(mutate(chemicals, source = "chemicals")) %>%
ggplot(aes(columnA, columnB)) +
geom_point(aes(color = source))
I'm guessing this should be straight-forward to translate into ggpubr::ggscatter.
The premise of row-binding (via base rbind or dplyr::bind_rows or data.table::rbindlist) is that the number of rows matters not, it's the columns that matter. In the base case, there must be the same number of columns with the same names:
dat1 <- data.frame(a = 1, b = 2)
dat2 <- data.frame(a = 1:2, d = 3:4)
rbind(dat1, dat2)
# Error in match.names(clabs, names(xi)) :
# names do not match previous names
dat2b <- data.frame(a = 1:2, b = 3:4)
rbind(dat1, dat2b)
# a b
# 1 1 2
# 2 1 3
# 3 2 4
Both dplyr::bind_rows and data.table::rbindlist provide wiggle room around this, either by default (former) or with options (latter):
dat2 <- data.frame(a = 1:2, d = 3:4)
dplyr::bind_rows(dat1, dat2)
# a b d
# 1 1 2 NA
# 2 1 NA 3
# 3 2 NA 4
data.table::rbindlist(list(dat1, dat2), use.names = TRUE, fill = TRUE)
# a b d
# <num> <num> <int>
# 1: 1 2 NA
# 2: 1 NA 3
# 3: 2 NA 4
In this case, though, you want to normalize the names, so for one of them you need to change in either or both of them so that they can be aligned/row-bound properly.
FYI, you don't actually have to rename or rbind them to do things the brute-force way in ggplot2, but doing it this way has consequences and limits several other options so it is generally discouraged:
ggplot() +
geom_point(aes(columnA, columnB), color = "red", data = chemicals) +
geom_point(aes(as.numeric(as.character(V3)), as.numeric(as.character(V2))), color = "blue", data = rivers)
... but this doesn't help you adapt the process to ggscatter, so it is doubly not useful. I'll keep it, but don't go down this last path.

Related

How to select specific element from nested dataframes

I have a list of nested data frames and I want to extract the observations of the earliest year, my problem is the first year change with the data frames. the year is either 1992 or 2005.
I want to create a list to stock them, I tried with which, but since there is the same year, observations are repeated, and I want them apart
new_df<- which(df[[i]]==1992 | df[[i]]==2005)
I've tried with ifelse() but I have to do an lm operation after, and it doesn't work. And I can't take only the first rows, because the year are repeated
my code looks like this:
df<- list(a<-data.frame(a_1<-(1992:2015),
a_2<-sample(1:24)),
b<-data.frame(b_1<-(1992:2015),
b_2<-sample(1:24)),
c<-data.frame(c_1<-(2005:2015),
c_2<-sample(1:11)),
d<-data.frame(d_1<-(2005:2015),
d_2<-sample(1:11)))
You can define a function to get the data on one data.frame and loop on the list to extract values.
Below I use map from the purrr package but you can also use lapply and for loops
Please do not use <- when assigning values in a function call (here data.frame() ) because it will mess colnames. = is used in function calls for arguments variables and it's okay to use it. You can read this ;)
df<- list(a<-data.frame(a_1 = (1992:2015),
a_2 = sample(1:24)),
b<-data.frame(b_1 = (1992:2015),
b_2 = sample(1:24)),
c<-data.frame(c_1 = (2005:2015),
c_2 = sample(1:11)),
d<-data.frame(d_1 = (2005:2015),
d_2 = sample(1:11)))
extract_miny <- function(df){
miny <- min(df[,1])
res <- df[df[,1] == miny, 2]
names(res) <- miny
return(res)
}
map(df, extract_miny)
If the data is sorted as the example, you can slice() the first row for the information. Notice the use of = rather than <- in creating a nested dataframe.
library(tidyverse)
df <- list(
a = data.frame(a_1 = (1992:2015),
a_2 = sample(1:24)),
b = data.frame(b_1 = (1992:2015),
b_2 = sample(1:24)),
c = data.frame(c_1 = (2005:2015),
c_2 = sample(1:11)),
d = data.frame(d_1 = (2005:2015),
d_2 = sample(1:11))
)
df %>%
imap_dfr( ~ slice(.x, 1) %>%
set_names(c("year", "value")) %>%
mutate(dataframe = .y) %>%
as_tibble())
# A tibble: 4 x 3
year value dataframe
<int> <int> <chr>
1 1992 19 a
2 1992 2 b
3 2005 1 c
4 2005 5 d
You may subset anonymeously.
lapply(df, \(x) setNames(x[x[[1]] == min(x[[1]]), ], c('year', 'value'))) |> do.call(what=rbind)
# year value
# 1 1992 6
# 2 1992 9
# 3 2005 11
# 4 2005 11
Or maybe better by creating a variable from which sample the value stems from.
Map(`[<-`, df, 'sample', value=letters[seq_along(df)]) |>
lapply(\(x) setNames(x[x[[1]] == min(x[[1]]), ], c('year', 'value', 'sample'))) |>
do.call(what=rbind)
# year value sample
# 1 1992 6 a
# 2 1992 9 b
# 3 2005 11 c
# 4 2005 11 d
Data:
df <- list(structure(list(a_1.....1992.2015. = 1992:2015, a_2....sample.1.24. = c(6L,
18L, 23L, 5L, 7L, 14L, 4L, 10L, 19L, 17L, 15L, 1L, 11L, 22L,
13L, 8L, 20L, 16L, 2L, 3L, 24L, 21L, 9L, 12L)), class = "data.frame", row.names = c(NA,
-24L)), structure(list(b_1.....1992.2015. = 1992:2015, b_2....sample.1.24. = c(9L,
24L, 18L, 8L, 16L, 11L, 13L, 23L, 15L, 20L, 19L, 21L, 12L, 22L,
7L, 3L, 6L, 17L, 2L, 5L, 4L, 10L, 1L, 14L)), class = "data.frame", row.names = c(NA,
-24L)), structure(list(c_1.....2005.2015. = 2005:2015, c_2....sample.1.11. = c(11L,
2L, 5L, 10L, 9L, 6L, 1L, 7L, 3L, 8L, 4L)), class = "data.frame", row.names = c(NA,
-11L)), structure(list(d_1.....2005.2015. = 2005:2015, d_2....sample.1.11. = c(11L,
2L, 5L, 1L, 6L, 9L, 3L, 7L, 10L, 4L, 8L)), class = "data.frame", row.names = c(NA,
-11L)))

R adding columns and data

I have a table with two columns A and B. I want to create a new table with two new columns added: X and Y. These two new columns are to contain data from column A, but every second row from column A. Correspondingly for column X, starting from the first value in column A and from the second value in column A for column Y.
So far, I have been doing it in Excel. But now I need it in R best function form so that I can easily reuse that code. I haven't done this in R yet, so I am asking for help.
Example data:
structure(list(A = c(2L, 7L, 5L, 11L, 54L, 12L, 34L, 14L, 10L,
6L), B = c(3L, 5L, 1L, 21L, 67L, 32L, 19L, 24L, 44L, 37L)), class = "data.frame", row.names = c(NA,
-10L))
Sample result:
structure(list(A = c(2L, 7L, 5L, 11L, 54L, 12L, 34L, 14L, 10L,
6L), B = c(3L, 5L, 1L, 21L, 67L, 32L, 19L, 24L, 44L, 37L), X = c(2L,
NA, 5L, NA, 54L, NA, 34L, NA, 10L, NA), Y = c(NA, 7L, NA, 11L,
NA, 12L, NA, 14L, NA, 6L)), class = "data.frame", row.names = c(NA,
-10L))
It is not a super elegant solution, but it works:
exampleDF <- structure(list(A = c(2L, 7L, 5L, 11L, 54L,
12L, 34L, 14L, 10L, 6L),
B = c(3L, 5L, 1L, 21L, 67L,
32L, 19L, 24L, 44L, 37L)),
class = "data.frame", row.names = c(NA, -10L))
index <- seq(from = 1, to = nrow(exampleDF), by = 2)
exampleDF$X <- NA
exampleDF$X[index] <- exampleDF$A[index]
exampleDF$Y <- exampleDF$A
exampleDF$Y[index] <- NA
You could also make use of the row numbers and the modulo operator:
A simple ifelse way:
library(dplyr)
df |>
mutate(X = ifelse(row_number() %% 2 == 1, A, NA),
Y = ifelse(row_number() %% 2 == 0, A, NA))
Or using pivoting:
library(dplyr)
library(tidyr)
df |>
mutate(name = ifelse(row_number() %% 2 == 1, "X", "Y"),
value = A) |>
pivot_wider()
A function using the first approach could look like:
See comment
xy_fun <- function(data, A = A, X = X, Y = Y) {
data |>
mutate({{X}} := ifelse(row_number() %% 2 == 1, {{A}}, NA),
{{Y}} := ifelse(row_number() %% 2 == 0, {{A}}, NA))
}
xy_fun(df, # Your data
A, # The col to take values from
X, # The column name of the first new column
Y # The column name of the second new column
)
Output:
A B X Y
1 2 3 2 NA
2 7 5 NA 7
3 5 1 5 NA
4 11 21 NA 11
5 54 67 54 NA
6 12 32 NA 12
7 34 19 34 NA
8 14 24 NA 14
9 10 44 10 NA
10 6 37 NA 6
Data stored as df:
df <- structure(list(A = c(2L, 7L, 5L, 11L, 54L, 12L, 34L, 14L, 10L, 6L),
B = c(3L, 5L, 1L, 21L, 67L, 32L, 19L, 24L, 44L, 37L)
),
class = "data.frame",
row.names = c(NA, -10L)
)
I like the #harre approach:
Another approach with base R we could ->
Use R's recycling ability (of a shorter-vector to a longer-vector):
df$X <- df$A
df$Y <- df$B
df$X[c(FALSE, TRUE)] <- NA
df$Y[c(TRUE, FALSE)] <- NA
df
A B X Y
1 2 3 2 NA
2 7 5 NA 5
3 5 1 5 NA
4 11 21 NA 21
5 54 67 54 NA
6 12 32 NA 32
7 34 19 34 NA
8 14 24 NA 24
9 10 44 10 NA
10 6 37 NA 37

Use a pre-existing value within a function in dplyr

The problem
I am having a lot of difficulty using a known value within a function within dplyr. The issue is with the following line. The rest of what follows it is just data that leads to the problematic component.
data <- data %>%
group_by(Group) %>%
bind_cols(as_tibble(rotate2(as.matrix(.)[,1:2], theta = min(.$theta))))
The min(.$theta) is my attempt to try to find the theta value within each group and use it. There is a column in the data created (as shown below) which contains this value. I want to take the value from each group (Group) and use it with rotate2. There are only two groups in the sample below, but the real data has hundreds of groups. What I want to know is: how can I use the existing value for each group (the theta column repeats the same value for each group).
Is there something I can replace min(.$theta) with that would do this? It seems to take data from the entire column, rather than taking the value from each Group individually.
Data to get to the problem
Packages - dplyr, plyr, lava
data <- structure(list(X = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, 4.9046,
6.1424, 7.275, 8.5851, 10.0373, 11.9981, 13.7726, 15.0731, 16.0664,
18.1945, 21.2666, 24.2093, 26.7119, 28.8037, 30.7135, 32.1351,
33.1982, 34.2341, 35.7587, 37.2147, 38.4303, 39.625, 40.4596,
42.0938, 42.7428, 42.7593, 43.5085, 43.7419, 43.5989, 44.0841,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -14.845, -11.9052,
-8.7897, -5.8034, -2.6756, 0.3316, 3.4003, 6.5281, 9.6517, 12.804,
15.9861, 19.1769, 22.2929, 25.4089, 28.3392, 31.0054, 33.1847,
35.081, 36.7227, 38.1544, 39.1697, 40.049, 40.9647, 41.5014,
41.8874, 42.1778, 42.3435, 42.2681, 42.3745, 42.4619, NA, NA,
NA, NA), Y = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, -9.9938, -7.4596,
-4.8647, -2.2903, 0.3158, 2.9302, 5.7262, 8.7033, 11.8007, 14.9847,
16.7225, 16.7813, 15.6921, 14.2964, 11.5579, 8.2378, 5.183, 1.5938,
-2.0712, -5.195, -7.1447, -9.0446, -11.1269, -13.0979, -15.3295,
-17.1898, -19.4376, -21.4781, -23.8426, -25.6343, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, 8.0113, 9.1826, 9.838, 10.7908,
11.175, 12.0393, 12.6813, 12.8828, 13.2281, 13.5102, 13.6637,
13.5493, 12.8699, 12.2191, 10.9208, 9.0209, 6.2158, 3.2466, 0.2169,
-2.7807, -6.0439, -9.1262, -11.8684, -14.7779, -17.5825, -20.2452,
-22.807, -25.3519, -27.6105, -29.7536, NA, NA, NA, NA), fan_line = c(1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L,
16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L,
29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L, 37L, 38L, 39L, 40L, 41L,
42L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L,
14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L,
27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L, 37L, 38L, 39L,
40L, 41L, 42L)), class = "data.frame", row.names = c(NA, -84L
))
data <- data %>% mutate(Group = rep(1:(n()/42), each = 42)) %>% dplyr::group_by(Group) %>%
mutate(start = min(which(!is.na(X))), end = max(which(!is.na(X))), midpoint = round((start+end)/2, digits = 0)) %>% ungroup()
data$start_val_x <- 0
data$end_val_x <- 0
data$start_val_y <- 0
data$end_val_y <- 0
for (i in 1:nrow(data)){
if (data[i, "fan_line"] == data[i, "start"]){
data[i, "start_val_x"] = data[i, "X"]
data[i, "start_val_y"] = data[i, "Y"]
}
else{data[i, "start_val_y"] = NA
data[i, "start_val_x"] = NA}
}
for (i in 1:nrow(data)){
if (data[i, "fan_line"] == data[i, "end"]){
data[i, "end_val_x"] = data[i, "X"]
data[i, "end_val_y"] = data[i, "Y"]
}
else{data[i, "end_val_y"] = NA
data[i, "end_val_x"] = NA}
}
data <- data %>% group_by(Group) %>% fill(c(start_val_x, start_val_y), .direction = "down") %>% fill(c(start_val_x, start_val_y), .direction = "up")
data <- data %>% group_by(Group) %>% fill(c(end_val_x, end_val_y), .direction = "down") %>% fill(c(end_val_x, end_val_y), .direction = "up")
data <- data %>% group_by(Group) %>% mutate(theta = max(atan(diff(c(start_val_y, end_val_y))/diff(c(start_val_x, end_val_x))), na.rm = T))
data <- data %>% group_by(Group) %>% bind_cols(as_tibble(rotate2(as.matrix(.)[,1:2], theta = min(.$theta))))
We could use group_modify. However, I'm not sure if the outcome below is what you are looking for.
In a normal dplyr pipeline we could use cur_data() to access the data of each group. This is not possible here, because we are inside a non-dplyr function. For this case we have group_map (which returns a list) and group_modify (which returns a grouped tibble as long as each output is a data.frame). We can use a lambda function and here .x is our grouped data.
library(tidyverse)
library(lava)
data %>%
group_by(Group) %>%
group_modify(~ as_tibble(rotate2(as.matrix(.x)[,1:2], theta = min(.x$theta))))
#> Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if `.name_repair` is omitted as of tibble 2.0.0.
#> Using compatibility `.name_repair`.
#> # A tibble: 84 x 3
#> # Groups: Group [2]
#> Group V1 V2
#> <int> <dbl> <dbl>
#> 1 1 NA NA
#> 2 1 NA NA
#> 3 1 NA NA
#> 4 1 NA NA
#> 5 1 NA NA
#> 6 1 NA NA
#> 7 1 NA NA
#> 8 1 NA NA
#> 9 1 NA NA
#> 10 1 8.26 -7.46
#> # … with 74 more rows
Created on 2021-04-13 by the reprex package (v0.3.0)

Replace special characters in multiple columns and calculate yearly pct changes in R

I'm working at a dataset as follows:
structure(list(date = structure(1:24, .Label = c("2010Y1-01m",
"2010Y1-02m", "2010Y1-03m", "2010Y1-04m", "2010Y1-05m", "2010Y1-06m",
"2010Y1-07m", "2010Y1-08m", "2010Y1-09m", "2010Y1-10m", "2010Y1-11m",
"2010Y1-12m", "2011Y1-01m", "2011Y1-02m", "2011Y1-03m", "2011Y1-04m",
"2011Y1-05m", "2011Y1-06m", "2011Y1-07m", "2011Y1-08m", "2011Y1-09m",
"2011Y1-10m", "2011Y1-11m", "2011Y1-12m"), class = "factor"),
a = structure(c(1L, 18L, 19L, 20L, 22L, 23L, 2L, 4L, 5L,
7L, 8L, 10L, 1L, 21L, 3L, 6L, 9L, 11L, 12L, 13L, 14L, 15L,
16L, 17L), .Label = c("--", "10159.28", "10295.69", "10580.82",
"10995.65", "11245.84", "11327.23", "11621.99", "12046.63",
"12139.78", "12848.27", "13398.26", "13962.6", "14559.72",
"14982.58", "15518.64", "15949.87", "7363.45", "8237.71",
"8830.99", "9309.47", "9316.56", "9795.77"), class = "factor"),
b = structure(c(1L, 15L, 22L, 23L, 3L, 5L, 6L, 8L, 9L, 11L,
13L, 16L, 1L, 21L, 2L, 4L, 7L, 10L, 12L, 14L, 17L, 18L, 19L,
20L), .Label = c("--", "1058.18", "1455.6", "1539.01", "1867.07",
"2036.92", "2102.23", "2372.84", "2693.96", "2769.65", "2973.04",
"3146.88", "3227.23", "3604.71", "365.07", "3678.01", "4043.18",
"4438.55", "4860.76", "5360.94", "555.51", "653.19", "980.72"
), class = "factor")), class = "data.frame", row.names = c(NA,
-24L))
I'm trying to calculate yearly_pct_change for column a and b, so firstly, I replace -- in a and b with NA, then convert date column,the code I have used:
df[df == "--"] <- NA
df$date <- as.Date(paste0(df$date, '-01'), '%YY1-%mm-%d')
df %>%
# mutate(date = lubridate::ymd(paste0(date, '-01'))) %>%
mutate(ratio_a = round((a / lag(a, 12) - 1)*100, 2),
ratio_b = round((b / lag(b, 12) - 1)*100, 2))
In the final result, ratio_a and ratio_b are all NAs.
But with data as belows I manipulated in excel by replacing -- into space, it works:
structure(list(date = structure(1:24, .Label = c("2010Y1-01m",
"2010Y1-02m", "2010Y1-03m", "2010Y1-04m", "2010Y1-05m", "2010Y1-06m",
"2010Y1-07m", "2010Y1-08m", "2010Y1-09m", "2010Y1-10m", "2010Y1-11m",
"2010Y1-12m", "2011Y1-01m", "2011Y1-02m", "2011Y1-03m", "2011Y1-04m",
"2011Y1-05m", "2011Y1-06m", "2011Y1-07m", "2011Y1-08m", "2011Y1-09m",
"2011Y1-10m", "2011Y1-11m", "2011Y1-12m"), class = "factor"),
a = c(NA, 7363.45, 8237.71, 8830.99, 9316.56, 9795.77, 10159.28,
10580.82, 10995.65, 11327.23, 11621.99, 12139.78, NA, 9309.47,
10295.69, 11245.84, 12046.63, 12848.27, 13398.26, 13962.6,
14559.72, 14982.58, 15518.64, 15949.87), b = c(NA, 365.07,
653.19, 980.72, 1455.6, 1867.07, 2036.92, 2372.84, 2693.96,
2973.04, 3227.23, 3678.01, NA, 555.51, 1058.18, 1539.01,
2102.23, 2769.65, 3146.88, 3604.71, 4043.18, 4438.55, 4860.76,
5360.94)), class = "data.frame", row.names = c(NA, -24L))
Does someone could help me to figure out why my code above give NAs for ratio columns? Thanks.
Your data has factors, try to convert them to number.
library(dplyr)
df[df == "--"] <- NA
df$date <- as.Date(paste0(df$date, '-01'), '%YY1-%mm-%d')
df %>%
type.convert() %>%
mutate(ratio_a = round((a / lag(a, 12) - 1)*100, 2),
ratio_b = round((b / lag(b, 12) - 1)*100, 2))

how can I chop strings in a consecutive region with various moving window

I have a data like this
df<- structure(list(sname = structure(2:1, .Label = c("Carrot", "Melon"
), class = "factor"), sence = structure(1:2, .Label = c("RSNSNASSAVSTSCVSNRAMKGTTHYDTS",
"TGMRHGGMVSVCMCVVDDNRRRHYNGAYDDHHRGGVCTS"), class = "factor")), class = "data.frame", row.names = c(NA,
-2L))
Lets look at the first row
Melon RSNSNASSAVSTSCVSNRAMKGTTHYDTS
I want to be able to chop the strings into different windows as well as moving in different pattern. for example lets say moving 1 letter at the time and windows of 10. so The first output will be like this
RSNSNASSAV
So this one is letter 1 ,2,3,4,5,6,7,8,9,10
The second one will be moving 1 letter forward and then chop for 10 letters
SNSNASSAVS
so this is letter 2,3,4,5,6,7,8,9,10,11
it goes until the end.
a requested output is like the following
output<- structure(list(position = structure(c(33L, 1L, 12L, 23L, 26L,
27L, 28L, 29L, 30L, 31L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L,
11L, 13L, 32L, 1L, 12L, 23L, 26L, 27L, 28L, 29L, 30L, 31L, 2L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 13L, 14L, 15L, 16L, 17L,
18L, 19L, 20L, 21L, 22L, 24L, 25L), .Label = c("1,2,3,4,5,6,7,8,9,10",
"10,11,12,13,14,15,16,17,18,19", "11,12,13,14,15,16,17,18,20",
"12,13,14,15,16,17,18,19,20,21", "13,14,15,16,17,18,19,20,21,22",
"14,15,16,17,18,19,20,21,22,23", "15,16,17,18,19,20,21,22,23,24",
"16,17,18,19,20,21,22,23,24,25", "17,18,19,20,21,22,23,24,25,26",
"18,19,20,21,22,23,24,25,26,27", "19,20,21,22,23,24,25,26,27,28",
"2,3,4,5,6,7,8,9,10,11", "20,21,22,23,24,25,26,27,28,29", "21,22,23,24,25,26,27,28,29,30",
"22,23,24,25,26,27,28,29,30,31", "23,24,25,26,27,28,29,30,31,32",
"24,25,26,27,28,29,30,31,32,33", "25,26,27,28,29,30,31,32,33,34",
"26,27,28,29,30,31,32,33,34,35", "27,28,29,30,31,32,33,34,35,36",
"28,29,30,31,32,33,34,35,36,37", "29,30,31,32,33,34,35,36,37,38",
"3,4,5,6,7,8,9,10,11,12", "30,31,32,33,34,35,36,37,38,39", "31,32,33,34,35,36,37,38,39,40",
"4,5,6,7,8,9,10,11,12,13", "5,6,7,8,9,10,11,12,13,14", "6,7,8,9,10,11,12,14,15",
"7,8,9,10,11,12,13,14,15,16", "8,9,10,11,12,13,14,15,16,17",
"9,10,11,12,13,14,15,16,17,18", "Carrot", "Melon"), class = "factor"),
name = structure(c(20L, 32L, 37L, 26L, 35L, 35L, 2L, 38L,
33L, 3L, 46L, 39L, 42L, 34L, 7L, 45L, 36L, 24L, 27L, 1L,
21L, 5L, 41L, 15L, 22L, 28L, 17L, 14L, 16L, 23L, 47L, 40L,
43L, 6L, 19L, 8L, 19L, 8L, 48L, 44L, 10L, 12L, 25L, 31L,
30L, 29L, 18L, 50L, 13L, 4L, 49L, 9L, 11L), .Label = c("AMKGTTHYDT",
"ASSAVSTSCV", "AVSTSCVSNR", "AYDDHHRGGV", "Carrot", "CMCVVDDNRR",
"CVSNRAMKGT", "CVVDDNRRRH", "DDHHRGGVCT", "DDNRRRHYNG", "DHHRGGVCTS",
"DNRRRHYNGA", "GAYDDHHRGG", "GGMVSVCMCV", "GMRHGGMVSV", "GMVSVCMCVV",
"HGGMVSVCMC", "HYNGAYDDHH", "MCVVDDNRRR", "Melon", "MKGTTHYDTS",
"MRHGGMVSVC", "MVSVCMCVVD", "NRAMKGTTHY", "NRRRHYNGAY", "NSNASSAVST",
"RAMKGTTHYD", "RHGGMVSVCM", "RHYNGAYDDH", "RRHYNGAYDD", "RRRHYNGAYD",
"RSNSNASSAV", "SAVSTSCVSN", "SCVSNRAMKG", "SNASSAVSTS", "SNRAMKGTTH",
"SNSNASSAVS", "SSAVSTSCVS", "STSCVSNRAM", "SVCMCVVDDN", "TGMRHGGMVS",
"TSCVSNRAMK", "VCMCVVDDNR", "VDDNRRRHYN", "VSNRAMKGTT", "VSTSCVSNRA",
"VSVCMCVVDD", "VVDDNRRRHY", "YDDHHRGGVC", "YNGAYDDHHR"), class = "factor")), class = "data.frame", row.names = c(NA,
-53L))
Split with 2
RSNSNASSAV
NSNASSAVST
NASSAVSTSC
SSAVSTSCVS
AVSTSCVSNR
STSCVSNRAM
SCVSNRAMKG
VSNRAMKGTT
NRAMKGTTHY
AMKGTTHYDT
KGTTHYDTS
We convert the factor columns to character, then transmute to createa tibble of 'position', 'name' by looping over the rows with map, create substrings based on the the split width 'n' and the number of character (nchar) of 'sence', concatenate the 'sname' as the first element and unnest the list output to create a two column dataset
library(tidyverse)
f1 <- function(dat, n, mv = 1) {
dat %>%
mutate_all(as.character) %>%
transmute(out = map2(sence, sname, ~ {
i1 <- seq_len(nchar(.x) - (n -1))
i11 <- seq(i1[1], i1[length(i1)], by = mv)
i2 <- n:nchar(.x)
i22 <- seq(i2[1], i2[length(i2)], by = mv)
tibble(position = c(.y, map2_chr(i11, i22, ~
str_c(seq(.x, .y), collapse=","))),
name = c(.y, substring(.x, i11, i22)))
})) %>%
unnest
}
-testing
- moving window - 1
f1(df, n = 10, mv = 1)
# position name
#1 Melon Melon
#2 1,2,3,4,5,6,7,8,9,10 RSNSNASSAV
#3 2,3,4,5,6,7,8,9,10,11 SNSNASSAVS
#4 3,4,5,6,7,8,9,10,11,12 NSNASSAVST
#5 4,5,6,7,8,9,10,11,12,13 SNASSAVSTS
#6 5,6,7,8,9,10,11,12,13,14 NASSAVSTSC
#7 6,7,8,9,10,11,12,13,14,15 ASSAVSTSCV
#8 7,8,9,10,11,12,13,14,15,16 SSAVSTSCVS
#9 8,9,10,11,12,13,14,15,16,17 SAVSTSCVSN
#10 9,10,11,12,13,14,15,16,17,18 AVSTSCVSNR
#11 10,11,12,13,14,15,16,17,18,19 VSTSCVSNRA
#12 11,12,13,14,15,16,17,18,19,20 STSCVSNRAM
#13 12,13,14,15,16,17,18,19,20,21 TSCVSNRAMK
#14 13,14,15,16,17,18,19,20,21,22 SCVSNRAMKG
#15 14,15,16,17,18,19,20,21,22,23 CVSNRAMKGT
#16 15,16,17,18,19,20,21,22,23,24 VSNRAMKGTT
#17 16,17,18,19,20,21,22,23,24,25 SNRAMKGTTH
#18 17,18,19,20,21,22,23,24,25,26 NRAMKGTTHY
#19 18,19,20,21,22,23,24,25,26,27 RAMKGTTHYD
#20 19,20,21,22,23,24,25,26,27,28 AMKGTTHYDT
#21 20,21,22,23,24,25,26,27,28,29 MKGTTHYDTS
#22 Carrot Carrot
#23 1,2,3,4,5,6,7,8,9,10 TGMRHGGMVS
#24 2,3,4,5,6,7,8,9,10,11 GMRHGGMVSV
#25 3,4,5,6,7,8,9,10,11,12 MRHGGMVSVC
#26 4,5,6,7,8,9,10,11,12,13 RHGGMVSVCM
#27 5,6,7,8,9,10,11,12,13,14 HGGMVSVCMC
#28 6,7,8,9,10,11,12,13,14,15 GGMVSVCMCV
#29 7,8,9,10,11,12,13,14,15,16 GMVSVCMCVV
#30 8,9,10,11,12,13,14,15,16,17 MVSVCMCVVD
#31 9,10,11,12,13,14,15,16,17,18 VSVCMCVVDD
#32 10,11,12,13,14,15,16,17,18,19 SVCMCVVDDN
#33 11,12,13,14,15,16,17,18,19,20 VCMCVVDDNR
#34 12,13,14,15,16,17,18,19,20,21 CMCVVDDNRR
#35 13,14,15,16,17,18,19,20,21,22 MCVVDDNRRR
#36 14,15,16,17,18,19,20,21,22,23 CVVDDNRRRH
#37 15,16,17,18,19,20,21,22,23,24 VVDDNRRRHY
#38 16,17,18,19,20,21,22,23,24,25 VDDNRRRHYN
#39 17,18,19,20,21,22,23,24,25,26 DDNRRRHYNG
#40 18,19,20,21,22,23,24,25,26,27 DNRRRHYNGA
#41 19,20,21,22,23,24,25,26,27,28 NRRRHYNGAY
#42 20,21,22,23,24,25,26,27,28,29 RRRHYNGAYD
#43 21,22,23,24,25,26,27,28,29,30 RRHYNGAYDD
#44 22,23,24,25,26,27,28,29,30,31 RHYNGAYDDH
#45 23,24,25,26,27,28,29,30,31,32 HYNGAYDDHH
#46 24,25,26,27,28,29,30,31,32,33 YNGAYDDHHR
#47 25,26,27,28,29,30,31,32,33,34 NGAYDDHHRG
#48 26,27,28,29,30,31,32,33,34,35 GAYDDHHRGG
#49 27,28,29,30,31,32,33,34,35,36 AYDDHHRGGV
#50 28,29,30,31,32,33,34,35,36,37 YDDHHRGGVC
#51 29,30,31,32,33,34,35,36,37,38 DDHHRGGVCT
#52 30,31,32,33,34,35,36,37,38,39 DHHRGGVCTS
-moving window - 2
f1(df, n = 10, mv = 2) %>%
head
# position name
#1 Melon Melon
#2 1,2,3,4,5,6,7,8,9,10 RSNSNASSAV
#3 3,4,5,6,7,8,9,10,11,12 NSNASSAVST
#4 5,6,7,8,9,10,11,12,13,14 NASSAVSTSC
#5 7,8,9,10,11,12,13,14,15,16 SSAVSTSCVS
#6 9,10,11,12,13,14,15,16,17,18 AVSTSCVSNR
-moving window - 3
f1(df, n = 10, mv = 3) %>%
head
# position name
#1 Melon Melon
#2 1,2,3,4,5,6,7,8,9,10 RSNSNASSAV
#3 4,5,6,7,8,9,10,11,12,13 SNASSAVSTS
#4 7,8,9,10,11,12,13,14,15,16 SSAVSTSCVS
#5 10,11,12,13,14,15,16,17,18,19 VSTSCVSNRA
#6 13,14,15,16,17,18,19,20,21,22 SCVSNRAMKG
-moving window - 4
f1(df, n = 10, mv = 4) %>%
head
# position name
#1 Melon Melon
#2 1,2,3,4,5,6,7,8,9,10 RSNSNASSAV
#3 5,6,7,8,9,10,11,12,13,14 NASSAVSTSC
#4 9,10,11,12,13,14,15,16,17,18 AVSTSCVSNR
#5 13,14,15,16,17,18,19,20,21,22 SCVSNRAMKG
#6 17,18,19,20,21,22,23,24,25,26 NRAMKGTTHY
library('tidyverse')
# use this function to make the blocks:
make_substrings = function(string, len, label){
# set up the indices
str_len = nchar(string)
indices1 = 1:(str_len-len+1)
indices2 = (len:str_len)
# create the list of indices
position = map2_chr(indices1, indices2, .f = function(x, y){paste(x:y, collapse = ', ')})
# take substrings
name = map2_chr(indices1, indices2, .f = substr, x = string)
# add yoru food labels
header = tibble(position = label,
name = label)
header %>%
bind_rows(tibble(position,
name))
}
# your version had factors
df = df %>%
mutate_all(as.character)
# iterate over all the rows of df:
output = Map(f = make_substrings, string = df$sence, len = 10, label = df$sname) %>%
bind_rows

Resources