The problem
I am having a lot of difficulty using a known value within a function within dplyr. The issue is with the following line. The rest of what follows it is just data that leads to the problematic component.
data <- data %>%
group_by(Group) %>%
bind_cols(as_tibble(rotate2(as.matrix(.)[,1:2], theta = min(.$theta))))
The min(.$theta) is my attempt to try to find the theta value within each group and use it. There is a column in the data created (as shown below) which contains this value. I want to take the value from each group (Group) and use it with rotate2. There are only two groups in the sample below, but the real data has hundreds of groups. What I want to know is: how can I use the existing value for each group (the theta column repeats the same value for each group).
Is there something I can replace min(.$theta) with that would do this? It seems to take data from the entire column, rather than taking the value from each Group individually.
Data to get to the problem
Packages - dplyr, plyr, lava
data <- structure(list(X = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, 4.9046,
6.1424, 7.275, 8.5851, 10.0373, 11.9981, 13.7726, 15.0731, 16.0664,
18.1945, 21.2666, 24.2093, 26.7119, 28.8037, 30.7135, 32.1351,
33.1982, 34.2341, 35.7587, 37.2147, 38.4303, 39.625, 40.4596,
42.0938, 42.7428, 42.7593, 43.5085, 43.7419, 43.5989, 44.0841,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -14.845, -11.9052,
-8.7897, -5.8034, -2.6756, 0.3316, 3.4003, 6.5281, 9.6517, 12.804,
15.9861, 19.1769, 22.2929, 25.4089, 28.3392, 31.0054, 33.1847,
35.081, 36.7227, 38.1544, 39.1697, 40.049, 40.9647, 41.5014,
41.8874, 42.1778, 42.3435, 42.2681, 42.3745, 42.4619, NA, NA,
NA, NA), Y = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, -9.9938, -7.4596,
-4.8647, -2.2903, 0.3158, 2.9302, 5.7262, 8.7033, 11.8007, 14.9847,
16.7225, 16.7813, 15.6921, 14.2964, 11.5579, 8.2378, 5.183, 1.5938,
-2.0712, -5.195, -7.1447, -9.0446, -11.1269, -13.0979, -15.3295,
-17.1898, -19.4376, -21.4781, -23.8426, -25.6343, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, 8.0113, 9.1826, 9.838, 10.7908,
11.175, 12.0393, 12.6813, 12.8828, 13.2281, 13.5102, 13.6637,
13.5493, 12.8699, 12.2191, 10.9208, 9.0209, 6.2158, 3.2466, 0.2169,
-2.7807, -6.0439, -9.1262, -11.8684, -14.7779, -17.5825, -20.2452,
-22.807, -25.3519, -27.6105, -29.7536, NA, NA, NA, NA), fan_line = c(1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L,
16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L, 27L, 28L,
29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L, 37L, 38L, 39L, 40L, 41L,
42L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L,
14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L,
27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L, 36L, 37L, 38L, 39L,
40L, 41L, 42L)), class = "data.frame", row.names = c(NA, -84L
))
data <- data %>% mutate(Group = rep(1:(n()/42), each = 42)) %>% dplyr::group_by(Group) %>%
mutate(start = min(which(!is.na(X))), end = max(which(!is.na(X))), midpoint = round((start+end)/2, digits = 0)) %>% ungroup()
data$start_val_x <- 0
data$end_val_x <- 0
data$start_val_y <- 0
data$end_val_y <- 0
for (i in 1:nrow(data)){
if (data[i, "fan_line"] == data[i, "start"]){
data[i, "start_val_x"] = data[i, "X"]
data[i, "start_val_y"] = data[i, "Y"]
}
else{data[i, "start_val_y"] = NA
data[i, "start_val_x"] = NA}
}
for (i in 1:nrow(data)){
if (data[i, "fan_line"] == data[i, "end"]){
data[i, "end_val_x"] = data[i, "X"]
data[i, "end_val_y"] = data[i, "Y"]
}
else{data[i, "end_val_y"] = NA
data[i, "end_val_x"] = NA}
}
data <- data %>% group_by(Group) %>% fill(c(start_val_x, start_val_y), .direction = "down") %>% fill(c(start_val_x, start_val_y), .direction = "up")
data <- data %>% group_by(Group) %>% fill(c(end_val_x, end_val_y), .direction = "down") %>% fill(c(end_val_x, end_val_y), .direction = "up")
data <- data %>% group_by(Group) %>% mutate(theta = max(atan(diff(c(start_val_y, end_val_y))/diff(c(start_val_x, end_val_x))), na.rm = T))
data <- data %>% group_by(Group) %>% bind_cols(as_tibble(rotate2(as.matrix(.)[,1:2], theta = min(.$theta))))
We could use group_modify. However, I'm not sure if the outcome below is what you are looking for.
In a normal dplyr pipeline we could use cur_data() to access the data of each group. This is not possible here, because we are inside a non-dplyr function. For this case we have group_map (which returns a list) and group_modify (which returns a grouped tibble as long as each output is a data.frame). We can use a lambda function and here .x is our grouped data.
library(tidyverse)
library(lava)
data %>%
group_by(Group) %>%
group_modify(~ as_tibble(rotate2(as.matrix(.x)[,1:2], theta = min(.x$theta))))
#> Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if `.name_repair` is omitted as of tibble 2.0.0.
#> Using compatibility `.name_repair`.
#> # A tibble: 84 x 3
#> # Groups: Group [2]
#> Group V1 V2
#> <int> <dbl> <dbl>
#> 1 1 NA NA
#> 2 1 NA NA
#> 3 1 NA NA
#> 4 1 NA NA
#> 5 1 NA NA
#> 6 1 NA NA
#> 7 1 NA NA
#> 8 1 NA NA
#> 9 1 NA NA
#> 10 1 8.26 -7.46
#> # … with 74 more rows
Created on 2021-04-13 by the reprex package (v0.3.0)
I'm working at a dataset as follows:
structure(list(date = structure(1:24, .Label = c("2010Y1-01m",
"2010Y1-02m", "2010Y1-03m", "2010Y1-04m", "2010Y1-05m", "2010Y1-06m",
"2010Y1-07m", "2010Y1-08m", "2010Y1-09m", "2010Y1-10m", "2010Y1-11m",
"2010Y1-12m", "2011Y1-01m", "2011Y1-02m", "2011Y1-03m", "2011Y1-04m",
"2011Y1-05m", "2011Y1-06m", "2011Y1-07m", "2011Y1-08m", "2011Y1-09m",
"2011Y1-10m", "2011Y1-11m", "2011Y1-12m"), class = "factor"),
a = structure(c(1L, 18L, 19L, 20L, 22L, 23L, 2L, 4L, 5L,
7L, 8L, 10L, 1L, 21L, 3L, 6L, 9L, 11L, 12L, 13L, 14L, 15L,
16L, 17L), .Label = c("--", "10159.28", "10295.69", "10580.82",
"10995.65", "11245.84", "11327.23", "11621.99", "12046.63",
"12139.78", "12848.27", "13398.26", "13962.6", "14559.72",
"14982.58", "15518.64", "15949.87", "7363.45", "8237.71",
"8830.99", "9309.47", "9316.56", "9795.77"), class = "factor"),
b = structure(c(1L, 15L, 22L, 23L, 3L, 5L, 6L, 8L, 9L, 11L,
13L, 16L, 1L, 21L, 2L, 4L, 7L, 10L, 12L, 14L, 17L, 18L, 19L,
20L), .Label = c("--", "1058.18", "1455.6", "1539.01", "1867.07",
"2036.92", "2102.23", "2372.84", "2693.96", "2769.65", "2973.04",
"3146.88", "3227.23", "3604.71", "365.07", "3678.01", "4043.18",
"4438.55", "4860.76", "5360.94", "555.51", "653.19", "980.72"
), class = "factor")), class = "data.frame", row.names = c(NA,
-24L))
I'm trying to calculate yearly_pct_change for column a and b, so firstly, I replace -- in a and b with NA, then convert date column,the code I have used:
df[df == "--"] <- NA
df$date <- as.Date(paste0(df$date, '-01'), '%YY1-%mm-%d')
df %>%
# mutate(date = lubridate::ymd(paste0(date, '-01'))) %>%
mutate(ratio_a = round((a / lag(a, 12) - 1)*100, 2),
ratio_b = round((b / lag(b, 12) - 1)*100, 2))
In the final result, ratio_a and ratio_b are all NAs.
But with data as belows I manipulated in excel by replacing -- into space, it works:
structure(list(date = structure(1:24, .Label = c("2010Y1-01m",
"2010Y1-02m", "2010Y1-03m", "2010Y1-04m", "2010Y1-05m", "2010Y1-06m",
"2010Y1-07m", "2010Y1-08m", "2010Y1-09m", "2010Y1-10m", "2010Y1-11m",
"2010Y1-12m", "2011Y1-01m", "2011Y1-02m", "2011Y1-03m", "2011Y1-04m",
"2011Y1-05m", "2011Y1-06m", "2011Y1-07m", "2011Y1-08m", "2011Y1-09m",
"2011Y1-10m", "2011Y1-11m", "2011Y1-12m"), class = "factor"),
a = c(NA, 7363.45, 8237.71, 8830.99, 9316.56, 9795.77, 10159.28,
10580.82, 10995.65, 11327.23, 11621.99, 12139.78, NA, 9309.47,
10295.69, 11245.84, 12046.63, 12848.27, 13398.26, 13962.6,
14559.72, 14982.58, 15518.64, 15949.87), b = c(NA, 365.07,
653.19, 980.72, 1455.6, 1867.07, 2036.92, 2372.84, 2693.96,
2973.04, 3227.23, 3678.01, NA, 555.51, 1058.18, 1539.01,
2102.23, 2769.65, 3146.88, 3604.71, 4043.18, 4438.55, 4860.76,
5360.94)), class = "data.frame", row.names = c(NA, -24L))
Does someone could help me to figure out why my code above give NAs for ratio columns? Thanks.
Your data has factors, try to convert them to number.
library(dplyr)
df[df == "--"] <- NA
df$date <- as.Date(paste0(df$date, '-01'), '%YY1-%mm-%d')
df %>%
type.convert() %>%
mutate(ratio_a = round((a / lag(a, 12) - 1)*100, 2),
ratio_b = round((b / lag(b, 12) - 1)*100, 2))
I have a data like this
df<- structure(list(sname = structure(2:1, .Label = c("Carrot", "Melon"
), class = "factor"), sence = structure(1:2, .Label = c("RSNSNASSAVSTSCVSNRAMKGTTHYDTS",
"TGMRHGGMVSVCMCVVDDNRRRHYNGAYDDHHRGGVCTS"), class = "factor")), class = "data.frame", row.names = c(NA,
-2L))
Lets look at the first row
Melon RSNSNASSAVSTSCVSNRAMKGTTHYDTS
I want to be able to chop the strings into different windows as well as moving in different pattern. for example lets say moving 1 letter at the time and windows of 10. so The first output will be like this
RSNSNASSAV
So this one is letter 1 ,2,3,4,5,6,7,8,9,10
The second one will be moving 1 letter forward and then chop for 10 letters
SNSNASSAVS
so this is letter 2,3,4,5,6,7,8,9,10,11
it goes until the end.
a requested output is like the following
output<- structure(list(position = structure(c(33L, 1L, 12L, 23L, 26L,
27L, 28L, 29L, 30L, 31L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L,
11L, 13L, 32L, 1L, 12L, 23L, 26L, 27L, 28L, 29L, 30L, 31L, 2L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 13L, 14L, 15L, 16L, 17L,
18L, 19L, 20L, 21L, 22L, 24L, 25L), .Label = c("1,2,3,4,5,6,7,8,9,10",
"10,11,12,13,14,15,16,17,18,19", "11,12,13,14,15,16,17,18,20",
"12,13,14,15,16,17,18,19,20,21", "13,14,15,16,17,18,19,20,21,22",
"14,15,16,17,18,19,20,21,22,23", "15,16,17,18,19,20,21,22,23,24",
"16,17,18,19,20,21,22,23,24,25", "17,18,19,20,21,22,23,24,25,26",
"18,19,20,21,22,23,24,25,26,27", "19,20,21,22,23,24,25,26,27,28",
"2,3,4,5,6,7,8,9,10,11", "20,21,22,23,24,25,26,27,28,29", "21,22,23,24,25,26,27,28,29,30",
"22,23,24,25,26,27,28,29,30,31", "23,24,25,26,27,28,29,30,31,32",
"24,25,26,27,28,29,30,31,32,33", "25,26,27,28,29,30,31,32,33,34",
"26,27,28,29,30,31,32,33,34,35", "27,28,29,30,31,32,33,34,35,36",
"28,29,30,31,32,33,34,35,36,37", "29,30,31,32,33,34,35,36,37,38",
"3,4,5,6,7,8,9,10,11,12", "30,31,32,33,34,35,36,37,38,39", "31,32,33,34,35,36,37,38,39,40",
"4,5,6,7,8,9,10,11,12,13", "5,6,7,8,9,10,11,12,13,14", "6,7,8,9,10,11,12,14,15",
"7,8,9,10,11,12,13,14,15,16", "8,9,10,11,12,13,14,15,16,17",
"9,10,11,12,13,14,15,16,17,18", "Carrot", "Melon"), class = "factor"),
name = structure(c(20L, 32L, 37L, 26L, 35L, 35L, 2L, 38L,
33L, 3L, 46L, 39L, 42L, 34L, 7L, 45L, 36L, 24L, 27L, 1L,
21L, 5L, 41L, 15L, 22L, 28L, 17L, 14L, 16L, 23L, 47L, 40L,
43L, 6L, 19L, 8L, 19L, 8L, 48L, 44L, 10L, 12L, 25L, 31L,
30L, 29L, 18L, 50L, 13L, 4L, 49L, 9L, 11L), .Label = c("AMKGTTHYDT",
"ASSAVSTSCV", "AVSTSCVSNR", "AYDDHHRGGV", "Carrot", "CMCVVDDNRR",
"CVSNRAMKGT", "CVVDDNRRRH", "DDHHRGGVCT", "DDNRRRHYNG", "DHHRGGVCTS",
"DNRRRHYNGA", "GAYDDHHRGG", "GGMVSVCMCV", "GMRHGGMVSV", "GMVSVCMCVV",
"HGGMVSVCMC", "HYNGAYDDHH", "MCVVDDNRRR", "Melon", "MKGTTHYDTS",
"MRHGGMVSVC", "MVSVCMCVVD", "NRAMKGTTHY", "NRRRHYNGAY", "NSNASSAVST",
"RAMKGTTHYD", "RHGGMVSVCM", "RHYNGAYDDH", "RRHYNGAYDD", "RRRHYNGAYD",
"RSNSNASSAV", "SAVSTSCVSN", "SCVSNRAMKG", "SNASSAVSTS", "SNRAMKGTTH",
"SNSNASSAVS", "SSAVSTSCVS", "STSCVSNRAM", "SVCMCVVDDN", "TGMRHGGMVS",
"TSCVSNRAMK", "VCMCVVDDNR", "VDDNRRRHYN", "VSNRAMKGTT", "VSTSCVSNRA",
"VSVCMCVVDD", "VVDDNRRRHY", "YDDHHRGGVC", "YNGAYDDHHR"), class = "factor")), class = "data.frame", row.names = c(NA,
-53L))
Split with 2
RSNSNASSAV
NSNASSAVST
NASSAVSTSC
SSAVSTSCVS
AVSTSCVSNR
STSCVSNRAM
SCVSNRAMKG
VSNRAMKGTT
NRAMKGTTHY
AMKGTTHYDT
KGTTHYDTS
We convert the factor columns to character, then transmute to createa tibble of 'position', 'name' by looping over the rows with map, create substrings based on the the split width 'n' and the number of character (nchar) of 'sence', concatenate the 'sname' as the first element and unnest the list output to create a two column dataset
library(tidyverse)
f1 <- function(dat, n, mv = 1) {
dat %>%
mutate_all(as.character) %>%
transmute(out = map2(sence, sname, ~ {
i1 <- seq_len(nchar(.x) - (n -1))
i11 <- seq(i1[1], i1[length(i1)], by = mv)
i2 <- n:nchar(.x)
i22 <- seq(i2[1], i2[length(i2)], by = mv)
tibble(position = c(.y, map2_chr(i11, i22, ~
str_c(seq(.x, .y), collapse=","))),
name = c(.y, substring(.x, i11, i22)))
})) %>%
unnest
}
-testing
- moving window - 1
f1(df, n = 10, mv = 1)
# position name
#1 Melon Melon
#2 1,2,3,4,5,6,7,8,9,10 RSNSNASSAV
#3 2,3,4,5,6,7,8,9,10,11 SNSNASSAVS
#4 3,4,5,6,7,8,9,10,11,12 NSNASSAVST
#5 4,5,6,7,8,9,10,11,12,13 SNASSAVSTS
#6 5,6,7,8,9,10,11,12,13,14 NASSAVSTSC
#7 6,7,8,9,10,11,12,13,14,15 ASSAVSTSCV
#8 7,8,9,10,11,12,13,14,15,16 SSAVSTSCVS
#9 8,9,10,11,12,13,14,15,16,17 SAVSTSCVSN
#10 9,10,11,12,13,14,15,16,17,18 AVSTSCVSNR
#11 10,11,12,13,14,15,16,17,18,19 VSTSCVSNRA
#12 11,12,13,14,15,16,17,18,19,20 STSCVSNRAM
#13 12,13,14,15,16,17,18,19,20,21 TSCVSNRAMK
#14 13,14,15,16,17,18,19,20,21,22 SCVSNRAMKG
#15 14,15,16,17,18,19,20,21,22,23 CVSNRAMKGT
#16 15,16,17,18,19,20,21,22,23,24 VSNRAMKGTT
#17 16,17,18,19,20,21,22,23,24,25 SNRAMKGTTH
#18 17,18,19,20,21,22,23,24,25,26 NRAMKGTTHY
#19 18,19,20,21,22,23,24,25,26,27 RAMKGTTHYD
#20 19,20,21,22,23,24,25,26,27,28 AMKGTTHYDT
#21 20,21,22,23,24,25,26,27,28,29 MKGTTHYDTS
#22 Carrot Carrot
#23 1,2,3,4,5,6,7,8,9,10 TGMRHGGMVS
#24 2,3,4,5,6,7,8,9,10,11 GMRHGGMVSV
#25 3,4,5,6,7,8,9,10,11,12 MRHGGMVSVC
#26 4,5,6,7,8,9,10,11,12,13 RHGGMVSVCM
#27 5,6,7,8,9,10,11,12,13,14 HGGMVSVCMC
#28 6,7,8,9,10,11,12,13,14,15 GGMVSVCMCV
#29 7,8,9,10,11,12,13,14,15,16 GMVSVCMCVV
#30 8,9,10,11,12,13,14,15,16,17 MVSVCMCVVD
#31 9,10,11,12,13,14,15,16,17,18 VSVCMCVVDD
#32 10,11,12,13,14,15,16,17,18,19 SVCMCVVDDN
#33 11,12,13,14,15,16,17,18,19,20 VCMCVVDDNR
#34 12,13,14,15,16,17,18,19,20,21 CMCVVDDNRR
#35 13,14,15,16,17,18,19,20,21,22 MCVVDDNRRR
#36 14,15,16,17,18,19,20,21,22,23 CVVDDNRRRH
#37 15,16,17,18,19,20,21,22,23,24 VVDDNRRRHY
#38 16,17,18,19,20,21,22,23,24,25 VDDNRRRHYN
#39 17,18,19,20,21,22,23,24,25,26 DDNRRRHYNG
#40 18,19,20,21,22,23,24,25,26,27 DNRRRHYNGA
#41 19,20,21,22,23,24,25,26,27,28 NRRRHYNGAY
#42 20,21,22,23,24,25,26,27,28,29 RRRHYNGAYD
#43 21,22,23,24,25,26,27,28,29,30 RRHYNGAYDD
#44 22,23,24,25,26,27,28,29,30,31 RHYNGAYDDH
#45 23,24,25,26,27,28,29,30,31,32 HYNGAYDDHH
#46 24,25,26,27,28,29,30,31,32,33 YNGAYDDHHR
#47 25,26,27,28,29,30,31,32,33,34 NGAYDDHHRG
#48 26,27,28,29,30,31,32,33,34,35 GAYDDHHRGG
#49 27,28,29,30,31,32,33,34,35,36 AYDDHHRGGV
#50 28,29,30,31,32,33,34,35,36,37 YDDHHRGGVC
#51 29,30,31,32,33,34,35,36,37,38 DDHHRGGVCT
#52 30,31,32,33,34,35,36,37,38,39 DHHRGGVCTS
-moving window - 2
f1(df, n = 10, mv = 2) %>%
head
# position name
#1 Melon Melon
#2 1,2,3,4,5,6,7,8,9,10 RSNSNASSAV
#3 3,4,5,6,7,8,9,10,11,12 NSNASSAVST
#4 5,6,7,8,9,10,11,12,13,14 NASSAVSTSC
#5 7,8,9,10,11,12,13,14,15,16 SSAVSTSCVS
#6 9,10,11,12,13,14,15,16,17,18 AVSTSCVSNR
-moving window - 3
f1(df, n = 10, mv = 3) %>%
head
# position name
#1 Melon Melon
#2 1,2,3,4,5,6,7,8,9,10 RSNSNASSAV
#3 4,5,6,7,8,9,10,11,12,13 SNASSAVSTS
#4 7,8,9,10,11,12,13,14,15,16 SSAVSTSCVS
#5 10,11,12,13,14,15,16,17,18,19 VSTSCVSNRA
#6 13,14,15,16,17,18,19,20,21,22 SCVSNRAMKG
-moving window - 4
f1(df, n = 10, mv = 4) %>%
head
# position name
#1 Melon Melon
#2 1,2,3,4,5,6,7,8,9,10 RSNSNASSAV
#3 5,6,7,8,9,10,11,12,13,14 NASSAVSTSC
#4 9,10,11,12,13,14,15,16,17,18 AVSTSCVSNR
#5 13,14,15,16,17,18,19,20,21,22 SCVSNRAMKG
#6 17,18,19,20,21,22,23,24,25,26 NRAMKGTTHY
library('tidyverse')
# use this function to make the blocks:
make_substrings = function(string, len, label){
# set up the indices
str_len = nchar(string)
indices1 = 1:(str_len-len+1)
indices2 = (len:str_len)
# create the list of indices
position = map2_chr(indices1, indices2, .f = function(x, y){paste(x:y, collapse = ', ')})
# take substrings
name = map2_chr(indices1, indices2, .f = substr, x = string)
# add yoru food labels
header = tibble(position = label,
name = label)
header %>%
bind_rows(tibble(position,
name))
}
# your version had factors
df = df %>%
mutate_all(as.character)
# iterate over all the rows of df:
output = Map(f = make_substrings, string = df$sence, len = 10, label = df$sname) %>%
bind_rows