Related
Sample data:
X_5 X_1 Y alpha_5 alpha_1 beta_5 beta_1
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0.21 0.02 0.61 10 5 3 0.01
2 0.01 0.02 0.37 0.4 0.01 0.8 0.5
3 0.02 0.03 0.55 0.01 0.01 0.3 0.99
4 0.04 0.05 0.29 0.01 0.005 0.03 0.55
5 0.11 0.1 -0.08 0.22 0.015 0.01 0.01
6 0.22 0.21 -0.08 0.02 0.03 0.01 0.01
I have a dataset which has columns of some variable of interest, say alpha, beta, and so on. I also have this saved as a character vector. I want to be able to mutate new columns based on these variable names, suffixed with an identifier, using the existing columns in the dataset as part of some transformation, like this:
df %>% mutate(
alpha_new = ((alpha_5-alpha_1) / (X_5-X_1) * Y),
beta_new = ((beta_5-beta_1) / (X_5-X_1) * Y)
)
X_5 X_1 Y alpha_5 alpha_1 beta_5 beta_1 alpha_new beta_new
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0.21 0.02 0.61 10 5 3 0.01 16.1 9.60
2 0.01 0.02 0.37 0.4 0.01 0.8 0.5 -14.4 -11.1
3 0.02 0.03 0.55 0.01 0.01 0.3 0.99 0 38.0
4 0.04 0.05 0.29 0.01 0.005 0.03 0.55 -0.145 15.1
5 0.11 0.1 -0.08 0.22 0.015 0.01 0.01 -1.64 0
6 0.22 0.21 -0.08 0.02 0.03 0.01 0.01 0.0800 0
In my real data I have many more columns like this and I'm struggling to implement this in a "tidy" way which isn't hardcoded, what's the best practice for my situation?
Sample code:
structure(
list(
X_5 = c(0.21, 0.01, 0.02, 0.04, 0.11, 0.22),
X_1 = c(0.02,
0.02, 0.03, 0.05, 0.10, 0.21),
Y = c(0.61, 0.37, 0.55, 0.29, -0.08, -0.08),
alpha_5 = c(10, 0.4, 0.01, 0.01, 0.22, 0.02),
alpha_1 = c(5, 0.01, 0.01, 0.005, 0.015, 0.03),
beta_5 = c(3, 0.8, 0.3, 0.03, 0.01, 0.01),
beta_1 = c(0.01, 0.5, 0.99, 0.55, 0.01, 0.01)
),
row.names = c(NA, -6L),
class = c("tbl_df", "tbl", "data.frame")
) -> df
variable_of_interest <- c("alpha", "beta")
Here's another way to approach this with dynamic creation of columns. With map_dfc from purrr you can column-bind new results, creating new column names with bang-bang on left hand side of := operator, and using .data to access column values on right hand side.
library(tidyverse)
bind_cols(
df,
map_dfc(
variable_of_interest,
~ transmute(df, !!paste0(.x, '_new') :=
(.data[[paste0(.x, '_5')]] - .data[[paste0(.x, '_1')]]) /
(X_5 - X_1) * Y)
)
)
Output
X_5 X_1 Y alpha_5 alpha_1 beta_5 beta_1 alpha_new beta_new
1 0.21 0.02 0.61 10.00 5.000 3.00 0.01 16.05263 9.599474
2 0.01 0.02 0.37 0.40 0.010 0.80 0.50 -14.43000 -11.100000
3 0.02 0.03 0.55 0.01 0.010 0.30 0.99 0.00000 37.950000
4 0.04 0.05 0.29 0.01 0.005 0.03 0.55 -0.14500 15.080000
5 0.11 0.10 -0.08 0.22 0.015 0.01 0.01 -1.64000 0.000000
6 0.22 0.21 -0.08 0.02 0.030 0.01 0.01 0.08000 0.000000
Better to pivot the data first
library(dplyr)
library(tidyr)
# your data
df <- structure(list(X_5 = c(0.21, 0.01, 0.02, 0.04, 0.11, 0.22), X_1 = c(0.02,
0.02, 0.03, 0.05, 0.1, 0.21), Y = c(0.61, 0.37, 0.55, 0.29, -0.08,
-0.08), alpha_5 = c(10, 0.4, 0.01, 0.01, 0.22, 0.02), alpha_1 = c(5,
0.01, 0.01, 0.005, 0.015, 0.03), beta_5 = c(3, 0.8, 0.3, 0.03,
0.01, 0.01), beta_1 = c(0.01, 0.5, 0.99, 0.55, 0.01, 0.01)), class = "data.frame", row.names = c(NA,
-6L))
df <- df |> mutate(id = 1:n()) |>
pivot_longer(cols = -c(id, Y, X_5, X_1),
names_to = c("name", ".value"), names_sep="_") |>
mutate(new= (`5` - `1`) / (X_5 - X_1) * Y) |>
pivot_wider(id_cols = id, names_from = "name", values_from = c(`5`,`1`, `new`),
names_glue = "{name}_{.value}", values_fn = sum)
df
#> # A tibble: 6 × 7
#> id alpha_5 beta_5 alpha_1 beta_1 alpha_new beta_new
#> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 10 3 5 0.01 16.1 9.60
#> 2 2 0.4 0.8 0.01 0.5 -14.4 -11.1
#> 3 3 0.01 0.3 0.01 0.99 0 38.0
#> 4 4 0.01 0.03 0.005 0.55 -0.145 15.1
#> 5 5 0.22 0.01 0.015 0.01 -1.64 0
#> 6 6 0.02 0.01 0.03 0.01 0.0800 0
Created on 2023-02-16 with reprex v2.0.2
Note: if you want to add X_5 and X_1 in the output use id_cols = c(id, X_5, X_1) instead.
I modified your data to create a bit more complicated situation. My hope is that this is close to your real situation. The condition in this idea is that two columns that you wanna pair up stay next to each other. The first job is to collect column names that begin with small letters. Next job is to create a data frame. Here I keep the column names in odd positions
in target in the first column, and ones in even positions in the second column. I was thinking in the same line of Ben; I used map2_dfc to create an output data frame. In this function, I replaced all small letters with X so that I could specify two column names in the original data (i.e., ones starting with X). Then, I did the calculation as you specified. Finally, I created a column name for outcome in the loop. If you want to add the result to the original data, you can run the final line with cbind.
grep(x = names(df), pattern = "[[:lower:]]+_[0-9]+", value = TRUE) -> target
tibble(first_element = target[c(TRUE, FALSE)],
second_element = target[c(FALSE, TRUE)]) -> mydf
map2_dfc(.x = mydf$first_element,
.y = mydf$second_element,
.f = function(x, y) {
sub(x = x, pattern = "[[:lower:]]+", replacement = "X") -> foo1
sub(x = y, pattern = "[[:lower:]]+", replacement = "X") -> foo2
outcome <- ((df[x] - df[y]) / (df[foo1] - df[foo2]) * df["Y"])
names(outcome) <- paste(x,
sub(x = y, pattern = "[[:lower:]]+", replacement = ""),
sep = "")
return(outcome)
}) -> result
cbind(df, result)
# alpha_5_1 alpha_2_6 beta_5_1 beta_3_4
#1 16.05263 0.10736 9.599474 0.27145
#2 -14.43000 0.10730 -11.100000 0.28564
#3 0.00000 0.28710 37.950000 0.50820
#4 -0.14500 0.21576 15.080000 0.64206
#5 -1.64000 -0.06416 0.000000 -0.61352
#6 0.08000 -0.08480 0.000000 -0.25400
DATA
structure(list(
X_5 = c(0.21, 0.01, 0.02, 0.04, 0.11, 0.22),
X_1 = c(0.02,0.02, 0.03, 0.05, 0.10, 0.21),
X_2 = 1:6,
X_6 = 6:11,
X_3 = 21:26,
X_4 = 31:36,
Y = c(0.61, 0.37, 0.55, 0.29, -0.08, -0.08),
alpha_5 = c(10, 0.4, 0.01, 0.01, 0.22, 0.02),
alpha_1 = c(5, 0.01, 0.01, 0.005, 0.015, 0.03),
alpha_2 = c(0.12, 0.55, 0.39, 0.28, 0.99, 0.7),
alpha_6 = 1:6,
beta_5 = c(3, 0.8, 0.3, 0.03, 0.01, 0.01),
beta_1 = c(0.01, 0.5, 0.99, 0.55, 0.01, 0.01),
beta_3 = c(0.55, 0.28, 0.76, 0.86, 0.31, 0.25),
beta_4 = c(5, 8, 10, 23, 77, 32)),
row.names = c(NA, -6L),
class = c("tbl_df", "tbl", "data.frame")) -> df
I am trying to use pivot_longer. However, I am not sure how to use names_sep or names_pattern to solve this.
dat <- tribble(
~group, ~BP, ~HS, ~BB, ~lowerBP, ~upperBP, ~lowerHS, ~upperHS, ~lowerBB, ~upperBB,
"1", 0.51, 0.15, 0.05, 0.16, 0.18, 0.5, 0.52, 0.14, 0.16,
"2.1", 0.67, 0.09, 0.06, 0.09, 0.11, 0.66, 0.68, 0.08, 0.1,
"2.2", 0.36, 0.13, 0.07, 0.12, 0.15, 0.34, 0.38, 0.12, 0.14,
"2.3", 0.09, 0.17, 0.09, 0.13, 0.16, 0.08, 0.11, 0.15, 0.18,
"2.4", 0.68, 0.12, 0.07, 0.12, 0.14, 0.66, 0.69, 0.11, 0.13,
"3", 0.53, 0.15, 0.06, 0.14, 0.16, 0.52, 0.53, 0.15, 0.16)
Desired output (First row from wide data)
group names values lower upper
1 BP 0.51 0.16 0.18
1 HS 0.15 0.5 0.52
1 BB 0.05 0.14 0.16
Here is solution following a similar method that #Fnguyen used but using the newer pivot_longer and pivot_wider construct:
library(dplyr)
library(tidyr)
longer<-pivot_longer(dat, cols=-1, names_pattern = "(.*)(..)$", names_to = c("limit", "name")) %>%
mutate(limit=ifelse(limit=="", "value", limit))
answer <-pivot_wider(longer, id_cols = c(group, name), names_from = limit, values_from = value, names_repair = "check_unique")
Most of the selecting, separating, mutating and renaming is taking place within the pivot function calls.
Update:
This regular expressions "(.*)(..)$" means:
( ) ( ) Look for two parts,
(.*) the first part should have zero or more characters
(..) the second part should have just 2 characters at the “$” end of the string
A data.table version (not sure yet how to retain the original names so that you dont need to post substitute them https://github.com/Rdatatable/data.table/issues/2551):
library(data.table)
df <- data.table(dat)
v <- c("BP","HS","BB")
setnames(df, v, paste0("x",v) )
g <- melt(df, id.vars = "group",
measure.vars = patterns(values = "x" ,
lower = "lower",
upper = "upper"),
variable.name = "names")
g[names==1, names := "BP" ]
g[names==2, names := "HS" ]
g[names==3, names := "BB" ]
group names values lower upper
1: 1 BP 0.51 0.16 0.18
2: 2.1 BP 0.67 0.09 0.11
3: 2.2 BP 0.36 0.12 0.15
4: 2.3 BP 0.09 0.13 0.16
5: 2.4 BP 0.68 0.12 0.14
6: 3 BP 0.53 0.14 0.16
7: 1 HS 0.15 0.50 0.52
8: 2.1 HS 0.09 0.66 0.68
9: 2.2 HS 0.13 0.34 0.38
10: 2.3 HS 0.17 0.08 0.11
11: 2.4 HS 0.12 0.66 0.69
12: 3 HS 0.15 0.52 0.53
13: 1 BB 0.05 0.14 0.16
14: 2.1 BB 0.06 0.08 0.10
15: 2.2 BB 0.07 0.12 0.14
16: 2.3 BB 0.09 0.15 0.18
17: 2.4 BB 0.07 0.11 0.13
18: 3 BB 0.06 0.15 0.16
Based on your example data this solution using dplyr works for me:
library(dplyr)
dat %>%
gather(key, values,-group) %>%
mutate(names = gsub("lower","",gsub("upper","",key))) %>%
separate(key, into = c("key1","key2") ,"[[:upper:]]", perl=T) %>%
mutate(key1 = case_when(key1 == "" ~ "values", TRUE ~ key1)) %>%
select(group,names,key1,values) %>%
rowid_to_column() %>%
spread(key1,values) %>%
select(-rowid) %>%
group_by(group,names) %>%
summarise_all(mean,na.rm = TRUE)
I'd like to add an alternative tidyverse solution drawing from the answer provided by #Dave2e.
Like Dave2e's solution it's a two-step procedure (first rename, then reshape). Instead of reshaping the data twice, I add the prefix "values" to the columns named "BP", "HS", and "BB" using rename_with. This was necessary for getting the column names right when using the .value sentinel in the names_to argument of pivot_longer.
library(dplyr)
library(tidyr)
dat %>%
rename_with(~sub("^(BP|HS|BB)$", "values\\1", .)) %>% # add prefix values
pivot_longer(dat , cols= -1,
names_pattern = "(.*)(BP|HS|BB)$",
names_to = c(".value", "names"))
I have 2 data sets - one is quarterly which I need to match to monthly data. So the values from the quarterly data will be repeated thrice in the final data set. I have created a one quarter sample below but this would need to be repeated for many quarters.
month <- c(1/20, 2/20, 3/20)
rating <- c(0.5,0.6,0.65)
df1 <- cbind(month,rating)
quarter <- c(“q1/20”)
amount <- c(100)
df2 <- cbind(quarter,amount)
My final data set should have the following structure
month <- c(1/20, 2/20, 3/20)
rating <- c(0.5,0.6,0.65)
quarter <- c(“q1/20”, “q1/20”, “q1/20”)
amount <- c(100,100,100)
df3 <- cbind(month, rating, quarter, amount)
In the full quarterly data set (df1), some observations are also monthly so it would maybe be a case of matching the monthly observations by month and quarterly observations by quarter?
Thanks in anticipation.
Assuming you have this data.
head(m.dat)
# month rating
# 1 1/18 0.91
# 2 2/18 0.94
# 3 3/18 0.29
# 4 4/18 0.83
# 5 5/18 0.64
# 6 6/18 0.52
head(q.dat)
# quarter amount
# 1 q1/18 1
# 2 q2/18 21
# 3 q3/18 91
# 4 q4/18 61
# 5 q1/19 38
# 6 q2/19 44
You could match month information to quarters using an assignment matrix qm.
qm <- matrix(c(1:12, paste0("q", rep(1:4, each=3))), 12, 2)
m.dat$quarter <- paste0(qm[match(qm[, 1], gsub("(^\\d*).*", "\\1", m.dat$month)), 2],
"/",
sapply(strsplit(m.dat$month, "/"), `[`, 2))
This enables you to use merge.
res <- merge(m.dat, q.dat, all=TRUE)
head(res)
# quarter month rating amount
# 1 q1/18 1/18 0.91 1
# 2 q1/18 2/18 0.94 1
# 3 q1/18 3/18 0.29 1
# 4 q1/19 1/19 0.93 38
# 5 q1/19 2/19 0.26 38
# 6 q1/19 3/19 0.46 38
Toy data
m.dat <- structure(list(month = c("1/18", "2/18", "3/18", "4/18", "5/18",
"6/18", "7/18", "8/18", "9/18", "10/18", "11/18", "12/18", "1/19",
"2/19", "3/19", "4/19", "5/19", "6/19", "7/19", "8/19", "9/19",
"10/19", "11/19", "12/19", "1/20", "2/20", "3/20", "4/20", "5/20",
"6/20", "7/20", "8/20", "9/20", "10/20", "11/20", "12/20"), rating = c(0.91,
0.94, 0.29, 0.83, 0.64, 0.52, 0.74, 0.13, 0.66, 0.71, 0.46, 0.72,
0.93, 0.26, 0.46, 0.94, 0.98, 0.12, 0.47, 0.56, 0.9, 0.14, 0.99,
0.95, 0.08, 0.51, 0.39, 0.91, 0.45, 0.84, 0.74, 0.81, 0.39, 0.69,
0, 0.83)), class = "data.frame", row.names = c(NA, -36L))
q.dat <- structure(list(quarter = c("q1/18", "q2/18", "q3/18", "q4/18",
"q1/19", "q2/19", "q3/19", "q4/19", "q1/20", "q2/20", "q3/20",
"q4/20"), amount = c(1, 21, 91, 61, 38, 44, 4, 97, 43, 96, 89,
64)), class = "data.frame", row.names = c(NA, -12L))
Assuming that df1 and df2 are the data frames shown in the Note at the end create a yq column of class yearqtr in each and merge on that:
library(zoo)
df1 <- transform(df1, yq = as.yearqtr(month, "%m/%y"))
df2 <- transform(df2, yq = as.yearqtr(quarter, "q%q/%y"))
merge(df1, df2, by = "yq", all = TRUE)
giving:
yq month rating quarter amount
1 2020 Q1 1/20 0.50 q1/20 100
2 2020 Q1 2/20 0.60 q1/20 100
3 2020 Q1 3/20 0.65 q1/20 100
We could also consider converting the month column into a yearmon class column using
as.yearmon .
Note
df1 <- data.frame(month = c("1/20", "2/20", "3/20"), rating = c(0.5,0.6,0.65))
df2 <- data.frame(quarter = "q1/20", amount = 100)
I am trying to use pivot_longer. However, I am not sure how to use names_sep or names_pattern to solve this.
dat <- tribble(
~group, ~BP, ~HS, ~BB, ~lowerBP, ~upperBP, ~lowerHS, ~upperHS, ~lowerBB, ~upperBB,
"1", 0.51, 0.15, 0.05, 0.16, 0.18, 0.5, 0.52, 0.14, 0.16,
"2.1", 0.67, 0.09, 0.06, 0.09, 0.11, 0.66, 0.68, 0.08, 0.1,
"2.2", 0.36, 0.13, 0.07, 0.12, 0.15, 0.34, 0.38, 0.12, 0.14,
"2.3", 0.09, 0.17, 0.09, 0.13, 0.16, 0.08, 0.11, 0.15, 0.18,
"2.4", 0.68, 0.12, 0.07, 0.12, 0.14, 0.66, 0.69, 0.11, 0.13,
"3", 0.53, 0.15, 0.06, 0.14, 0.16, 0.52, 0.53, 0.15, 0.16)
Desired output (First row from wide data)
group names values lower upper
1 BP 0.51 0.16 0.18
1 HS 0.15 0.5 0.52
1 BB 0.05 0.14 0.16
Here is solution following a similar method that #Fnguyen used but using the newer pivot_longer and pivot_wider construct:
library(dplyr)
library(tidyr)
longer<-pivot_longer(dat, cols=-1, names_pattern = "(.*)(..)$", names_to = c("limit", "name")) %>%
mutate(limit=ifelse(limit=="", "value", limit))
answer <-pivot_wider(longer, id_cols = c(group, name), names_from = limit, values_from = value, names_repair = "check_unique")
Most of the selecting, separating, mutating and renaming is taking place within the pivot function calls.
Update:
This regular expressions "(.*)(..)$" means:
( ) ( ) Look for two parts,
(.*) the first part should have zero or more characters
(..) the second part should have just 2 characters at the “$” end of the string
A data.table version (not sure yet how to retain the original names so that you dont need to post substitute them https://github.com/Rdatatable/data.table/issues/2551):
library(data.table)
df <- data.table(dat)
v <- c("BP","HS","BB")
setnames(df, v, paste0("x",v) )
g <- melt(df, id.vars = "group",
measure.vars = patterns(values = "x" ,
lower = "lower",
upper = "upper"),
variable.name = "names")
g[names==1, names := "BP" ]
g[names==2, names := "HS" ]
g[names==3, names := "BB" ]
group names values lower upper
1: 1 BP 0.51 0.16 0.18
2: 2.1 BP 0.67 0.09 0.11
3: 2.2 BP 0.36 0.12 0.15
4: 2.3 BP 0.09 0.13 0.16
5: 2.4 BP 0.68 0.12 0.14
6: 3 BP 0.53 0.14 0.16
7: 1 HS 0.15 0.50 0.52
8: 2.1 HS 0.09 0.66 0.68
9: 2.2 HS 0.13 0.34 0.38
10: 2.3 HS 0.17 0.08 0.11
11: 2.4 HS 0.12 0.66 0.69
12: 3 HS 0.15 0.52 0.53
13: 1 BB 0.05 0.14 0.16
14: 2.1 BB 0.06 0.08 0.10
15: 2.2 BB 0.07 0.12 0.14
16: 2.3 BB 0.09 0.15 0.18
17: 2.4 BB 0.07 0.11 0.13
18: 3 BB 0.06 0.15 0.16
Based on your example data this solution using dplyr works for me:
library(dplyr)
dat %>%
gather(key, values,-group) %>%
mutate(names = gsub("lower","",gsub("upper","",key))) %>%
separate(key, into = c("key1","key2") ,"[[:upper:]]", perl=T) %>%
mutate(key1 = case_when(key1 == "" ~ "values", TRUE ~ key1)) %>%
select(group,names,key1,values) %>%
rowid_to_column() %>%
spread(key1,values) %>%
select(-rowid) %>%
group_by(group,names) %>%
summarise_all(mean,na.rm = TRUE)
I'd like to add an alternative tidyverse solution drawing from the answer provided by #Dave2e.
Like Dave2e's solution it's a two-step procedure (first rename, then reshape). Instead of reshaping the data twice, I add the prefix "values" to the columns named "BP", "HS", and "BB" using rename_with. This was necessary for getting the column names right when using the .value sentinel in the names_to argument of pivot_longer.
library(dplyr)
library(tidyr)
dat %>%
rename_with(~sub("^(BP|HS|BB)$", "values\\1", .)) %>% # add prefix values
pivot_longer(dat , cols= -1,
names_pattern = "(.*)(BP|HS|BB)$",
names_to = c(".value", "names"))
I am looking to create a function that aggregates sale data by many different variables. I am running into a snag with aggregate(by =). Here is my function thus far:
func <- function(x, x2, statfunc) {
PT <- c(1,5,3,5,4,8,3,1,5,6,1,5,5,6,1,2,3,1,5,1)
SH <- c(7,7,3,1,1,1,1,4,4,6,6,7,7,1,1,1,3,2,1,3)
SaleRatio <- c(0.85, 0.92, 0.89, 0.88, 0.86, 1.08, 1.15, 1.03, 0.95, 1.01, 1.36, 0.96, 1.03, 0.95, 0.90, 1.01, 0.96, 0.95, 0.81, 1.29)
study <- data.frame(PT, SH, SaleRatio)
study <- select(study, x2, SaleRatio)
study <- aggregate(study,
by = list(x),
FUN = statfunc)
print(study)
}
When I attempt to run my formula with:
func(x = "study$PT", x2 = "PT", statfunc = median)
I get the error:
Error in aggregate.data.frame(study, by = list(x), FUN = statfunc) :
arguments must have same length
I am expecting this:
Group.1 PT SaleRatio
1 1 1 0.990
2 2 2 1.010
3 3 3 0.960
4 4 4 0.860
5 5 5 0.935
6 6 6 0.980
7 8 8 1.080
The results above are from the exact same formula, only by manually entering the arguments instead of letting the function pass them.
This user provided function will eventually be applied with many different variables and aggregate functions, and on a much larger data set.
Can someone assist?
We can try with tidyverse
library(dplyr)
func <- function(x, x2, statfunc) {
PT <- c(1,5,3,5,4,8,3,1,5,6,1,5,5,6,1,2,3,1,5,1)
SH <- c(7,7,3,1,1,1,1,4,4,6,6,7,7,1,1,1,3,2,1,3)
SaleRatio <- c(0.85, 0.92, 0.89, 0.88, 0.86, 1.08, 1.15, 1.03, 0.95,
1.01, 1.36, 0.96, 1.03, 0.95, 0.90, 1.01, 0.96, 0.95, 0.81, 1.29)
study <- data.frame(PT, SH, SaleRatio)
study %>%
select(x2, SaleRatio) %>%
group_by_at(x) %>%
summarise_all(statfunc)
}
func("PT", "PT", median)
# A tibble: 7 x 2
# PT SaleRatio
# <dbl> <dbl>
#1 1 0.99
#2 2 1.01
#3 3 0.96
#4 4 0.86
#5 5 0.935
#6 6 0.98
#7 8 1.08