Although this seems similar to this, I'm looking for a "tidy" solution...
Let's look at the following data (it's rocks compositions for some chemical elements, if you are curious):
# A tibble: 4 x 15
Rock La Ce Pr Nd Sm Eu Gd Tb Dy Ho Er Tm Yb Lu
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Upper CC 31 63 7.1 27 4.7 1 4 0.7 3.9 0.83 2.3 0.3 1.96 0.31
2 Middle CC 24 53 5.8 25 4.6 1.4 4 0.7 3.8 0.82 2.3 0.32 2.2 0.4
3 Lower CC 8 20 2.4 11 2.8 1.1 3.1 0.48 3.1 0.68 1.9 0.24 1.5 0.25
4 chondrite 0.235 0.603 0.0891 0.452 0.147 0.056 0.197 0.0363 0.243 0.0556 0.159 0.0242 0.162 0.0243
(see at the end for the dput)
This is made of three samples and a reference value (chondrite). I want to normalize the value of each element by the chondrite, for each sample, i.e. get something like that:
# A tibble: 4 x 15
Rock La Ce Pr Nd Sm Eu Gd Tb Dy Ho Er Tm Yb Lu
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Upper CC 132. 104. 79.7 59.7 32.0 17.9 20.3 19.3 16.0 14.9 14.5 12.4 12.1 12.8
2 Middle CC 102. 87.9 65.1 55.3 31.3 25 20.3 19.3 15.6 14.8 14.5 13.2 13.6 16.5
3 Lower CC 34.0 33.2 26.9 24.3 19.0 19.6 15.7 13.2 12.8 12.2 12.0 9.92 9.26 10.3
4 chondrite 1 1 1 1 1 1 1 1 1 1 1 1 1 1
In which, of course, the first 132 for df["Upper CC","La"] comes from 31 / 0.235, i.e. df["Upper CC","La"] / df["chondrite","La"]
This is trivial in excel, and can be done in plain R with something along the lines of
apply(df[,-1],1,FUN=function(z){return(z/df[4,-1])})
Give or take some unlist() and other niceties.
But how do I do this in tidyverse idiom ? I started constructing
df %>% mutate(across( where(is.numeric), ... ? .... ) )
... but could not go further.
Generalize/related question: instead of normalizing by df[4,], normalize by an arbitrary named vector.
dput(df)
structure(list(Rock = c("Upper CC", "Middle CC", "Lower CC",
"chondrite"), La = c(31, 24, 8, 0.2347), Ce = c(63, 53, 20, 0.6032
), Pr = c(7.1, 5.8, 2.4, 0.0891), Nd = c(27, 25, 11, 0.4524),
Sm = c(4.7, 4.6, 2.8, 0.1471), Eu = c(1, 1.4, 1.1, 0.056),
Gd = c(4, 4, 3.1, 0.1966), Tb = c(0.7, 0.7, 0.48, 0.0363),
Dy = c(3.9, 3.8, 3.1, 0.2427), Ho = c(0.83, 0.82, 0.68, 0.0556
), Er = c(2.3, 2.3, 1.9, 0.1589), Tm = c(0.3, 0.32, 0.24,
0.0242), Yb = c(1.96, 2.2, 1.5, 0.1625), Lu = c(0.31, 0.4,
0.25, 0.0243)), row.names = c(NA, -4L), class = c("tbl_df",
"tbl", "data.frame"))
You can use :
library(dplyr)
df %>% mutate(across(where(is.numeric), ~./.[Rock == "chondrite"]))
# Rock La Ce Pr Nd Sm Eu Gd Tb Dy
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 Upper … 132. 104. 79.7 59.7 32.0 17.9 20.3 19.3 16.1
#2 Middle… 102. 87.9 65.1 55.3 31.3 25.0 20.3 19.3 15.7
#3 Lower … 34.1 33.2 26.9 24.3 19.0 19.6 15.8 13.2 12.8
#4 chondr… 1 1 1 1 1 1 1 1 1
# … with 5 more variables: Ho <dbl>, Er <dbl>, Tm <dbl>,
# Yb <dbl>, Lu <dbl>
Using matrix calculations.
m <- t(dat[-1])
dat[-1] <- t(m / m[,4])
# Rock La Ce Pr Nd Sm Eu Gd Tb Dy Ho Er Tm Yb Lu
# 1 Upper CC 131.91489 104.47761 79.68575 59.73451 31.97279 17.85714 20.30457 19.28375 16.04938 14.92806 14.46541 12.396694 12.098765 12.75720
# 2 Middle CC 102.12766 87.89386 65.09540 55.30973 31.29252 25.00000 20.30457 19.28375 15.63786 14.74820 14.46541 13.223140 13.580247 16.46091
# 3 Lower CC 34.04255 33.16750 26.93603 24.33628 19.04762 19.64286 15.73604 13.22314 12.75720 12.23022 11.94969 9.917355 9.259259 10.28807
# 4 chondrite 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.00000 1.000000 1.000000 1.00000
Data
dat <- structure(list(Rock = c("Upper CC", "Middle CC", "Lower CC",
"chondrite"), La = c(31, 24, 8, 0.235), Ce = c(63, 53, 20, 0.603
), Pr = c(7.1, 5.8, 2.4, 0.0891), Nd = c(27, 25, 11, 0.452),
Sm = c(4.7, 4.6, 2.8, 0.147), Eu = c(1, 1.4, 1.1, 0.056),
Gd = c(4, 4, 3.1, 0.197), Tb = c(0.7, 0.7, 0.48, 0.0363),
Dy = c(3.9, 3.8, 3.1, 0.243), Ho = c(0.83, 0.82, 0.68, 0.0556
), Er = c(2.3, 2.3, 1.9, 0.159), Tm = c(0.3, 0.32, 0.24,
0.0242), Yb = c(1.96, 2.2, 1.5, 0.162), Lu = c(0.31, 0.4,
0.25, 0.0243)), class = "data.frame", row.names = c("1",
"2", "3", "4"))
Using data.table
library(data.table)
setDT(df1)[, (names(df1)[-1]) := lapply(.SD, function(x)
x/x[match( "chondrite", Rock)]), .SDcols = -1]
Related
I'm trying to split a data frame from long to wide format by converting selected rows to columns. Here is the current general long-format structure:
data_long <- data.frame(
id = c("kelp","kelp","fish","fish","beach","beach","kelp","kelp","fish","fish","beach","beach"),
desig = c("mpa","reference","mpa","reference","mpa","reference","mpa","reference","mpa","reference","mpa","reference"),
indicator = c("density","density","density","density","density","density","biomass","biomass","biomass","biomass","biomass","biomass"),
n = c(1118,1118,1118,1118,1118,1118,1118,1118,1118,1118,1118,1118),
m = c(0.35, 4.28, 1.16, 106.35, 13.44,0.63,0.35, 4.28, 1.16, 106.35, 13.44,0.63),
sd = c(1.19, 8.48, 4.25, 118, 31.77,2.79,1.19, 8.48, 4.25, 118, 31.77,2.79)
)
data_long
I want to keep id and indicator, split by "desig",and move "n", "m", and "sd" into new columns. The final data frame structure I'm trying to obtain is:
data_wide <- data.frame(
id = c("kelp","fish","beach","kelp","fish","beach"),
indicator = c("density","density","density","biomass","biomass","biomass"),
mpa.n = c(1118,1118,1118,1118,1118,1118),
mpa.m = c(0.35, 4.28, 1.16, 106.35, 13.44,0.63),
mpa.sd = c(1.19, 8.48, 4.25, 118, 31.77,2.79),
reference.n = c(1118,1118,1118,1118,1118,1118),
reference.m = c(0.35, 4.28, 1.16, 106.35, 13.44,0.63),
reference.sd = c(1.19, 8.48, 4.25, 118, 31.77,2.79)
)
data_wide
I can't seem to get this right using reshape2. Any suggestions?
We may use pivot_wider
library(tidyr)
library(dplyr)
pivot_wider(data_long, names_from = desig,
values_from = c(n, m, sd), names_glue = "{desig}.{.value}") %>%
select(id, indicator, starts_with("mpa"), starts_with('reference'))
-output
# A tibble: 6 × 8
id indicator mpa.n mpa.m mpa.sd reference.n reference.m reference.sd
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 kelp density 1118 0.35 1.19 1118 4.28 8.48
2 fish density 1118 1.16 4.25 1118 106. 118
3 beach density 1118 13.4 31.8 1118 0.63 2.79
4 kelp biomass 1118 0.35 1.19 1118 4.28 8.48
5 fish biomass 1118 1.16 4.25 1118 106. 118
6 beach biomass 1118 13.4 31.8 1118 0.63 2.79
I need to write a for loop to calculate the product of year variables (e.g. var1874) * price variables (e.g. num1874), creating a new variable for each year and its corresponding price value (e.g. newvar1874).
Here's my data in R
A tibble: 4 x 7
cty var1874 var1875 var1876 num1874 num1875 num1876
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 0.78 0.83 0.99 2.64 2.8 3.1
2 2 0.69 0.69 0.89 2.3 2.3 2.58
3 3 0.42 0.48 0.59 2.28 2.44 2.64
4 4 0.82 0.94 1.09 2.28 2.36 3
I've been able to do this using the 'foreach' loop in Stata:
local vn 1874 1875 1876
foreach v of local vn {
gen newvar'v' = var'v'*num'v'
Does anyone know how I would do this same type of command using the for loop in R? I know there may be simpler ways to do this without the for loop, but I need to know how to do this using the for loop.
Using a for loop you could do:
vn <- 1874:1876
for (v in vn) d[[paste0("newvar", v)]] <- d[[paste0("var", v)]] * d[[paste0("num", v)]]
d
#> cty var1874 var1875 var1876 num1874 num1875 num1876 newvar1874 newvar1875
#> 1 1 0.78 0.83 0.99 2.64 2.80 3.10 2.0592 2.3240
#> 2 2 0.69 0.69 0.89 2.30 2.30 2.58 1.5870 1.5870
#> 3 3 0.42 0.48 0.59 2.28 2.44 2.64 0.9576 1.1712
#> 4 4 0.82 0.94 1.09 2.28 2.36 3.00 1.8696 2.2184
#> newvar1876
#> 1 3.0690
#> 2 2.2962
#> 3 1.5576
#> 4 3.2700
Or using lapply you could do:
d[, paste0("newvar", vn)] <- lapply(vn, function(v) d[[paste0("var", v)]] * d[[paste0("num", v)]])
DATA
d <- structure(list(
cty = 1:4, var1874 = c(0.78, 0.69, 0.42, 0.82),
var1875 = c(0.83, 0.69, 0.48, 0.94), var1876 = c(
0.99, 0.89,
0.59, 1.09
), num1874 = c(2.64, 2.3, 2.28, 2.28), num1875 = c(
2.8,
2.3, 2.44, 2.36
), num1876 = c(3.1, 2.58, 2.64, 3)
), class = "data.frame", row.names = c(
"1",
"2", "3", "4"
))
cutoff KM KM_lo KM_hi rstm rstm_lo rstm_hi
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2017-01-01 2.1 1.4 4.9 7.2 3.9 10.2
2 2017-04-01 3.5 2.1 4.7 8.9 6.6 10.8
3 2017-07-01 3.7 2.8 4.2 7.2 6.2 8.4
How do I convert this to a long table? I am struggling to create it into the format I want. I tried the gather and melt functions. The output table would look something like this
cutoff VAR Val Val-hi Val-lo
<chr> <chr> <dbl> <dbl> <dbl>
1 2017-01-01 KM 2.1 4.9 1.4
2 2017-01-01 rstm 7.2 4.7 3.9
3 2017-07-01 KM 3.7 4.2 2.8
Sample date
structure(list(cutoff = c("2017-01-01", "2017-04-01", "2017-07-01"
), KM = c(2.1, 3.5, 3.7), KM_lo = c(1.4, 2.1, 2.8), KM_hi = c(4.9,
4.7, 4.2), rstm = c(7.2, 8.9, 7.2), rstm_lo = c(3.9, 6.6, 6.2
), rstm_hi = c(10.2, 10.8, 8.4)), row.names = c(NA, -3L), class = c("tbl_df",
"tbl", "data.frame"))
We may do
library(dplyr)
library(tidyr)
library(stringr)
df1 %>%
rename_with(~ str_c(., "_none"), c("KM", "rstm")) %>%
pivot_longer(cols = -cutoff, names_to = c("VAR", ".value"),
names_sep = "_") %>%
rename_with(~ c("Val", "Val-lo", "Val-hi"), 3:5)
-output
# A tibble: 6 × 5
cutoff VAR Val `Val-lo` `Val-hi`
<chr> <chr> <dbl> <dbl> <dbl>
1 2017-01-01 KM 2.1 1.4 4.9
2 2017-01-01 rstm 7.2 3.9 10.2
3 2017-04-01 KM 3.5 2.1 4.7
4 2017-04-01 rstm 8.9 6.6 10.8
5 2017-07-01 KM 3.7 2.8 4.2
6 2017-07-01 rstm 7.2 6.2 8.4
Here is another pivot_longer approach:
library(dplyr)
library(tidyr)
df %>%
pivot_longer(
-cutoff,
names_to = c("VAR", ".value"),
names_pattern = "(.+)_(.+)"
) %>%
na.omit()
cutoff VAR lo hi
<chr> <chr> <dbl> <dbl>
1 2017-01-01 KM 1.4 4.9
2 2017-01-01 rstm 3.9 10.2
3 2017-04-01 KM 2.1 4.7
4 2017-04-01 rstm 6.6 10.8
5 2017-07-01 KM 2.8 4.2
6 2017-07-01 rstm 6.2 8.4
library(tidyverse)
df <-
structure(
list(
cutoff = c("2017-01-01", "2017-04-01", "2017-07-01"),
KM = c(2.1, 3.5, 3.7),
KM_lo = c(1.4, 2.1, 2.8),
KM_hi = c(4.9, 4.7, 4.2),
rstm = c(7.2, 8.9, 7.2),
rstm_lo = c(3.9, 6.6, 6.2),
rstm_hi = c(10.2, 10.8, 8.4)
),
row.names = c(NA,-3L),
class = c("tbl_df",
"tbl", "data.frame")
)
df %>%
pivot_longer(cols = -cutoff) %>%
separate(col = name, into = c("name", "suffix"), sep = "_", remove = TRUE) %>%
mutate(id = data.table::rleid(name)) %>%
pivot_wider(id_cols = c(id, cutoff, name), names_from = suffix, names_prefix = "VAL_", values_from = value) %>%
select(-id) %>%
rename(VAL = VAL_NA)
#> Warning: Expected 2 pieces. Missing pieces filled with `NA` in 6 rows [1, 4, 7,
#> 10, 13, 16].
#> # A tibble: 6 x 5
#> cutoff name VAL VAL_lo VAL_hi
#> <chr> <chr> <dbl> <dbl> <dbl>
#> 1 2017-01-01 KM 2.1 1.4 4.9
#> 2 2017-01-01 rstm 7.2 3.9 10.2
#> 3 2017-04-01 KM 3.5 2.1 4.7
#> 4 2017-04-01 rstm 8.9 6.6 10.8
#> 5 2017-07-01 KM 3.7 2.8 4.2
#> 6 2017-07-01 rstm 7.2 6.2 8.4
Created on 2021-09-28 by the reprex package (v2.0.1)
I have a data. table
h1 <- c(rnorm(50, mean = 50, sd = 1),
rnorm(50, mean = 60, sd = 1),
rnorm(50, mean = 70, sd = 1),
rnorm(50, mean = 80, sd = 1))
w1 <- c(rnorm(150, mean = 150, sd = 1),
rnorm(150, mean = 160, sd = 1),
rnorm(150, mean = 170, sd = 1),
rnorm(150, mean = 180, sd = 1))
e1 <- c(rnorm(150, mean = 150, sd = 1),
rnorm(150, mean = 160, sd = 1),
rnorm(150, mean = 170, sd = 1),
rnorm(150, mean = 180, sd = 1))
h2 <- c(rnorm(50, mean = 50, sd = 1),
rnorm(50, mean = 60, sd = 1),
rnorm(50, mean = 70, sd = 1),
rnorm(50, mean = 80, sd = 1))
w2 <- c(rnorm(150, mean = 150, sd = 1),
rnorm(150, mean = 160, sd = 1),
rnorm(150, mean = 170, sd = 1),
rnorm(150, mean = 180, sd = 1))
e2 <- c(rnorm(150, mean = 150, sd = 1),
rnorm(150, mean = 160, sd = 1),
rnorm(150, mean = 170, sd = 1),
rnorm(150, mean = 180, sd = 1))
df <- data.frame(h1,w1,e1,h2,w2,e2)
> df
h1 w1 e1 h2 w2 e2
1 49.85148 148.6694 149.4619 49.05355 151.1857 147.7629
2 49.81708 149.7126 149.1840 50.75627 150.4471 149.2853
I would like to find the difference between the columns in data. table?
what I want to get:
h1
w1
e1
h2
w2
e2
h2-h1
w2-w1
e2-e1
49.85148
148.6694
149.4619
49.05355
151.1857
147.7629
-0,79793
2,5163
-1,699
You can divide the dataframe in half and subtract the second part with the first one and assign new columns names.
n <- ncol(df)
col1 <- 1:(n/2)
col2 <- (n/2 + 1):n
new_col_name <- paste(names(df)[col2], names(df)[col1], sep = '-')
df[new_col_name] <- df[col2] - df[col1]
head(df)
# h1 w1 e1 h2 w2 e2 h2-h1 w2-w1 e2-e1
#1 49.43 149.6 150.2 49.39 149.4 150.1 -0.03665 -0.193458 -0.09741
#2 50.10 149.7 150.8 49.03 149.6 149.6 -1.07812 -0.053813 -1.25975
#3 50.05 149.8 150.7 48.42 149.8 151.0 -1.62448 -0.007319 0.32304
#4 49.77 149.7 148.8 49.92 148.7 149.1 0.15132 -1.005730 0.23139
#5 49.44 149.9 151.0 48.39 150.9 150.0 -1.04673 0.977863 -0.97748
#6 49.58 148.8 151.1 50.41 150.6 148.6 0.83088 1.800697 -2.52930
Perhaps this:
library(tidyverse)
str_sub(names(df), 1, 1) %>%
unique() %>%
map_dfc(~ select(df,starts_with(.x)) %>% transmute('{.x}1-{.x}2' := .[, 1] - .[, 2])) %>%
{bind_cols(df, .)} %>% as.tibble()
#> Warning: `as.tibble()` was deprecated in tibble 2.0.0.
#> Please use `as_tibble()` instead.
#> The signature and semantics have changed, see `?as_tibble`.
#> # A tibble: 600 x 9
#> h1 w1 e1 h2 w2 e2 `h1-h2` `w1-w2` `e1-e2`
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 49.6 152. 151. 48.4 151. 148. 1.22 1.13 2.90
#> 2 50.1 150. 150. 51.6 150. 150. -1.50 -0.301 0.552
#> 3 50.9 150. 150. 50.4 151. 148. 0.474 -1.28 1.64
#> 4 51.6 149. 152. 51.0 150. 149. 0.599 -0.613 2.78
#> 5 50.2 149. 150. 51.4 150. 152. -1.27 -1.24 -2.44
#> 6 50.9 149. 152. 50.2 151. 151. 0.682 -2.18 0.815
#> 7 49.4 149. 149. 49.0 151. 149. 0.315 -2.57 -0.858
#> 8 50.3 151. 152. 50.4 148. 149. -0.0959 2.38 2.70
#> 9 50.3 151. 151. 49.0 150. 150. 1.30 0.611 0.749
#> 10 51.5 149. 150. 50.0 150. 149. 1.47 -1.21 0.366
#> # … with 590 more rows
Created on 2021-06-25 by the reprex package (v2.0.0)
Variation on the idea of splitting names up and subtracting:
sel <- names(df)[endsWith(names(df), "1")]
df[sprintf("%1$s1-%1$s2", sub("[12]$", "", sel))] <- df[sel] - df[sub("1", "2", sel)]
head(df)
# h1 w1 e1 h2 w2 e2 h1-h2 w1-w2 e1-e2
#1 49.93223 150.9997 149.2362 50.34892 150.3622 148.6164 -0.4166869 0.6375487 0.6198452
#2 48.83462 149.7938 150.9722 49.24049 150.0979 149.2758 -0.4058752 -0.3040331 1.6964756
#3 49.55578 146.8567 149.4173 48.61832 150.7250 148.2298 0.9374638 -3.8682714 1.1875108
I have a problem I'm trying to solve, and I can't seem to find a succinct solution. There are a few similar questions on SO, but nothing that quite fits.
Take some sample data:
library(dplyr)
dat <- tibble(
group1 = factor(sample(c("one", "two"), 10, replace = T)),
group2 = factor(sample(c("alpha", "beta"), 10, replace = T)),
var1 = rnorm(10, 20, 2),
var2 = rnorm(10, 20, 2),
var3 = rnorm(10, 20, 2),
other1 = sample(c("a", "b", "c"), 10, replace = T),
other2 = sample(c("a", "b", "c"), 10, replace = T),
)
I would like to summarise just the numeric variables (i.e. ignoring other1 and other2), but have the output grouped by group1 and group2.
I have tried something like this, but it returns an error as it attempts to apply my summarise() functions to the grouping variables too.
dat %>%
group_by(group1, group2) %>%
select(where(is.numeric)) %>%
map(~ .x %>%
filter(!is.na(.x)) %>%
summarise(mean = mean(.x),
sd = sd(.x),
median = median(.x),
q1 = quantile(.x, p = .25),
q3 = quantile(.x, p = .75))
)
My expected output would be something like
group1 group2 mean sd median q1 q3
<fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
1 one alpha ? ? ? ? ?
2 one beta ? ? ? ? ?
3 two alpha ? ? ? ? ?
4 two beta ? ? ? ? ?
Any solutions would be greatly appreciated.
Thanks,
Sam
Try:
dat %>% group_by(group1,group2) %>%
summarize(across(is.numeric,c(sd = sd,
mean = mean,
median =median,
q1 = function(x) quantile(x,.25),
q3 = function(x) quantile(x,.75))))
group1 group2 var1_sd var1_mean var1_median var1_q1 var1_q3 var2_sd var2_mean var2_median var2_q1 var2_q3 var3_sd
<fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 one alpha 4.06 20.6 19.3 18.3 22.2 1.12 17.9 17.3 17.2 18.2 1.09
2 one beta 0.726 18.7 18.7 18.4 18.9 0.348 18.8 18.8 18.7 18.9 0.604
3 two alpha 1.31 19.9 20.0 19.3 20.6 1.10 17.8 18.3 17.4 18.5 0.624
4 two beta 0.777 21.2 21.2 21.0 21.5 1.13 19.6 19.6 19.2 20.0 0.0161
You can also pass the columns to the functions in summarise:
dat %>%
group_by(group1, group2) %>%
summarise(mean = mean(var1:var3),
sd = sd(var1:var3),
median = median(var1:var3),
q1 = quantile(var1:var3, p = .25),
q3 = quantile(var1:var3, p = .75))
dat
# A tibble: 4 x 7
# Groups: group1 [2]
# group1 group2 mean sd median q1 q3
# <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 one alpha 19.1 0.707 19.1 18.8 19.3
# 2 one beta 17.5 1.29 17.5 16.8 18.3
# 3 two alpha 17.1 NA 17.1 17.1 17.1
# 4 two beta 19.9 NA 19.9 19.9 19.9