Mutating new columns based on common string using existing columns - r

Sample data:
X_5 X_1 Y alpha_5 alpha_1 beta_5 beta_1
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0.21 0.02 0.61 10 5 3 0.01
2 0.01 0.02 0.37 0.4 0.01 0.8 0.5
3 0.02 0.03 0.55 0.01 0.01 0.3 0.99
4 0.04 0.05 0.29 0.01 0.005 0.03 0.55
5 0.11 0.1 -0.08 0.22 0.015 0.01 0.01
6 0.22 0.21 -0.08 0.02 0.03 0.01 0.01
I have a dataset which has columns of some variable of interest, say alpha, beta, and so on. I also have this saved as a character vector. I want to be able to mutate new columns based on these variable names, suffixed with an identifier, using the existing columns in the dataset as part of some transformation, like this:
df %>% mutate(
alpha_new = ((alpha_5-alpha_1) / (X_5-X_1) * Y),
beta_new = ((beta_5-beta_1) / (X_5-X_1) * Y)
)
X_5 X_1 Y alpha_5 alpha_1 beta_5 beta_1 alpha_new beta_new
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0.21 0.02 0.61 10 5 3 0.01 16.1 9.60
2 0.01 0.02 0.37 0.4 0.01 0.8 0.5 -14.4 -11.1
3 0.02 0.03 0.55 0.01 0.01 0.3 0.99 0 38.0
4 0.04 0.05 0.29 0.01 0.005 0.03 0.55 -0.145 15.1
5 0.11 0.1 -0.08 0.22 0.015 0.01 0.01 -1.64 0
6 0.22 0.21 -0.08 0.02 0.03 0.01 0.01 0.0800 0
In my real data I have many more columns like this and I'm struggling to implement this in a "tidy" way which isn't hardcoded, what's the best practice for my situation?
Sample code:
structure(
list(
X_5 = c(0.21, 0.01, 0.02, 0.04, 0.11, 0.22),
X_1 = c(0.02,
0.02, 0.03, 0.05, 0.10, 0.21),
Y = c(0.61, 0.37, 0.55, 0.29, -0.08, -0.08),
alpha_5 = c(10, 0.4, 0.01, 0.01, 0.22, 0.02),
alpha_1 = c(5, 0.01, 0.01, 0.005, 0.015, 0.03),
beta_5 = c(3, 0.8, 0.3, 0.03, 0.01, 0.01),
beta_1 = c(0.01, 0.5, 0.99, 0.55, 0.01, 0.01)
),
row.names = c(NA, -6L),
class = c("tbl_df", "tbl", "data.frame")
) -> df
variable_of_interest <- c("alpha", "beta")

Here's another way to approach this with dynamic creation of columns. With map_dfc from purrr you can column-bind new results, creating new column names with bang-bang on left hand side of := operator, and using .data to access column values on right hand side.
library(tidyverse)
bind_cols(
df,
map_dfc(
variable_of_interest,
~ transmute(df, !!paste0(.x, '_new') :=
(.data[[paste0(.x, '_5')]] - .data[[paste0(.x, '_1')]]) /
(X_5 - X_1) * Y)
)
)
Output
X_5 X_1 Y alpha_5 alpha_1 beta_5 beta_1 alpha_new beta_new
1 0.21 0.02 0.61 10.00 5.000 3.00 0.01 16.05263 9.599474
2 0.01 0.02 0.37 0.40 0.010 0.80 0.50 -14.43000 -11.100000
3 0.02 0.03 0.55 0.01 0.010 0.30 0.99 0.00000 37.950000
4 0.04 0.05 0.29 0.01 0.005 0.03 0.55 -0.14500 15.080000
5 0.11 0.10 -0.08 0.22 0.015 0.01 0.01 -1.64000 0.000000
6 0.22 0.21 -0.08 0.02 0.030 0.01 0.01 0.08000 0.000000

Better to pivot the data first
library(dplyr)
library(tidyr)
# your data
df <- structure(list(X_5 = c(0.21, 0.01, 0.02, 0.04, 0.11, 0.22), X_1 = c(0.02,
0.02, 0.03, 0.05, 0.1, 0.21), Y = c(0.61, 0.37, 0.55, 0.29, -0.08,
-0.08), alpha_5 = c(10, 0.4, 0.01, 0.01, 0.22, 0.02), alpha_1 = c(5,
0.01, 0.01, 0.005, 0.015, 0.03), beta_5 = c(3, 0.8, 0.3, 0.03,
0.01, 0.01), beta_1 = c(0.01, 0.5, 0.99, 0.55, 0.01, 0.01)), class = "data.frame", row.names = c(NA,
-6L))
df <- df |> mutate(id = 1:n()) |>
pivot_longer(cols = -c(id, Y, X_5, X_1),
names_to = c("name", ".value"), names_sep="_") |>
mutate(new= (`5` - `1`) / (X_5 - X_1) * Y) |>
pivot_wider(id_cols = id, names_from = "name", values_from = c(`5`,`1`, `new`),
names_glue = "{name}_{.value}", values_fn = sum)
df
#> # A tibble: 6 × 7
#> id alpha_5 beta_5 alpha_1 beta_1 alpha_new beta_new
#> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 10 3 5 0.01 16.1 9.60
#> 2 2 0.4 0.8 0.01 0.5 -14.4 -11.1
#> 3 3 0.01 0.3 0.01 0.99 0 38.0
#> 4 4 0.01 0.03 0.005 0.55 -0.145 15.1
#> 5 5 0.22 0.01 0.015 0.01 -1.64 0
#> 6 6 0.02 0.01 0.03 0.01 0.0800 0
Created on 2023-02-16 with reprex v2.0.2
Note: if you want to add X_5 and X_1 in the output use id_cols = c(id, X_5, X_1) instead.

I modified your data to create a bit more complicated situation. My hope is that this is close to your real situation. The condition in this idea is that two columns that you wanna pair up stay next to each other. The first job is to collect column names that begin with small letters. Next job is to create a data frame. Here I keep the column names in odd positions
in target in the first column, and ones in even positions in the second column. I was thinking in the same line of Ben; I used map2_dfc to create an output data frame. In this function, I replaced all small letters with X so that I could specify two column names in the original data (i.e., ones starting with X). Then, I did the calculation as you specified. Finally, I created a column name for outcome in the loop. If you want to add the result to the original data, you can run the final line with cbind.
grep(x = names(df), pattern = "[[:lower:]]+_[0-9]+", value = TRUE) -> target
tibble(first_element = target[c(TRUE, FALSE)],
second_element = target[c(FALSE, TRUE)]) -> mydf
map2_dfc(.x = mydf$first_element,
.y = mydf$second_element,
.f = function(x, y) {
sub(x = x, pattern = "[[:lower:]]+", replacement = "X") -> foo1
sub(x = y, pattern = "[[:lower:]]+", replacement = "X") -> foo2
outcome <- ((df[x] - df[y]) / (df[foo1] - df[foo2]) * df["Y"])
names(outcome) <- paste(x,
sub(x = y, pattern = "[[:lower:]]+", replacement = ""),
sep = "")
return(outcome)
}) -> result
cbind(df, result)
# alpha_5_1 alpha_2_6 beta_5_1 beta_3_4
#1 16.05263 0.10736 9.599474 0.27145
#2 -14.43000 0.10730 -11.100000 0.28564
#3 0.00000 0.28710 37.950000 0.50820
#4 -0.14500 0.21576 15.080000 0.64206
#5 -1.64000 -0.06416 0.000000 -0.61352
#6 0.08000 -0.08480 0.000000 -0.25400
DATA
structure(list(
X_5 = c(0.21, 0.01, 0.02, 0.04, 0.11, 0.22),
X_1 = c(0.02,0.02, 0.03, 0.05, 0.10, 0.21),
X_2 = 1:6,
X_6 = 6:11,
X_3 = 21:26,
X_4 = 31:36,
Y = c(0.61, 0.37, 0.55, 0.29, -0.08, -0.08),
alpha_5 = c(10, 0.4, 0.01, 0.01, 0.22, 0.02),
alpha_1 = c(5, 0.01, 0.01, 0.005, 0.015, 0.03),
alpha_2 = c(0.12, 0.55, 0.39, 0.28, 0.99, 0.7),
alpha_6 = 1:6,
beta_5 = c(3, 0.8, 0.3, 0.03, 0.01, 0.01),
beta_1 = c(0.01, 0.5, 0.99, 0.55, 0.01, 0.01),
beta_3 = c(0.55, 0.28, 0.76, 0.86, 0.31, 0.25),
beta_4 = c(5, 8, 10, 23, 77, 32)),
row.names = c(NA, -6L),
class = c("tbl_df", "tbl", "data.frame")) -> df

Related

R - transpose dataframe with multiple id columns and multiple variables [duplicate]

I am trying to use pivot_longer. However, I am not sure how to use names_sep or names_pattern to solve this.
dat <- tribble(
~group, ~BP, ~HS, ~BB, ~lowerBP, ~upperBP, ~lowerHS, ~upperHS, ~lowerBB, ~upperBB,
"1", 0.51, 0.15, 0.05, 0.16, 0.18, 0.5, 0.52, 0.14, 0.16,
"2.1", 0.67, 0.09, 0.06, 0.09, 0.11, 0.66, 0.68, 0.08, 0.1,
"2.2", 0.36, 0.13, 0.07, 0.12, 0.15, 0.34, 0.38, 0.12, 0.14,
"2.3", 0.09, 0.17, 0.09, 0.13, 0.16, 0.08, 0.11, 0.15, 0.18,
"2.4", 0.68, 0.12, 0.07, 0.12, 0.14, 0.66, 0.69, 0.11, 0.13,
"3", 0.53, 0.15, 0.06, 0.14, 0.16, 0.52, 0.53, 0.15, 0.16)
Desired output (First row from wide data)
group names values lower upper
1 BP 0.51 0.16 0.18
1 HS 0.15 0.5 0.52
1 BB 0.05 0.14 0.16
Here is solution following a similar method that #Fnguyen used but using the newer pivot_longer and pivot_wider construct:
library(dplyr)
library(tidyr)
longer<-pivot_longer(dat, cols=-1, names_pattern = "(.*)(..)$", names_to = c("limit", "name")) %>%
mutate(limit=ifelse(limit=="", "value", limit))
answer <-pivot_wider(longer, id_cols = c(group, name), names_from = limit, values_from = value, names_repair = "check_unique")
Most of the selecting, separating, mutating and renaming is taking place within the pivot function calls.
Update:
This regular expressions "(.*)(..)$" means:
( ) ( ) Look for two parts,
(.*) the first part should have zero or more characters
(..) the second part should have just 2 characters at the “$” end of the string
A data.table version (not sure yet how to retain the original names so that you dont need to post substitute them https://github.com/Rdatatable/data.table/issues/2551):
library(data.table)
df <- data.table(dat)
v <- c("BP","HS","BB")
setnames(df, v, paste0("x",v) )
g <- melt(df, id.vars = "group",
measure.vars = patterns(values = "x" ,
lower = "lower",
upper = "upper"),
variable.name = "names")
g[names==1, names := "BP" ]
g[names==2, names := "HS" ]
g[names==3, names := "BB" ]
group names values lower upper
1: 1 BP 0.51 0.16 0.18
2: 2.1 BP 0.67 0.09 0.11
3: 2.2 BP 0.36 0.12 0.15
4: 2.3 BP 0.09 0.13 0.16
5: 2.4 BP 0.68 0.12 0.14
6: 3 BP 0.53 0.14 0.16
7: 1 HS 0.15 0.50 0.52
8: 2.1 HS 0.09 0.66 0.68
9: 2.2 HS 0.13 0.34 0.38
10: 2.3 HS 0.17 0.08 0.11
11: 2.4 HS 0.12 0.66 0.69
12: 3 HS 0.15 0.52 0.53
13: 1 BB 0.05 0.14 0.16
14: 2.1 BB 0.06 0.08 0.10
15: 2.2 BB 0.07 0.12 0.14
16: 2.3 BB 0.09 0.15 0.18
17: 2.4 BB 0.07 0.11 0.13
18: 3 BB 0.06 0.15 0.16
Based on your example data this solution using dplyr works for me:
library(dplyr)
dat %>%
gather(key, values,-group) %>%
mutate(names = gsub("lower","",gsub("upper","",key))) %>%
separate(key, into = c("key1","key2") ,"[[:upper:]]", perl=T) %>%
mutate(key1 = case_when(key1 == "" ~ "values", TRUE ~ key1)) %>%
select(group,names,key1,values) %>%
rowid_to_column() %>%
spread(key1,values) %>%
select(-rowid) %>%
group_by(group,names) %>%
summarise_all(mean,na.rm = TRUE)
I'd like to add an alternative tidyverse solution drawing from the answer provided by #Dave2e.
Like Dave2e's solution it's a two-step procedure (first rename, then reshape). Instead of reshaping the data twice, I add the prefix "values" to the columns named "BP", "HS", and "BB" using rename_with. This was necessary for getting the column names right when using the .value sentinel in the names_to argument of pivot_longer.
library(dplyr)
library(tidyr)
dat %>%
rename_with(~sub("^(BP|HS|BB)$", "values\\1", .)) %>% # add prefix values
pivot_longer(dat , cols= -1,
names_pattern = "(.*)(BP|HS|BB)$",
names_to = c(".value", "names"))

Clumpsy code for calculating mean and standard error

I have tried to make a piece of code that calculate the mean and standard error of my data and puts it into a new tibble.
It feels very clumpsy, however. Does anyone know of packages or other tricks that could make my code more elegant?
I need to calculate mean and se for a number of sub groups (days_incubated).
library(dplyr)
library(tibble)
library(tidyr)
library(data.table)
library(plotrix)
df2 <- df1%>%
group_by(days_incubated)%>%
summarise_each(funs(mean, se= std.error))%>% # Calculating mean and standard error
mutate_if(is.numeric, round, digits = 2) # Round off the data
df2_trans <- transpose(df2) # Transposing data table
colnames(df2_trans) <- rownames(df2) # Get row and colnames in order
rownames(df2_trans) <- colnames(df2) # Get row and colnames in order
df2_trans <- rownames_to_column(df2_trans, "mass") # Making row names into a column
df3_trans <- df2_trans%>% # Converting one column into two
separate(mass, c("mass","type"), sep = "([_])")
mean_target <- c("mean", "incubated")
mean <- df3_trans%>% # Mean table
filter(type %in% mean_target)%>%
rename("mean day 0"="1")%>%
rename("mean day 4"="2")%>%
rename("mean day 10"="3")%>%
rename("mean day 17"="4")%>%
rename("mean day 24"="5")%>%
rename("mean day 66"="6")%>%
rename("mean day 81"="7")%>%
rename("mean day 94"="8")%>%
rename("mean day 116"="9")%>%
select("mass", "mean day 0", "mean day 4", "mean day 10", "mean day 17", "mean day 24", "mean day 66", "mean day 81", "mean day 94", "mean day 116")%>%
slice(-c(1))
se_target <- c("se", "incubated")
se <- df3_trans%>% # SE table
filter(type %in% se_target)%>%
rename("se day 0"="1")%>%
rename("se day 4"="2")%>%
rename("se day 10"="3")%>%
rename("se day 17"="4")%>%
rename("se day 24"="5")%>%
rename("se day 66"="6")%>%
rename("se day 81"="7")%>%
rename("se day 94"="8")%>%
rename("se day 116"="9")%>%
select("mass", "se day 0", "se day 4", "se day 10", "se day 17", "se day 24", "se day 66", "se day 81", "se day 94", "se day 116")%>%
slice(-c(1))
# join mean and se tables
mean_se <- mean %>% #merging mean and se dataset
full_join(se, by=("mass"))%>%
select("mass","mean day 0","se day 0", "mean day 4", "se day 4", "mean day 10", "se day 10", "mean day 17", "se day 17", "mean day 24", "se day 24", "mean day 66", "se day 66", "mean day 81", "se day 81", "mean day 94", "se day 94", "mean day 116", "se day 116") # Putting columns in correct order
And here's the data:
df1 <- structure(list(days_incubated = c("0", "0", "0", "0", "0", "4",
"4", "4", "4", "4", "10", "10", "10", "10", "10", "17", "17",
"17", "17", "17", "24", "24", "24", "24", "24", "66", "66", "66",
"66", "66", "81", "81", "81", "81", "81", "94", "94", "94", "94",
"94", "116", "116", "116", "116", "116"), i.x33.031 = c(7.45,
0, 78.2, 16.49, 18.77, 104.5, 28.95, 26.05, 4.11, 62.09, 1.95,
6.75, 1.41, 3.34, 3.02, 0, 100.28, 0.2, 32.66, 0, 0, 370.57,
7.24, 133.63, 55.26, 0.16, 5.5, 25.17, 16.59, 3.3, 23.95, 30.61,
4.04, 0, 6.58, 0.08, 0.01, 0, 0.38, 0, 0, 0, 0, 0.18, 0), i.x35.034 = c(0,
0, 0.15, 0.02, 0.01, 0.04, 0.04, 0.05, 0.02, 0.09, 0.02, 0, 0.04,
0.01, 0, 0, 0.22, 0, 0.08, 0, 0, 0.66, 0.01, 0.2, 0.12, 0.01,
0.01, 0.04, 0.01, 0.01, 0.01, 0.04, 0, 0, 0, 0, 0, 0, 0.01, 0,
0, 0.02, 0, 0, 0.02), i.x36.017 = c(0.47, 0.09, 0.28, 0.02, 0.03,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.05,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.3, 0.06, 0.32, 0, 0, 0, 0, 0.12,
0, 0.02), i.x39.959 = c(0.02, 0, 0.08, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.04, 0, 0, 0, 0, 0, 0.01, 0, 0,
0, 0, 0, 0.01, 0.02, 0.06, 0.03, 0.03, 0, 0, 0.02, 0.01, 0, 0,
0), i.x40.023 = c(0.35, 0.02, 0.48, 0.06, 0, 1.25, 0.09, 0.1,
0.03, 0, 0.09, 0.07, 0.55, 0.09, 0.07, 0, 0.63, 0, 0.09, 0.07,
0.02, 1.11, 0.04, 0.59, 0.13, 0, 0.01, 0.02, 0, 0, 0, 0, 0.01,
0.02, 0.06, 0.01, 0.01, 0.01, 0.01, 0.04, 0, 0.08, 0, 0, 0.01
)), row.names = c(NA, -45L), class = "data.frame")
Does this give you what you are looking for? It seems to reproduce the output. The note there is that I think your result actually has the columns labeled wrong. One of your steps did a character sort on the columns, so they are not in the order you expected when your renamed later.
What we do is first use pivot_longer() so that we have tidy data. From there, we can group and summarize to calculate the mean and standard errors. Then we pivot_wider() to move it back to the wide format of your result.
library(dplyr)
library(tidyr)
library(plotrix)
col_order <- paste0(rep(paste0(c("mean", "se"), "_day_"), length(unique(df1$days_incubated))),
rep(unique(df1$days_incubated), each = 2))
df1 %>%
pivot_longer(-days_incubated, names_to = "mass") %>%
group_by(days_incubated, mass) %>%
summarize(mean = mean(value),
se = std.error(value), .groups = "drop") %>%
pivot_wider(names_from = days_incubated, values_from = c("mean", "se"),
names_glue = "{.value}_day_{days_incubated}") %>%
relocate(mass, all_of(col_order))
# A tibble: 5 x 19
mass mean_day_0 se_day_0 mean_day_4 se_day_4 mean_day_10 se_day_10 mean_day_17 se_day_17 mean_day_24 se_day_24 mean_day_66 se_day_66 mean_day_81
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 i.x3~ 24.2 13.9 45.1 17.5 3.29 0.932 26.6 19.5 113. 68.6 10.1 4.67 13.0
2 i.x3~ 0.036 0.0287 0.048 0.0116 0.014 0.00748 0.06 0.0429 0.198 0.121 0.016 0.006 0.01
3 i.x3~ 0.178 0.0867 0 0 0 0 0 0 0.01 0.01 0 0 0
4 i.x3~ 0.02 0.0155 0 0 0 0 0 0 0.008 0.008 0.002 0.002 0.006
5 i.x4~ 0.182 0.0978 0.294 0.240 0.174 0.0941 0.158 0.119 0.378 0.210 0.006 0.004 0.018
# ... with 5 more variables: se_day_81 <dbl>, mean_day_94 <dbl>, se_day_94 <dbl>, mean_day_116 <dbl>, se_day_116 <dbl>
Line-by-line
First, I have added in a short step to help column sorting.
col_order is a character vector we create of the correct final order we are looking for. Basically we are just concatenating strings to match the column names. You can skip this and simply type each name individually much like you did in your select() statement, but this saves typing.
Now onto the pipeline.
pivot_longer() will take the data from the columns and stack them. The column names will now be in a variable called mass. By default, the values will be in a new variable called value.
group_by() groups on the specified variables, which are the row and column combinations you want in the final table.
summarize() collapses the multiple rows for each mass and days_incubated combination into one new row, with two columns: the mean and se. The .groups = "drop" argument says to drop the groups (i.e., the tibble is no longer grouped, check the help file for more details).
pivot_wider() takes the long formatted summary tibble and pivots the data out into columns, much how you had your format. It says to take the column names from days_incubated, and to take the values from the mean and the se columns. The names_glue argument specifies how to name the new columns. It uses syntax from the glue packages, there the { brackets basically mean "substitute a value here". So it is the '[name of value column]_day_[days_incubated]' as each name.
relocate() simply reorders the columns. This is relatively new to dplyr, and is the preferred way now to rearrange columns instead of select(). It says to first take mass, and then all_of() the columns in the col_order vector that we created earlier.
I'd like to offer a solution mainly in base R.
library(dplyr)
# Define a function that computes the standard error of a vector.
f.SE <- function(x) sd(x)/sqrt(length(x))
# Define a function that compute the standard error of a column (MARGIN = 2 for column).
f.colSE <- function(mydf) apply(mydf, MARGIN = 2, FUN = f.SE)
# Define a function that combines the means and the standard errors of a column
f.colMeanSE <- function(mycol) cbind(means = colMeans(mycol), se = f.colSE(mycol))
# Apply the function to all numeric columns in your data frame by using the days_incubated subgroup as the indices. The result is a list.
my_mean_se1 <- by(data = df1[,-1], INDICES = df1$days_incubated, FUN= f.colMeanSE); my_mean_se1
#df1$days_incubated: 0
#means se
#i.x33.031 24.182 13.91245823
#i.x35.034 0.036 0.02874022
#i.x36.017 0.178 0.08668333
#i.x39.959 0.020 0.01549193
#i.x40.023 0.182 0.09779571
#------------------------------------------------------------
# df1$days_incubated: 10
#means se
#i.x33.031 3.294 0.932119091
#i.x35.034 0.014 0.007483315
#i.x36.017 0.000 0.000000000
#i.x39.959 0.000 0.000000000
#i.x40.023 0.174 0.094106323
#------------------------------------------------------------
# Bind the columns.
my_mean_se2 <- do.call(cbind, my_mean_se1); my_mean_se2
# means se means se means se means se means se means se means se means
#i.x33.031 24.182 13.91245823 3.294 0.932119091 0.036 0.036000000 26.628 19.46476571 113.340 68.5767486 45.140 17.49416646 10.144 4.665935 13.036
#i.x35.034 0.036 0.02874022 0.014 0.007483315 0.008 0.004898979 0.060 0.04289522 0.198 0.1212601 0.048 0.01157584 0.016 0.006000 0.010
#i.x36.017 0.178 0.08668333 0.000 0.000000000 0.028 0.023323808 0.000 0.00000000 0.010 0.0100000 0.000 0.00000000 0.000 0.000000 0.000
#i.x39.959 0.020 0.01549193 0.000 0.000000000 0.006 0.004000000 0.000 0.00000000 0.008 0.0080000 0.000 0.00000000 0.002 0.002000 0.006
#i.x40.023 0.182 0.09779571 0.174 0.094106323 0.018 0.015620499 0.158 0.11939012 0.378 0.2103188 0.294 0.23972067 0.006 0.004000 0.018
# se means se
#i.x33.031 6.002108463 0.094 0.07304793
#i.x35.034 0.007745967 0.002 0.00200000
#i.x36.017 0.000000000 0.136 0.07194442
#i.x39.959 0.004000000 0.024 0.01122497
#i.x40.023 0.011135529 0.016 0.00600000
# Bind the names of the columns to the names of days_incubated
names_days <- unique(df1$days_incubated)
names_meanse <- rbind(paste0("mean day ", names_days), paste0("se day ", names_days))
colnames(my_mean_se2) <- names_meanse
# Round the numbers to 2 decimal places and convert the numeric table to a data frame.
mean_se <- as.data.frame(round(my_mean_se2, digits = 2)); mean_se
# mean day 0 se day 0 mean day 4 se day 4 mean day 10 se day 10 mean day 17
#i.x33.031 24.18 13.91 3.29 0.93 0.04 0.04 26.63
#i.x35.034 0.04 0.03 0.01 0.01 0.01 0.00 0.06
#i.x36.017 0.18 0.09 0.00 0.00 0.03 0.02 0.00
#i.x39.959 0.02 0.02 0.00 0.00 0.01 0.00 0.00
#i.x40.023 0.18 0.10 0.17 0.09 0.02 0.02 0.16
# se day 17 mean day 24 se day 24 mean day 66 se day 66 mean day 81 se day 81
#i.x33.031 19.46 113.34 68.58 45.14 17.49 10.14 4.67
#i.x35.034 0.04 0.20 0.12 0.05 0.01 0.02 0.01
#i.x36.017 0.00 0.01 0.01 0.00 0.00 0.00 0.00
#i.x39.959 0.00 0.01 0.01 0.00 0.00 0.00 0.00
#i.x40.023 0.12 0.38 0.21 0.29 0.24 0.01 0.00
# mean day 94 se day 94 mean day 116 se day 116
#i.x33.031 13.04 6.00 0.09 0.07
#i.x35.034 0.01 0.01 0.00 0.00
#i.x36.017 0.00 0.00 0.14 0.07
#i.x39.959 0.01 0.00 0.02 0.01
#i.x40.023 0.02 0.01 0.02 0.01
# Convert the data frame to a tibble with rowname set as a new column
as_tibble(mean_se, rownames = "rowname")
# A tibble: 5 x 19
# rowname `mean day 0` `se day 0` `mean day 4` `se day 4` `mean day 10` `se day 10` `mean day 17` `se day 17` `mean day 24` `se day 24`
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 i.x33.~ 24.2 13.9 3.29 0.93 0.04 0.04 26.6 19.5 113. 68.6
#2 i.x35.~ 0.04 0.03 0.01 0.01 0.01 0 0.06 0.04 0.2 0.12
#3 i.x36.~ 0.18 0.09 0 0 0.03 0.02 0 0 0.01 0.01
#4 i.x39.~ 0.02 0.02 0 0 0.01 0 0 0 0.01 0.01
#5 i.x40.~ 0.18 0.1 0.17 0.09 0.02 0.02 0.16 0.12 0.38 0.21
# ... with 8 more variables: `mean day 66` <dbl>, `se day 66` <dbl>, `mean day 81` <dbl>, `se day 81` <dbl>, `mean day 94` <dbl>, `se day
# 94` <dbl>, `mean day 116` <dbl>, `se day 116` <dbl>
I think what you started off with was in right direction. You can next get data in long format, change the column names the way you want it and get the data back in wide format.
library(dplyr)
library(tidyr)
df1 %>%
group_by(days_incubated = as.numeric(days_incubated)) %>%
summarise(across(i.x33.031:i.x40.023, list(mean = mean, se = plotrix::std.error))) %>%
pivot_longer(cols = -days_incubated,
names_to = c('mass', 'col'),
names_sep = '_') %>%
#If you need column exactly as shown
mutate(col = paste(col, 'day')) %>%
pivot_wider(names_from = c(col, days_incubated), values_from = value, names_sep = ' ')
# mass `mean day 0` `se day 0` `mean day 4` `se day 4` `mean day 10` `se day 10`
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 i.x3… 24.2 13.9 45.1 17.5 3.29 0.932
#2 i.x3… 0.036 0.0287 0.048 0.0116 0.014 0.00748
#3 i.x3… 0.178 0.0867 0 0 0 0
#4 i.x3… 0.02 0.0155 0 0 0 0
#5 i.x4… 0.182 0.0978 0.294 0.240 0.174 0.0941
# … with 12 more variables: `mean day 17` <dbl>, `se day 17` <dbl>, `mean day
# 24` <dbl>, `se day 24` <dbl>, `mean day 66` <dbl>, `se day 66` <dbl>, `mean day
# 81` <dbl>, `se day 81` <dbl>, `mean day 94` <dbl>, `se day 94` <dbl>, `mean day
# 116` <dbl>, `se day 116` <dbl>

Creating multiple ggplots from select columns in one data frame using for loop or lapply

I am having all sorts of trouble trying to create a loop, or using lapply to generate multiple plots from one data frame.
df
target A.O2 A.H2O A.conc A.bias B.O2 B.H2O B.conc B.bias C.O2 C.H2O C.conc C.bias
1 85 20.90 0.06 254.96 0.01 20.90 0.06 255.02 0.03 20.90 0.06 254.98 0.01
2 50 20.90 0.09 150.09 0.09 20.90 0.09 150.06 0.08 20.90 0.09 150.00 0.03
3 25 20.94 0.09 75.24 0.31 20.94 0.09 75.47 0.62 20.94 0.09 74.98 -0.04
4 85 10.00 0.08 251.99 -1.22 10.00 0.08 252.02 -1.21 10.00 0.08 252.01 -1.21
5 50 10.00 0.09 148.51 -1.06 10.00 0.09 148.52 -1.05 10.00 0.09 148.50 -1.06
6 25 10.00 0.07 74.00 -1.27 10.00 0.07 74.03 -1.24 10.00 0.07 74.03 -1.24
7 85 0.10 0.06 246.99 -3.13 0.10 0.06 247.01 -3.13 0.10 0.06 247.00 -3.13
8 50 0.10 0.14 146.50 -2.39 0.10 0.14 146.50 -2.39 0.10 0.14 146.45 -2.42
9 25 0.10 0.10 72.97 -2.55 0.10 0.10 73.04 -2.45 0.10 0.10 73.04 -2.44
I want to create plots where X = O2 (A.O2, B.O2, C.O2) and Y = bias (A.bias, B.bias, C.bias) and the points are grouped based off the values in the target column.
library(ggrepel)
ggplot(df, aes(A.O2, A.bias)) +
theme_bw() +
theme(legend.position = 'bottom', plot.title = element_text(hjust=0.5)) +
geom_point(aes(colour = factor(target))) +
geom_line(aes(colour = factor(target))) +
geom_text_repel(aes(label=paste(A.bias),
hjust= 0.4,
vjust=-.8, colour = factor(target)),
size = 3) +
ggtitle('A') +
labs(
x = expression('O'[2]),
y = "bias",
colour = 'conc'
)
I want to repeat the same code where the only thing that changes is the X and Y values in aes() and ggtitle(). I have tried looking up similar posts for using for loops or lapply to do this but nothing seems to work.
Probably reshaping the data to long format and using facet_grid. This is easy using reshape when we switch suffix and prefix of column names.
names(df) <- sapply(lapply(strsplit(names(df), "\\."), rev), paste, collapse=".")
dfl <- reshape(df, varying=2:13, direction="long")
library(ggplot2)
library(ggrepel)
ggplot(dfl, aes(O2, bias)) +
theme_bw() +
theme(legend.position = 'bottom', plot.title = element_text(hjust=0.5)) +
geom_point(aes(colour = factor(target))) +
geom_line(aes(colour = factor(target)))+
geom_text_repel(aes(label=paste(bias),
hjust= 0.4,
vjust=-.8, colour = factor(target)),
size = 3) +
facet_grid("time") +
# ggtitle(z) + ## not needed
labs(
x = expression('O'[2]),
y = "bias",
colour = 'conc'
)
Or if you want three single plots you may put the code into a function together with ggsave to use in an lapply loop.
FUN <- function(x) {
ggplot(dfl[dfl$time == x, ], aes(O2, bias)) +
theme_bw() +
theme(legend.position = 'bottom', plot.title = element_text(hjust=0.5)) +
geom_point(aes(colour = factor(target))) +
geom_line(aes(colour = factor(target)))+
geom_text_repel(aes(label=paste(bias),
hjust= 0.4,
vjust=-.8, colour = factor(target)),
size = 3) +
# facet_grid("time") + ## not needed
ggtitle(x) +
labs(
x = expression('O'[2]),
y = "bias",
colour = 'conc'
)
ggsave(paste0("plot", x, ".png"))
}
times <- c("A", "B", "C")
lapply(times, FUN)
This saves the three plots in your working directory:
dir()
# [1] plotA.png
# [2] plotB.png
# [3] plotC.png
Example plot:
Data:
df <- structure(list(target = c(85L, 50L, 25L, 85L, 50L, 25L, 85L,
50L, 25L), A.O2 = c(20.9, 20.9, 20.94, 10, 10, 10, 0.1, 0.1,
0.1), A.H2O = c(0.06, 0.09, 0.09, 0.08, 0.09, 0.07, 0.06, 0.14,
0.1), A.conc = c(254.96, 150.09, 75.24, 251.99, 148.51, 74, 246.99,
146.5, 72.97), A.bias = c(0.01, 0.09, 0.31, -1.22, -1.06, -1.27,
-3.13, -2.39, -2.55), B.O2 = c(20.9, 20.9, 20.94, 10, 10, 10,
0.1, 0.1, 0.1), B.H2O = c(0.06, 0.09, 0.09, 0.08, 0.09, 0.07,
0.06, 0.14, 0.1), B.conc = c(255.02, 150.06, 75.47, 252.02, 148.52,
74.03, 247.01, 146.5, 73.04), B.bias = c(0.03, 0.08, 0.62, -1.21,
-1.05, -1.24, -3.13, -2.39, -2.45), C.O2 = c(20.9, 20.9, 20.94,
10, 10, 10, 0.1, 0.1, 0.1), C.H2O = c(0.06, 0.09, 0.09, 0.08,
0.09, 0.07, 0.06, 0.14, 0.1), C.conc = c(254.98, 150, 74.98,
252.01, 148.5, 74.03, 247, 146.45, 73.04), C.bias = c(0.01, 0.03,
-0.04, -1.21, -1.06, -1.24, -3.13, -2.42, -2.44)), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9"))

pivot_longer into multiple columns

I am trying to use pivot_longer. However, I am not sure how to use names_sep or names_pattern to solve this.
dat <- tribble(
~group, ~BP, ~HS, ~BB, ~lowerBP, ~upperBP, ~lowerHS, ~upperHS, ~lowerBB, ~upperBB,
"1", 0.51, 0.15, 0.05, 0.16, 0.18, 0.5, 0.52, 0.14, 0.16,
"2.1", 0.67, 0.09, 0.06, 0.09, 0.11, 0.66, 0.68, 0.08, 0.1,
"2.2", 0.36, 0.13, 0.07, 0.12, 0.15, 0.34, 0.38, 0.12, 0.14,
"2.3", 0.09, 0.17, 0.09, 0.13, 0.16, 0.08, 0.11, 0.15, 0.18,
"2.4", 0.68, 0.12, 0.07, 0.12, 0.14, 0.66, 0.69, 0.11, 0.13,
"3", 0.53, 0.15, 0.06, 0.14, 0.16, 0.52, 0.53, 0.15, 0.16)
Desired output (First row from wide data)
group names values lower upper
1 BP 0.51 0.16 0.18
1 HS 0.15 0.5 0.52
1 BB 0.05 0.14 0.16
Here is solution following a similar method that #Fnguyen used but using the newer pivot_longer and pivot_wider construct:
library(dplyr)
library(tidyr)
longer<-pivot_longer(dat, cols=-1, names_pattern = "(.*)(..)$", names_to = c("limit", "name")) %>%
mutate(limit=ifelse(limit=="", "value", limit))
answer <-pivot_wider(longer, id_cols = c(group, name), names_from = limit, values_from = value, names_repair = "check_unique")
Most of the selecting, separating, mutating and renaming is taking place within the pivot function calls.
Update:
This regular expressions "(.*)(..)$" means:
( ) ( ) Look for two parts,
(.*) the first part should have zero or more characters
(..) the second part should have just 2 characters at the “$” end of the string
A data.table version (not sure yet how to retain the original names so that you dont need to post substitute them https://github.com/Rdatatable/data.table/issues/2551):
library(data.table)
df <- data.table(dat)
v <- c("BP","HS","BB")
setnames(df, v, paste0("x",v) )
g <- melt(df, id.vars = "group",
measure.vars = patterns(values = "x" ,
lower = "lower",
upper = "upper"),
variable.name = "names")
g[names==1, names := "BP" ]
g[names==2, names := "HS" ]
g[names==3, names := "BB" ]
group names values lower upper
1: 1 BP 0.51 0.16 0.18
2: 2.1 BP 0.67 0.09 0.11
3: 2.2 BP 0.36 0.12 0.15
4: 2.3 BP 0.09 0.13 0.16
5: 2.4 BP 0.68 0.12 0.14
6: 3 BP 0.53 0.14 0.16
7: 1 HS 0.15 0.50 0.52
8: 2.1 HS 0.09 0.66 0.68
9: 2.2 HS 0.13 0.34 0.38
10: 2.3 HS 0.17 0.08 0.11
11: 2.4 HS 0.12 0.66 0.69
12: 3 HS 0.15 0.52 0.53
13: 1 BB 0.05 0.14 0.16
14: 2.1 BB 0.06 0.08 0.10
15: 2.2 BB 0.07 0.12 0.14
16: 2.3 BB 0.09 0.15 0.18
17: 2.4 BB 0.07 0.11 0.13
18: 3 BB 0.06 0.15 0.16
Based on your example data this solution using dplyr works for me:
library(dplyr)
dat %>%
gather(key, values,-group) %>%
mutate(names = gsub("lower","",gsub("upper","",key))) %>%
separate(key, into = c("key1","key2") ,"[[:upper:]]", perl=T) %>%
mutate(key1 = case_when(key1 == "" ~ "values", TRUE ~ key1)) %>%
select(group,names,key1,values) %>%
rowid_to_column() %>%
spread(key1,values) %>%
select(-rowid) %>%
group_by(group,names) %>%
summarise_all(mean,na.rm = TRUE)
I'd like to add an alternative tidyverse solution drawing from the answer provided by #Dave2e.
Like Dave2e's solution it's a two-step procedure (first rename, then reshape). Instead of reshaping the data twice, I add the prefix "values" to the columns named "BP", "HS", and "BB" using rename_with. This was necessary for getting the column names right when using the .value sentinel in the names_to argument of pivot_longer.
library(dplyr)
library(tidyr)
dat %>%
rename_with(~sub("^(BP|HS|BB)$", "values\\1", .)) %>% # add prefix values
pivot_longer(dat , cols= -1,
names_pattern = "(.*)(BP|HS|BB)$",
names_to = c(".value", "names"))

length of 'dimnames' [2] not equal to array extent when using corrplot function from a matrix read from a csv file

I wanna read the data from a csv file, save it as a matrix and use it for visualization.
data<-read.table("Desktop/Decision_Tree/cor_test_.csv",header = F,sep = ",")
data
V1 V2 V3 V4 V5 V6
1 1.00 0.00 0.00 0.00 0.00 0
2 0.11 1.00 0.00 0.00 0.00 0
3 0.12 0.03 1.00 0.00 0.00 0
4 -0.04 0.54 0.32 1.00 0.00 0
5 -0.12 0.57 -0.09 0.26 1.00 0
6 0.21 -0.04 0.24 0.18 -0.21 1
It goes well. But then:
corrplot(data, method = 'color', addCoef.col="grey")
It is said that:
Error in matrix(unlist(value, recursive = FALSE, use.names = FALSE), nrow = nr, :
length of 'dimnames' [2] not equal to array extent
I don't know how to solve it.
corrplot requires a matrix, I assume your data is a data frame. Use as.matrix(data) instead.
Example:
## Your data as data frame:
data <- structure(list(V1 = c(1, 0.11, 0.12, -0.04, -0.12, 0.21), V2 = c(0,
1, 0.03, 0.54, 0.57, -0.04), V3 = c(0, 0, 1, 0.32, -0.09, 0.24
), V4 = c(0, 0, 0, 1, 0.26, 0.18), V5 = c(0, 0, 0, 0, 1, -0.21
), V6 = c(0, 0, 0, 0, 0, 1)), .Names = c("V1", "V2", "V3", "V4",
"V5", "V6"), row.names = c(NA, -6L), class = "data.frame")
## Using the data frame results in an error:
corrplot::corrplot(data, method = 'color', addCoef.col = "grey")
# Error in matrix(unlist(value, recursive = FALSE, use.names = FALSE), nrow = nr, :
# length of 'dimnames' [2] not equal to array extent
## Using the matrix works:
corrplot::corrplot(as.matrix(data), method = 'color', addCoef.col = "grey")

Resources