pivot to wide dataframe with repeating column names - r

I have a dataframe that I need to get into the right configuration for an external program (PRISM Graphpad). In this toy example I have a matrix of 5x3 conditions with duplicate measurements for each condition. Originally for plotting and analysis in R the data is in the proper long tidy format.
While I am comfortable pivoting the data, I run into problems that the values are not unique when pivoting the DF wide. I need the replicates to be in adjacent columns with the same name for PRISM to properly recognize things. However, when I pivot wide, the duplicate values get shoved into a list because they do not have a unique identified in the id_cols.
In the real-life example the matrix of conditions is of course much larger, there are more repeats (but an identical number for each condition), and on top of that every df is an entry in a list-column, so I will likely need to apply the solution using a purrr::map function.
library(tidyverse)
df <- data.frame("ID" = c(rep(LETTERS[seq(1,5)], 2)),
"cond1" = runif(10, 0, 1),
"cond2" = runif(10, 1, 10),
"cond3" = runif(10, 10, 100))
#// original long dataframe
long_df <- pivot_longer(data = df, cols = c("cond1", "cond2", "cond3"))
long_df
#> # A tibble: 30 x 3
#> ID name value
#> <chr> <chr> <dbl>
#> 1 A cond1 0.424
#> 2 A cond2 9.01
#> 3 A cond3 61.6
#> 4 B cond1 0.460
#> 5 B cond2 2.33
#> 6 B cond3 40.3
#> 7 C cond1 0.107
#> 8 C cond2 5.82
#> 9 C cond3 23.9
#> 10 D cond1 0.714
#> # ... with 20 more rows
#// desired output
desired_df <- cbind(df[c(1:5),], df[c(6:10),])
desired_df <- desired_df[,c(1,2,6,3,7,4,8)]
colnames(desired_df)[c(3,5,7)] <- c("cond1", "cond2", "cond3")
desired_df
#> ID cond1 cond1 cond2 cond2 cond3 cond3
#> 1 A 0.4244798 0.8078372 9.005544 5.349371 61.61488 73.80651
#> 2 B 0.4596927 0.3509671 2.325029 8.636263 40.33949 66.54288
#> 3 C 0.1069974 0.3903294 5.817079 7.100623 23.87013 99.98683
#> 4 D 0.7144698 0.1005499 9.886948 7.006333 19.40680 66.86696
#> 5 E 0.2903691 0.6177356 8.890734 9.863695 46.56568 66.42537
#// result from pivot_wider
wide_df <- pivot_wider(long_df, id_cols = ID, names_from = name, values_from = value)
#> Warning: Values are not uniquely identified; output will contain list-cols.
#> * Use `values_fn = list` to suppress this warning.
#> * Use `values_fn = length` to identify where the duplicates arise
#> * Use `values_fn = {summary_fun}` to summarise duplicates
wide_df
#> # A tibble: 5 x 4
#> ID cond1 cond2 cond3
#> <chr> <list> <list> <list>
#> 1 A <dbl [2]> <dbl [2]> <dbl [2]>
#> 2 B <dbl [2]> <dbl [2]> <dbl [2]>
#> 3 C <dbl [2]> <dbl [2]> <dbl [2]>
#> 4 D <dbl [2]> <dbl [2]> <dbl [2]>
#> 5 E <dbl [2]> <dbl [2]> <dbl [2]>
Created on 2021-01-15 by the reprex package (v0.3.0)
.0)

It is not recommended to have duplicate column names, therefore, we modify the 'name' column by appending an unique index created with rowid, and use that to reshape with pivot_wider
library(dplyr)
library(tidyr)
library(stringr)
library(data.table)
long_df %>%
mutate(name = str_c(name, "_", rowid(ID, name))) %>%
pivot_wider(names_from = name, values_from = value, names_sort = TRUE)
-output
# A tibble: 5 x 7
# ID cond1_1 cond1_2 cond2_1 cond2_2 cond3_1 cond3_2
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 A 0.293 0.920 6.44 9.14 18.5 71.9
#2 B 0.225 0.280 4.34 2.78 59.7 16.9
#3 C 0.704 0.764 7.05 1.40 75.3 64.0
#4 D 0.519 0.802 7.06 5.51 22.4 66.7
#5 E 0.663 0.255 3.88 2.25 30.1 14.2
If it needs to have repeating names, just strip off the _\\d+ at the end of the name with str_remove

Try this:
library(tidyverse)
#Code
new <- df %>%
pivot_longer(-1) %>%
group_by(ID,name) %>%
mutate(name=paste0(name,'.',row_number())) %>%
pivot_wider(names_sort = T,names_from=name,values_from=value)
Output:
# A tibble: 5 x 7
# Groups: ID [5]
ID cond1.1 cond1.2 cond2.1 cond2.2 cond3.1 cond3.2
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A 0.619 0.851 4.49 9.17 70.9 84.2
2 B 0.989 0.542 9.64 3.57 55.3 28.3
3 C 0.594 0.602 5.16 8.97 26.2 19.0
4 D 0.349 0.244 5.29 8.52 44.8 17.7
5 E 0.683 0.848 7.27 8.07 97.3 73.9
Then you can process like this:
#Further process
names(new) <- gsub("\\..*","",names(new))
Output:
# A tibble: 5 x 7
# Groups: ID [5]
ID cond1 cond1 cond2 cond2 cond3 cond3
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A 0.129 0.360 4.60 2.74 55.3 58.3
2 B 0.416 0.384 5.93 9.17 15.7 21.8
3 C 0.724 0.622 9.30 7.81 76.9 79.0
4 D 0.101 0.951 6.35 1.58 30.3 68.5
5 E 0.238 0.814 9.46 9.50 12.4 57.8
And export to .txt for the other software.

Related

Create a new column based on the value arrangement of another column in R dplyr

I have a data frame that looks like this:
I want to create a new column called labels, which indicates the genes that have the
top2 positive log2FoldChange and small pvalue as well
as the top2 negative log2FoldChange and small pvalue.
library(tidyverse)
df <- tibble(gene=c("AT1G23410","AT2G05530","AT2G26820","AT1G36240","AT5G05260","AT5G47260"),
log2FoldChange=c(14.468018,13.763999,13.708055, -13.480280,-12.166529,-11.468673),
pvalue=c(0.001,0.004,0.005,0.0041,0.0034,0.004))
df
#> # A tibble: 6 × 3
#> gene log2FoldChange pvalue
#> <chr> <dbl> <dbl>
#> 1 AT1G23410 14.5 0.001
#> 2 AT2G05530 13.8 0.004
#> 3 AT2G26820 13.7 0.005
#> 4 AT1G36240 -13.5 0.0041
#> 5 AT5G05260 -12.2 0.0034
#> 6 AT5G47260 -11.5 0.004
Created on 2022-10-19 with reprex v2.0.2
I want my data to look like this.
#> gene log2FoldChange pvalue labels
#> <chr> <dbl> <dbl>
#> 1 AT1G23410 14.5 0.001 AT1G23410
#> 2 AT2G05530 13.8 0.004 AT2G05530
#> 3 AT2G26820 13.7 0.005
#> 4 AT1G36240 -13.5 0.0041 AT1G36240
#> 5 AT5G05260 -12.2 0.0034 AT5G05260
#> 6 AT5G47260 -11.5 0.004
EDIT:
by small pvalue I mean the smallest value in the dataset. In terms of arrange it would be:
arrange(log2FoldChange, pvalue) for the negative ones, and for the positive arrange(desc(log2FoldChange), pvalue)
does it make sense
You can arrange and use ifelse:
df %>%
arrange(log2FoldChange) %>%
mutate(labels = ifelse(row_number() %in% c(1:2, (n()-1):n()), gene, ""))
output
# A tibble: 6 × 4
gene log2FoldChange pvalue labels
<chr> <dbl> <dbl> <chr>
1 AT1G36240 -13.5 0.0041 "AT1G36240"
2 AT5G05260 -12.2 0.0034 "AT5G05260"
3 AT5G47260 -11.5 0.004 ""
4 AT2G26820 13.7 0.005 ""
5 AT2G05530 13.8 0.004 "AT2G05530"
6 AT1G23410 14.5 0.001 "AT1G23410"
If there is a filtering to do with pvalue, you can do:
df %>%
mutate(labels = ifelse(log2FoldChange %in% c(head(log2FoldChange[order(log2FoldChange, pvalue)], 2),
head(log2FoldChange[order(-log2FoldChange, pvalue)], 2)), gene, ""))
Here's an option:
library(dplyr)
library(purrr)
df <- tibble(gene=c("AT1G23410","AT2G05530","AT2G26820","AT1G36240","AT5G05260","AT5G47260"),
log2FoldChange=c(14.468018,13.763999,13.708055, -13.480280,-12.166529,-11.468673),
pvalue=c(0.001,0.004,0.005,0.0041,0.0034,0.004))
df %>% left_join(
df %>%
mutate(positive = ifelse(log2FoldChange >= 0, "y", "n")) %>%
split(.$positive) %>%
map(~.x %>% arrange(desc(abs(log2FoldChange))) %>% slice(1:2) %>% select(gene) %>% mutate(labels = gene)) %>%
bind_rows())
#> Joining, by = "gene"
#> # A tibble: 6 × 4
#> gene log2FoldChange pvalue labels
#> <chr> <dbl> <dbl> <chr>
#> 1 AT1G23410 14.5 0.001 AT1G23410
#> 2 AT2G05530 13.8 0.004 AT2G05530
#> 3 AT2G26820 13.7 0.005 <NA>
#> 4 AT1G36240 -13.5 0.0041 AT1G36240
#> 5 AT5G05260 -12.2 0.0034 AT5G05260
#> 6 AT5G47260 -11.5 0.004 <NA>
You can arrange the data and label first and last 2 rows.
library(dplyr)
df %>%
arrange(desc(log2FoldChange)) %>%
mutate(label = case_when(row_number() %in% c(1, 2, n()-1, n()) ~ gene,
TRUE ~ ""))
# gene log2FoldChange pvalue label
# <chr> <dbl> <dbl> <chr>
#1 AT1G23410 14.5 0.001 "AT1G23410"
#2 AT2G05530 13.8 0.004 "AT2G05530"
#3 AT2G26820 13.7 0.005 ""
#4 AT5G47260 -11.5 0.004 ""
#5 AT5G05260 -12.2 0.0034 "AT5G05260"
#6 AT1G36240 -13.5 0.0041 "AT1G36240"
Note - It is not clear to me what you mean by small p-value. This answer only considers log2FoldChange variable which seems to match with your expected output.
A base R approach using order() only, and some indexing
df$labels <- NA
df[order(df$log2FoldChange),][1:2,]$labels <- df[order(df$log2FoldChange),][1:2,]$gene
df[order(df$log2FoldChange, decreasing = TRUE),][1:2,]$labels <- df[order(df$log2FoldChange, decreasing = TRUE),][1:2,]$gene
df
# A tibble: 6 × 4
gene log2FoldChange pvalue labels
<chr> <dbl> <dbl> <chr>
1 AT1G23410 14.5 0.001 AT1G23410
2 AT2G05530 13.8 0.004 AT2G05530
3 AT2G26820 13.7 0.005 NA
4 AT1G36240 -13.5 0.0041 AT1G36240
5 AT5G05260 -12.2 0.0034 AT5G05260
6 AT5G47260 -11.5 0.004 NA

map_dfr outputting a row rather than a column

This is similar to purrr::map_dfr binds by columns, not row as expected but the solutions there aren't working for me. I have a dataframe like
beta_df <- structure(list(intercept = c(-2.75747056032685, -2.90831892599742,
-2.92478082251453, -2.99701559041538, -2.88885796048347, -3.09564193631675
), B1 = c(0.0898235360814854, 0.0291839369781567, 0.0881023522236231,
0.231703026085554, 0.0441573699433149, 0.258219673780526), B2 = c(-0.222367437619057,
0.770536384299238, 0.199648657850609, 0.0529038155448773, 0.00310458335580774,
0.132604387458483), B3 = c(1.26339268033385, 1.29883641278223,
0.949504940387809, 1.26904511447941, 0.863882674439083, 0.823907268679309
), B4 = c(2.13662994525526, 1.02340744740827, 0.959079691725652,
1.60672779812489, 1.19095838867883, -0.0693120654049908)), row.names = c(NA,
-6L), class = c("tbl_df", "tbl", "data.frame"))
#> # A tibble: 6 × 5
#> intercept B1 B2 B3 B4
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 -2.76 0.0898 -0.222 1.26 2.14
#> 2 -2.91 0.0292 0.771 1.30 1.02
#> 3 -2.92 0.0881 0.200 0.950 0.959
#> 4 -3.00 0.232 0.0529 1.27 1.61
#> 5 -2.89 0.0442 0.00310 0.864 1.19
#> 6 -3.10 0.258 0.133 0.824 -0.0693
I'd like to turn this into a tibble with columns for the mean, 0.025 and 0.975 quantiles. For the quantile function this works:
beta_df %>%
map_dfr(quantile,0.025)
#> # A tibble: 5 × 1
#> `2.5%`
#> <dbl>
#> 1 -3.08
#> 2 0.0311
#> 3 -0.194
#> 4 0.829
#> 5 0.0592
And this gets me both quantiles
bind_cols(beta_df %>%
map_dfr(quantile, 0.025),
beta_df %>%
map_dfr(quantile, 0.975))
#> # A tibble: 5 × 2
#> `2.5%` `97.5%`
#> <dbl> <dbl>
#> 1 -3.08 -2.77
#> 2 0.0311 0.255
#> 3 -0.194 0.699
#> 4 0.829 1.30
#> 5 0.0592 2.07
But for mean,
beta_df %>%
map_dfr(mean)
#> # A tibble: 1 × 5
#> intercept B1 B2 B3 B4
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 -2.93 0.124 0.156 1.08 1.14
Gives me a long row rather than a column. How can I turn the mean of each column of the original dataframe into a row of a single column dataframe labelled mean?
The reason is because the output of quantile() is a named vector whereas for the mean() is just a single value.
Lets create a custom function with the mean that outputs a named vector,
myMean <- function(x) {setNames(mean(x), nm = 'theMean')}
Applying that using map_dfr we get,
library(dplyr)
beta_df %>%
purrr::map_dfr(myMean)
# A tibble: 5 x 1
theMean
<dbl>
1 -2.93
2 0.124
3 0.156
4 1.08
5 1.14

dplyr unquoting does not work with filter function

maybe I am missing something, but I can't seem to make dplyr's unquoting operator to work with the filter function. It does with with select, but not with filter...
Example
set.seed(1234)
A = matrix(rnorm(100),nrow = 10, ncol = 10)
colnames(A) <- paste("var", seq(1:10), sep = "")
varname_test <- "var2"
A <- as_tibble(A)
select(A, !!varname_test) #this works as expected
# this does NOT give me only the rows where var2
# is positive
(result1 <- filter(A, !!varname_test > 0))
# This is how the result 1 should look like
(result2 <- filter(A, var2 > 0))
# result1 is not equal to result2
I would appreciate any help!
I would suggest the following:
library(dplyr)
filter_at(A, vars(starts_with(varname_test)), any_vars(. > 0))
A tibble: 3 x 10
var1 var2 var3 var4 var5 var6 var7 var8 var9 var10
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 -2.35 0.0645 0.460 -0.501 -0.281 -1.01 -0.670 0.648 -0.174 1.00
2 0.429 0.959 -0.694 -1.63 -0.994 -0.162 -0.00760 2.07 0.850 -0.496
3 -0.890 2.42 -0.936 -0.466 -0.497 -1.16 0.336 -0.317 -1.19 2.12
Here, my solution (1g) uses filter_ and conditions built up with paste.
Of course, 1a is a perfectly fine solution (as was provided by joran and aosmith in the comments).
I thought this might be a good place to use curly curly but I couldn't get it to work (maybe not applicable?)
I also thought: what if we wanted to filter by multiple variables? This is where you see 2g working below (while 2a does not work anymore).
Other issues: filter_ is now deprecated, and I'm not sure what the correct syntax would be here. Will be asking this in a question.
library(tidyverse)
set.seed(1234)
A <- matrix(rnorm(30),nrow = 10, ncol = 3) %>% as_tibble() %>% set_names(paste("var", seq(1:3), sep = ""))
varnames_1 <- c("var2")
(expected_result_1 <- filter(A, var2 > 0))
#> # A tibble: 3 x 3
#> var1 var2 var3
#> <dbl> <dbl> <dbl>
#> 1 -2.35 0.0645 0.460
#> 2 0.429 0.959 -0.694
#> 3 -0.890 2.42 -0.936
(answer_1a <- filter(A,!!ensym(varnames_1) > 0)) # works (thanks joran and aosmith)
#> # A tibble: 3 x 3
#> var1 var2 var3
#> <dbl> <dbl> <dbl>
#> 1 -2.35 0.0645 0.460
#> 2 0.429 0.959 -0.694
#> 3 -0.890 2.42 -0.936
(answer_1b <- filter_(A, varnames_1 > 0)) # filter_ not doing what I thought it might
#> Warning: filter_() is deprecated.
#> Please use filter() instead
#>
#> The 'programming' vignette or the tidyeval book can help you
#> to program with filter() : https://tidyeval.tidyverse.org
#> This warning is displayed once per session.
#> # A tibble: 10 x 3
#> var1 var2 var3
#> <dbl> <dbl> <dbl>
#> 1 -1.21 -0.477 0.134
#> 2 0.277 -0.998 -0.491
#> 3 1.08 -0.776 -0.441
#> 4 -2.35 0.0645 0.460
#> 5 0.429 0.959 -0.694
#> 6 0.506 -0.110 -1.45
#> 7 -0.575 -0.511 0.575
#> 8 -0.547 -0.911 -1.02
#> 9 -0.564 -0.837 -0.0151
#> 10 -0.890 2.42 -0.936
(answer_1c <- filter(A, {{varnames_1}} > 0)) # curly curly not doing what I thought it might
#> # A tibble: 10 x 3
#> var1 var2 var3
#> <dbl> <dbl> <dbl>
#> 1 -1.21 -0.477 0.134
#> 2 0.277 -0.998 -0.491
#> 3 1.08 -0.776 -0.441
#> 4 -2.35 0.0645 0.460
#> 5 0.429 0.959 -0.694
#> 6 0.506 -0.110 -1.45
#> 7 -0.575 -0.511 0.575
#> 8 -0.547 -0.911 -1.02
#> 9 -0.564 -0.837 -0.0151
#> 10 -0.890 2.42 -0.936
(answer_1d <- filter(A, {{varnames_1 > 0}})) # curly curly not doing what I thought it might
#> `arg` must be a symbol
conditions_1 <- paste(varnames_1, "> 0")
(answer_1e <- filter(A, conditions_1)) # does not work
#> Error: Argument 2 filter condition does not evaluate to a logical vector
(answer_1f <- filter(A, {{conditions_1}})) # curly curly not doing what I thought it might
#> Error: Argument 2 filter condition does not evaluate to a logical vector
(answer_1g <- filter_(A, conditions_1)) # works
#> # A tibble: 3 x 3
#> var1 var2 var3
#> <dbl> <dbl> <dbl>
#> 1 -2.35 0.0645 0.460
#> 2 0.429 0.959 -0.694
#> 3 -0.890 2.42 -0.936
# what if we wanted to filter multiple variables?
varnames_2 <- c("var2", "var3")
(expected_result_2 <- filter(A, var2 > 0 & var3 > 0))
#> # A tibble: 1 x 3
#> var1 var2 var3
#> <dbl> <dbl> <dbl>
#> 1 -2.35 0.0645 0.460
(answer_2a <- filter(A,!!ensym(varnames_2) > 0)) # does not work
#> Only strings can be converted to symbols
conditions_2 <- paste(paste(varnames_2, "> 0"), collapse = " & ")
(answer_2f <- filter(A, {{conditions_2}})) # curly curly not doing what I thought it might
#> Error: Argument 2 filter condition does not evaluate to a logical vector
(answer_2g <- filter_(A, conditions_2)) # works
#> # A tibble: 1 x 3
#> var1 var2 var3
#> <dbl> <dbl> <dbl>
#> 1 -2.35 0.0645 0.460
Created on 2019-08-28 by the reprex package (v0.3.0)

Trying to generate a data frame from values in another data frame using dplyr and purrr

I'm trying to create a data frame that is generated based on the content of another data frame. In the example below I use the n_seqs column of a tibble to specify the mean for the rnorm function and then I generate my_tibble. The first column of my_tibble should contain the value from the group column and the subsequent columns should contain the 10 random values from running rnorm. As the reproducible example below shows, I'm able to get this to work through a fairly hacky approach.
I don't understand...
Why I have to do pull and can't specify n_seqs in the map function. Also
Whether there's a way to name the individual entries in the list so that I can use map_dfr or bind_rows
What is the best dplyr/purrr approach do get the desired result?
library(tidyverse)
my_tibble <- tibble(group=c("A", "B", "C"), n_seqs=c(5,7,10)) %>%
pull(n_seqs) %>%
map(function(x){ z <- rnorm(x, n=10); names(z) <- letters[1:10]; return(z) })
my_tibble
#> [[1]]
#> a b c d e f g h
#> 6.518214 4.305639 6.106827 5.118304 4.255043 5.678025 4.345129 4.914239
#> i j
#> 6.727135 6.030590
#>
#> [[2]]
#> a b c d e f g h
#> 7.969410 7.558780 8.265322 8.004338 6.862732 5.517313 8.061683 4.062385
#> i j
#> 6.693430 7.858993
#>
#> [[3]]
#> a b c d e f g
#> 9.066362 9.921300 10.724671 8.643903 9.783747 9.102569 10.489579
#> h i j
#> 9.156070 9.863332 11.148255
#error
my_tibble %>% bind_rows(.)
#> Error in bind_rows_(x, .id): Argument 1 must have names
# deprecated warning, but desired output
my_tibble %>% rbind_list %>% mutate(sample=c("A", "B", "C")) %>% select(sample, everything())
#> Warning: 'rbind_list' is deprecated.
#> Use 'bind_rows()' instead.
#> See help("Deprecated")
#> # A tibble: 3 x 11
#> sample a b c d e f g h i j
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 A 6.52 4.31 6.11 5.12 4.26 5.68 4.35 4.91 6.73 6.03
#> 2 B 7.97 7.56 8.27 8.00 6.86 5.52 8.06 4.06 6.69 7.86
#> 3 C 9.07 9.92 10.7 8.64 9.78 9.10 10.5 9.16 9.86 11.1
#desired output
my_tibble %>% do.call(rbind, .) %>% as.tibble() %>% mutate(sample=c("A", "B", "C")) %>% select(sample, everything())
#> # A tibble: 3 x 11
#> sample a b c d e f g h i j
#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 A 6.52 4.31 6.11 5.12 4.26 5.68 4.35 4.91 6.73 6.03
#> 2 B 7.97 7.56 8.27 8.00 6.86 5.52 8.06 4.06 6.69 7.86
#> 3 C 9.07 9.92 10.7 8.64 9.78 9.10 10.5 9.16 9.86 11.1
Created on 2018-06-12 by the reprex package (v0.2.0).
The list elements are named vectors.
We convert it to to a tibble and then do the bind_rows or use map_df
my_tibble %>%
map_df(~ as.list(.x) %>%
as_tibble)
# A tibble: 3 x 10
# a b c d e f g h i j
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 7.40 4.96 5.69 5.03 4.26 5.19 3.20 6.47 5.15 7.17
#2 7.48 6.29 7.61 6.07 5.75 7.29 6.56 7.00 7.07 6.41
#3 9.43 9.86 11.2 8.48 10.6 10.3 11.1 9.70 10.4 10.3
Or data.frame (with as.data.frame.list)
my_tibble %>%
map_df(as.data.frame.list)
# a b c d e f g h
#1 7.401618 4.960760 5.689739 5.028002 4.256727 5.188792 3.195041 6.465555
#2 7.475510 6.290054 7.610726 6.065902 5.746367 7.291446 6.556708 7.001105
#3 9.431331 9.864821 11.178087 8.476433 10.593946 10.332950 11.063100 9.695816
# i j
#1 5.153253 7.172612
#2 7.074341 6.410479
#3 10.370019 10.267099
Regarding the first question, we can use map within mutate and then pull the column
tibble(group=c("A", "B", "C"), n_seqs=c(5,7,10)) %>%
mutate(new_col = map(n_seqs, ~ as.list(rnorm(.x, n = 10)) %>%
set_names(letters[1:10]))) %>%
pull(new_col) %>%
bind_rows
# A tibble: 3 x 10
# a b c d e f g h i j
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 5.45 4.98 4.68 4.07 3.51 3.92 6.00 4.38 3.62 6.87
#2 7.43 6.76 8.06 7.89 6.38 9.21 6.74 5.58 6.86 7.21
#3 12.3 10.1 10.5 9.92 9.67 9.97 10.8 12.1 11.0 11.2
Based on the comments, if we need the 'group' column as well
tibble(group= c("A", "B", "C"), n_seqs = c(5, 7, 10)) %>%
nest(-group) %>%
mutate(new_col = map(data, ~
.x %>%
pull(n_seqs) %>%
rnorm(., n = 10 ) %>%
set_names(letters[1:10]) %>%
as.list %>%
as_tibble)) %>%
select(-data) %>%
unnest
# A tibble: 3 x 11
# group a b c d e f g h i j
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 A 6.77 5.34 4.38 4.56 4.49 5.19 5.18 5.92 5.32 4.63
#2 B 6.06 7.63 6.94 7.18 8.10 8.75 6.05 8.64 6.13 7.27
#3 C 10.2 9.72 11.4 9.34 10.7 9.99 9.07 11.2 7.91 9.47
NOTE: Values are different as we didn't set a seed
Why I have to do pull and can't specify n_seqs in the map function
Because unlike mutate or summarize, map is meant to work on lists and vectors, so it can't infer column from a data frame.
Whether there's a way to name the individual entries in the list so
that I can use map_dfr or bind_rows
See #akrun's answer, you need to convert each individual vector to list before using bind_rows or map_df.
What is the best dplyr/purrr approach do get the desired result?
Try start from sapply which simplifies the result as a matrix instead of map which you can conveniently convert to a data frame later. Here is one in baseR only:
df <- tibble(group=c("A", "B", "C"), n_seqs=c(5,7,10))
sapply(df$n_seqs, rnorm, n=10) %>%
t %>% as.data.frame %>%
setNames(letters[1:10])
# A tibble: 3 x 10
# a b c d e f g h i j
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 4.93 4.99 3.64 4.19 4.84 3.15 3.81 5.87 2.25 5.80
#2 6.34 5.30 7.56 5.73 6.84 7.30 6.84 7.91 6.60 6.36
#3 9.42 9.28 8.46 10.6 9.73 9.39 10.2 10.8 10.2 9.30

change several column names() in data.frame() with str_replace_all()

I read this this question and practiced matching patterns, but I am still not figuring it.
I have a panel with the same measure, several times per year. Now, I want to rename them in a logical way. My raw data looks a bit like this,
set.seed(667)
dta <- data.frame(id = 1:6,
R1213 = runif(6),
R1224 = runif(6, 1, 2),
R1255 = runif(6, 2, 3),
R1235 = runif(6, 3, 4))
# install.packages(c("tidyverse"), dependencies = TRUE)
require(tidyverse)
(tbl <- dta %>% as_tibble())
#> # A tibble: 6 x 5
#> id R1213 R1224 R1255 R1235
#> <int> <dbl> <dbl> <dbl> <dbl>
#> 1 1 0.488 1.60 2.07 3.07
#> 2 2 0.692 1.42 2.76 3.19
#> 3 3 0.262 1.34 2.33 3.82
#> 4 4 0.330 1.77 2.61 3.93
#> 5 5 0.582 1.92 2.15 3.86
#> 6 6 0.930 1.88 2.56 3.59
Now, I use str_replace_all() to rename them, here with only one variable in where I use pate, and everything is fine (it might also be possible to optimize this in other ways, if so please feel to let me know),
names(tbl) <- tbl %>% names() %>%
str_replace_all('^R1.[125].$', 'A') %>%
str_replace_all('^R1.[3].$', paste0('A.2018.', 1))
tbl
#> # A tibble: 6 x 5
#> id A A A A.2018.1
#> <int> <dbl> <dbl> <dbl> <dbl>
#> 1 1 0.488 1.60 2.07 3.07
#> 2 2 0.692 1.42 2.76 3.19
#> 3 3 0.262 1.34 2.33 3.82
#> 4 4 0.330 1.77 2.61 3.93
#> 5 5 0.582 1.92 2.15 3.86
#> 6 6 0.930 1.88 2.56 3.59
Eveything call A is actually from the same year, let's say 2017, but with the suffix .1, .2, etc. need to appended. I start over and again use paste0('A.2017.', 1:3), but this time with three suffices,
tbl <- dta %>% as_tibble()
names(tbl) <- tbl %>% names() %>%
str_replace_all('^R1.[125].$', paste0('A.2017.', 1:3)) %>%
str_replace_all('^R1.[7].$', paste0('A.2018.', 1))
tbl
#> Warning message:
#> In stri_replace_all_regex(string, pattern, fix_replacement(replacement), :
#> longer object length is not a multiple of shorter object length
#> > tbl
#> # A tibble: 6 x 5
#> id A.2017.2 A.2017.3 A.2017.1 R1235
#> <int> <dbl> <dbl> <dbl> <dbl>
#> 1 1 0.488 1.60 2.07 3.07
#> 2 2 0.692 1.42 2.76 3.19
#> 3 3 0.262 1.34 2.33 3.82
#> 4 4 0.330 1.77 2.61 3.93
#> 5 5 0.582 1.92 2.15 3.86
#> 6 6 0.930 1.88 2.56 3.59
this does come out, but the order is reversed and I am told longer object length is not a multiple of shorter object length, but isen't 3 the right length? I am looking to do this in a cleaner and simpler way. Also, I don't really like names(tbl) <-, if that can be done in a more elegant way.
Building on David's suggestion - how about something like the following using dplyr::rename_at?
library(dplyr)
## Get data
set.seed(667)
dta <- data.frame(id = 1:6,
R1213 = runif(6),
R1224 = runif(6, 1, 2),
R1255 = runif(6, 2, 3),
R1235 = runif(6, 3, 4)) %>%
as_tibble()
## Rename
dta <- dta %>%
rename_at(.vars = grep('^R1.[125].$', names(.)),
.funs = ~paste0("A.2017.", 1:length(.)))
dta
#> # A tibble: 6 x 5
#> id A.2017.1 A.2017.2 A.2017.3 R1235
#> <int> <dbl> <dbl> <dbl> <dbl>
#> 1 1 0.196 1.74 2.51 3.49
#> 2 2 0.478 1.85 2.06 3.69
#> 3 3 0.780 1.32 2.21 3.26
#> 4 4 0.705 1.49 2.49 3.33
#> 5 5 0.942 1.59 2.66 3.58
#> 6 6 0.906 1.90 2.87 3.93
Vectorised solution for multiple patterns
For a complete solution that can be used for multiple patterns and replacements, we can make use of purr::map2_dfc as follows.
library(dplyr)
library(purrr)
## Get data
set.seed(667)
dta <- data.frame(id = 1:6,
R1213 = runif(6),
R1224 = runif(6, 1, 2),
R1255 = runif(6, 2, 3),
R1235 = runif(6, 3, 4)) %>%
as_tibble()
## Define a function to keep a hold out data set, then rename iteratively for each pattern and replacement.
rename_multiple_years <- function(df, patterns,
replacements,
hold_out_var = "id") {
hold_out_df <- df %>%
select_at(.vars = hold_out_var)
rename_df <- map2_dfc(patterns, replacements, function(pattern, replacement) {
df %>%
rename_at(.vars = grep(pattern, names(.)),
.funs = ~paste0(replacement, 1:length(.))) %>%
select_at(.vars = grep(replacement, names(.)))
})
final_df <- bind_cols(hold_out_df, rename_df)
return(final_df)
}
## Call function on specified patterns and replacements
renamed_dta <- dta %>%
rename_multiple_years(patterns = c("^R1.[125].$", "^R1.[3].$"),
replacements = c("A.2017.", "A.2018."))
renamed_dta
#> # A tibble: 6 x 5
#> id A.2017.1 A.2017.2 A.2017.3 A.2018.1
#> <int> <dbl> <dbl> <dbl> <dbl>
#> 1 1 0.196 1.74 2.51 3.49
#> 2 2 0.478 1.85 2.06 3.69
#> 3 3 0.780 1.32 2.21 3.26
#> 4 4 0.705 1.49 2.49 3.33
#> 5 5 0.942 1.59 2.66 3.58
#> 6 6 0.906 1.90 2.87 3.93
Towards tidy data
Now that the variables have been renamed you might find it useful to have your data in a tidy format. The following using tidyr::gather might be useful.
library(tidyr)
library(dplyr)
#Use tidy dataframe gather all variables, split by "." and drop A column (or keep if a measurement id)
renamed_dta %>%
gather(key = "measure", value = "value", -id) %>%
separate(measure, c("A", "year", "measure"), "[[.]]") %>%
select(-A)
#> # A tibble: 24 x 4
#> id year measure value
#> <int> <chr> <chr> <dbl>
#> 1 1 2017 1 0.196
#> 2 2 2017 1 0.478
#> 3 3 2017 1 0.780
#> 4 4 2017 1 0.705
#> 5 5 2017 1 0.942
#> 6 6 2017 1 0.906
#> 7 1 2017 2 1.74
#> 8 2 2017 2 1.85
#> 9 3 2017 2 1.32
#> 10 4 2017 2 1.49
#> # ... with 14 more rows

Resources