User Defined Function not working in dplyr pipe - r

I have a dataset with proteins accession numbers (DataGranulomeTidy). I have written a function (extractInfo) in r to scrape some information of those proteins from the ncbi website. The function works as expected when I run it in a short "for" loop.
DataGranulomeTidy <- tibble(GIaccessionNumber = c("29436380", "4504165", "17318569"))
extractInfo <- function(GInumber){
tempPage <- readLines(paste("https://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?id=", GInumber, "&db=protein&report=genpept&conwithfeat=on&withparts=on&show-cdd=on&retmode=html&withmarkup=on&tool=portal&log$=seqview&maxdownloadsize=1000000", sep = ""), skipNul = TRUE)
tempPage <- base::paste(tempPage, collapse = "")
Accession <- str_extract(tempPage, "(?<=ACCESSION).{3,20}(?=VERSION)")
Symbol <- str_extract(tempPage, "(?<=gene=\").{1,20}(?=\")")
GeneID <- str_extract(tempPage, "(?<=gov/gene/).{1,20}(?=\">)")
out <- paste(Symbol, Accession, GeneID, sep = "---")
return(out)
}
for(n in 1:3){
print(extractInfo(GInumber = DataGranulomeTidy$GIaccessionNumber[n]))
}
[1] "MYH9--- AAH49849---4627"
[1] "GSN--- NP_000168---2934"
[1] "KRT1--- NP_006112---3848"
When I use the same function in a dplyr pipe I doesn't work and I can't figure our why.
> DataGranulomeTidy %>% mutate(NewVar = extractInfo(.$GIaccessionNumber))
Error in file(con, "r") : argumento 'description' inválido
At this point I could make things work without using the "pipe" operator by using the "for" operator but I would like so much to understand why the function does not work in the dplyr pipe.

It is the cause that your UDF can't treat vector.
vectorized_extractInfo <- Vectorize(extractInfo, "GInumber")
DataGranulomeTidy %>%
mutate(NewVar = vectorized_extractInfo(GIaccessionNumber))

As #cuttlefish44 already pointed out, the problem is that your fun is not a vectorized fun. My approach uses purrr::map_chr. Another option would be to use dplyr::rowwise:
library(tidyverse)
DataGranulomeTidy <- tibble(GIaccessionNumber = c("29436380", "4504165", "17318569"))
extractInfo <- function(GInumber){
tempPage <- readLines(paste("https://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?id=", GInumber, "&db=protein&report=genpept&conwithfeat=on&withparts=on&show-cdd=on&retmode=html&withmarkup=on&tool=portal&log$=seqview&maxdownloadsize=1000000", sep = ""), skipNul = TRUE)
tempPage <- base::paste(tempPage, collapse = "")
Accession <- str_extract(tempPage, "(?<=ACCESSION).{3,20}(?=VERSION)")
Symbol <- str_extract(tempPage, "(?<=gene=\").{1,20}(?=\")")
GeneID <- str_extract(tempPage, "(?<=gov/gene/).{1,20}(?=\">)")
out <- paste(Symbol, Accession, GeneID, sep = "---")
return(out)
}
DataGranulomeTidy %>% mutate(NewVar = map_chr(GIaccessionNumber, extractInfo))
#> # A tibble: 3 x 2
#> GIaccessionNumber NewVar
#> <chr> <chr>
#> 1 29436380 MYH9--- AAH49849---4627
#> 2 4504165 GSN--- NP_000168---2934
#> 3 17318569 KRT1--- NP_006112---3848
Created on 2020-04-17 by the reprex package (v0.3.0)

There is a rentrez package for NCBI queries, for example:
library(rentrez)
protein <- entrez_summary("protein", id = 29436380)
protein$caption
# [1] "AAH49849"
links <- entrez_link(dbfrom = "protein", id = 29436380, db = "gene")
links$links$protein_gene
# [1] "4627"
gene <- entrez_summary("gene", id = links$links$protein_gene)
gene$name
# [1] "MYH9"
Wrap this up into a function, then we don't need to mess about with regex.

Related

How grip files based on words from a file?

I have this text file:
l=c("ced","nad")
h=c("SAF","EYR")
res=cbind(l,h)
and this list of files:
dirf<- list.files ("path", "*.txt", full.names = TRUE)
example of files
ced_SAF_jkh_2020.txt
ced_EYR_jkh_2001.txt
nad_SAF_jkh_200.txt
nad_EYR_jkh_200.txt
I want to grip files that contain both words in the two columns, so the files i need
ced_SAF_jkh_2020.txt
nad_EYR_jkh_200.txt
You can construct the name from the matrix and use that, i.e.
do.call(paste, c(data.frame(res), sep = '_'))
#[1] "ced_SAF" "nad_EYR"
To grep them you can do,
ptrn <- do.call(paste, c(data.frame(res), sep = '_'))
grep(paste(ptrn, collapse = '|'), x, value = TRUE)
#[1] "ced_SAF_jkh_2020.txt" "nad_EYR_jkh_200.txt"
where x,
dput(x)
c("ced_SAF_jkh_2020.txt", "ced_EYR_jkh_2001.txt", "nad_SAF_jkh_200.txt",
"nad_EYR_jkh_200.txt")
Another possible solution, based on tidyverse:
library(tidyverse)
l=c("ced","nad")
h=c("SAF","EYR")
res=cbind(l,h)
df <- data.frame(
files = c("ced_SAF_jkh_2020.txt","ced_EYR_jkh_2001.txt","nad_SAF_jkh_200.txt",
"nad_EYR_jkh_200.txt")
)
df %>%
filter((str_detect(files, res[1,1]) & str_detect(files, res[1,2])) |
(str_detect(files, res[2,1]) & str_detect(files, res[2,2])))
#> files
#> 1 ced_SAF_jkh_2020.txt
#> 2 nad_EYR_jkh_200.txt
Or, more compactly, with purrr::map2_dfr:
library(tidyverse)
map2_dfr(res[,1], res[,2],
~ filter(df, (str_detect(files, .x) & str_detect(files, .y))))
#> files
#> 1 ced_SAF_jkh_2020.txt
#> 2 nad_EYR_jkh_200.txt
You can use sprintf() + paste(collapse = '|') to make the expected syntax of regular expression and pass it to list.files() directly:
regex <- paste(sprintf("%s_%s", l, h), collapse = '|')
# [1] "ced_SAF|nad_EYR"
list.files("path_to_file", regex, full.names = TRUE)
Then all the file names which match the regular expression will be returned.

PowerBI R Script Runtime Error,Prefixing UQ() with the rlang namespace is deprecated as of rlang 0.3.0

Below is the reproducible code which needs to be pasted in PowerBI R script visualization.
I'm making some customizations to the default process_map object.
The visual works on my current desktop but gives out error when published to PowerBI Web.
# The following code to create a dataframe and remove duplicated rows is always executed and acts as a preamble for your script:
# dataset <- data.frame(Column1)
# dataset <- unique(dataset)
# Paste or type your script code here:
library(bupaR)
library(DiagrammeR)
library(tidyverse)
library(lubridate)
# convert to proper date
processMap <- patients %>%
process_map(sec = performance(median, "hours")
,type = frequency("relative_case")
,type_edges = frequency("absolute_case")
,rankdir = "LR"
,layout = layout_pm(edge_weight = TRUE)
,fixed_edge_width = F
,render = F
)
# customisation(label, color, font)
processMap$nodes_df$label <- stringr::str_replace_all(processMap$nodes_df$label, c('ARTIFICIAL_START' = 'Start', 'ARTIFICIAL_END' = 'End'))
processMap$nodes_df$color <- stringr::str_replace_all(processMap$nodes_df$color, c('chartreuse4' = '#769e00', 'brown4' = '#7a0f2d'))
processMap$nodes_df$fontcolor <- stringr::str_replace_all(processMap$nodes_df$fontcolor, c('chartreuse4' = '#769e00', 'brown4' = '#7a0f2d'))
processMap$nodes_df$fontname <- stringr::str_replace_all(processMap$nodes_df$fontname, c('Arial' = 'Calibri'))
processMap$edges_df$fontname <- stringr::str_replace_all(processMap$edges_df$fontname, c('Arial' = 'Calibri'))
# change edge with
processMap$edges_df$penwidth <- scales::rescale(processMap$edges_df$penwidth, to = c(0.5, 4))
# custom duration edges
edges_label <- processMap$edges_df$label
tmp <- regmatches(edges_label, gregexpr("\\(.*?\\)", edges_label))
tmp <- gsub("[\\(\\)]", "", tmp)
tmp <- stringr::str_replace_all(tmp, c('character0' = '0', ' hours'=''))
tmp <- as.numeric(tmp)*60*60
tmp <- lubridate::as.duration(tmp)
tmp <- regmatches(tmp, gregexpr("\\(.*?\\)", tmp))
tmp <- gsub("[\\(\\)]", "", tmp)
edges_label_duration <- stringr::str_replace_all(tmp, c('character0' = '<1 hour'))
edges_label_count_absolute_case <- gsub("\n.*","",edges_label)
edges_label_clean <- paste0(edges_label_count_absolute_case, '\n',edges_label_duration)
# remove those which should have no hour()
correct_label <- gsub("\n.*","",edges_label_clean[!grepl("hour", edges_label)])
edges_label_clean[!grepl("hour", edges_label)] <- correct_label
# assign to process object
processMap$edges_df$label <- edges_label_clean
DiagrammeR::export_graph(graph = processMap, file_type = 'png', file_name = 'processMap.png')
Below is the visual output in PowerBI desktop
When published to the PowerBI portal, it gives the following error
Error in if (any(ind)) Encoding(x[ind]) <- "bytes" :
missing value where TRUE/FALSE needed
In addition: Warning message:
Prefixing `UQ()` with the rlang namespace is deprecated as of rlang 0.3.0.
Please use the non-prefixed form or `!!` instead.
# Bad:
rlang::expr(mean(rlang::UQ(var) * 100))
# Ok:
rlang::expr(mean(UQ(var) * 100))
# Good:
rlang::expr(mean(!!var * 100))
rlang's on desktop is rlang_0.4.10 while the PowerBI Web has a rlang_0.3.0
I ended up re-writing the customisation code so that there's no function within a function which is referenced by the error code below
# Bad:
rlang::expr(mean(rlang::UQ(var) * 100))
I suspect the related function is this one
regmatches(edges_label, gregexpr("\\(.*?\\)", edges_label))
Couldn't really definitively say it's the case but anyways the updated code below generates no more error when published to PowerBI service/web.
library(bupaR)
library(DiagrammeR)
library(tidyverse)
library(lubridate)
# convert to proper date
processMap <- patients %>%
process_map(sec = performance(median, "hours")
,type = frequency("relative_case")
,type_edges = frequency("absolute_case")
,rankdir = "LR"
,layout = layout_pm(edge_weight = TRUE)
,fixed_edge_width = F
,render = F
)
# customisation(label, color, font)
processMap$nodes_df$label <- stringr::str_replace_all(processMap$nodes_df$label, c('ARTIFICIAL_START' = 'Start', 'ARTIFICIAL_END' = 'End'))
processMap$nodes_df$color <- stringr::str_replace_all(processMap$nodes_df$color, c('chartreuse4' = '#769e00', 'brown4' = '#7a0f2d'))
processMap$nodes_df$fontcolor <- stringr::str_replace_all(processMap$nodes_df$fontcolor, c('chartreuse4' = '#769e00', 'brown4' = '#7a0f2d'))
processMap$nodes_df$fontname <- stringr::str_replace_all(processMap$nodes_df$fontname, c('Arial' = 'Calibri'))
processMap$edges_df$fontname <- stringr::str_replace_all(processMap$edges_df$fontname, c('Arial' = 'Calibri'))
# change edge with
processMap$edges_df$penwidth <- scales::rescale(processMap$edges_df$penwidth, to = c(0.5, 4))
# custom duration edges
edges_label <- processMap$edges_df$label
# extract within bracket
tmp <- stringr::str_extract(string = edges_label,
pattern = "(?<=\\().*(?=\\))")
tmp <- stringr::str_replace_all(tmp, c(' hours'=''))
tmp[is.na(tmp)] <- '0'
tmp <- as.numeric(tmp)*60*60
tmp <- lubridate::as.duration(tmp)
tmp <- stringr::str_extract(string = tmp,
pattern = "\\(.*?\\)")
tmp[is.na(tmp)] <- '(<1 hour)'
edges_label_duration <- tmp
edges_label_count_absolute_case <- gsub("\n.*","",edges_label)
edges_label_clean <- paste0(edges_label_count_absolute_case, '\n',edges_label_duration)
# remove those which should have no hour()
correct_label <- gsub("\n.*","",edges_label_clean[!grepl("hour", edges_label)])
edges_label_clean[!grepl("hour", edges_label)] <- correct_label
# assign to process object
processMap$edges_df$label <- edges_label_clean
DiagrammeR::export_graph(graph = processMap, file_type = 'png', file_name = 'processMap.png')

How do I make a function in R with a mass amount of code?

I think this is a fairly basic question, as I am a new R user, but I want to make it so that I can activate the entire code below with a single entry/word (I presumed it would be a function). If this has already been asked, I apologize, and please refer me to the link where it is answered. Thank you in advance for all help.
My code:
head(yelp, 10)
str(yelp)
yelp_flat<- flatten(yelp)
str(yelp_flat)
library(tibble)
yelp_tbl <- as_data_frame(yelp_flat)
yelp_tbl
yelp_tbl$newcolumn <- NULL
yelp_tbl$newcolumn1 <- NULL
yelp_tbl$shotClock <- NULL
yelp_tbl$period <- NULL
yelp_tbl$wallClock <- NULL
yelp_tbl$gameClock <- NULL
yelp_tbl$gameClockStopped <- NULL
yelp_tbl$ball <- NULL
head(yelp_tbl)
good <- unnest(yelp_tbl) #extracts xyz from original dataframe
library(tidyr)
player <- good %>% separate(xyz, c("player_x", "player_y", "player_z"), sep = ",")
finish <- player %>% separate(xyz1, c("player_x", "player_y", "player_z"), sep = ",")
k <- finish %>% separate(player_x, c("trash", "player_x"), sep = "c")
k$trash <- NULL
r <- k %>% separate(player_z, c("player_z", "tra"), sep = "\\)")
u <- r %>% separate(player_x, c("kol", "player_x"), sep = "\\(")
Away_Team <- u
Away_Team$garbage <- NULL
Away_Team$playerId1<- NULL
Away_Team$aplayer_x <- NULL
Away_Team$aplayer_y <- NULL
Away_Team$aplayer_z <- NULL
Away_Team$dispose <- NULL
Away_Team$brack <- NULL
Away_Team$kol <- NULL
Away_Team$tra <- NULL
View(Away_Team)
yelp_tbl
yelp_tbl$newcolumn <- NULL
yelp_tbl$newcolumn1 <- NULL
yelp_tbl$shotClock <- NULL
yelp_tbl$period <- NULL
yelp_tbl$wallClock <- NULL
yelp_tbl$gameClock <- NULL
yelp_tbl$gameClockStopped <- NULL
yelp_tbl$ball <- NULL
head(yelp_tbl)
good <- unnest(yelp_tbl) #extracts xyz from original dataframe
library(tidyr)
player <- good %>% separate(xyz, c("player_x", "player_y", "player_z"), sep = ",")
finish <- player %>% separate(xyz1, c("player_x", "player_y", "player_z"), sep = ",")
k <- finish %>% separate(player_x, c("trash", "player_x"), sep = "c")
k$trash <- NULL
r <- k %>% separate(player_z, c("player_z", "tra"), sep = "\\)")
u <- r %>% separate(player_x, c("kol", "player_x"), sep = "\\(")
Home_Team <- u
Home_Team$garbage <- NULL
Home_Team$playerId1<- NULL
Home_Team$hplayer_x <- NULL
Home_Team$hplayer_y <- NULL
Home_Team$hplayer_z <- NULL
Home_Team$dispose <- NULL
Home_Team$brack <- NULL
Home_Team$kol <- NULL
Home_Team$tra <- NULL
View(Home_Team)
View (Away_Team)
Table <- rbind(Home_Team, Away_Team)
View(Table) #order frameIdx to see correct order
So, indeed you should make a function. Here are some steps to follow:
1. Put all your code in your function
my_function <- function(){
# Your code
}
2. Identify what you have as an input (aka, what your are not building in your code), they will become the argument of your function
my_function <- function(arg1, arg2, ...){
# Your code
}
In your example, I identified yelp
3. Identify what you want to output (ideally only one object), they will be in the return of your function
my_function <- function(arg1, arg2, ...){
# Your code
return(output)
}
In your example I identified Table
4. Take all the import/library and put them outside your function
library(lib1)
my_function <- function(arg1, arg2, ...){
# Your code
return(output)
}
EDIT using #r2evans suggestion: Using libraryis generally used instead of require, here and here is some literature on it.
In your code I identified tidyr and tibble
5. Identify what you want to print/View and what was just for debugging. Add a print to print, suppres what you don't want
6. Add some comments/slice your code
For example I would add something like # Creating XXX table
7. Improve code quality
You should try to minimize the number of line of code (for example using loops and avoiding code to be in double). Make variables names explicit (instead of k, u, r...)
Regarding loop, in your code you drop some columns on at a time, you could do a loop to drop them in order. (It's what I have done bellow). It helps to make your code easier to read/debug. In this particular case, as Gregor said it is heaven faster to drop them all at once with using a list of column names (if you are interested check his comment).
Here you go:
There are still some improvement to do especially regarding point number 7 and 5.
library(tibble)
library(tidyr)
yelp_function <- function(yelp){
# Printing the input
print(head(yelp, 10))
print(str(yelp))
# Flatten table
yelp_flat<- flatten(yelp)
print(str(yelp_flat))
# Create yelp_tbl and drop some columns
yelp_tbl <- as_data_frame(yelp_flat)
# Drop some columns
for (col in c("newcolumn", "newcolumn1", "shotClock", "period", "wallClock", "gameClock", "gameClockStopped", "ball")){
yelp_tbl[, col] <- NULL
}
print(head(yelp_tbl))
# Build some table
good <- unnest(yelp_tbl) #extracts xyz from original dataframe
player <- good %>% separate(xyz, c("player_x", "player_y", "player_z"), sep = ",")
finish <- player %>% separate(xyz1, c("player_x", "player_y", "player_z"), sep = ",")
k <- finish %>% separate(player_x, c("trash", "player_x"), sep = "c")
k$trash <- NULL
r <- k %>% separate(player_z, c("player_z", "tra"), sep = "\\)")
u <- r %>% separate(player_x, c("kol", "player_x"), sep = "\\(")
# Build away team
Away_Team <- u
# Build yelp table: I'm not quite sure why you are rebdoing that... Is this code necessary?
yelp_tbl
# Drop some columns
for (col in c("newcolumn", "newcolumn1", "shotClock", "period", "wallClock", "gameClock", "gameClockStopped", "ball")){
yelp_tbl[, col] <- NULL
}
print(head(yelp_tbl))
good <- unnest(yelp_tbl) #extracts xyz from original dataframe
# Build some table
player <- good %>% separate(xyz, c("player_x", "player_y", "player_z"), sep = ",")
finish <- player %>% separate(xyz1, c("player_x", "player_y", "player_z"), sep = ",")
k <- finish %>% separate(player_x, c("trash", "player_x"), sep = "c")
k$trash <- NULL
r <- k %>% separate(player_z, c("player_z", "tra"), sep = "\\)")
u <- r %>% separate(player_x, c("kol", "player_x"), sep = "\\(")
## Build home_team
Home_Team <- u
# Drop some columns
for (col in c("garbage", "playerId1", "aplayer_x", "aplayer_y", "aplayer_z", "dispose", "brack", "kol", "tra")){
Away_Team[, col] <- NULL
Home_Team[, col] <- NULL
}
# Merge
Table <- rbind(Home_Team, Away_Team)
# Return
return(Table)
}
View(Table) #order frameIdx to see correct order
Run it:
To run your code you now just have to execute the function with the needed argument:
yelp_function(yelp)
NB 1: please note that I didn't tested the code since you didn't provide data to run it. To improve your question you should give some data using dputfunction.
NB 2: There is always room for improvement in the code so you might want to go further and llok into refactoring to avoid having code in double. Control your inputs with some sanity check...
It's rather simple.
You do this:
foo <- function{
#all your code goes here
}
Then you call your function by typing (in console for instance):
foo()

How can I use accumulate like reduce2 function in purrr?

I would like to use the accumulate function with two input vectors and the reduce2 function. The documentation for accumulate implies that two input vectors can be given and that accumulate can work with reduce2. However, I am having trouble.
Here is an example, inspired by the documentation from reduce2.
This is the example from reduce2
> paste2 <- function(x, y, sep = ".") paste(x, y, sep = sep)
> letters[1:4] %>% reduce2(.y=c("-", ".", "-"), paste2)
[1] "a-b.c-d"
Here are several attempts to use accumulate similarly to reduce2. None properly iterate through both letters[1:4] and c("-",".","-").
> letters[1:4] %>% accumulate(.y=c("-", ".", "-"),paste2)
Error in .f(x, y, ...) : unused argument (.y = c("-", ".", "-"))
> letters[1:4] %>% accumulate(c("-", ".", "-"),paste2)
[[1]]
[1] "a"
[[2]]
NULL
> letters[1:4] %>% accumulate(sep=c("-", ".", "-"),paste2)
[1] "a" "a-b" "a-b-c" "a-b-c-d"
How would I use accumulate to see the intermediate results given by the reduce2 example?
It is possible that this is an oversight where the documentation is simply not up to date/a bit misleading? I could not get accumulate to accept a three argument function either, and I'm surprised there's no error in your last example though I guess it would have to be paste that throws it. The fact that the text for .f is exactly the same for accumulate as for reduce makes me think that this just isn't functionality present in accumulate. Additionally, looking at the source seems to show (unless I misread) that reduce and reduce2 have their own implementation but accumulate relies on base::Reduce. Might be worth a GitHub issue.
Here's my best shot at producing the output you wanted. It basically involves calling reduce2 multiple times with the right subset of the input list and the secondary input vector to paste2, which doesn't feel very neat or tidy. This might just not be a particularly neat or tidy problem. Note the use of the {} to override the default %>% behaviour of placing the pipe LHS as the first argument, and the different indexing on .x and .y inside reduce2 (we want to keep .y one element shorter than .x).
paste2 <- function(x, y, sep = ".") paste(x, y, sep = sep)
library(purrr)
letters[1:4] %>%
{map_chr(
.x = 2:length(.),
.f = function(index) reduce2(
.x = .[1:index],
.y = c("-", ".", "-")[1:(index - 1)],
.f = paste2
)
)}
#> [1] "a-b" "a-b.c" "a-b.c-d"
Created on 2018-05-11 by the reprex package (v0.2.0).
A few months after this post accumulate2 was introduced that gives the results OP was after:
library(purrr)
paste2 <- function(x, y, sep = ".") paste(x, y, sep = sep)
accumulate2(letters[1:4], c("-", ".", "-"), paste2)
#> [[1]]
#> [1] "a"
#>
#> [[2]]
#> [1] "a-b"
#>
#> [[3]]
#> [1] "a-b.c"
#>
#> [[4]]
#> [1] "a-b.c-d"
With this trick you can use unlimited arguments in accumulate and you did not even need accumulate2
library(tidyverse)
x <- letters[1:4]
y <- c('-', '.', '-')
accumulate(seq_along(x[-1]), .init = x[1], ~paste(.x, x[.y+1], sep = y[.y]))
#> [1] "a" "a-b" "a-b.c" "a-b.c-d"
# OR
accumulate(seq_along(y), .init = x[1], ~paste(.x, x[.y+1], sep = y[.y]))
#> [1] "a" "a-b" "a-b.c" "a-b.c-d"
Created on 2022-02-21 by the reprex package (v2.0.1)

unexpected error when manipulating list of data.frame

I have list of data.frame as an output of custom function, so I intend to split each data.frame by its last column, where threshold is given. However, I manipulated the two list nicely, and combined them to get only one table. But I have an error when manipulating this new table. I can't figure out where is issue come from. How can I fix this error ? Can anyone point me out to possibly fix this error ? If this error can be fixed, I want to implement wrapper. How can I easily manipulate list of data.frame ? Any better idea to debug the error ?
mini example :
savedDF <- list(
bar = data.frame(.start=c(12,21,37), .stop=c(14,29,45), .score=c(5,9,4)),
cat = data.frame(.start=c(18,42,18,42,81), .stop=c(27,46,27,46,114), .score=c(10,5,10,5,34)),
foo = data.frame(.start=c(3,3,33,3,33,91), .stop=c(24,24,10,24,10,17), .score=c(22,22,6,22,6,7))
)
discardedDF <- list(
bar = data.frame(.start=c(16,29), .stop=c(20,37), .score=c(2,11)),
cat = data.frame(.start=c(21,31), .stop=c(23,43), .score=c(1,9)),
foo = data.frame(.start=c(54, 79), .stop=c(71,93), .score=c(3,8))
)
I can manipulate this way :
both <- do.call("rbind", c(savedDF, discardedDF))
cn <- c("letter", "seq")
# FIXME :
DF <- cbind(
read.table(text = chartr("_", ".", rownames(both)), header=T, sep = ".", col.names = cn),
both)
DF <- transform(DF, isPassed = ifelse(.score > 8, "Pass", "Fail"))
by(DF, DF[c("letter", "isPassed")],
function(x) write.csv(x[-(1:length(savedDF))],
sprintf("%s_%s_%s.csv", x$letter[1], x$isPassed[1])))
But I have an error
Error in scan(file = file, what = what, sep = sep, quote = quote, dec = dec, :
line 15 did not have 2 elements
Why I have this error ? Can anyone point me out how to fix this ?
my desired output is list of CSV file as follows :
bar.saved.Pass.csv
bar.saved.Fail.csv
bar.discarded.Pass.csv
bar.discarded.Fail.csv
cat.saved.Pass.csv
cat.saved.Fail.csv
cat.discarded.Pass.csv
cat.discarded.Fail.csv
foo.saved.Pass.csv
foo.saved.Fail.csv
foo.discarded.Pass.csv
foo.discarded.Fail.csv
But I think controlling exported CSV files still not desired. How can I improve functionality of this wrapper ? I intend to let use choose output directory by custom, or more dynamic would be nice. Any idea ? Thanks a lot
Is this what you are looking for?
library(tidyverse)
library(magrittr)
both <- do.call("rbind", c(savedDF, discardedDF))
both %<>% rownames_to_column(var = "cn")
both %<>% separate(cn, c("letters", "seq"), sep = "\\.")
both %<>% mutate(isPassed = ifelse(.score > 8, "Passed", "Failed"),
isDiscard = ifelse(is.na(seq), "Saved", "Discarded"))
list_of_dfs <- both %>% split(list(.$letters, .$isPassed, .$isDiscard))
csv_names <- paste0("/Users/nathanday/Desktop/", names(list_of_dfs), ".csv") # change this path
mapply(write.csv, list_of_dfs, csv_names)
The %<>% operator is short hand so both %<>% rownames_to_columm(var = "cn") is identical to both <- rownames_to_column(both, var = "cn")
To make it more "dynamic" for allowing output path input, you could wrap this in the function structure you already have like this:
output_where <- function(output_path, list1, list2) {
if (!dir.exists(output_path)) {
dir.create(file.path(output_path))
}
both <- do.call(rbind, c(list1, list2))
both %<>% rownames_to_column(var = "cn")
both %<>% separate(cn, c("letters", "seq"), sep = "\\.")
both %<>% mutate(isPassed = ifelse(.score > 8, "Passed", "Failed"), isDiscard = ifelse(is.na(seq), "Saved", "Discarded"))
list_of_dfs <- both %>% split(list(.$letters, .$isPassed, .$isDiscard))
csv_names <- paste0(output_path, names(list_of_dfs), ".csv")
return(mapply(write.csv, list_of_dfs, csv_names))
}
output_where("~/Desktop/", savedDF, discardedDF)
for even more dynamics:
output_where <- function(output_path, list1, list2) {
if (!dir.exists(output_path)) {
dir.create(file.path(output_path))
}
names(list1) <- paste("list1", names(list1), sep = ".")
names(list2) <- paste("list2", names(list2), sep = ".")
both <- do.call(rbind, c(list1, list2))
both %<>% rownames_to_column(var = "cn")
both %<>% separate(cn, c("original_list", "letters", "seq"), sep = "\\.")
both %<>% mutate(isPassed = ifelse(.score > 8, "Passed", "Failed"))
list_of_dfs <- both %>% split(list(.$letters, .$isPassed, .$original_list))
csv_names <- paste0(output_path, names(list_of_dfs), ".csv")
return(mapply(write.csv, list_of_dfs, csv_names))
}

Resources