Related
I'm trying to iron out a formatting bug. My formatC() function applied against a data frame, using lapply(), is adding a decimal point to NULL values in the data frame output column header. I do want to keep the NULL values appearing in my output column header, I just don't want a "." added to the end of each NULL output in the column header. That combination of lapply() and formatC() is important for formatting the numeric values in the data frame (though in the below reproducible code they are not produced, for the sake of brevity). Please see image at the bottom where you can see the issue.
I've tried following the advice from lapply if some input values in a list are NULL but it hasn't worked for me yet.
So how can I prevent "lapplying" formatC() to NULL elements? Leave the NULLs alone.
Below is seriously-gutted code that shows the offending code in a single line immediately below the line # commenting-out the below line removes the "." from the NULL (the other 2 commented-out lines are my attempts to resolve) and produces the output shown in the image, illustrating only the problem at hand:
library(data.table)
library(dplyr)
library(DT)
library(shiny)
library(tidyverse)
transitDF <-
as.data.frame(
data.table(
ID = as.numeric(c("1930145","1930145","1930145","1930145","1930145")),
Period_1 = as.numeric(c("1","2","3","4","5")),
Period_2 = c("2012-10","2012-11","2012-12","2013-01","2013-02"),
Values = as.numeric(c("8","17.97","97.85","87.85","273.85")),
State = c("NULL","NULL","NULL","NULL","NULL")
)
)
num_transit <- function(x,from,to,refvar="Period_2", return_matrix=T) {
res <- x[get(refvar) %in% c(to,from), if(.N>1) .SD, by=ID, .SDcols = c(refvar, "State")]
res <- res[, id:=1:.N, by=ID]
res <- dcast(res, ID~id, value.var="State")[,.N, .(`1`,`2`)]
setnames(res,c("from","to", "ct"))
if(return_matrix) return(convert_transits_to_matrix(res, unique(x$State)))
res
}
convert_transits_to_matrix <- function(transits,states) {
m = matrix(NA, nrow=length(states), ncol=length(states), dimnames=list(states,states))
m[as.matrix(transits[,.(to,from)])] <- transits[[3]]
m = data.table(m)[,to_state:=rownames(m)]
setcolorder(m,"to_state")
return(m[])
}
ui <- fluidPage(DTOutput("resultstransitDF"))
server <- function(input, output, session) {
results <-
reactive({
setDT(transitDF)
results <- num_transit(transitDF,1,2,"Period_1")
results <- cbind(results, Sum = rowSums(results[,-1]))
# commenting-out the below line removes the "." from the NULL
results[] <- as.data.frame(lapply(results, formatC, decimal.mark ="."))
# results[] <- as.data.frame(lapply(results, function(x) if (!is.null(x)) (formatC, decimal.mark =".") else NULL))
# results[] <- as.data.frame(lapply(if(!is.null(results)), formatC, decimal.mark ="."))
})
output$resultstransitDF <- renderDT(server=FALSE, {datatable(data = results())})
}
shinyApp(ui, server)
Output when running the above:
Sorry for my comment, I didn't understand the issue. In fact this has nothing to do with formatC, which anyway is not applied to the column names. The problem is with the usage of "NULL" as the name of a list component, and as.data.frame "corrects" this name by default.
> m <- matrix(1, nrow=1, ncol=1, dimnames=list("NULL", "NULL"))
> m # ok
NULL
NULL 1
> data.table(m) # ok
NULL
1: 1
> as.data.frame(data.table(m)) # ok
NULL
1 1
> as.data.frame(lapply(data.table(m), formatC)) # not ok
NULL.
1 1
> data.frame("NULL" = 4) # the problem is here: the "NULL" string is reserved
NULL.
1 4
> lapply(data.table(m), formatC) # look, the name is the NULL object, not the "NULL" string
$`NULL`
[1] "1"
> # you can solve the problem as follows:
> as.data.frame(lapply(data.table(m), formatC), check.names = FALSE)
NULL
1 1
I have a main string that looks like this:
my_main <- "ABCDEFGHIJ"
What I want to do is to sequentially mask at every position with another pattern string:
my_pattern <- "x*x" # the length could be varied from 1 up to length of my_main
Every character that overlap with * will be kept, other will be replaced with x.
The final result is a vector of strings that contain these:
xBxDEFGHIJ
AxCxEFGHIJ
ABxDxFGHIJ
ABCxExGHIJ
ABCDxFxHIJ
ABCDExGxIJ
ABCDEFxHxJ
ABCDEFGxIx
Next if the pattern is
my_pattern <- "xx**x"
The result would be:
xxCDxFGHIJ
AxxDExGHIJ
ABxxEFxHIJ
ABCxxFGxIJ
ABCDxxGHxJ
ABCDExxHIx
How can I achieve that?
This might be a little over-complicated, but it's a start:
I'm going to reuse Reduce_frame from https://stackoverflow.com/a/70945868/3358272.
Reduce_frame <- function(data, expr, init) {
expr <- substitute(expr)
out <- rep(init[1][NA], nrow(data))
for (rn in seq_len(nrow(data))) {
out[rn] <- init <- eval(expr, envir = data[rn,])
}
out
}
From here, let's split the pattern into a frame (for ease of access, if nothing else):
repl <- subset(
data.frame(p = strsplit(my_pattern, "")[[1]], i = seq_len(nchar(my_pattern))),
p != "*")
repl
# p i
# 1 x 1
# 3 x 3
From here, we can do it once with:
tail(Reduce_frame(repl, `substring<-`(init, i, i, p), init = my_main), 1)
# [1] "xBxDEFGHIJ"
Which means we can iterate fairly easily:
sapply(c(0, seq_len(nchar(my_main) - nchar(my_pattern))), function(offset) {
tail(Reduce_frame(transform(repl, i = i + offset),
`substring<-`(init, i, i, p), init = my_main), 1)
})
# [1] "xBxDEFGHIJ" "AxCxEFGHIJ" "ABxDxFGHIJ" "ABCxExGHIJ" "ABCDxFxHIJ" "ABCDExGxIJ" "ABCDEFxHxJ" "ABCDEFGxIx"
To use your second pattern,
my_pattern <- "xx**x"
repl <- transform(...) # from above
## the rest of this code is unchanged from above
sapply(c(0, seq_len(nchar(my_main) - nchar(my_pattern))), function(offset) {
tail(Reduce_frame(transform(repl, i = i + offset),
`substring<-`(init, i, i, p), init = my_main), 1)
})
# [1] "xxCDxFGHIJ" "AxxDExGHIJ" "ABxxEFxHIJ" "ABCxxFGxIJ" "ABCDxxGHxJ" "ABCDExxHIx"
So this can be easily functionized:
Reduce_frame <- ... # defined above
func <- function(S, pattern) {
stopifnot(nchar(S) >= nchar(pattern))
repl <- subset(
data.frame(p = strsplit(pattern, "")[[1]], i = seq_len(nchar(pattern))),
p != "*")
sapply(c(0, seq_len(nchar(S) - nchar(pattern))), function(offset) {
tail(Reduce_frame(transform(repl, i = i + offset),
`substring<-`(init, i, i, p), init = S), 1)
})
}
func("ABCDEFGHIJ", "x*x")
# [1] "xBxDEFGHIJ" "AxCxEFGHIJ" "ABxDxFGHIJ" "ABCxExGHIJ" "ABCDxFxHIJ" "ABCDExGxIJ" "ABCDEFxHxJ" "ABCDEFGxIx"
func("ABCDEFGHIJ", "xx**x")
# [1] "xxCDxFGHIJ" "AxxDExGHIJ" "ABxxEFxHIJ" "ABCxxFGxIJ" "ABCDxxGHxJ" "ABCDExxHIx"
Here's one way using strsplit, grepl, and paste.
f <- \(mm, mp) {
m <- el(strsplit(mm, ''))
p <- el(strsplit(mp, ''))
i <- which(!grepl(p, pattern='\\*'))
vapply(c(0L, seq_len(length(m) - max(i))), \(j) {
m[i + j] <- p[i]
paste(m, collapse='')
}, vector('character', 1L))
}
f('ABCDEFGHIJ', 'x*x')
# [1] "xBxDEFGHIJ" "AxCxEFGHIJ" "ABxDxFGHIJ" "ABCxExGHIJ" "ABCDxFxHIJ"
# [6] "ABCDExGxIJ" "ABCDEFxHxJ" "ABCDEFGxIx"
f('ABCDEFGHIJ', 'x**x')
# [1] "xBCxEFGHIJ" "AxCDxFGHIJ" "ABxDExGHIJ" "ABCxEFxHIJ" "ABCDxFGxIJ"
# [6] "ABCDExGHxJ" "ABCDEFxHIx"
f('ABCDEFGHIJ', 'xx**x')
# [1] "xxCDxFGHIJ" "AxxDExGHIJ" "ABxxEFxHIJ" "ABCxxFGxIJ" "ABCDxxGHxJ"
# [6] "ABCDExxHIx"
f('ABCDEFGHIJ', 'kk**krr')
# [1] "kkCDkrrHIJ" "AkkDEkrrIJ" "ABkkEFkrrJ" "ABCkkFGkrr"
f('ABCDEFGHIJ', 'kk**kr*r')
# [1] "kkCDkrGrIJ" "AkkDEkrHrJ" "ABkkEFkrIr"
Here is an approach along the same lines as r2evans' answer but relying on some stringr functions which should be more efficient than the base equivalents:
library(stringr)
f <- function(main, r_pattern) {
shift <- nchar(main) - nchar(r_pattern) + 1
idx <- as.data.frame(str_locate_all(r_pattern, "[^*]+")[[1]])
x_pattern <- str_split(r_pattern, "\\*+")[[1]]
Reduce(
function(x, y)
`str_sub<-`(
x,
seq(idx$start[y], length.out = shift),
seq(idx$end[y], length.out = shift),
omit_na = FALSE,
x_pattern[y]
),
seq(nrow(idx)),
init = main
)
}
f("ABCDEFGHIJ", "x*x")
[1] "xBxDEFGHIJ" "AxCxEFGHIJ" "ABxDxFGHIJ" "ABCxExGHIJ" "ABCDxFxHIJ" "ABCDExGxIJ" "ABCDEFxHxJ" "ABCDEFGxIx"
f("ABCDEFGHIJ", "xx**x")
[1] "xxCDxFGHIJ" "AxxDExGHIJ" "ABxxEFxHIJ" "ABCxxFGxIJ" "ABCDxxGHxJ" "ABCDExxHIx"
# Edit after OP comment:
f(my_main, "KK**KRR")
[1] "KKCDKRRHIJ" "AKKDEKRRIJ" "ABKKEFKRRJ" "ABCKKFGKRR"
Here is Ruby code that produces the desired result. I am presenting it in the event that a reader wishes to convert it to R, possibly with modification, of course.
You should be able to read the code even if you don't know Ruby, as long as you understand that:
'abc'.size returns 3;
0..8 is a range of integers between 0 and 8, inclusive;
'abc' << 'd' returns 'abcd';
7.modulo(3) returns 1;
'abcd'[2] returns 'c', 2 being an index; and
s == 'x' ? 'x' : my_main[j] reads, "if the string s (which will be 'x' or '*') equals 'x' return 'x', else return the character of my_main at index j.
The Ruby code (somewhat simplified from what would normally be written) is as follows.
def doit(my_main, my_pattern)
msz = my_main.size
psz = my_pattern.size
(0..msz-psz).map do |i|
s = ''
(0..msz-1).each do |j|
s << (my_pattern[(j-i).modulo(msz)] == 'x' ? 'x' : my_main[j])
end
s
end
end
Given the following data:
list_A <- list(data_cars = mtcars,
data_air = AirPassengers,
data_list = list(A = 1,
B = 2))
I would like to print names of objects available across list_A.
Example:
Map(
f = function(x) {
nm <- deparse(match.call()$x)
print(nm)
# nm object is only needed to properly name flat file that may be
# produced within Map call
if (any(class(x) == "list")) {
length(x) + 1
} else {
length(x) + 1e6
saveRDS(object = x,
file = tempfile(pattern = make.names(nm), fileext = ".RDS"))
}
},
list_A
)
returns:
[1] "dots[[1L]][[1L]]"
[1] "dots[[1L]][[2L]]"
[1] "dots[[1L]][[3L]]"
$data_cars
NULL
$data_air
NULL
$data_list
[1] 3
Desired results
I would like to get:
`data_cars`
`data_air`
`data_list`
Update
Following the comments, I have modified the example to make it more reflective of my actual needs which are:
While using Map to iterate over list_A I'm performing some operations on each element of the list
Periodically I want to create a flat file with name reflecting name of object that was processed
In addition to list_A, there are also list_B, list_C and so forth. Therefore, I would like to avoid calling names(list) inside the function f of the Map as I will have to modify it n number of times. The solution I'm looking to find should lend itself for:
Map(function(l){...}, list_A)
So I can later replace list_A. It does not have to rely on Map. Any of the apply functions would do; same applied to purrr-based solutions.
Alternative example
do_stuff <- function(x) {
nm <- deparse(match.call()$x)
print(nm)
# nm object is only needed to properly name flat file that may be
# produced within Map call
if (any(class(x) == "list")) {
length(x) + 1
} else {
length(x) + 1e6
saveRDS(object = x,
file = tempfile(pattern = make.names(nm), fileext = ".RDS"))
}
}
Map(do_stuff, list_A)
As per the notes below, I want to avoid having to modify do_stuff function as I will be looking to do:
Map(do_stuff, list_A)
Map(do_stuff, list_B)
Map(do_stuff, list_...)
We could wrap it into a function, and do it in two steps:
myFun <- function(myList){
# do stuff
res <- Map(
f = function(x) {
#do stuff
head(x)
},
myList)
# write to a file, here we might add control
# if list is empty do not output to a file
for(i in names(res)){
write.table(res[[ i ]], file = paste0(i, ".txt"))
}
}
myFun(list_A)
Would something like this work ?
list_A2 <- Map(list, x = list_A,nm = names(list_A) )
trace(do_stuff, quote({ nm <- x$nm; x<- x$x}), at=3)
Map(do_stuff, list_A2)
Working on improving my package summarytools, I'm looking for a way to use the information on each of by()'s groups to integrate this info in some function's output. To give a little bit more of a context, the functions in this package print out the dataframe name and variable name(s) being summarized. Functions like by() make it difficult because they use generic names such as dd[x, ] when slicing the data and feeding it to functions. substitute() is thus not an option to get at the x parameter in that case, and the values of the IND variable(s) are also hidden (to a certain level).
To illustrate, in the following example, the group information (c.g. "gender: F" and "smoker: No") is simply printed out with cat() when print.by() is invoked, using attributes of the object of class "by":
dat <- data.frame(gender=rep(c("F","M"),each=15),
smoker=rep(c("Yes", "No")),
someQty=runif(n = 30,min = 0, max = 10))
by(dat$someQty, INDICES = list(gender=dat$gender, smoker=dat$smoker), FUN = mean)
## gender: F
## smoker: No
## [1] 5.560505
## -------------------------------------------------------------------------------
## gender: M
## smoker: No
## [1] 2.568055
## -------------------------------------------------------------------------------
## gender: F
## smoker: Yes
## [1] 4.057938
## -------------------------------------------------------------------------------
## gender: M
## smoker: Yes
## [1] 3.416027
Now what I need is to get the info for each group during the by-group processing (as opposed to recuperating them after the "by" object has been created).
I worked on a solution, but before I repeat a similar work for making functions comply with with(), %>%, and possibly others... and their combinations, I'm wondering if there might be a simpler approach to this.
Here's what I have so far do deal with by():
# Initialise variable in package-specific environment that
# will help keeping track of the by-processing
myenv <- new.env()
myenv$byInfo <- list()
# Declare some function that will return the `by` variables values
# at each iteration (it's a sort of dummy function that does just that)
myfunc <- function(x) {
sc <- sys.calls()
sf <- sys.frames()
# Find position of by.default() and tapply() in the sys.calls list
by_pos <- which(as.character(lapply(sc, head, 1))=="by.default()")
tapply_pos <- which(as.character(lapply(sc, head, 1))=="tapply()")
if (length(by_pos) == 1) {
# check if this is the first "by" iteration
if(length(myenv$byInfo) == 0) {
# Standardise the call (adds argument names)
by_call <- as.list(pryr::standardise_call(sc[[by_pos]]))
# Extract the data argument
by_data <- deparse(by_call$data)
# Extract the IND variable names
by_IND <- as.character(by_call$IND)
by_IND <- by_IND[-which(by_IND=="list")]
# Get the levels of these IND variables
by_levels <- sf[[tapply_pos]]$namelist
levels_df <- expand.grid(by_levels, stringsAsFactors = FALSE)
# Store the info in the package-specific environment
myenv$byInfo$iter <- 1
myenv$byInfo$levels_df <- levels_df
myenv$byInfo$nb_iter <- nrow(levels_df)
}
levels_df <- myenv$byInfo$levels_df
info <- paste(colnames(myenv$byInfo$levels_df),
as.character(myenv$byInfo$levels_df[myenv$byInfo$iter,]),
sep=" = ", collapse = ", ")
if (myenv$byInfo$iter == myenv$byInfo$nb_iter)
myenv$byInfo <- list()
else
myenv$byInfo$iter = myenv$byInfo$iter + 1
return(info)
}
return()
}
b <- by(data = dat$someQty,
INDICES = list(gender = dat$gender, smoker = dat$smoker),
FUN = myfunc)
b[1:4]
## [1] "gender = F, smoker = No" "gender = M, smoker = No"
## [3] "gender = F, smoker = Yes" "gender = M, smoker = Yes"
So yes, it does give me what I want, but I'd like to know if I'm missing something more straightforward here.
Note: I thought adding a by= parameter to some functions and just ignore base R's by() altogether but I'd rather use the preexisting base functions people are accustomed to.
According to the documentation for data.frame(...), the ... argument has the form:
... these arguments are of either the form value or tag = value.
Component names are created based on the tag (if present) or
the deparsed argument itself.
Consider a data frame with three columns: a, b, c
DF <- data.frame(a=1:10, b=letters[1:10], c=rnorm(10))
Now consider these three possibilities for creating a new data frame
newDF <- data.frame(x=DF$a)
colnames(newDF) # as expected...
# [1] "x"
newDF <- data.frame(x=DF["a"])
colnames(newDF) # Huh??
# [1] "a"
newDF <- data.frame(x=DF[["a"]])
colnames(newDF) # Why is this necessary??
# [1] "x"
Looking at the class of each RHS:
class(DF$a)
# [1] "integer"
class(DF["a"])
# [1] "data.frame"
class(DF[["a"]])
# [1] "integer"
it appears that, if the RHS is a data.frame, then tag is overridden by the dimname of value.
Also, consider this slightly more complicated example, prompted by this question:
library(xts)
data(sample_matrix)
xtsObject=as.xts(sample_matrix)
head(xtsObject,1)
# Open High Low Close
# 2007-01-02 50.03978 50.11778 49.95041 50.11778
newDF <- data.frame(x=xtsObject$Open) # would have expected this to work
colnames(newDF) # alas, no...
# [1] "Open"
class(xtsObject$Open)
# [1] "xts" "zoo"
So my question is: what is the rule when using data.frame(tag=value,...)? That is, when can I expect the result to have a column named "tag"?
tl;dr: If the object supplied to data.frame is not named, the result will have the name of the tag.
Let's call the optional arguments to data.frame the data. data.frame first creates a list of the data supplied to it. The function then loops through each element of the list. If the element of the list has a name, data.frame keeps that name. Technically, it checks to see if length(names(data[[i]])) > 0 for each element, i, of list of the data supplied to the function. Only if that element has no names, does data.frame use tag as the name.
Getting back to your example, consider the names of arguments derived from DF supplied to data.frame:
names(DF$a)
# NULL
names(DF['a'])
# [1] "a"
names(DF[['a']])
# NULL
Notice that in the first and third case, names(...) is NULL. That is why data.frame(x = DF$a) and data.frame(x = DF[['a']]) had the expected name: x.
For the more complicated xts object, however, notice that the resulting object from the subset operation with $ has a name:
names(xtsObject$Open)
#"Open"
names(xtsObject[, 'Open'])
#"Open"
Therefore, in either case the data frame created with either data.frame(x=xtsObject[, 'Open']) or data.frame(x=xtsObject$Open) will have the name Open.
Here is the relevant code where the names are set in data.frame. Note that x is list(...) where the ... is the data.
for (i in seq_len(n)) {
xi <- if (is.character(x[[i]]) || is.list(x[[i]]))
as.data.frame(x[[i]], optional = TRUE, stringsAsFactors = stringsAsFactors)
else as.data.frame(x[[i]], optional = TRUE)
nrows[i] <- .row_names_info(xi)
ncols[i] <- length(xi)
namesi <- names(xi)
if (ncols[i] > 1L) {
if (length(namesi) == 0L)
namesi <- seq_len(ncols[i])
if (no.vn[i])
vnames[[i]] <- namesi
else vnames[[i]] <- paste(vnames[[i]], namesi, sep = ".")
}
else {
if (length(namesi))
vnames[[i]] <- namesi
else if (no.vn[[i]]) {
tmpname <- deparse(object[[i]])[1L]
if (substr(tmpname, 1L, 2L) == "I(") {
ntmpn <- nchar(tmpname, "c")
if (substr(tmpname, ntmpn, ntmpn) == ")")
tmpname <- substr(tmpname, 3L, ntmpn - 1L)
}
vnames[[i]] <- tmpname
}
}
if (mirn && nrows[i] > 0L) {
rowsi <- attr(xi, "row.names")
nc <- nchar(rowsi, allowNA = FALSE)
nc <- nc[!is.na(nc)]
if (length(nc) && any(nc))
row.names <- data.row.names(row.names, rowsi,
i)
}
nrows[i] <- abs(nrows[i])
vlist[[i]] <- xi
}