Related
I have a dataframe e.g.
df_reprex <- data.frame(id = rep(paste0("S",round(runif(100, 1000000, 9999999),0)), each=10),
date = rep(seq.Date(today(), by=-7, length.out = 10), 100),
var1 = runif(1000, 10, 20),
var2 = runif(1000, 20, 50),
var3 = runif(1000, 2, 5),
var250 = runif(1000, 100, 200),
var1_baseline = rep(runif(100, 5, 10), each=10),
var2_baseline = rep(runif(100, 50, 80), each=10),
var3_baseline = rep(runif(100, 1, 3), each=10),
var250_baseline = rep(runif(100, 20, 70), each=10))
I want to write a function containing a for loop that for each row in the dataframe will subtract every "_baseline" column from the non-baseline column with the same name.
I have created a script that automatically creates a character string containing the code I would like to run:
df <- df_reprex
# get only numeric columns
df_num <- df %>% dplyr::select_if(., is.numeric)
# create a version with no baselines
df_nobaselines <- df_num %>% select(-contains("baseline"))
#extract names of non-baseline columns
numeric_cols <- names(df_nobaselines)
#initialise empty string
mutatestring <- ""
#write loop to fill in string:
for (colname in numeric_cols) {
mutatestring <- paste(mutatestring, ",", paste0(colname, "_change"), "=", colname, "-", paste0(colname, "_baseline"))
# df_num <- df_num %>%
# mutate(paste0(col, "_change") = col - paste0(col, "_baseline"))
}
mutatestring <- substr(mutatestring, 4, 9999999) # remove stuff at start (I know it's inefficient)
mutatestring2 <- paste("df %>% mutate(", mutatestring, ")") # add mutate call
but when I try to call "mutatestring2" it just prints the character string e.g.:
[1] "df %>% mutate( var1_change = var1 - var1_baseline , var2_change = var2 - var2_baseline , var3_change = var3 - var3_baseline , var250_change = var250 - var250_baseline )"
I thought that this part would be relatively easy and I'm sure I've missed something obvious, but I just can't get the text inside that string to run!
I've tried various slightly ridiculous methods but none of them return the desired output (i.e. the result returned by the character string if it was entered into the console as a command):
call(mutatestring2)
eval(mutatestring2)
parse(mutatestring2)
str2lang(mutatestring2)
mget(mutatestring2)
diff_func <- function() {mutatestring2}
diff_func1 <- function() {
a <-mutatestring2
return(a)}
diff_func2 <- function() {str2lang(mutatestring2)}
diff_func3 <- function() {eval(mutatestring2)}
diff_func4 <- function() {parse(mutatestring2)}
diff_func5 <- function() {call(mutatestring2)}
diff_func()
diff_func1()
diff_func2()
diff_func3()
diff_func4()
diff_func5()
I'm sure there must be a very straightforward way of doing this, but I just can't work it out!
How do I convert a character string to something that I can run or pass to a magrittr pipe?
You need to use the text parameter in parse, then eval the result. For example, you can do:
eval(parse(text = "print(5)"))
#> [1] 5
However, using eval(parse()) is normally a very bad idea, and there is usually a more sensible alternative.
In your case you can do this without resorting to eval(parse()), for example in base R you could subtract all the appropriate variables from each other like this:
baseline <- grep("_baseline$", names(df_reprex), value = TRUE)
non_baseline <- gsub("_baseline", "", baseline)
df_new <- cbind(df_reprex, as.data.frame(setNames(mapply(
function(i, j) df_reprex[[i]] - df_reprex[[j]],
baseline, non_baseline, SIMPLIFY = FALSE),
paste0(non_baseline, "_corrected"))))
Or if you want to keep the whole thing in a single pipe without storing intermediate variables, you could do:
mapply(function(i, j) df_reprex[[i]] - df_reprex[[j]],
grep("_baseline$", names(df_reprex), value = TRUE),
gsub("_baseline", "", grep("_baseline$", names(df_reprex), value = TRUE)),
SIMPLIFY = FALSE) %>%
setNames(gsub("_baseline", "_corrected",
grep("_baseline$", names(df_reprex), value = TRUE))) %>%
as.data.frame() %>%
{cbind(df_reprex, .)}
I have a table showing sequences of patterns, represented with the id of the sequence, and the ordered value of the consecutive patterns over several rows. patterns have all the same length but sequences have different lengths)
patterns_seq_a <- c("ABC", "BCD", "ABC", "CBA")
patterns_seq_b <- c("BCD", "BCC", "BAC", "ABC", "BCD")
patterns_seq_c <- c("ABC", "ABC")
sequence_df <- data.frame(sequence_id = c(rep("a", length(patterns_seq_a)),
rep("b", length(patterns_seq_b)),
rep("c", length(patterns_seq_c))),
pattern = c(patterns_seq_a, patterns_seq_b, patterns_seq_c))
and I'm trying to count the frequency from one pattern to another over all the sequences.
(and ultimately will build a probability of transitions from the frequency table)
so for example looking at the transition from "CBA" to "BCD" its frequency over all sequences would be 0 (NB the row 4 and 5 don't belong to the same sequence).
The end result would have the form of:
unique_patterns <- unique(sequence_df$pattern)
result <- data.frame(matrix(0, ncol = length(unique_patterns), nrow = length(unique_patterns)))
colnames(result) <- unique_patterns
rownames(result) <- unique_patterns
(but could also be a hash pattern -> pattern -> frequency if it has to)
On the web I've found a solution based on concatenations of sequences then greps but I it's unusable as it's too slow (profvis blames the greps). Something on the lines of this:
freq_table <- c()
for (start_pattern in unique_patterns) {
for (end_pattern in unique_patterns) {
transition_pattern <- paste0(start_pattern, ',', end_pattern)
sequence_holding_transition <- concat_sequence_df[grep(transition_pattern, concat_sequence_df$patterns_sequence),]
if (nrow(sequence_holding_transition) < 1) {
transition_frequency <- c(transition_pattern, 0)
} else {
concat_sequence_holding_transition <- paste0(sequence_holding_transition$patterns_sequence, collapse = ",", sep="/")
transition_pattern_positions <- gregexpr(pattern = transition_pattern, text = concat_sequence_holding_transition)[[1]]
transition_frequency <- c(transition_pattern, length(transition_pattern_positions))
}
freq_table <- rbind(freq_table, transition_frequency)
}
}
frequency_table <- data.frame(pattern_transition = freq_table[, 1], counts = freq_table[, 2])
frequency_table$pattern.from <- sapply(strsplit(as.character(frequency_table$pattern_transition), ","), `[`, 1)
frequency_table$pattern.to <- sapply(strsplit(as.character(frequency_table$pattern_transition), ","), `[`, 2)
frequency_table <- t(matrix(as.numeric(as.character(frequency_table$counts)), ncol=length(unique_patterns), nrow=length(unique_patterns)))
colnames(frequency_table) <- unique_patterns
rownames(frequency_table) <- unique_patterns
I haven't been able to vectorize it and those greps are really slow in any case. I tried parallelizing it but it's still too slow.
Would someone have a lightning fast solution of building this frequency table?
I don't really care about the code sample, it's just here to show an example of a - unusable- solution.
Thanks!
UPDATE:
please find bellow the expected output if I know how to count.
To have lines with no transitions (such as "CBA" to "XXX") dropped is acceptable, and ofc it doesn't matter the actual type of the structure:
structure(
list(
ABC = c(1, 1, 0, 1, 0),
BAC = c(0, 0, 1, 0, 0),
BCC = c(0, 0, 0, 1, 0),
BCD = c(2, 0, 0, 0, 0),
CBA = c(1, 0, 0, 0, 0)
),
row.names = c("ABC", "BAC", "BCC", "BCD", "CBA"),
class = "data.frame"
)
I have not benchmarked it but it seems, that the following solutions uses basic R functions that tend to usually be fast. Given sequence_df from the question:
table(unlist(tapply(sequence_df$pattern, sequence_df$sequence_id, FUN =
function(p) paste0(p[-length(p)], p[-1]))))
I am using tapply to examine each sequence_id on it's own and paste0 for the transition patterns. unlist puts it all in one large vector which table can count. I am not shure, whether one would call that fully vectorized but at least it has no nested for loops and no regular expressions.
It certainly lacks code to transform the table type into a matrix right now. That can be written once we know how it compares runtime wise with other solutions and depending on whether the matrix is really the ideal format for the operations to come after that.
ok so I had a look at the solutions from #Berhard and #user20650 and while I haven't seriously validated the results yet, it looks that they both do the job.
one difference is that the tapply doesn't generate the 0 frequency transitions while data.table does.
Both solutions are faster than the nested for / rbind example as expected.
So I have tried to benchmark both, and assuming no mistake on my side adapting your code, the data.table is more than twice faster than the tapply one.
Thank you both for these elegant answers, appreciated!
library(data.table)
library(tidyr)
library(microbenchmark)
patterns_sequences <- lapply(seq(1:8), function(id) {
length_sequence <- sample(3:10, 1);
do.call(paste0, replicate(3, sample(c("A", "B", "C"), length_sequence, TRUE), FALSE))
})
sequence_df <- data.frame(sequence_id = c(rep("a", length(patterns_sequences[[1]])),
rep("b", length(patterns_sequences[[2]])),
rep("c", length(patterns_sequences[[3]])),
rep("d", length(patterns_sequences[[4]])),
rep("e", length(patterns_sequences[[5]])),
rep("f", length(patterns_sequences[[6]])),
rep("g", length(patterns_sequences[[7]])),
rep("h", length(patterns_sequences[[8]]))),
pattern = unlist(patterns_sequences))
build_frequency_table_base <- function(sequence_df) {
ft0 <- as.data.frame(table(unlist(tapply(sequence_df$pattern, sequence_df$sequence_id, FUN =
function(p) paste0(p[-length(p)], ",", p[-1])))), stringsAsFactors = FALSE)
ft1 <- ft0 %>%
tidyr::separate(Var1, c("from_pattern", "to_pattern"), ",")
ft5 <- tidyr::spread(ft1, to_pattern, Freq, fill= 0)
rownames(ft5) <- ft5$from_pattern
ft5$from_pattern <- NULL
ft5
}
build_frequency_table_dt <- function(sequence_df) {
dt = as.data.table(sequence_df);
dt[, pattern := factor(pattern)];
dt[, pl := shift(pattern), by=sequence_id][ ,pl := factor(pl, level=levels(pattern))];
res_dt <- with(dt, table(pl, pattern))
res_dt <- as.data.frame.matrix(res_dt)
}
tictoc::tic("base")
res_base <- build_frequency_table_base(sequence_df)
tictoc::toc()
tictoc::tic("DT")
res_dt <- build_frequency_table_dt(sequence_df)
tictoc::toc()
(bench = microbenchmark::microbenchmark(
build_frequency_table_base(sequence_df),
res_dt <- build_frequency_table_dt(sequence_df),
times=1000L
))
ggplot2::autoplot(bench)
Looking to create a conversion of a 15 digit salesforce ID to 18 digit in R. The formula is written out here: https://salesforce.stackexchange.com/questions/27686/how-can-i-convert-a-15-char-id-value-into-an-18-char-id-value
but that is in C#, and I'd like to do this in R.
I've made a clunky formula with as much as I know in R which does work for a 15 digit input and returns the 18 digit one successfully. I would like to know how to then apply this to a column in a data.frame via dplyr.
reproducible code:
SFID_Convert <- function(fifteen_digit) {
if (length(fifteen_digit == 15)) {
# binary map ----
binary <-
c(
"00000",
"00001",
"00010",
"00011",
"00100",
"00101",
"00110",
"00111",
"01000",
"01001",
"01010",
"01011",
"01100",
"01101",
"01110",
"01111",
"10000",
"10001",
"10010",
"10011",
"10100",
"10101",
"10110",
"10111",
"11000",
"11001",
"11010",
"11011",
"11100",
"11101",
"11110",
"11111"
)
letter <- c(LETTERS, 0:5)
binarymap <- data_frame(binary, letter)
# sfid ----
sfid <- substr(fifteen_digit, 1, 15)
s1 <- substr(sfid, 1, 5)
s2 <- substr(sfid, 6, 10)
s3 <- substr(sfid, 11, 15)
convertID <- function(str_frag) {
str_frag <- paste(rev(strsplit(str_frag, NULL)[[1]]), collapse = '')
str_frag <- strsplit(str_frag, NULL)[[1]]
str_frag[which(unlist(gregexpr("[0-9]", str_frag)) == 1)] <- 0
str_frag[which(unlist(gregexpr("[a-z]", str_frag)) == 1)] <- 0
str_frag[which(unlist(gregexpr("[A-Z]", str_frag)) == 1)] <- 1
str_frag <<- paste(str_frag, collapse = '')
}
convertID(s1)
n1 <- str_frag
convertID(s2)
n2 <- str_frag
convertID(s3)
n3 <- str_frag
binary <- data_frame(c(n1, n2, n3)) %>%
select(binary = 1) %>%
left_join(binarymap)
return(paste(sfid, paste(binary$letter[1:3], collapse = ''), sep = ''))}
}
example:
sfid <- "001a003920aSDuh"
SFID_Convert(sfid)
[1] "001a003920aSDuhAAG"
which is what I want, but when you apply it to a df...
col <- c("001a003920aSDuh", "001a08h010JNkJd")
name <- c("compA", "compB")
df <- data_frame(name, col)
It takes the "AAG" that it correctly computed for the first one, and applies it to every row. I could lapply it across, but if I have a df of 100,000 rows, it would be the wrong way to do it I think.
Any help is appreciated! still learning here. :)
There are various issues with your code. I've included a possible solution below, which should be more efficient:
1: Defining the map between binary string & letters. You can do this outside your function. Just define it once, with all the transformations necessary, & use it in the function.
binary <- c("00000","00001","00010","00011","00100","00101","00110","00111",
"01000","01001","01010","01011","01100","01101","01110","01111",
"10000","10001","10010","10011","10100","10101","10110","10111",
"11000","11001","11010","11011","11100","11101","11110","11111")
binary.reverse <- lapply(binary, function(x){paste0(rev(strsplit(x, split = "")[[1]]), collapse = "")})
binary2letter <- c(LETTERS, 0:5)
names(binary2letter) <- unlist(binary.reverse)
rm(binary, binary.reverse)
I reversed the binary strings in this step as well, so that I don't have to do it repeatedly for all the IDs. The results are saved in a named vector rather than a data frame.
2: Creating the function in a way that accepts vectors as input. Note that to check whether a string has X characters, you should use nchar() rather than length(). The latter returns the number of strings, not the number of characters in a string.
SFID_Convert <- function(sfid) {
sfid <- as.character(sfid) # in case the input column are factors
str_num <- gsub("[A-Z]", "1", gsub("[a-z0-9]", "0", sfid))
s1 <- substring(str_num, 1, 5)
s2 <- substring(str_num, 6, 10)
s3 <- substring(str_num, 11, 15)
sfid.addon <- paste0(sfid,
binary2letter[s1],
binary2letter[s2],
binary2letter[s3])
sfid[nchar(sfid)==15] <- sfid.addon[nchar(sfid)==15]
return(sfid)
}
Check the solution:
sfid <- "001a003920aSDuh"
col <- c("001a003920aSDuh", "001a08h010JNkJd")
name <- c("compA", "compB")
df <- data_frame(name, col)
> SFID_Convert(sfid)
[1] "001a003920aSDuhAAG"
> df %>% mutate(new.col = SFID_Convert(col))
# A tibble: 2 x 3
name col new.col
<chr> <chr> <chr>
1 compA 001a003920aSDuh 001a003920aSDuhAAG
2 compB 001a08h010JNkJd 001a08h010JNkJdAAL
I am working on an assignment for school. I need to transform the columns in a data frame using a for loop and the bcPower function from the cars package. My data frame named bb2.df consists of 13 columns of baseball statistics for 337 players. The data is from:
http://ww2.amstat.org/publications/jse/datasets/baseball.dat.txt
I read the data in using:
bb.df <- read.fwf("baseball.dat.txt",widths=c(4,6,6,4,4,3,3,3,4,4,4,3,3,2,2,2,2,19))
And then I created a second data frame just for the numeric stats using:
bb2.df <- bb.df[,1:13]
library(cars)
Then I unsuccessfully tried to build the for loop.
> bb2.df[[i]] <- bcPower(bb2.df[[i]],c)
> for (i in 1:ncol(bb2.df)) {
+ c <- coef(powerTransform(bb2.df[[i]]))
+ bb2.df[[i]] <- bcPower(bb2.df[[i]],c)
+ }
Error in bc1(out[, j], lambda[j]) :
First argument must be strictly positive.
The loop seems to transform the first three columns but stops.
What am I doing wrong?
This solution
tests whether a column appears to contain logical values and omits them from the transformation
replaces zero values in the vectors with a small number, outside the range of the actual values
stores the transformed values in a new data frame, retaining the column and row names
I have also tested all of the variables for normality before and after the transformation. I tried to find a variable that's interesting in that the transformed variable has a large p-value for the Shapiro test, but also there there was a large change in the p-value. Finally, the interesting variable is scaled in both the original and transformed version, and the two versions are overlaid on a density plot.
library(car); library(ggplot2); library(reshape2)
# see this link for column names and type hints
# http://ww2.amstat.org/publications/jse/datasets/baseball.txt
# add placeholder column for opening quotation mark
bb.df <-
read.fwf(
"http://ww2.amstat.org/publications/jse/datasets/baseball.dat.txt",
widths = c(4, 6, 6, 4, 4, 3, 3, 3, 4, 4, 4, 3, 3, 2, 2, 2, 2, 2, 17)
)
# remove placeholder column
bb.df <- bb.df[,-(ncol(bb.df) - 1)]
names(bb.df) <- make.names(
c(
'Salary', 'Batting average', 'OBP', 'runs', 'hits', 'doubles', 'triples',
'home runs', 'RBI', 'walks', 'strike-outs', 'stolen bases', 'errors',
"free agency eligibility", "free agent in 1991/2" ,
"arbitration eligibility", "arbitration in 1991/2", 'name'
)
)
# test for boolean/logical values... don't try to transform them
logicals.test <- apply(
bb.df,
MARGIN = 2,
FUN = function(one.col) {
asnumeric <- as.numeric(one.col)
aslogical <- as.logical(asnumeric)
renumeric <- as.numeric(aslogical)
matchflags <- renumeric == asnumeric
cant.be.logical <- any(!matchflags)
print(cant.be.logical)
}
)
logicals.test[is.na(logicals.test)] <- FALSE
probably.numeric <- bb.df[, logicals.test]
result <- apply(probably.numeric, MARGIN = 2, function(one.col)
{
# can't transform vectors containing non-positive values
# replace zeros with something small
non.zero <- one.col[one.col > 0]
small <- min(non.zero) / max(non.zero)
zeroless <- one.col
zeroless[zeroless == 0] <- small
c <- coef(powerTransform(zeroless))
transformation <- bcPower(zeroless, c)
return(transformation)
})
result <- as.data.frame(result)
row.names(result) <- bb.df$name
cols2test <- names(result)
normal.before <- sapply(cols2test, function(one.col) {
print(one.col)
temp <- shapiro.test(bb.df[, one.col])
return(temp$p.value)
})
normal.after <- sapply(cols2test, function(one.col) {
print(one.col)
temp <- shapiro.test(result[, one.col])
return(temp$p.value)
})
more.normal <- cbind.data.frame(normal.before, normal.after)
more.normal$more.normal <-
more.normal$normal.after / more.normal$normal.before
more.normal$interest <-
more.normal$normal.after * more.normal$more.normal
interesting <-
rownames(more.normal)[which.max(more.normal$interest)]
data2plot <-
cbind.data.frame(bb.df[, interesting], result[, interesting])
names(data2plot) <- c("original", "transformed")
data2plot <- scale(data2plot)
data2plot <- melt(data2plot)
names(data2plot) <- c("Var1", "dataset", interesting)
ggplot(data2plot, aes(x = data2plot[, 3], fill = dataset)) +
geom_density(alpha = 0.25) + xlab(interesting)
Original, incomplete answer:
I believe you're trying to do illegal power transformations (vectors including non-positive values, specifically zeros; vectors with no variance)
The fact that you are copying bb.df into bb2.df and then overwriting is a sure sign that you should really be using apply.
This doesn't create a useful dataframe, but it should get you started,
library(car)
bb.df <-
read.fwf(
"baseball.dat.txt",
widths = c(4, 6, 6, 4, 4, 3, 3, 3, 4, 4, 4, 3, 3, 2, 2, 2, 2, 19)
)
bb.df[bb.df == 0] <- NA
# skip last (text) col
for (i in 1:(ncol(bb.df) - 1)) {
print(i)
# use comma to indicate indexing by column
temp <- bb.df[, i]
temp[temp == 0] <- NA
temp <- temp[complete.cases(temp)]
if (length(unique(temp)) > 1) {
c <- coef(powerTransform(bb.df[, i]))
print(bcPower(bb.df[i], c))
} else {
print(paste0("column ", i, " is invariant"))
}
}
# apply solution
result <- apply(bb.df[,-ncol(bb.df)], MARGIN = 2, function(one.col)
{
temp <- one.col
temp[temp == 0] <- NA
temp <- temp[complete.cases(temp)]
if (length(unique(temp)) > 1) {
c <- coef(powerTransform(temp))
transformation <- bcPower(temp, c)
return(transformation)
} else
{
print("skipping invariant column")
return(NULL)
}
})
I just discovered the power of plyr frequency table with several variables in R
and I am still struggling to understand how it works and I hope some here can help me.
I would like to create a table (data frame) in which I can combine frequencies and summary stats but without hard-coding the values.
Here an example dataset
require(datasets)
d1 <- sleep
# I classify the variable extra to calculate the frequencies
extraClassified <- cut(d1$extra, breaks = 3, labels = c('low', 'medium', 'high') )
d1 <- data.frame(d1, extraClassified)
The results I am looking for should look like that :
require(plyr)
ddply(d1, "group", summarise,
All = length(ID),
nLow = sum(extraClassified == "low"),
nMedium = sum(extraClassified == "medium"),
nHigh = sum(extraClassified == "high"),
PctLow = round(sum(extraClassified == "low")/ length(ID), digits = 1),
PctMedium = round(sum(extraClassified == "medium")/ length(ID), digits = 1),
PctHigh = round(sum(extraClassified == "high")/ length(ID), digits = 1),
xmean = round(mean(extra), digits = 1),
xsd = round(sd(extra), digits = 1))
My question: how can I do this without hard-coding the values?
For the records:
I tried this code, but it does not work
ddply (d1, "group",
function(i) c(table(i$extraClassified),
prop.table(as.character(i$extraClassified))),
)
Thanks in advance
Here's an example to get you started:
foo <- function(x,colfac,colval){
tbl <- table(x[,colfac])
res <- cbind(n = nrow(x),t(tbl),t(prop.table(tbl)))
colnames(res)[5:7] <- paste(colnames(res)[5:7],"Pct",sep = "")
res <- as.data.frame(res)
res$mn <- mean(x[,colval])
res$sd <- sd(x[,colval])
res
}
ddply(d1,.(group),foo,colfac = "extraClassified",colval = "extra")
Don't take anything in that function foo as gospel. I just wrote that off the top of my head. Surely improvements/modifications are possible, but at least it's something to start with.
Thanks to Joran.
I slighlty modified your function to make it more generic (without reference to the position of the variables) .
require(plyr)
foo <- function(x,colfac,colval)
{
# table with frequencies
tbl <- table(x[,colfac])
# table with percentages
tblpct <- t(prop.table(tbl))
colnames( tblpct) <- paste(colnames(t(tbl)), 'Pct', sep = '')
# put the first part together
res <- cbind(n = nrow(x), t(tbl), tblpct)
res <- as.data.frame(res)
# add summary statistics
res$mn <- mean(x[,colval])
res$sd <- sd(x[,colval])
res
}
ddply(d1,.(group),foo,colfac = "extraClassified",colval = "extra")
and it works !!!
P.S : I still do not understand what (group) stands for but