I have some data, that looks like the following:
"Name","Length","Startpos","Endpos","ID","Start","End","Rev","Match"
"Name_1",140,0,138,"1729",11,112,0,1
"Name_2",132,0,103,"16383",23,232,0,1
"Name_3",102,0,100,"1729",22,226,1,1
"Name_4",112,0,130,"16383",99,992,1,1
"Name_5",132,0,79,"1729",81,820,1,1
"Name_6",112,0,163,"16383",81,820,0,1
"Name_7",123,0,164,"1729",54,542,1,1
"Name_8",123,0,65,"16383",28,289,0,1
I have used the order function to order according to first "ID then "Start".
"Name","Length","Startpos","Endpos","ID","Start","End","Rev","Match"
"Name_1",140,0,138,"1729",11,112,0,1
"Name_3",102,0,100,"1729",22,226,1,1
"Name_7",123,0,164,"1729",54,542,1,1
"Name_5",132,0,79,"1729",81,820,1,1
"Name_2",132,0,103,"16383",23,232,0,1
"Name_8",123,0,65,"16383",28,289,0,1
…
Now I need to do two things:
First I need to create a table that includes pairwise couples out of each ID group. For a group in one ID containing the names (1,2,3,4,5), I need to create the pairs (12,23,34,45). So for the above example, the pairs would be (Name_1+Name_3, Name_3+Name_7, Name_7+Name_5).
My output for the above example, would look like this:
"Start_Name_X","Start_Name_Y","Length_Name_X","Length_Name_Y","Name_Name_X","Name_Name_Y","ID","New column"
11, 22, 140, 102, "Name_1", Name_3", 1729,,
22, 54, 102, 123, "Name_3", Name_7, 1729,,
54, 81, 123, 132, "Name_7", Name_5, 1729,,
23, 28, 132, 123, "Name_2", "Name_8", 16383,,
…
So I need to create pairs through ascending "Start", but within each "ID".
I am thinking it should be done with a for loop, but I am a newbie, so pulling the data to a new table with the for loop confuses me in itself, and especially the constraint of doing it within each unique "ID", I have no idea how to do.
I have experimented with splitting the data into groups according to ID using split, but it doesn't really get me further with creating the new data table.
I have created the ned data-table with the following code:
column_names = data.frame(Start_Name_X ="Start_Name_x",
Start_Name_Y="Start_Name_Y", Length_Name_X ="Length_Name_X",
Length_Name_Y="Length_Name_Y", Name_X="Name_X", Name_Y="Name_Y", ID="ID",
New_Column="New_Column")
write.table(column_names, file = "datatabel.csv", row.names=FALSE, append =
FALSE, col.names = FALSE, sep=",", quote=TRUE)
And this is the table, I would like to write to.
Is a for loop the write way to handle this, and if so, can you give me a few clues on how to start?
It can be done with only one loop:
df <- read.table(sep = ",", header = TRUE, stringsAsFactors = FALSE,
text = "\"Name\",\"Length\",\"Startpos\",\"Endpos\",\"ID\",\"Start\",\"End\",\"Rev\",\"Match\"\n\"Name_1\",140,0,138,\"1729\",11,112,0,1\n\"Name_2\",132,0,103,\"16383\",23,232,0,1\n\"Name_3\",102,0,100,\"1729\",22,226,1,1\n\"Name_4\",112,0,130,\"16383\",99,992,1,1\n\"Name_5\",132,0,79,\"1729\",81,820,1,1\n\"Name_6\",112,0,163,\"16383\",81,820,0,1\n\"Name_7\",123,0,164,\"1729\",54,542,1,1\n\"Name_8\",123,0,65,\"16383\",28,289,0,1",
)
df <- df[order(df$ID, df$Start), ]
inds <- c("Name", "Start", "Length")
indsSorted <- c("Start_Name_X","Start_Name_Y","Length_Name_X","Length_Name_Y","Name_Name_X","Name_Name_Y","ID","New_Column")
out <- data.frame(matrix(nrow = 0, ncol = 8))
colnames(out) <- c("Start_Name_X","Start_Name_Y","Length_Name_X","Length_Name_Y","Name_Name_X","Name_Name_Y","ID","New_Column")
for (i in unique(df$ID)){
dfID <- subset(df, ID == i)
dfHead <- head(dfID, n = nrow(dfID) - 1)[, inds]
colnames(dfHead) <- paste0(colnames(dfHead), "_Name_X")
dfTail <- tail(dfID, n = nrow(dfID) - 1)[, inds]
colnames(dfTail) <- paste0(colnames(dfTail), "_Name_Y")
out <- rbind(out, cbind(dfHead, dfTail, ID = i, New_Column = '', stringsAsFactors = FALSE)[, indsSorted])
}
out
This will probably be horribly slow if the input is large. It can be optimized, but I didn't bother since using data.table is probably much quicker.
dt <- data.table(df, key = "ID,Start")
fn <- function(dtIn, id){
dtHead <- head(dtIn, n = nrow(dtIn) - 1)
setnames(dtHead, paste0(colnames(dtHead), "_Name_X"))
dtTail <- tail(dtIn, n = nrow(dtIn) - 1)
setnames(dtTail, paste0(colnames(dtTail), "_Name_Y"))
cbind(dtHead, dtTail, ID = id, New_Column = '')
}
out2 <- dt[, fn(.SD, ID), by = ID, .SDcols = c("Name", "Start", "Length")]
out2 <- as.data.frame(out2[, indsSorted, with = FALSE])
Rownames are different but otherwise the results are identical. The function used can probably be optimized as well.
rownames(out) <- NULL
rownames(out2) <- NULL
identical(out, out2)
Related
I have a dataframe e.g.
df_reprex <- data.frame(id = rep(paste0("S",round(runif(100, 1000000, 9999999),0)), each=10),
date = rep(seq.Date(today(), by=-7, length.out = 10), 100),
var1 = runif(1000, 10, 20),
var2 = runif(1000, 20, 50),
var3 = runif(1000, 2, 5),
var250 = runif(1000, 100, 200),
var1_baseline = rep(runif(100, 5, 10), each=10),
var2_baseline = rep(runif(100, 50, 80), each=10),
var3_baseline = rep(runif(100, 1, 3), each=10),
var250_baseline = rep(runif(100, 20, 70), each=10))
I want to write a function containing a for loop that for each row in the dataframe will subtract every "_baseline" column from the non-baseline column with the same name.
I have created a script that automatically creates a character string containing the code I would like to run:
df <- df_reprex
# get only numeric columns
df_num <- df %>% dplyr::select_if(., is.numeric)
# create a version with no baselines
df_nobaselines <- df_num %>% select(-contains("baseline"))
#extract names of non-baseline columns
numeric_cols <- names(df_nobaselines)
#initialise empty string
mutatestring <- ""
#write loop to fill in string:
for (colname in numeric_cols) {
mutatestring <- paste(mutatestring, ",", paste0(colname, "_change"), "=", colname, "-", paste0(colname, "_baseline"))
# df_num <- df_num %>%
# mutate(paste0(col, "_change") = col - paste0(col, "_baseline"))
}
mutatestring <- substr(mutatestring, 4, 9999999) # remove stuff at start (I know it's inefficient)
mutatestring2 <- paste("df %>% mutate(", mutatestring, ")") # add mutate call
but when I try to call "mutatestring2" it just prints the character string e.g.:
[1] "df %>% mutate( var1_change = var1 - var1_baseline , var2_change = var2 - var2_baseline , var3_change = var3 - var3_baseline , var250_change = var250 - var250_baseline )"
I thought that this part would be relatively easy and I'm sure I've missed something obvious, but I just can't get the text inside that string to run!
I've tried various slightly ridiculous methods but none of them return the desired output (i.e. the result returned by the character string if it was entered into the console as a command):
call(mutatestring2)
eval(mutatestring2)
parse(mutatestring2)
str2lang(mutatestring2)
mget(mutatestring2)
diff_func <- function() {mutatestring2}
diff_func1 <- function() {
a <-mutatestring2
return(a)}
diff_func2 <- function() {str2lang(mutatestring2)}
diff_func3 <- function() {eval(mutatestring2)}
diff_func4 <- function() {parse(mutatestring2)}
diff_func5 <- function() {call(mutatestring2)}
diff_func()
diff_func1()
diff_func2()
diff_func3()
diff_func4()
diff_func5()
I'm sure there must be a very straightforward way of doing this, but I just can't work it out!
How do I convert a character string to something that I can run or pass to a magrittr pipe?
You need to use the text parameter in parse, then eval the result. For example, you can do:
eval(parse(text = "print(5)"))
#> [1] 5
However, using eval(parse()) is normally a very bad idea, and there is usually a more sensible alternative.
In your case you can do this without resorting to eval(parse()), for example in base R you could subtract all the appropriate variables from each other like this:
baseline <- grep("_baseline$", names(df_reprex), value = TRUE)
non_baseline <- gsub("_baseline", "", baseline)
df_new <- cbind(df_reprex, as.data.frame(setNames(mapply(
function(i, j) df_reprex[[i]] - df_reprex[[j]],
baseline, non_baseline, SIMPLIFY = FALSE),
paste0(non_baseline, "_corrected"))))
Or if you want to keep the whole thing in a single pipe without storing intermediate variables, you could do:
mapply(function(i, j) df_reprex[[i]] - df_reprex[[j]],
grep("_baseline$", names(df_reprex), value = TRUE),
gsub("_baseline", "", grep("_baseline$", names(df_reprex), value = TRUE)),
SIMPLIFY = FALSE) %>%
setNames(gsub("_baseline", "_corrected",
grep("_baseline$", names(df_reprex), value = TRUE))) %>%
as.data.frame() %>%
{cbind(df_reprex, .)}
I have a table showing sequences of patterns, represented with the id of the sequence, and the ordered value of the consecutive patterns over several rows. patterns have all the same length but sequences have different lengths)
patterns_seq_a <- c("ABC", "BCD", "ABC", "CBA")
patterns_seq_b <- c("BCD", "BCC", "BAC", "ABC", "BCD")
patterns_seq_c <- c("ABC", "ABC")
sequence_df <- data.frame(sequence_id = c(rep("a", length(patterns_seq_a)),
rep("b", length(patterns_seq_b)),
rep("c", length(patterns_seq_c))),
pattern = c(patterns_seq_a, patterns_seq_b, patterns_seq_c))
and I'm trying to count the frequency from one pattern to another over all the sequences.
(and ultimately will build a probability of transitions from the frequency table)
so for example looking at the transition from "CBA" to "BCD" its frequency over all sequences would be 0 (NB the row 4 and 5 don't belong to the same sequence).
The end result would have the form of:
unique_patterns <- unique(sequence_df$pattern)
result <- data.frame(matrix(0, ncol = length(unique_patterns), nrow = length(unique_patterns)))
colnames(result) <- unique_patterns
rownames(result) <- unique_patterns
(but could also be a hash pattern -> pattern -> frequency if it has to)
On the web I've found a solution based on concatenations of sequences then greps but I it's unusable as it's too slow (profvis blames the greps). Something on the lines of this:
freq_table <- c()
for (start_pattern in unique_patterns) {
for (end_pattern in unique_patterns) {
transition_pattern <- paste0(start_pattern, ',', end_pattern)
sequence_holding_transition <- concat_sequence_df[grep(transition_pattern, concat_sequence_df$patterns_sequence),]
if (nrow(sequence_holding_transition) < 1) {
transition_frequency <- c(transition_pattern, 0)
} else {
concat_sequence_holding_transition <- paste0(sequence_holding_transition$patterns_sequence, collapse = ",", sep="/")
transition_pattern_positions <- gregexpr(pattern = transition_pattern, text = concat_sequence_holding_transition)[[1]]
transition_frequency <- c(transition_pattern, length(transition_pattern_positions))
}
freq_table <- rbind(freq_table, transition_frequency)
}
}
frequency_table <- data.frame(pattern_transition = freq_table[, 1], counts = freq_table[, 2])
frequency_table$pattern.from <- sapply(strsplit(as.character(frequency_table$pattern_transition), ","), `[`, 1)
frequency_table$pattern.to <- sapply(strsplit(as.character(frequency_table$pattern_transition), ","), `[`, 2)
frequency_table <- t(matrix(as.numeric(as.character(frequency_table$counts)), ncol=length(unique_patterns), nrow=length(unique_patterns)))
colnames(frequency_table) <- unique_patterns
rownames(frequency_table) <- unique_patterns
I haven't been able to vectorize it and those greps are really slow in any case. I tried parallelizing it but it's still too slow.
Would someone have a lightning fast solution of building this frequency table?
I don't really care about the code sample, it's just here to show an example of a - unusable- solution.
Thanks!
UPDATE:
please find bellow the expected output if I know how to count.
To have lines with no transitions (such as "CBA" to "XXX") dropped is acceptable, and ofc it doesn't matter the actual type of the structure:
structure(
list(
ABC = c(1, 1, 0, 1, 0),
BAC = c(0, 0, 1, 0, 0),
BCC = c(0, 0, 0, 1, 0),
BCD = c(2, 0, 0, 0, 0),
CBA = c(1, 0, 0, 0, 0)
),
row.names = c("ABC", "BAC", "BCC", "BCD", "CBA"),
class = "data.frame"
)
I have not benchmarked it but it seems, that the following solutions uses basic R functions that tend to usually be fast. Given sequence_df from the question:
table(unlist(tapply(sequence_df$pattern, sequence_df$sequence_id, FUN =
function(p) paste0(p[-length(p)], p[-1]))))
I am using tapply to examine each sequence_id on it's own and paste0 for the transition patterns. unlist puts it all in one large vector which table can count. I am not shure, whether one would call that fully vectorized but at least it has no nested for loops and no regular expressions.
It certainly lacks code to transform the table type into a matrix right now. That can be written once we know how it compares runtime wise with other solutions and depending on whether the matrix is really the ideal format for the operations to come after that.
ok so I had a look at the solutions from #Berhard and #user20650 and while I haven't seriously validated the results yet, it looks that they both do the job.
one difference is that the tapply doesn't generate the 0 frequency transitions while data.table does.
Both solutions are faster than the nested for / rbind example as expected.
So I have tried to benchmark both, and assuming no mistake on my side adapting your code, the data.table is more than twice faster than the tapply one.
Thank you both for these elegant answers, appreciated!
library(data.table)
library(tidyr)
library(microbenchmark)
patterns_sequences <- lapply(seq(1:8), function(id) {
length_sequence <- sample(3:10, 1);
do.call(paste0, replicate(3, sample(c("A", "B", "C"), length_sequence, TRUE), FALSE))
})
sequence_df <- data.frame(sequence_id = c(rep("a", length(patterns_sequences[[1]])),
rep("b", length(patterns_sequences[[2]])),
rep("c", length(patterns_sequences[[3]])),
rep("d", length(patterns_sequences[[4]])),
rep("e", length(patterns_sequences[[5]])),
rep("f", length(patterns_sequences[[6]])),
rep("g", length(patterns_sequences[[7]])),
rep("h", length(patterns_sequences[[8]]))),
pattern = unlist(patterns_sequences))
build_frequency_table_base <- function(sequence_df) {
ft0 <- as.data.frame(table(unlist(tapply(sequence_df$pattern, sequence_df$sequence_id, FUN =
function(p) paste0(p[-length(p)], ",", p[-1])))), stringsAsFactors = FALSE)
ft1 <- ft0 %>%
tidyr::separate(Var1, c("from_pattern", "to_pattern"), ",")
ft5 <- tidyr::spread(ft1, to_pattern, Freq, fill= 0)
rownames(ft5) <- ft5$from_pattern
ft5$from_pattern <- NULL
ft5
}
build_frequency_table_dt <- function(sequence_df) {
dt = as.data.table(sequence_df);
dt[, pattern := factor(pattern)];
dt[, pl := shift(pattern), by=sequence_id][ ,pl := factor(pl, level=levels(pattern))];
res_dt <- with(dt, table(pl, pattern))
res_dt <- as.data.frame.matrix(res_dt)
}
tictoc::tic("base")
res_base <- build_frequency_table_base(sequence_df)
tictoc::toc()
tictoc::tic("DT")
res_dt <- build_frequency_table_dt(sequence_df)
tictoc::toc()
(bench = microbenchmark::microbenchmark(
build_frequency_table_base(sequence_df),
res_dt <- build_frequency_table_dt(sequence_df),
times=1000L
))
ggplot2::autoplot(bench)
Intro: Working in R, I often need to reorganize information from lists of data.frames to create a summary table. In this example, I start with a single data.frame, and I show my function that converts key information from the data.frame into a single row. Bearing in mind that my desired output requires the sorting of a mixture of numeric and character data, I can’t help wondering if there is an easier technique to do this kind of thing.
My question: Can anyone provide advice, or better yet a solution, for a simpler technique to convert data.frames like these into rows, while respecting the specific sorting of the data?
#sample data
input_df <- data.frame(M1 = c("Age", "Weight", "Speed", "Range"),
dogs = c(100, 120, 85, 105),
cats = c(115, 89, 80, 111),
birds = c(100, 90, 100, 104))
# desired summary row
desired_row <- data.frame(Model = "M1",
dogs = "Weight (120)",
cats = "Age (115), Range (111)",
birds = "Range (104)",
stringsAsFactors = F)
desired_row$Model <- factor(desired_row$Model)
# my function
makeRow <- function(dat1) {
# get model name
mod <- data.frame(Model = names(dat1[1]))
# make list of variables with model varible
d1 <- setNames(lapply(names(dat1)[-1], function(x) cbind(dat1[1],
dat1[x])), names(dat1)[-1])
# create a sorted named vector, largest-to-smallest
sorted_named_vec <- function(x) {
sort(setNames(x[[2]], x[[1]]), decreasing = T)
}
d2 <- lapply(d1, sorted_named_vec)
# implement a criterion to report only top indexes
keep_tops <- function(x) {
ifelse(x == max(x) | x >= 110 | (x > 102) & ((x -
100)/(max(x) - 100) > 0.33), x, "")
}
d3 <- lapply(d2, keep_tops)
# remove blank character elements
remove_blank_elements <- function(x) {
x[nchar(x) > 0]
}
d4 <- lapply(d3, remove_blank_elements)
# collapse variable name with top values and add parenthesis
collapse_to_string <- function(x) {
paste0(names(x), " (", x, "),", collapse = " ")
}
d5 <- lapply(d4, collapse_to_string)
# remove the last comma
remove_last_comma <- function(x) {
gsub("\\,$", "", x)
}
d6 <- lapply(d5, remove_last_comma)
# consturct a row from the list
row <- cbind(mod, as.data.frame(d6, stringsAsFactors = F))
row
}
# call
row_output <- makeRow(dat1 = input_df)
row_output
# check output to desired
identical(desired_row, row_output)
not sure if more efficient, but slightly less code and more direct approach imo.
makeRow <- function(dat1) {
#make data frame for row with model name
d0 <- data.frame(mod = names(dat1)[1]) #col name changed later
# implement a criterion to report only top indexes -> now return if true or false
keep_tops <- function(x) {
x == max(x) | x >= 110 | (x > 102) & ((x - 100)/(max(x) - 100) > 0.33)
}
vals =c() #empty -> for values of each cols
# make list of variables with model variables(dat1 cols)
#use the columns of the df directly
for(col in 2:ncol(dat1)){
#make temp df with each and evaluate what row to keep in the same line
df = dat1[keep_tops(dat1[,col])==1,c(1,col)]
df[,2] = paste0("(",df[,2],")") #add the () around the numbers
val = apply(as.data.frame(apply(df, 1, paste0, collapse=" ")), 2, paste0, collapse=", ") #collapse rows, then cols
vals = c(vals, val) #add this variable values to the values' list
}
# bind the first col made earlier with these values
row <- cbind(d0, as.data.frame(t(vals), stringsAsFactors = F))
colnames(row) = colnames(dat1) #rename the columns to match
row
}
# call
row_output <- makeRow(dat1 = input_df)
# check output to desired
identical(desired_row$birds, row_output$birds)
with your 'input_df', identical() was TRUE.
I just found a quite weird behaviour when subsetting a data.frame with a condition.
The problem is, when I use a leveled variable to subset, the subset gets filtered for all levels.
Here is an example where the error does not occur:
data <- data.frame(names = c("Bob", "Alice", "Joe"), ages = c(18, 20, 43), sizes = c(180, 160, 176), group = c("0021", "9430", "0021"))
for(i in 1:length(data$names)){
print(paste("Name: ", data$names[i], sep=""))
# print out all group members
print("Group Members:")
group = subset.data.frame(data, data$group == data$group[i])
for(j in 1:length(group$names)){
print(paste("Name: ", group$names[j], sep=""))
}
print("---------------------------------")
}
Now I am saving the data$group[i] into a variable and the data.frame does not get filtered at all:
data <- data.frame(names = c("Bob", "Alice", "Joe"), ages = c(18, 20, 43), sizes = c(180, 160, 176), group = c("0021", "9430", "0021"))
for(i in 1:length(data$names)){
print(paste("Name: ", data$names[i], sep=""))
group <- data$group[i]
# print out all group members
print("Group Members:")
group = subset.data.frame(data, data$group == group)
for(j in 1:length(group$names)){
print(paste("Name: ", group$names[j], sep=""))
}
print("---------------------------------")
}
Can someone please explain to me, why this unexpected behaviour occurs? I do expect to get a String from the data$group[i] expression but get a leveled something.
Nothing is wrong as such in your code. subset has scoping issues when you have variable name same as your column name. If you change your group variable to any other name it would work fine.
for(i in 1:length(data$names)){
print(paste("Name: ", data$names[i], sep=""))
temp <- data$group[i] #Change here
# print out all group members
print("Group Members:")
group = subset(data, group == temp)
for(j in 1:length(group$names)){
print(paste("Name: ", group$names[j], sep=""))
}
print("---------------------------------")
}
and that is the same reason why you should read Why is `[` better than `subset`?
I need to recall the variable within the loop and the use that as my column name:
This is an example dataset:
mtcars.df <- mtcars
Expected output:
mtcars.df <- mtcars.df %>% add_column(a1 = sample (1:4, 32, replace = TRUE), b1 = sample (1:4, 32, replace = TRUE), c1 = sample (1:4, 32, replace = TRUE))
So a1, b1 and c1 need to be created within the loop and renamed to a1, b1 and c1. The names are longer (and different combinations) in the original dataset but this is defined in variable mpg.filename.
This is what I have tried so far: The 1st obviously doesn't give the desired result but it's just to show what I want to achieve.
mpg.filename <- c("a1.file", "b1.file", "c1.file")
for (i in mpg.filename) {
sample.name <- unlist(strsplit(as.character(i), '.', fixed = TRUE))[1]
mtcars.df$i <- sample (1:4, 32, replace = TRUE)
}
for (i in mpg.filename) {
sample.name <- unlist(strsplit(as.character(i), '.', fixed = TRUE))[1]
mtcars.df$temp.var <- sample (1:4, 32, replace = TRUE)
temp.name <- paste0 (sample.name) %>% rlang::parse_expr()
mtcars.df <- mtcars.df %>% rename (eval (sample.name) = temp.var)
}
for (i in mpg.filename) {
sample.name <- unlist(strsplit(as.character(i), '.', fixed = TRUE))[1]
mtcars.df$temp.var <- sample (1:4, 32, replace = TRUE)
temp.name <- paste0 (sample.name) %>% rlang::parse_expr()
mtcars.df <- mtcars.df %>% rename (syms(sample.name) = temp.var)
}
I have tried get, as.symbol, parse (text = "sample.name") as well but didn't work either.
Thanks for the help. I have tried looking at other answers on forums but they do not seem to apply or work.
using eval() on string to access object attributes in R
call columns from inside a for loop in R
Getting strings recognized as variable names in R
How to evaluate an expression with variables in R?
Your first attempt was close! When you want to access / create columns in a data.frame using a character, you need to use [[ instead of $. No need for symbols / parsing / other complicated nonsense if this is all you need to do.
for(i in mpg.filename) {
sample.name <- unlist(strsplit(as.character(i), '.', fixed = TRUE))[1]
mtcars.df[[sample.name]] <- sample(1:4, 32, replace=TRUE)
}
Let me know if this did the trick-
mpg.filename <- c("a1.file", "b1.file", "c1.file")
for (i in 1:length(mpg.filename)) {
sample.name <- unlist(strsplit(as.character(mpg.filename[i]), '.', fixed = TRUE))[1]
mtcars.df$i <- sample (1:4, 32, replace = TRUE)
colnames(mtcars.df)[length(mtcars.df)]<-paste(sample.name)
}