I scanned similar questions previously answered but couldn't find the thread that is specific to my problem.
I have a number of datasets that all have five flagging columns (binary) at the end.
The aim is to produce an output that summarises the specified column in each dataset by each flag.
Hence, each output is a list of five summary tables.
library(tidyverse)
library(janitor)
## mydataset1
mydataset1 <- tibble(id = 1:100,
column_000 = sample(1:16, 100, replace = TRUE),
flag1 = sample(0:1, 100, replace = TRUE),
flag2 = sample(0:1, 100, replace = TRUE),
flag3 = sample(0:1, 100, replace = TRUE),
flag4 = sample(0:1, 100, replace = TRUE),
flag5 = sample(0:1, 100, replace = TRUE))
## summary table function
get_table <- function(data, column) {
data %>%
# select the flag
filter(data[[i]] == 1) %>%
# summary table
tabyl(column) %>%
arrange(desc(n)) %>%
top_n(5, n)
}
## list of tables function
output_list <- function(data, column) {
# empty list
output <- list()
# for loop - go through each flagging column
for (i in (length(data)-4):length(data)) {
output[[i]] <- get_table(data, column)
}
# for some reason, there are NULL list items for all other columns
output <- compact(output)
# rename and print
names(output) <- names(data)[(length(data)-4):length(data)]
print(output)
}
### execute
output_list(mydataset1, "column_000")
# error
### manually executing the function works fine
# empty list
output <- list()
# for loop - go through each flagging column
for (i in (length(mydataset1)-4):length(mydataset1)) {
output[[i]] <- get_table(mydataset1, "column_000")
}
# for some reason, there are NULL list items for all other columns
output <- compact(output)
# rename and print
names(output) <- names(mydataset1)[(length(mydataset1)-4):length(mydataset1)]
print(output)
This is what I have for now.
If I execute the contents of output_list function manually, it works fine.
However, if I execute it as a function, it gives me an error that object i is not found.
Where did I get it wrong? Please help!
Pass i as an input to get_table function.
library(tidyverse)
library(janitor)
get_table <- function(data, column, i) {
data %>%
# select the flag
filter(data[[i]] == 1) %>%
# summary table
tabyl(column) %>%
arrange(desc(n)) %>%
top_n(5, n)
}
Make the corresponding changes in output_list function.
output_list <- function(data, column) {
# empty list
output <- list()
# for loop - go through each flagging column
for (i in (length(data)-4):length(data)) {
output[[i]] <- get_table(data, column, i)
}
# for some reason, there are NULL list items for all other columns
output <- compact(output)
# rename and print
names(output) <- names(data)[(length(data)-4):length(data)]
print(output)
}
Run the function -
output_list(mydataset1, "column_000")
In your get table function, you are using "i" but not declaring "i" in the function argument. Your code works fine when you run code separately because i value gets assigned from the for loop in the global environment. if you intend to use i from for loop in get_table function you can just declare it. See code below.
library(tidyverse)
library(janitor)
## mydataset1
mydataset1 <- tibble(id = 1:100,
column_000 = sample(1:16, 100, replace = TRUE),
flag1 = sample(0:1, 100, replace = TRUE),
flag2 = sample(0:1, 100, replace = TRUE),
flag3 = sample(0:1, 100, replace = TRUE),
flag4 = sample(0:1, 100, replace = TRUE),
flag5 = sample(0:1, 100, replace = TRUE))
## summary table function
get_table <- function(data, column) {
data %>%
# select the flag
filter(data[[i]] == 1) %>%
# summary table
tabyl(column) %>%
arrange(desc(n)) %>%
top_n(5, n)
}
## list of tables function
output_list <- function(data, column) {
# empty list
output <- list()
# for loop - go through each flagging column
for (i in (length(data)-4):length(data)) {
output[[i]] <- get_table(data, column)
}
# for some reason, there are NULL list items for all other columns
output <- compact(output)
# rename and print
names(output) <- names(data)[(length(data)-4):length(data)]
print(output)
}
### execute
output_list(mydataset1, "column_000")
# error
### manually executing the function works fine
# empty list
output <- list()
# for loop - go through each flagging column
for (i in (length(mydataset1)-4):length(mydataset1)) {
output[[i]] <- get_table(mydataset1, "column_000")
}
# for some reason, there are NULL list items for all other columns
output <- compact(output)
# rename and print
names(output) <- names(mydataset1)[(length(mydataset1)-4):length(mydataset1)]
print(output)
My goal is to apply wavelet analysis and image construction to large data set of time series data to be eventually used in pipeline for time series clustering. The function to do the first step is from WaveletComp and I am using purr map () from Tidyverse package. Ideally the output is a list labeled for each column that I can then apply other functions to in the pipeline.
library(WaveletComp)
The data set has 3 columns and 6000 values
df <- data.frame(replicate(3,sample(-5:5,6000,rep=TRUE)))
wave_emg <- function(df) {
analyze.wavelet(my.data = df, my.series = "X1", loess.span =50,
dt=1, dj=1/250,
lowerPeriod = 32,
upperPeriod = 512,
make.pval = TRUE, n.sim = 100)
Solution <- mutate(model = map(df, wave_emg))
I get the following error *Error in my.data[, ind] : incorrect number of dimensions
It appears to me that the my.series command in the analyze.wavelet function is looking for a single column to be specified. Is there a way to inform the command to take the next column successively?
You could write a function which takes two input, dataframe and column name/position.
library(WaveletComp)
library(purrr)
ave_emg <- function(df, col) {
analyze.wavelet(my.data = df, my.series = col, loess.span =50,
dt=1, dj=1/250,
lowerPeriod = 32,
upperPeriod = 512,
make.pval = TRUE, n.sim = 100)
}
analyze.wavelet function takes column names or column index as input so you could use any of these versions :
#column names
result <- map(names(df), ave_emg, df = df)
#column index
result <- map(seq_along(df), ave_emg, df = df)
You can also replace map with lapply to get the same output.
Looks like df needs to be split first before entering into the function to avoid error for 'analyze.wavlet()'. This code seems to work with this function, but you #Ronak code works with other functions.
library(tidyverse)
library(WaveletComp)
wave_emg <- function(df) {
analyze.wavelet(my.data = df, my.series = "X1", loess.span =50,
dt=1, dj=1/250,
lowerPeriod = 32,
upperPeriod = 512,
make.pval = TRUE, n.sim = 100
Solution <- df %>% split.default(.,seq_along(.)) %>% map(., ave_emg)
I have a dataframe e.g.
df_reprex <- data.frame(id = rep(paste0("S",round(runif(100, 1000000, 9999999),0)), each=10),
date = rep(seq.Date(today(), by=-7, length.out = 10), 100),
var1 = runif(1000, 10, 20),
var2 = runif(1000, 20, 50),
var3 = runif(1000, 2, 5),
var250 = runif(1000, 100, 200),
var1_baseline = rep(runif(100, 5, 10), each=10),
var2_baseline = rep(runif(100, 50, 80), each=10),
var3_baseline = rep(runif(100, 1, 3), each=10),
var250_baseline = rep(runif(100, 20, 70), each=10))
I want to write a function containing a for loop that for each row in the dataframe will subtract every "_baseline" column from the non-baseline column with the same name.
I have created a script that automatically creates a character string containing the code I would like to run:
df <- df_reprex
# get only numeric columns
df_num <- df %>% dplyr::select_if(., is.numeric)
# create a version with no baselines
df_nobaselines <- df_num %>% select(-contains("baseline"))
#extract names of non-baseline columns
numeric_cols <- names(df_nobaselines)
#initialise empty string
mutatestring <- ""
#write loop to fill in string:
for (colname in numeric_cols) {
mutatestring <- paste(mutatestring, ",", paste0(colname, "_change"), "=", colname, "-", paste0(colname, "_baseline"))
# df_num <- df_num %>%
# mutate(paste0(col, "_change") = col - paste0(col, "_baseline"))
}
mutatestring <- substr(mutatestring, 4, 9999999) # remove stuff at start (I know it's inefficient)
mutatestring2 <- paste("df %>% mutate(", mutatestring, ")") # add mutate call
but when I try to call "mutatestring2" it just prints the character string e.g.:
[1] "df %>% mutate( var1_change = var1 - var1_baseline , var2_change = var2 - var2_baseline , var3_change = var3 - var3_baseline , var250_change = var250 - var250_baseline )"
I thought that this part would be relatively easy and I'm sure I've missed something obvious, but I just can't get the text inside that string to run!
I've tried various slightly ridiculous methods but none of them return the desired output (i.e. the result returned by the character string if it was entered into the console as a command):
call(mutatestring2)
eval(mutatestring2)
parse(mutatestring2)
str2lang(mutatestring2)
mget(mutatestring2)
diff_func <- function() {mutatestring2}
diff_func1 <- function() {
a <-mutatestring2
return(a)}
diff_func2 <- function() {str2lang(mutatestring2)}
diff_func3 <- function() {eval(mutatestring2)}
diff_func4 <- function() {parse(mutatestring2)}
diff_func5 <- function() {call(mutatestring2)}
diff_func()
diff_func1()
diff_func2()
diff_func3()
diff_func4()
diff_func5()
I'm sure there must be a very straightforward way of doing this, but I just can't work it out!
How do I convert a character string to something that I can run or pass to a magrittr pipe?
You need to use the text parameter in parse, then eval the result. For example, you can do:
eval(parse(text = "print(5)"))
#> [1] 5
However, using eval(parse()) is normally a very bad idea, and there is usually a more sensible alternative.
In your case you can do this without resorting to eval(parse()), for example in base R you could subtract all the appropriate variables from each other like this:
baseline <- grep("_baseline$", names(df_reprex), value = TRUE)
non_baseline <- gsub("_baseline", "", baseline)
df_new <- cbind(df_reprex, as.data.frame(setNames(mapply(
function(i, j) df_reprex[[i]] - df_reprex[[j]],
baseline, non_baseline, SIMPLIFY = FALSE),
paste0(non_baseline, "_corrected"))))
Or if you want to keep the whole thing in a single pipe without storing intermediate variables, you could do:
mapply(function(i, j) df_reprex[[i]] - df_reprex[[j]],
grep("_baseline$", names(df_reprex), value = TRUE),
gsub("_baseline", "", grep("_baseline$", names(df_reprex), value = TRUE)),
SIMPLIFY = FALSE) %>%
setNames(gsub("_baseline", "_corrected",
grep("_baseline$", names(df_reprex), value = TRUE))) %>%
as.data.frame() %>%
{cbind(df_reprex, .)}
I have some data, that looks like the following:
"Name","Length","Startpos","Endpos","ID","Start","End","Rev","Match"
"Name_1",140,0,138,"1729",11,112,0,1
"Name_2",132,0,103,"16383",23,232,0,1
"Name_3",102,0,100,"1729",22,226,1,1
"Name_4",112,0,130,"16383",99,992,1,1
"Name_5",132,0,79,"1729",81,820,1,1
"Name_6",112,0,163,"16383",81,820,0,1
"Name_7",123,0,164,"1729",54,542,1,1
"Name_8",123,0,65,"16383",28,289,0,1
I have used the order function to order according to first "ID then "Start".
"Name","Length","Startpos","Endpos","ID","Start","End","Rev","Match"
"Name_1",140,0,138,"1729",11,112,0,1
"Name_3",102,0,100,"1729",22,226,1,1
"Name_7",123,0,164,"1729",54,542,1,1
"Name_5",132,0,79,"1729",81,820,1,1
"Name_2",132,0,103,"16383",23,232,0,1
"Name_8",123,0,65,"16383",28,289,0,1
…
Now I need to do two things:
First I need to create a table that includes pairwise couples out of each ID group. For a group in one ID containing the names (1,2,3,4,5), I need to create the pairs (12,23,34,45). So for the above example, the pairs would be (Name_1+Name_3, Name_3+Name_7, Name_7+Name_5).
My output for the above example, would look like this:
"Start_Name_X","Start_Name_Y","Length_Name_X","Length_Name_Y","Name_Name_X","Name_Name_Y","ID","New column"
11, 22, 140, 102, "Name_1", Name_3", 1729,,
22, 54, 102, 123, "Name_3", Name_7, 1729,,
54, 81, 123, 132, "Name_7", Name_5, 1729,,
23, 28, 132, 123, "Name_2", "Name_8", 16383,,
…
So I need to create pairs through ascending "Start", but within each "ID".
I am thinking it should be done with a for loop, but I am a newbie, so pulling the data to a new table with the for loop confuses me in itself, and especially the constraint of doing it within each unique "ID", I have no idea how to do.
I have experimented with splitting the data into groups according to ID using split, but it doesn't really get me further with creating the new data table.
I have created the ned data-table with the following code:
column_names = data.frame(Start_Name_X ="Start_Name_x",
Start_Name_Y="Start_Name_Y", Length_Name_X ="Length_Name_X",
Length_Name_Y="Length_Name_Y", Name_X="Name_X", Name_Y="Name_Y", ID="ID",
New_Column="New_Column")
write.table(column_names, file = "datatabel.csv", row.names=FALSE, append =
FALSE, col.names = FALSE, sep=",", quote=TRUE)
And this is the table, I would like to write to.
Is a for loop the write way to handle this, and if so, can you give me a few clues on how to start?
It can be done with only one loop:
df <- read.table(sep = ",", header = TRUE, stringsAsFactors = FALSE,
text = "\"Name\",\"Length\",\"Startpos\",\"Endpos\",\"ID\",\"Start\",\"End\",\"Rev\",\"Match\"\n\"Name_1\",140,0,138,\"1729\",11,112,0,1\n\"Name_2\",132,0,103,\"16383\",23,232,0,1\n\"Name_3\",102,0,100,\"1729\",22,226,1,1\n\"Name_4\",112,0,130,\"16383\",99,992,1,1\n\"Name_5\",132,0,79,\"1729\",81,820,1,1\n\"Name_6\",112,0,163,\"16383\",81,820,0,1\n\"Name_7\",123,0,164,\"1729\",54,542,1,1\n\"Name_8\",123,0,65,\"16383\",28,289,0,1",
)
df <- df[order(df$ID, df$Start), ]
inds <- c("Name", "Start", "Length")
indsSorted <- c("Start_Name_X","Start_Name_Y","Length_Name_X","Length_Name_Y","Name_Name_X","Name_Name_Y","ID","New_Column")
out <- data.frame(matrix(nrow = 0, ncol = 8))
colnames(out) <- c("Start_Name_X","Start_Name_Y","Length_Name_X","Length_Name_Y","Name_Name_X","Name_Name_Y","ID","New_Column")
for (i in unique(df$ID)){
dfID <- subset(df, ID == i)
dfHead <- head(dfID, n = nrow(dfID) - 1)[, inds]
colnames(dfHead) <- paste0(colnames(dfHead), "_Name_X")
dfTail <- tail(dfID, n = nrow(dfID) - 1)[, inds]
colnames(dfTail) <- paste0(colnames(dfTail), "_Name_Y")
out <- rbind(out, cbind(dfHead, dfTail, ID = i, New_Column = '', stringsAsFactors = FALSE)[, indsSorted])
}
out
This will probably be horribly slow if the input is large. It can be optimized, but I didn't bother since using data.table is probably much quicker.
dt <- data.table(df, key = "ID,Start")
fn <- function(dtIn, id){
dtHead <- head(dtIn, n = nrow(dtIn) - 1)
setnames(dtHead, paste0(colnames(dtHead), "_Name_X"))
dtTail <- tail(dtIn, n = nrow(dtIn) - 1)
setnames(dtTail, paste0(colnames(dtTail), "_Name_Y"))
cbind(dtHead, dtTail, ID = id, New_Column = '')
}
out2 <- dt[, fn(.SD, ID), by = ID, .SDcols = c("Name", "Start", "Length")]
out2 <- as.data.frame(out2[, indsSorted, with = FALSE])
Rownames are different but otherwise the results are identical. The function used can probably be optimized as well.
rownames(out) <- NULL
rownames(out2) <- NULL
identical(out, out2)