Generate data frame length (and column data) from function - r

I want to generate a data frame with a random length.
> head(df)
"id" "age"
53 12 # randomly chosen data from fn1(){} and fn2(){}
146 31 #
343 22 #
...#randomly generated length from sample(50:5000,1)
The problem is that the way I'm trying is just repeat the same element over and over:
# This just repeats the same value instead of generating function over and over
a <- fn1(){}
rep(a,15)
[1] "S" "S" "S" "S" "S" "S" "S" ...
Ideally the column names I want to specify and assign a value from other functions:
# Generate length of data frame
df.length <- sample(50:500,1)
# Generate data for each row from function
df.column.id <- fn1(){}
df.column.age <- fn2(){}
...
df <- data.frame("id" = df.column.id, "age" = df.column.age, ...)
Unfortunately the rep function isn't working, so how can the data frame columns be generated from functions? I also tried matrix(data = c(df.column.id, df.column.age), nrow = df.length) didn't work as intended.
Edit:
replicate(10, RandomStatusColor(), simplify = "vector") is working to generate a vector of the function outputs.

Maybe something like this could help:
min_rownum <- 10
max_rownum <- 50
num_of_rows <- sample(seq(min_rownum, max_rownum), 1)
min_age <- 1
max_age <- 50
age <- sample(seq(min_age, max_age), num_of_rows, replace = TRUE)
min_ID <- 50
max_ID <- 500
id <- sample(seq(min_ID, max_ID), num_of_rows)
df1 <- data.frame(id, age)
I tried to use variable names that would make the code self-explanatory.
The parameter replace = TRUE in the sample() function means that an element can be selected more than once. In the case of ages this is plausible, whereas IDs should be unique. The second argument of sample() defines how many elements should be chosen from the vector that is passed as a first argument.
The title of the question suggests that the data.frame should be generated by a function. In that case the above code can be wrapped into a function like this:
make_random_df <- function(min_rownum=10, max_rownum=50, min_age=1, max_age=50,
min_ID=50, max_ID=500) {
num_of_rows <- sample(seq(min_rownum, max_rownum), 1)
age <- sample(seq(min_age, max_age), num_of_rows, replace = TRUE)
id <- sample(seq(min_ID, max_ID), num_of_rows)
df1 <- data.frame(id, age)
}
Using this function, the data.frame can be created with
my_random_df <- make_random_df()
#> head(my_random_df)
# id age
#1 461 7
#2 86 44
#3 319 8
#4 363 45
#5 59 3
#6 258 49

Here's a function that generates data samples of a given length (len) from a given vector (vec):
createData <- function(vec, len) {
sample(vec, len, replace = TRUE)
}
nobs <- 20
df <- data.frame(id = createData(vec = c("a", "b", "c"), len = nobs),
age = createData(vec = seq(10, 50, 10), len = nobs))
df
Is this what you are after?

Related

How to count missing values from two columns in R

I have a data frame which looks like this
**Contig_A** **Contig_B**
Contig_0 Contig_1
Contig_3 Contig_5
Contig_4 Contig_1
Contig_9 Contig_0
I want to count how many contig ids (from Contig_0 to Contig_1193) are not present in either Contig_A column of Contig_B.
For example: if we consider there are total 10 contigs here for this data frame (Contig_0 to Contig_9), then the answer would be 4 (Contig_2, Contig_6, Contig_7, Contig_8)
Create a vector of all the values that you want to check (all_contig) which is Contig_0 to Contig_10 here. Use setdiff to find the absent values and length to get the count of missing values.
cols <- c('Contig_A', 'Contig_B')
#If there are lot of 'Contig' columns that you want to consider
#cols <- grep('Contig', names(df), value = TRUE)
all_contig <- paste0('Contig_', 0:10)
missing_contig <- setdiff(all_contig, unlist(df[cols]))
#[1] "Contig_2" "Contig_6" "Contig_7" "Contig_8" "Contig_10"
count_missing <- length(missing_contig)
#[1] 5
by match,
x <- c(0:9)
contigs <- sapply(x, function(t) paste0("Contig_",t))
df1 <- data.frame(
Contig_A = c("Contig_0", "Contig_3", "Contig_4", "Contig_9"),
Contig_B = c("Contig_1", "Contig_5", "Contig_1", "Contig_0")
)
xx <- c(df1$Contig_A,df1$Contig_B)
contigs[is.na(match(contigs, xx))]
[1] "Contig_2" "Contig_6" "Contig_7" "Contig_8"
In your case, just change x as x <- c(0,1193)

How to efficiently store and retrieve parameters/arguments used during data processing in lists

I work with a large count table and for my analyses it is usually required to split this table into subsets based on observations, variables, values or context information.
# generating toy data
count_df1 <- data.frame(
column1 = c(1:50),
column2 = runif(50, 1, 10),
column3 = runif(50, 1, 10)
)
count_df2 <- data.frame(
column1 = c(1:50),
column2 = runif(50, 1.5, 9),
column3 = runif(50, 1.5, 9)
)
list_count_df <- list(count_df1 = count_df1, count_df2 = count_df2)
I learned to use lists and for loops to process all resulting subsets in the same manner. I'm rather using for loops than apply because I use the names of the objects (with the use of counters) to keep track of how I modified them and I don't know how to do this with e.g. lapply.
# set values to iterate over
thresholds <- c(2, 4)
conditions <- c(TRUE, FALSE)
# perform some kind of subsetting and store the parameters used
output_list <- list()
counter <- 0
for (current_threshold in thresholds) {
for (count_df in list_count_df) {
counter <- counter + 1
# modify the name to keep track of changes
current_name <- paste(names(list_count_df)[counter], current_threshold, sep = "_")
output_list[[current_name]] <- subset(count_df1, column2 < current_threshold)
}
counter <- 0
}
Additionally, the time consuming part is usually the main function of the body, so a loop with a reduced overhead by apply would probably not safe so much time (I'm still open to this).
After I'm done with preparing the various subsets and subject them to the analysis, I need to store the analysis' results and the accompanying parameters for the different subsets. That is probably a common task.
# allocate for df to store the results
result_length <- length(output_list) * length(conditions)
df_headers <- c("Names", "Threshold", "Input_table", "Standard_deviation", "Scaling")
df_results <- setNames(data.frame(matrix(ncol = length(df_headers),
nrow = result_length)), df_headers)
# perform some analyses (here: PCA) on the dfs while looping over
# analysis parameters and storing some results directly
iii <- 0
table_counter <- 0
for (item in output_list) {
table_counter <- table_counter + 1
for (condition in conditions) {
iii <- iii + 1
current_name <- paste(names(output_list)[table_counter], condition, sep = "_")
tmp <- prcomp(item, scale = condition)
# let's pretend we are only interested in standard deviation per item
df_results[iii, 1] <- current_name
df_results[iii, 4] <- tmp$sdev[1]
rm(tmp)
}
}
However, I'm partly doing this by extracting parts of the name of the object, which is highly repetitive and also very custom and has to be changed for each additional step included beforehand. As I want to start my own package soon, this is nothing another user could easily follow.
# extract more values from the name of the former object
df_results$Threshold <- as.numeric(sapply(strsplit(as.character(df_results$Names), '_'), "[", 3))
df_results$Input_table <- as.factor(sapply(strsplit(as.character(df_results$Names), '_'), "[", 2))
df_results$Scaling <- as.factor(sapply(strsplit(as.character(df_results$Names), '_'), "[", 4))
df_results
# now I could this into long format, do plotting etc
I provided a short example below of how such a workflow could look like. My questions are:
1) What are the general good practices on how to store parameters used for and how to extract them after processing?
2) If the solution is too case-specific for a general approach:
a) any ideas what to change here?
b) Are lists and/or for loops the way to go at all?
I do it because modifying names in lapply is unclear to me and without this I lose track of what is what. I also would not know how to efficiently handle all these different subsets in one big data.frame
Please consider that my original data contains numerical, factor and character columns with 100s of rows/observations and ten thousands of columns/variables.
Honestly there are many ways to do this and it will come down to personal preference. One common way would be to define a class object that will set the standard of how you access information on it. Creating a class means that you can make S3 methods too. This could help give more flexibility on how you generate your class depending on if you are working on a list, df, or just a vector.
generate_foo <- function(x, ...){
UseMethod("generate_foo")}
generate_foo.default <- function(x, current_threshold, conditions, name = NULL){
if(is.null(name)){
name <- as.character(substitute(x))
}
x <- x[x[["column2"]]<current_threshold,]
tmp <- tryCatch({prcomp(x, scale = conditions)}, error=function(er){return("Error")})
retval <- list(list(subset = x,
pcaObj = tmp, #could store the entire object or just the parts you care about.
subsetparam = current_threshold,
condition = conditions,
name = name))
class <- "foo"
return(retval)
}
generate_foo.list <- function(x,
current_threshold,
conditions, name = NULL){
if(is.null(name)||length(name)!=length(x)){
name <- names(x)
}
#Generate combinations
combi <- separate( #generate all the possible combination indexes at once
data.frame(
indx = levels(suppressWarnings(interaction(1:length(x),
1:length(current_threshold),
1:length(conditions))))),
col = "indx", into = c("df","thresh","cond"), sep = "\\.")
x <- x[as.numeric(combi$df)]
name <- name[as.numeric(combi$df)]
current_threshold <- current_threshold[as.numeric(combi$thresh)]
conditions <- conditions[as.numeric(combi$cond)]
foolist <- mapply(FUN = generate_foo.default,
x = x,
current_threshold = current_threshold,
conditions = conditions,
name = name)
class(foolist) <- "foolist"
return(foolist)
}
With this method when you call:
foo <- generate_foo(x = list_count_df,
current_threshold = thresholds,
conditions = conditions,
name = c("Custname1","Custname2"))
You will end up with a list of objects with class "foo". Specifically in this case the resulting object is length 8, each containing 5 parameters, subset, pcaObj, subsetparam, condition, name. with the exception of pcaObj sometimes throwing an error if the subset is too small, a tryCatch loop prevents the code from failing.
take it a step further by writing custom print and summary functions!
#summary
summary.foolist <- function(x){
subsetdim <- unlist(lapply(x, function(y){prod(dim(y[["subset"]]))}))
pcasdev <- unlist(lapply(x, function(y){y[["pcaObj"]]$sdev[1]}))
subsetparam <- unlist(lapply(x, function(y){y[["subsetparam"]]}))
condition <- unlist(lapply(x, function(y){y[["condition"]]}))
name <- unlist(lapply(x,function(y){y[["name"]]}))
df <- data.frame(SubsetDim=subsetdim, PCAsdev=pcasdev, SubsetParam=subsetparam, condition=condition, name = name)
return(df)
}
summary(foo)
SubsetDim PCAsdev SubsetParam condition name
1 24 1.207833 2 TRUE Custname1
2 6 1.732051 2 TRUE Custname2
3 54 1.324284 4 TRUE Custname1
4 33 1.372508 4 TRUE Custname2
5 24 16.258848 2 FALSE Custname1
6 6 12.024556 2 FALSE Custname2
7 54 15.592938 4 FALSE Custname1
8 33 14.057929 4 FALSE Custname2
Using a convention like this ensures your data is stored in a canonical way. Of course there are many ways you can choose to build your custom R class and object.
You could make one function that makes a list of subsetted dataframes and set that as one class. Then make another function that performs the analysis and generates a new class object. As long as you stick to building a named list, then accessing parts of the object become easier because they are organized.
Functional solution
0. Generate source data frame
# for reproducibility of random tasks
set.seed(1)
df <- data.frame(
col1 = c(1:100),
col2 = c(runif(50,1,10), runif(50,11,20)),
col3 = c(runif(50,1,10), runif(50,11,20))
)
# so half of the rows have numbers 1 to 10 in col2 and col3
# and other have 11 to 20 in col2 and col3.
# let's randomize the order of rows
df <- df[sample(1:100),]
# and take this data frame `df` as our source data frame
# fromw which we will do the analysis.
1. Problem description
We want to subdivide the original df into sub data frames
applying 2 different criteria.
Then, we analyze each sub data frame
using all possible combinations of 2 different parameters,
finally, collect all analysis values and aggravate them to a data frame.
The criteria:
criterium1: if col2 value is <= 10, we assign to the row "df1" therwise "df2".
categories c("df1", "df2").
criterium2: if col3 value is lower first limit, the row is assigned 'class5'
if col3 value is > first limit but <= second limit, assign 'class15'
other cases don't interest us - let's assign 'other'
categories c("class5", "class15", "other") # each row will be subdivided into
one of them
We want for each combination of the two criteria an own sub-dataframe
on which the analysis should be done.
The parameters for the analysis:
parameter1: 'scale.=' c(TRUE, FALSE)
parameter_categories c("sc+", "sc-")
parameter2: 'center=' c(TRUE, FASE)
parameter_categories c("cen+", "cen-")
The analysis result value:
We want for each combination of the two parameters an own report or the value
for 'Standard deviation'.
3 stddev columns of the PC1, PC2, PC3
Additional information to be collected:
we want a distinguishable (unique) name for each combination
2. How the entire analysis will look like:
# 0. categorize and split data frame
categories1 <- c("df1", "df2")[cut(df[, "col2"], c(1, 11, 20, Inf))]
categories2 <- c("class5", "class15", "other")[cut(df[, "col3"], c(-Inf, 5, 15, Inf))]
dfs <- split(df, gsub("class", "", paste(categories1, categories2, sep="_")))
# 1. Declare parameters and prepare all parameter combinations
parameters1 <- list("scale." = TRUE, "scale."=FALSE)
np1 <- c("scpos", "scneg")
parameters2 <- list("center"=TRUE, "center"=FALSE)
np2 <- c("cpos", "cneg")
params_list <- named_cross_combine(parameters1, parameters2, np1, np2, sep="_")
# 2. Apply analysis over all sub dfs and parameter combinations
# and extract and aggravate analysis results into a final data frame
df_final <- apply_extract_aggravate(
dfs=dfs,
params=params_list,
analyzer_func=prcomp,
extractor_func=function(x) x$sdev, # extractor must return a vector
col_names=c("df", "limits", "scale", "center", "std_PC1", "std_PC2", "std_PC3"),
sep="_" # separator for names
)
# 3. rename parameter column contents
df_final$scale <- unlist(lookup(df_final$scale, np1, parameters1))
df_final$center <- unlist(lookup(df_final$center, np2, parameters2))
df_final:
df limits scale center std_PC1 std_PC2 std_PC3
df1_15_scpos_cpos df1 15 TRUE TRUE 1.205986 0.9554013 0.7954906
df1_15_scpos_cneg df1 15 TRUE FALSE 1.638142 0.5159250 0.2243043
df1_15_scneg_cpos df1 15 FALSE TRUE 15.618145 2.4501942 1.3687843
df1_15_scneg_cneg df1 15 FALSE FALSE 31.425246 5.9055013 1.7178626
df1_5_scpos_cpos df1 5 TRUE TRUE 1.128371 1.0732246 0.7582659
df1_5_scpos_cneg df1 5 TRUE FALSE 1.613217 0.4782639 0.4108470
df1_5_scneg_cpos df1 5 FALSE TRUE 13.525868 2.5524661 0.9894493
df1_5_scneg_cneg df1 5 FALSE FALSE 30.007511 3.9094993 1.6020638
df2_15_scpos_cpos df2 15 TRUE TRUE 1.129298 1.0069030 0.8431092
df2_15_scpos_cneg df2 15 TRUE FALSE 1.720909 0.1523516 0.1235295
df2_15_scneg_cpos df2 15 FALSE TRUE 14.061532 2.4172787 1.2348606
df2_15_scneg_cneg df2 15 FALSE FALSE 80.543382 3.8409639 1.8480111
df2_other_scpos_cpos df2 other TRUE TRUE 1.090057 0.9588241 0.9446865
df2_other_scpos_cneg df2 other TRUE FALSE 1.718190 0.1881516 0.1114570
df2_other_scneg_cpos df2 other FALSE TRUE 15.168160 2.5579403 1.3354016
df2_other_scneg_cneg df2 other FALSE FALSE 82.297724 5.0580949 1.9356444
3. Explanation step by step
3.1 Declare Helper functions
# for preparing parameter combinations as lists
named_cross_combine <- function(seq1, seq2, seq1_names, seq2_names, sep="_") {
res <- list()
i <- 1
namevec <- c()
for (j1 in seq_along(seq1)) {
for (j2 in seq_along(seq2)) {
res[[i]] <- c(seq1[j1], seq2[j2])
namevec[i] <- paste0(seq1_names[j1], sep, seq2_names[j2])
i <- i + 1
}
}
names(res) <- namevec
res
}
# correctly named params list - `sep=` determines how names are joined
# you can apply `gsub()` on the namevec before assignment to adjust further the names.
# useful for doing analysis
do.call2 <- function(fun, x, rest) {
do.call(fun, c(list(x), rest))
}
apply_parameters <- function(funcname,
dfs,
params) {
lapply(dfs, function(df) lapply(params_list, function(pl) do.call2(funcname, df, pl)))
}
split_names_to_data_frame <- function(names_vec, sep) {
res <- lapply(names_vec, function(s) strsplit(s, sep)[[1]])
df <- Reduce(rbind, res)
# colnames(df) <- col_names
rownames(df) <- names_vec
df
}
apply_to_subdf_and_combine <- function(
res_list,
accessor_func=function(x) x, # subdf result
subdf_level_combiner_func=as.data.frame, # within subdf result
combine_prepare_func=function(x) x, # applied on each subdf result
final_combiner_func=rbind, # combine the results
col_names=NULL, # column names for final
sep="_") { # joiner for names
res_accessed_combined <- lapply(res_list,
function(x) do.call(what=subdf_level_combiner_func,
list(lapply(x, accessor_func))))
res_prepared <- lapply(res_accessed_combined, combine_prepare_func)
res_df <- Reduce(final_combiner_func, res_prepared)
rownames(res_df) <- paste(unlist(sapply(names(res_prepared), rep, nrow(res_prepared[[1]]))),
unlist(sapply(res_prepared, rownames)),
sep = sep)
names_df <- split_names_to_data_frame(rownames(res_df), sep = sep)
final_df <- as.data.frame(cbind(names_df, res_df))
if (!is.null(col_names)) {
colnames(final_df) <- col_names
}
final_df
}
# for simplifying the function call
extract_and_combine <- function(res_list,
result_extractor_func,
col_names,
sep="_") {
apply_to_subdf_and_combine(
res_list = res_list,
accessor_func = result_extractor_func,
subdf_level_combiner_func=as.data.frame,
combine_prepare_func=function(x) as.data.frame(t(x)),
final_combiner_func=rbind,
col_names=col_names,
sep=sep
)
}
# for even more simplifying function call
apply_extract_aggravate <- function(dfs,
params,
analyzer_func,
extractor_func,
col_names,
sep="_") {
extract_and_combine(
res_list=apply_parameters(funcname=analyzer_func, dfs=dfs, params=params),
result_extractor_func=extractor_func,
col_names=col_names,
sep=sep
)
}
# useful for renaming the data frame columns values
lookup <- function(x, seq1, seq2) {
seq2[sapply(x, function(x) which(x == seq1))]
}
3.2 Categorize and split data frame
categories1 <- c("df1", "df2")[cut(df[, "col2"], c(1, 11, 20, Inf))]
categories2 <- c("5", "15", "other")[cut(df[, "col3"], c(-Inf, 5, 15, Inf))]
dfs <- split(df, gsub("class", "", paste(categories1, categories2, sep="_")))
But to have full control over categorization, you can
declare your own categorizer functions and categorize and
split the data frame:
# write rules for criterium1 1 element as function
categorizer1 <- function(x) {
if (1 <= x && x <= 10) {
"df1"
} else if (11 <= x && x <= 20) {
"df2"
}
}
# vectorize it to be able to apply it on entire columns
categorizer1 <- Vectorize(categorizer1)
# do the same for critreium2
categorizer2 <- function(x) {
if (x <= 5) {
"class5"
} else if (5 < x && x <= 15) {
"class15"
} else {
"other"
}
}
categorizer2 <- Vectorize(categorizer2)
# apply on col2 and col3 the corresponding categorizers
categories1 <- categorizer1(df[,"col2"])
categories2 <- categorizer2(df[,"col3"])
# get the list of sub data frames according to categories
dfs <- split(df, gsub("class", "", paste(categories1, categories2, sep="_")))
# Let the categorizer functions return strings and
# for the second argument use `paste()` with `sep=` to determine
# how the names should be combined - here with "_".
# Use `gsub(pattern, replacement, x, ignore.case=F, perl=T)`
# to process the name using regex patterns to how you want it at the end.
# Here, we remove the bulky "class".
3.3 Declare parameters as lists and their corresponding names in filename
parameters1 <- list("scale." = TRUE, "scale."=FALSE)
np1 <- c("scpos", "scneg")
parameters2 <- list("center"=TRUE, "center"=FALSE)
np2 <- c("cpos", "cneg")
# prepare all combinations of them in a list of lists
params_list <- named_cross_combine(parameters1, parameters2, np1, np2, sep="_")
# this produces a list of all possible parameter combination lists.
# Each parameter combination has to be kept itself in a list, because
# `do.call()` later requires the parameters being in a list.
# `named_cross_combine()` takes care of correct naming,
# joining the names using `sep` values.
# The first element in `parameter1` is taken and is paired with each of
# `parameters2`. Then the second of `parameter1` through all `parameters2`, etc.
3.4 Apply all parameters over dfs and collect the results into a data frame
df_final <- apply_extract_aggravate(
dfs=dfs,
params=params_list,
analyzer_func=prcomp,
extractor_func=function(x) x$sdev, # extractor must return a vector
col_names=c("df", "limits", "scale", "center", "std_PC1", "std_PC2", "std_PC3"),
sep="_" # separator for names
)
# This function takes the dfs and the parameters list and runs the
# analyzer_func, here `prcomp()` over all combinations of boths.
# The `extractor_func` must be chosen in a way that the returned result is a vector.
# If it is already a vector, set here `function(x) x` the identity function.
# The column names should give new names to the resulting columns.
# The number of the names are determined by:
# - the number of categoriesN,
# - the number of parametersN,
# - the number of elements of result after extractor_func() was applied.
# `sep=` determines which joiner is used for joining the names.
3.5 Finally, rename parameter columns' contents by using lookup() + previously declared parameter lists (parametersN) with their corresponding name vectors (npN)
df_final$scale <- unlist(lookup(df_final$scale, np1, parameters1))
df_final$center <- unlist(lookup(df_final$center, np2, parameters2))
# Two parameter columns, so two commands.
This converts df_final from this:
# df limits scale center std_PC1 std_PC2 std_PC3
# df1_15_scpos_cpos df1 15 scpos cpos 1.205986 0.9554013 0.7954906
# df1_15_scpos_cneg df1 15 scpos cneg 1.638142 0.5159250 0.2243043
# df1_15_scneg_cpos df1 15 scneg cpos 15.618145 2.4501942 1.3687843
# df1_15_scneg_cneg df1 15 scneg cneg 31.425246 5.9055013 1.7178626
# df1_5_scpos_cpos df1 5 scpos cpos 1.128371 1.0732246 0.7582659
# df1_5_scpos_cneg df1 5 scpos cneg 1.613217 0.4782639 0.4108470
# df1_5_scneg_cpos df1 5 scneg cpos 13.525868 2.5524661 0.9894493
# df1_5_scneg_cneg df1 5 scneg cneg 30.007511 3.9094993 1.6020638
# df2_15_scpos_cpos df2 15 scpos cpos 1.129298 1.0069030 0.8431092
# df2_15_scpos_cneg df2 15 scpos cneg 1.720909 0.1523516 0.1235295
# df2_15_scneg_cpos df2 15 scneg cpos 14.061532 2.4172787 1.2348606
# df2_15_scneg_cneg df2 15 scneg cneg 80.543382 3.8409639 1.8480111
# df2_other_scpos_cpos df2 other scpos cpos 1.090057 0.9588241 0.9446865
# df2_other_scpos_cneg df2 other scpos cneg 1.718190 0.1881516 0.1114570
# df2_other_scneg_cpos df2 other scneg cpos 15.168160 2.5579403 1.3354016
# df2_other_scneg_cneg df2 other scneg cneg 82.297724 5.0580949 1.9356444
to this:
df limits scale center std_PC1 std_PC2 std_PC3
df1_15_scpos_cpos df1 15 TRUE TRUE 1.205986 0.9554013 0.7954906
df1_15_scpos_cneg df1 15 TRUE FALSE 1.638142 0.5159250 0.2243043
df1_15_scneg_cpos df1 15 FALSE TRUE 15.618145 2.4501942 1.3687843
df1_15_scneg_cneg df1 15 FALSE FALSE 31.425246 5.9055013 1.7178626
df1_5_scpos_cpos df1 5 TRUE TRUE 1.128371 1.0732246 0.7582659
df1_5_scpos_cneg df1 5 TRUE FALSE 1.613217 0.4782639 0.4108470
df1_5_scneg_cpos df1 5 FALSE TRUE 13.525868 2.5524661 0.9894493
df1_5_scneg_cneg df1 5 FALSE FALSE 30.007511 3.9094993 1.6020638
df2_15_scpos_cpos df2 15 TRUE TRUE 1.129298 1.0069030 0.8431092
df2_15_scpos_cneg df2 15 TRUE FALSE 1.720909 0.1523516 0.1235295
df2_15_scneg_cpos df2 15 FALSE TRUE 14.061532 2.4172787 1.2348606
df2_15_scneg_cneg df2 15 FALSE FALSE 80.543382 3.8409639 1.8480111
df2_other_scpos_cpos df2 other TRUE TRUE 1.090057 0.9588241 0.9446865
df2_other_scpos_cneg df2 other TRUE FALSE 1.718190 0.1881516 0.1114570
df2_other_scneg_cpos df2 other FALSE TRUE 15.168160 2.5579403 1.3354016
df2_other_scneg_cneg df2 other FALSE FALSE 82.297724 5.0580949 1.9356444
4. Final remarks
This is not very different from your approach. All information is collected in the names. And the names used for generating the part of the data frame which explains the background for the analysis data.
The lookup() function is very useful for renaming the columns for the parameters.
The categorization of a column can be very simplified by the cat() function. But in the cut() function you don't have full control over
whether the upper/lower limit is included (<=) or not (<).
That is why sometimes declaring own categorizer functions can be of advantage. (And especially for more complex categorizations).
Extensibility
More categories: Just define more categories categories1 categories2 categories3 ...
# then do
dfs <- split(df, paste(categories1, categories2, categories3, ..., sep="_"))
# use `gsub()` around `paste()` or do
# names(dfs) <- gsub("search_term", "replace_term", names(dfs)) - over and over again
# until all names are as they should be.
More parameters: Just define more parametersN - npN pairs.
# then do
params_list <- named_cross_combine(parameters1, parameters2, np1, np2, sep="_")
params_list <- named_cross_combine(params_list, parameters3, names(params_list), np3, sep="_")
params_list <- named_cross_combine(params_list, parameters4, names(params_list), np4, sep="_")
... (and so on ...)
# use then at the end more lines for renaming parameter column contents:
df_final[, prmcol_name1] <- unlist(lookup(df_final[, prmcol_name1], np1, parameters1))
df_final[, prmcol_name2] <- unlist(lookup(df_final[, prmcol_name2], np2, parameters2))
df_final[, prmcol_name3] <- unlist(lookup(df_final[, prmcol_name3], np2, parameters3))
... (and so on ...)
Thus, the number of categories and parameters is easily enhance-able.
The core helper functions stay the same. And don't have to be modified.
(The use of higher order functions (functions which take functions as arguments) as helper functions is key for their flexibility - one of the strenghs of functional programming).

Iterate a data frame containing lists of column numbers, of different lengths, with a function in R

I have a data frame (df) of survey responses about human values with 57 columns/variables of numerical/scale responses. Each column belongs to one of ten categories, and they're not in contiguous groups.
I have a second dataframe (scoretable) that associates the categories with the column numbers for the variables; the lists of column numbers are all different lengths:
scoretable <- data.frame(
valuename =
c("Conformity","Tradition","Benevolence","Universalism","Self-
Direction","Stimulation","Hedonism","Achievement","Power","Security"),
valuevars = I(list(c(11,20,40,47), # Conformity
c(18,32,36,44,51), # Tradition
c(33,45,49,52,54), # Benevolence
c(1,17,24,26,29,30,35,38), # Universalism
c(5,16,31,41,53), # Self-Direction
c(9,25,37), # Stimulation
c(4,50,57), # Hedonism
c(34,39,43,55), # Achievement
c(3,12,27,46), # Power
c(8,13,15,22,56))), # Security
stringsAsFactors=FALSE)
I would like to iterate through scoretable with a function, valuescore, that calculates the mean and sd of all responses in that group of columns in data frame df and write the result to a third table of results:
valuescore = function(df,scoretable,valueresults){
valuename = scoretable[,1]
set <- df[,scoretable[,2]]
setmeans <- colMeans(set,na.rm=TRUE)
valuemean <- mean(setmeans)
setvars <- apply(set, 2, var)
valuesd <-sqrt(mean(setvars))
rbind(valueresults,c(valuename, valuemean, valuesd))
}
a <- nrow(scoretable)
for(i in 1:a){
valuescore(df,scoretable[i,],valueresults)
}
I am very new to R and programming in general (this is my first question here), and I'm struggling to determine how to pass list variables to functions and/or as address ranges for data frames.
Let's create an example data.frame:
df <- replicate(57, rnorm(10, 50, 20)) %>% as.data.frame()
Let's prepare the table result format:
valueresults <- data.frame(
name = scoretable$valuename,
mean = 0
)
Now, a loop on the values of scoretable, a mean calculation by column and then the mean of the mean. It is brutal (first answer with Map is more elegant), but maybe it is easier to understand for a R beginnner.
for(v in 1:nrow(scoretable)){
# let's suppose v = 1 "Conformity"
columns_id <- scoretable$valuevars[[v]]
# isolate columns that correspond to 'Conformity'
temp_df <- df[, columns_id]
# mean of the values of these columns
temp_means <- apply(temp_df, 2, mean)
mean <- mean(temp_means)
# save result in the prepared table
valueresults$mean[v] <- mean
}
> (valueresults)
name mean
1 Conformity 45.75407
2 Tradition 52.76935
3 Benevolence 50.81724
4 Universalism 51.04970
5 Self-Direction 55.43723
6 Stimulation 52.15962
7 Hedonism 53.17395
8 Achievement 47.77570
9 Power 52.61731
10 Security 54.07066
Here is a way using Map to apply a function to the list scoretable[, 2].
First I will create a test df.
set.seed(1234)
m <- 100
n <- 57
df <- matrix(sample(10, m*n, TRUE), nrow = m, ncol = n)
df <- as.data.frame(df)
And now the function valuescore.
valuescore <- function(DF, scores){
f <- function(inx) mean(as.matrix(DF[, inx]), na.rm = TRUE)
res <- Map(f, scores[, 2])
names(res) <- scores[[1]]
res
}
valuescore(df, scoretable)
#$Conformity
#[1] 5.5225
#
#$Tradition
#[1] 5.626
#
#$Benevolence
#[1] 5.548
#
#$Universalism
#[1] 5.36125
#
#$`Self-Direction`
#[1] 5.494
#
#$Stimulation
#[1] 5.643333
#
#$Hedonism
#[1] 5.546667
#
#$Achievement
#[1] 5.3175
#
#$Power
#[1] 5.41
#
#$Security
#[1] 5.54

wrong number of observations - nobs

I am just written my "complete" function (R programming - Coursera) but something is not working as expected.
complete <- function(directory,id=1:332){
#create a list of files
list_files<-list.files(directory,full.names = TRUE)
#create an empty data frame
dat <- data.frame()
for(i in id){
#read in the file
temp<- read.csv(list_files[i],header=TRUE)
#delete rows that do not have complete cases
temp<-na.omit(temp)
#count all of the rows with complete cases
tNobs<-nrow(temp)
#enumerate the complete cases by index
dat<-rbind(dat,data.frame(i,tNobs))
}
return(dat)
}
When I ask:
cc <- complete("specdata", c(6, 10, 20, 34, 100, 200, 310))
print(cc$nobs)
it returns NULL. WHY? it should return:
228 148 124 165 104 460 232
The issue is that the data.frame was creating by not assigning the column names. So, it will take the object name as the column name i.e. tNobs for the second column.
i <- 2
tNobs = 10
data.frame(i, tOobs)
# i tOobs
#1 2 10
So, when we call the nobs that doesn't exist in the data.frame, it gives NULL
dat <- rbind(dat, data.frame(col1 = i, nobs = tNobs))

selecting n consequent grouped variables and apply the function in r

Here is example data:
myd <- data.frame (matrix (sample (c("AB", "BB", "AA"), 100*100,
replace = T), ncol = 100))
variablenames= paste (rep (paste ("MR.", 1:10,sep = ""),
each = 10), 1:100, sep = ".")
names(myd) <- variablenames
Each variable has a group, here we have ten groups. Thus the group index for the each variable in this data frame is as follows:
group <- rep(1:10, each = 10)
Thus Variable names and group
data.frame (group, variablenames)
group variablenames
1 1 MR.1.1
2 1 MR.1.2
3 1 MR.1.3
4 1 MR.1.4
5 1 MR.1.5
6 1 MR.1.6
7 1 MR.1.7
8 1 MR.1.8
9 1 MR.1.9
10 1 MR.1.10
11 2 MR.2.11
<<<<<<<<<<<<<<<<<<<<<<<<
100 10 MR.10.100
Each groups means that the following steps whould be applied to group of variables seperately.
I have longer function to work the following is short example:
function considering two variables at time
myfun <- function (x1, x2) {
out <- NULL
out <- paste(x1, x2, sep=":")
# for other steps to be performed here
return (out)
}
# group 1
myfun (myd[,1], myd[,2]); myfun (myd[,3], myd[,4]); myfun (myd[,5], myd[,6]);
myfun (myd[,7], myd[,8]); myfun (myd[,9], myd[,10]);
# group 2
myfun (myd[,11], myd[,12]); myfun (myd[,13], myd[,14]); .......so on to group 10 ;
In this way I need to walk for variables 1:10 (i.e. in first group to perform the above action), then 11:20 (the second group). The group doesnot matter in this case number of variables in each group are divisible with number of variables (10) taken (considered) at a time (2).
However in the following example where 3 variables taken at a time - number of total variable in each group (3), 10/3, you have one variable left over at the end.
function considering three variable at time.
myfun <- function (x1, x2, x3) {
out <- NULL
out <- paste(x1, x2, x3, sep=":")
# for other steps to be performed here
return (out)
}
# for group 1
myfun (myd[,1], myd[,2], myd[,3])
myfun (myd[,4], myd[,5], myd[,6])
myfun (myd[,7], myd[,8], myd[,9])
# As there one variable left before proceedomg to second group, the final group will
have 1 extra variable
myfun (myd[,7], myd[,8], myd[,9],myd[,10] )
# for group 2
myfun (myd[,11], myd[,12], myd[,13])
# and to the end all groups and to end of the file.
I want to loop this process by user defined n number of variables consered at time, where n may be 1 to maximum number of variables in each group.
Edit: Just illustration to show the process (just group 1 and 2 demostrated for example):
Create a function that will split your data up into appropriate lists, and apply whatever functions you want to your list.
This function will create your second grouping variable. (The first grouping variable (group) is provided in your question; if you change that value, you should also change DIM in the function below.)
myfun = function(LENGTH, DIM = 10) {
PATTERN = rep(1:(DIM %/% LENGTH), each=LENGTH)
c(PATTERN, rep(max(PATTERN), DIM %% LENGTH))
}
Here are the groups on which we will split myd. In this example, we are splitting myd first into 10-column groups, and each group into 3-column groups, except for the last group, which will have 4 columns (3+3+4 = 10).
NOTE: To change the number of columns you're grouping by, for example, grouping by two variables at a time, change group2 = rep(myfun(3), length.out=100) to group2 = rep(myfun(2), length.out=100).
group <- rep(1:10, each = 10)
# CHANGE THE FOLLOWING LINE ACCORDING
# TO THE NUMBER OF GROUPS THAT YOU WANT
group2 = rep(myfun(3), length.out=100)
This is the splitting process. We first split up just by names, and match those names with myd to create a list of data.frames.
# Extract group names for matching purposes
temp = split(names(myd), list(group, group2))
# Match the names to myd
temp = lapply(1:length(temp),
function(x) myd[, which(names(myd) %in% temp[[x]])])
# Extract the names from the list for future reference
NAMES = lapply(temp, function(x) paste(names(x), collapse="_"))
Now that we have a list, we can do lots of fun things. You wanted to paste your columns together separated by a colon. Here's how you'd do that.
# Do what you want with the list
# For example, to paste the columns together:
FINAL = lapply(temp, function(x) apply(x, 1, paste, collapse=":"))
names(FINAL) = NAMES
Here's a sample of the output:
lapply(FINAL, function(x) head(x, 5))
# $MR.1.1_MR.1.2_MR.1.3
# [1] "AA:AB:AB" "AB:BB:AA" "BB:AB:AA" "BB:AA:AB" "AA:AA:AA"
#
# $MR.2.11_MR.2.12_MR.2.13
# [1] "BB:AA:AB" "BB:AB:BB" "BB:AA:AA" "AB:BB:AA" "BB:BB:AA"
#
# $MR.3.21_MR.3.22_MR.3.23
# [1] "AA:AB:BB" "BB:AA:AA" "AA:AB:BB" "AB:AA:AA" "AB:BB:BB"
#
# <<<<<<<------SNIP------>>>>>>>>
#
# $MR.1.4_MR.1.5_MR.1.6
# [1] "AB:BB:AA" "BB:BB:BB" "AA:AA:AA" "BB:BB:AB" "AB:AA:AA"
#
# $MR.2.14_MR.2.15_MR.2.16
# [1] "AA:BB:AB" "BB:BB:BB" "BB:BB:AB" "AA:BB:AB" "BB:BB:BB"
#
# $MR.3.24_MR.3.25_MR.3.26
# [1] "AA:AB:BB" "BB:AA:BB" "BB:AB:BB" "AA:AB:AA" "AB:AA:AA"
#
# <<<<<<<------SNIP------>>>>>>>>
#
# $MR.1.7_MR.1.8_MR.1.9_MR.1.10
# [1] "AB:AB:AA:AB" "AB:AA:BB:AA" "BB:BB:AA:AA" "AB:BB:AB:AA" "AB:BB:AB:BB"
#
# $MR.2.17_MR.2.18_MR.2.19_MR.2.20
# [1] "AB:AB:BB:BB" "AB:AB:BB:BB" "AB:AA:BB:BB" "AA:AA:AB:AA" "AB:AB:AB:AB"
#
# $MR.3.27_MR.3.28_MR.3.29_MR.3.30
# [1] "BB:BB:AB:BB" "BB:BB:AA:AA" "AA:BB:AB:AA" "AA:BB:AB:AA" "AA:AB:AA:BB"
#
# $MR.4.37_MR.4.38_MR.4.39_MR.4.40
# [1] "BB:BB:AB:AA" "AA:BB:AA:BB" "AA:AA:AA:AB" "AB:AA:BB:AB" "BB:BB:BB:BB"
#
# $MR.5.47_MR.5.48_MR.5.49_MR.5.50
# [1] "AB:AA:AA:AB" "AB:AA:BB:AA" "AB:BB:AA:AA" "AB:BB:BB:BB" "BB:AA:AB:AA"
#
# $MR.6.57_MR.6.58_MR.6.59_MR.6.60
# [1] "BB:BB:AB:AA" "BB:AB:BB:AA" "AA:AB:AB:BB" "BB:AB:AA:AB" "AB:AA:AB:BB"
#
# $MR.7.67_MR.7.68_MR.7.69_MR.7.70
# [1] "BB:AB:BB:AA" "BB:AB:BB:AA" "BB:AB:BB:AB" "AB:AA:AA:AA" "AA:AA:AA:AB"
#
# $MR.8.77_MR.8.78_MR.8.79_MR.8.80
# [1] "AA:AB:AA:AB" "AB:AA:AB:BB" "BB:BB:AA:AB" "AB:BB:BB:BB" "AB:AA:BB:AB"
#
# $MR.9.87_MR.9.88_MR.9.89_MR.9.90
# [1] "AA:BB:AB:AA" "AA:AB:BB:BB" "AA:BB:AA:BB" "AB:AB:AA:BB" "AB:AA:AB:BB"
#
# $MR.10.97_MR.10.98_MR.10.99_MR.10.100
# [1] "AB:AA:BB:AB" "AB:AA:AB:BB" "BB:AB:AA:AA" "BB:BB:AA:AA" "AB:AB:BB:AB"
I suggest to recode myfun to take a matrix and use pasteCols from plotrix package.
library(plotrix)
myfun = function(x){
out = pasteCols(t(x), sep = ":")
# some code
return(out)
}
then, its very easy: for each group, compute the index of the first and of the last column you want to use when you call myfun, using modulus and integer division:
rubiques_solution = function(group, myd, num_to_group){
# loop over groups
for(g in unique(group)){
var_index = which(group == g)
num_var = length(var_index)
# test to make sure num_to_group is smaller than the number of variable
if(num_var < num_to_group){
stop("num_to_group > number of variable in at least one group")
}
# number of calls to myfun
num_calls = num_var %/% num_to_group
# the idea here is that we create the first and last column
# in which we are interested for each call
first = seq(from = var_index[1], by = num_to_group, length = num_calls)
last = first + num_to_group -1
# the last call will contain possibly more varialbe, we adjust here:
last[length(last)] = last[length(last)] + (num_var %% num_to_group)
for(i in num_calls){
# maybe do something with the return value of myfun ?
myfun(myd[,first[i]:last[i]])
}
}
}
group = rep(1:10, each = 10) # same than yours
myd = data.frame (matrix (sample (c("AB", "BB", "AA"), 100*100, replace = T), ncol = 100)) # same than yours
num_to_group = 2 # this is your first example
rubiques_solution(group, myd, num_to_group)
hope i understood the problem right.

Resources