I need to create groups of numbers which summed up do not reach 34.
For example: I have an array x<-c(28,26,20,5,3,2,1) and I need to create the following groups: a=(28,5,1), b=(26,3,2), c=(20) because the sums of the groups a, b and c do not exceed 34.
Is it possible to implement this procedure in R?
If I understand correctly this is what you want to do:
create_groups <- function(input, threshold) {
input <- sort(input, decreasing = TRUE)
result <- vector("list", length(input))
sums <- rep(0, length(input))
for (k in input) {
i <- match(TRUE, sums + k <= threshold)
if (!is.na(i)) {
result[[i]] <- c(result[[i]], k)
sums[i] <- sums[i] + k
}
}
result[sapply(result, is.null)] <- NULL
result
}
create_groups(x, 34)
# [[1]]
# [1] 28 5 1
#
# [[2]]
# [1] 26 3 2
#
# [[3]]
# [1] 20
However it is not guaranteed that this greedy algorithm will output the optimal solution in terms of number of groups. For instance:
y <- c(18, 15, 11, 9, 8, 7)
create_groups(y, 34)
# [[1]]
# [1] 18 15
#
# [[2]]
# [1] 11 9 8
#
# [[3]]
# [1] 7
while the optimal solution in this case consists of only 2 groups: list(c(18, 9, 7), c(15, 11, 8)).
Assuming you want all possible combinations of subsets of x that meet this condition, you can use
x = c(28,26,20,5,3,2,1)
y = lapply(seq_along(x), function(y) combn(x, y)) # list all combinations of all subsets
le34 = sapply(y, function(z) colSums(z) <= 34) # which sums are less than 34
lapply(seq_along(y), function(i) y[[i]][,le34[[i]]] ) # list of combinations that meet condition
Here is my dummy dataset:
dataset<-data.frame(a=c(1,2,3,4),b=c('a','b','c','d'), c=c("HI","DD","gg","ff"))
g=list(c("a","b"),c(2,3,4), c(44,33,11,22),c("chr","ID","i","II"))
dataset$l<-g
dataset
a b c l
1 1 a HI a, b
2 2 b DD 2, 3, 4
3 3 c gg 44, 33, 11, 22
4 4 d ff chr, ID, i, II
> mode(dataset$l)
[1] "list"
when I try to write the dataset to a file:
> write.table(dataset, "dataset.txt", quote=F, sep="\t")
Error in write.table(x, file, nrow(x), p, rnames, sep, eol, na, dec, as.integer(quote), :
unimplemented type 'list' in 'EncodeElement'
How can i solve this problem?
I can think a few options, depending on what you're trying to achieve.
If it is for display only, then you might simply want capture.output() or sink(); neither of these would be very convenient to read back into R:
capture.output(dataset, file="myfile.txt")
### Result is a text file that looks like this:
# a b c l
# 1 1 a HI a, b
# 2 2 b DD 2, 3, 4
# 3 3 c gg 44, 33, 11, 22
# 4 4 d ff chr, ID, i, II
sink("myfile.txt")
dataset
sink()
## Same result as `capture.output()` approach
If you want to be able to read the resulting table back into R (albeit without preserving the fact that column "l" is a list), you can take an approach similar to what #DWin suggested.
In the code below, the dataset2[sapply... line identifies which variables are lists and concatenates them into a single string. Thus, they become simple character variables, allowing you to use write.table().
dataset2 <- dataset # make a copy just to be on the safe side
dataset2[sapply(dataset2, is.list)] <- apply(dataset2[sapply(dataset2, is.list)],
1, function(x)
paste(unlist(x),
sep=", ", collapse=", "))
str(dataset2)
# 'data.frame': 4 obs. of 4 variables:
# $ a: num 1 2 3 4
# $ b: Factor w/ 4 levels "a","b","c","d": 1 2 3 4
# $ c: Factor w/ 4 levels "DD","ff","gg",..: 4 1 3 2
# $ l: chr "a, b" "2, 3, 4" "44, 33, 11, 22" "chr, ID, i, II"
write.table(dataset2, "myfile.txt", quote=FALSE, sep="\t")
# can be read back in with: dataset3 <- read.delim("myfile.txt")
Output from save is unreadable. Output from dump or dput is ASCII and is readable to people who understand the structure of R objects, but I'm guessing you wanted it more conventionally arranged.
> apply(dataset, 1, function(x) paste(x, sep=",", collapse=","))
[1] "1,a,HI,c(\"a\", \"b\")"
[2] "2,b,DD,c(2, 3, 4)"
[3] "3,c,gg,c(44, 33, 11, 22)"
[4] "4,d,ff,c(\"chr\", \"ID\", \"i\", \"II\")"
The backslashes do not appear in the text-file output:
writeLines(con="test.txt", apply(dataset, 1, function(x) paste(x, sep=",", collapse=",")))
#-------output-----
1,a,HI,c("a", "b")
2,b,DD,c(2, 3, 4)
3,c,gg,c(44, 33, 11, 22)
4,d,ff,c("chr", "ID", "i", "II")
If one of the requirements is to preserve the formatting for excel, etc, this might help:
writableTable <- tableFlatten(dataset, filler="")
# a b c l.01 l.02 l.03 l.04
# 1 a HI a b
# 2 b DD 2 3 4
# 3 c gg 44 33 11 22
# 4 d ff chr ID i II
write.csv(writableTable, "myFile.csv")
tableFlatten uses a function listFlatten which, as the name implies, takes nested lists and flattens them.
However, if the elements within the lists are of different sizes, it adds filler (which can be NAs, blank spaces, or any other user defined option)
The code for it is below.
tableFlatten <- function(tableWithLists, filler="") {
# takes as input a table with lists and returns a flat table
# empty spots in lists are filled with value of `filler`
#
# depends on: listFlatten(.), findGroupRanges(.), fw0(.)
# index which columns are lists
listCols <- sapply(tableWithLists, is.list)
tableWithLists[listCols]
tableWithLists[!listCols]
# flatten lists into table
flattened <- sapply(tableWithLists[listCols], listFlatten, filler=filler, simplify=FALSE)
# fix names
for (i in 1:length(flattened)) colnames(flattened[[i]]) <- fw0(ncol(flattened[[i]]), 2)
# REASSEMBLE, IN ORDER
# find pivot point counts
pivots <- sapply(findGroupRanges(listCols), length)
#index markers
indNonList <- indList <- 1
# nonListGrp <- (0:(length(pivots)/2)) * 2 + 1
# ListGrp <- (1:(length(pivots)/2)) * 2
final <- data.frame(row.names=row.names(tableWithLists))
for (i in 1:length(pivots)) {
if(i %% 2 == 1) {
final <- cbind(final,
tableWithLists[!listCols][indNonList:((indNonList<-indNonList+pivots[[i]])-1)]
)
} else {
final <- cbind(final,
flattened[indList:((indList<-indList+pivots[[i]])-1)]
)
}
}
return(final)
}
#=====================================
listFlatten <- function(obj, filler=NA) {
## Flattens obj like rbind, but if elements are of different length, plugs in value filler
# Initialize Vars
bind <- FALSE
# IF ALL ELEMENTS ARE MATRIX-LIKE OR VECTORS, MAKE SURE SAME NUMBER OF COLUMNS
matLike <- sapply(obj, function(x) !is.null(dim(x)))
vecLike <- sapply(obj, is.vector)
# If all matrix-like.
if (all(matLike)) {
maxLng <- max(sapply(obj[matLike], ncol))
obj[matLike] <- lapply(obj[matLike], function(x) t(apply(x, 1, c, rep(filler, maxLng - ncol(x)))))
bind <- TRUE
# If all vector-like
} else if (all(vecLike)) {
maxLng <- max(sapply(obj[vecLike], length))
obj[vecLike] <- lapply(obj[vecLike], function(x) c(x, rep(filler, maxLng - length(x))))
bind <- TRUE
# If all are either matrix- or vector-like
} else if (all(matLike & vecLike)) {
maxLng <- max(sapply(obj[matLike], ncol), sapply(obj[vecLike], length))
# Add in filler's as needed
obj[matLike] <-
lapply(obj[matLike], function(x) t(apply(x, 1, c, rep(filler, maxLng - ncol(x)))))
obj[vecLike] <-
lapply(obj[vecLike], function(x) c(x, rep(filler, maxLng - length(x))))
bind <- TRUE
}
# If processed and ready to be returned, then just clean it up
if(bind) {
ret <- (do.call(rbind, obj))
colnames(ret) <- paste0("L", fw0(1:ncol(ret), digs=2))
return(ret)
}
# Otherwise, if obj is sitll a list, continue recursively
if (is.list(obj)) {
return(lapply(obj, listFlatten))
}
# If none of the above, return an error.
stop("Unknown object type")
}
#--------------------------------------------
findGroupRanges <- function(booleanVec) {
# returns list of indexes indicating a series of identical values
pivots <- which(sapply(2:length(booleanVec), function(i) booleanVec[[i]] != booleanVec[[i-1]]))
### THIS ISNT NEEDED...
# if (identical(pivots, numeric(0)))
# pivots <- length(booleanVec)
pivots <- c(0, pivots, length(booleanVec))
lapply(seq(2, length(pivots)), function(i)
seq(pivots[i-1]+1, pivots[i])
)
}
#--------------------------------------------
fw0 <- function(num, digs=NULL, mkSeq=TRUE) {
## formats digits with leading 0's.
## num should be an integer or range of integers.
## if mkSeq=T, then an num of length 1 will be expanded to seq(1, num).
# TODO 1: put more error check
if (is.list(num))
lapply(num, fw0)
if (!is.vector(num)) {
stop("num should be integer or vector")
}
# convert strings to numbers
num <- as.numeric(num)
# If num is a single number and mkSeq is T, expand to seq(1, num)
if(mkSeq && !length(num)>1)
num <- (1:num)
# number of digits is that of largest number or digs, whichever is max
digs <- max(nchar(max(abs(num))), digs)
# if there are a mix of neg & pos numbers, add a space for pos numbs
posSpace <- ifelse(sign(max(num)) != sign(min(num)), " ", "")
# return: paste appropriate 0's and preface neg/pos mark
sapply(num, function(x) ifelse(x<0,
paste0("-", paste0(rep(0, max(0, digs-nchar(abs(x)))), collapse=""), abs(x)),
paste0(posSpace, paste0(rep(0, max(0, digs-nchar(abs(x)))), collapse=""), x)
))
}
#-----------------------------------------------
You can use dput for this.
dput(dataset, "dataset.txt")
you can also use save()
save(dataset, file="dataset.RData")
The answer provided by #Ananda is excellent, however, I ran into an issue when I had a data frame with two columns that were lists.
dataset<-data.frame(a=c(1,2,3,4),b=c('a','b','c','d'), c=c("HI","DD","gg","ff"))
g=list(c("a","b"),c(2,3,4), c(44,33,11,22),c("chr","ID","i","II"))
dataset$l<-g
dataset$l2<-g
dataset
a b c l l2
1 1 a HI a, b a, b
2 2 b DD 2, 3, 4 2, 3, 4
3 3 c gg 44, 33, 11, 22 44, 33, 11, 22
4 4 d ff chr, ID, i, II chr, ID, i, II
Using the original answer, both list columns contain the concatenated contents of both columns.
a b c l l2
1 1 a HI a, b, a, b a, b, a, b
2 2 b DD 2, 3, 4, 2, 3, 4 2, 3, 4, 2, 3, 4
3 3 c gg 44, 33, 11, 22, 44, 33, 11, 22 44, 33, 11, 22, 44, 33, 11, 22
4 4 d ff chr, ID, i, II, chr, ID, i, II chr, ID, i, II, chr, ID, i, II
Instead, try this modified version:
dataset2 <- dataset # make a copy just to be on the safe side
dataset2[sapply(dataset2, is.list)] <-
sapply(dataset2[sapply(dataset2, is.list)],
function(x)sapply(x, function(y) paste(unlist(y),collapse=", ") ) )
dataset2
a b c l l2
1 1 a HI a, b a, b
2 2 b DD 2, 3, 4 2, 3, 4
3 3 c gg 44, 33, 11, 22 44, 33, 11, 22
4 4 d ff chr, ID, i, II chr, ID, i, II
I stumbled across this and while there are a lot of great answers, I ended up doing something else. Sharing for posterity.
library(dplyr)
flatten_list = function(x){
if (typeof(x) != "list") {
return(x)
}
sapply(x, function(y) paste(y, collapse = " | "))
}
data %>%
mutate_each(funs(flatten_list)) ->
write_csv("data.csv")
I have two set of samples that are time independent. I would like to merge them and calculate the missing values
for the times where I do not have values of both. Simplified example:
A <- cbind(time=c(10, 20, 30, 40, 50, 60, 70, 80, 90, 100),
Avalue=c(1, 2, 3, 2, 1, 2, 3, 2, 1, 2))
B <- cbind(time=c(15, 30, 45, 60), Bvalue=c(100, 200, 300, 400))
C <- merge(A,B, all=TRUE)
time Avalue Bvalue
1 10 1 NA
2 15 NA 100
3 20 2 NA
4 30 3 200
5 40 2 NA
6 45 NA 300
7 50 1 NA
8 60 2 400
9 70 3 NA
10 80 2 NA
11 90 1 NA
12 100 2 NA
By assuming linear change between each sample, it is possible to calculate the missing NA values.
Intuitively it is easy to see that the A value at time 15 and 45 should be 1.5. But a proper calculation for B
for instance at time 20 would be
100 + (20 - 15) * (200 - 100) / (30 - 15)
which equals 133.33333.
The first parenthesis being the time between estimate time and the last sample available.
The second parenthesis being the difference between the nearest samples.
The third parenthesis being the time between the nearest samples.
How can I use R to calculate the NA values?
Using the zoo package:
library(zoo)
Cz <- zoo(C)
index(Cz) <- Cz[,1]
Cz_approx <- na.approx(Cz)
The proper way to do this statistically and still get valid confidence intervals is to use Multiple Imputation. See Rubin's classic book, and there's an excellent R package for this (mi).
An ugly and probably inefficient Base R solution:
# Data provided:
A <- cbind(time=c(10, 20, 30, 40, 50, 60, 70, 80, 90, 100),
Avalue=c(1, 2, 3, 2, 1, 2, 3, 2, 1, 2))
B <- cbind(time=c(15, 30, 45, 60), Bvalue=c(100, 200, 300, 400))
C <- merge(A,B, all=TRUE)
# Scalar valued at the minimum time difference: -> min_time_diff
min_time_diff <- min(diff(C$time))
# Adjust frequency of the series to hold all steps in range: -> df
df <- merge(C,
data.frame(time = seq(min_time_diff,
max(C$time),
by = min_time_diff)),
by = "time",
all = TRUE)
# Linear interpolation function handling ties,
# returns interpolated vector the same length
# a the input vector: -> vector
l_interp_vec <- function(na_vec){
approx(x = na_vec,
method = "linear",
ties = "constant",
n = length(na_vec))$y
}
# Applied to a dataframe, replacing NA values
# in each of the numeric vectors,
# with interpolated values.
# input is dataframe: -> dataframe()
interped_df <- data.frame(lapply(df, function(x){
if(is.numeric(x)){
# Store a scalar of min row where x isn't NA: -> min_non_na
min_non_na <- min(which(!(is.na(x))))
# Store a scalar of max row where x isn't NA: -> max_non_na
max_non_na <- max(which(!(is.na(x))))
# Store scalar of the number of rows needed to impute prior
# to first NA value: -> ru_lower
ru_lower <- ifelse(min_non_na > 1, min_non_na - 1, min_non_na)
# Store scalar of the number of rows needed to impute after
# the last non-NA value: -> ru_lower
ru_upper <- ifelse(max_non_na == length(x),
length(x) - 1,
(length(x) - (max_non_na + 1)))
# Store a vector of the ramp to function: -> l_ramp_up:
ramp_up <- as.numeric(
cumsum(rep(x[min_non_na]/(min_non_na), ru_lower))
)
# Apply the interpolation function on vector "x": -> y
y <- as.numeric(l_interp_vec(as.numeric(x[min_non_na:max_non_na])))
# Create a vector that combines the ramp_up vector
# and y if the first NA is at row 1: -> z
if(length(ramp_up) > 1 & max_non_na != length(x)){
# Create a vector interpolations if there are
# multiple NA values after the last value: -> lower_l_int
lower_l_int <- as.numeric(cumsum(rep(mean(diff(c(ramp_up, y))),
ru_upper+1)) +
as.numeric(x[max_non_na]))
# Store the linear interpolations in a vector: -> z
z <- as.numeric(c(ramp_up, y, lower_l_int))
}else if(length(ramp_up) > 1 & max_non_na == length(x)){
# Store the linear interpolations in a vector: -> z
z <- as.numeric(c(ramp_up, y))
}else if(min_non_na == 1 & max_non_na != length(x)){
# Create a vector interpolations if there are
# multiple NA values after the last value: -> lower_l_int
lower_l_int <- as.numeric(cumsum(rep(mean(diff(c(ramp_up, y))),
ru_upper+1)) +
as.numeric(x[max_non_na]))
# Store the linear interpolations in a vector: -> z
z <- as.numeric(c(y, lower_l_int))
}else{
# Store the linear interpolations in a vector: -> z
z <- as.numeric(y)
}
# Interpolate between points in x, return new x:
return(as.numeric(ifelse(is.na(x), z, x)))
}else{
x
}
}
)
)
# Subset interped df to only contain
# the time values in C, store a data frame: -> int_df_subset
int_df_subset <- interped_df[interped_df$time %in% C$time,]