I have a list like L (comes from a vector splitting).
L <- strsplit(c("1 5 9", "", "3 7 11", ""), " ")
# [[1]]
# [1] "1" "5" "9"
#
# [[2]]
# character(0)
#
# [[3]]
# [1] "3" "7" "11"
#
# [[4]]
# character(0)
When I do an ordinary rbind as follows, I'm losing all the character(0) rows.
do.call(rbind, L)
# [,1] [,2] [,3]
# [1,] "1" "5" "9"
# [2,] "3" "7" "11"
Do I always have to do a lapply like the following or have I missed something?
do.call(rbind, lapply(L, function(x)
if (length(x) == 0) rep("", 3) else x))
# [,1] [,2] [,3]
# [1,] "1" "5" "9"
# [2,] "" "" ""
# [3,] "3" "7" "11"
# [4,] "" "" ""
Base R answers are preferred.
If you use lapply you don't have to worry about length so you can skip the rep part it will automatically be recycled across columns.
do.call(rbind, lapply(L, function(x) if (length(x) == 0) "" else x))
# [,1] [,2] [,3]
#[1,] "1" "5" "9"
#[2,] "" "" ""
#[3,] "3" "7" "11"
#[4,] "" "" ""
Another option using same logic as #NelsonGon we can replace the empty lists with blank and then rbind.
L[lengths(L) == 0] <- ""
do.call(rbind, L)
# [,1] [,2] [,3]
#[1,] "1" "5" "9"
#[2,] "" "" ""
#[3,] "3" "7" "11"
#[4,] "" "" ""
Maybe this roundabout using data.table suits you:
L <- data.table::tstrsplit(c("1 5 9", "", "3 7 11", ""), " ", fill="")
t(do.call(rbind,L))
With plyr then proceed with replacement. Since OP asked for base R, see below.
plyr::ldply(L,rbind)
1 2 3
1 1 5 9
2 <NA> <NA> <NA>
3 3 7 11
4 <NA> <NA> <NA>
A less efficient base R way:
L <- strsplit(c("1 5 9", "", "3 7 11", ""), " ")
L[lapply(L,length)==0]<-"Miss"
res<-Reduce(rbind,L)
res[res=="Miss"]<-""
Result:
[,1] [,2] [,3]
init "1" "5" "9"
"" "" ""
"3" "7" "11"
"" "" ""
That is the defined behavior for scenarios like that. As written in ?rbind:
For cbind (rbind), vectors of zero length (including NULL) are ignored
unless the result would have zero rows (columns), for S compatibility.
(Zero-extent matrices do not occur in S3 and are not ignored in R.)
When you inspect your elements, you see that it is true:
length(L[[1]])
[1] 3
length(L[[2]])
[1] 0
However, as you see, multiple workarounds are possible.
We can use stri_list2matrix in a simple way
library(stringi)
stri_list2matrix(L, byrow = TRUE, fill = "")
# [,1] [,2] [,3]
#[1,] "1" "5" "9"
#[2,] "" "" ""
#[3,] "3" "7" "11"
#[4,] "" "" ""
Related
Once again I'm struggling with strsplit. I'm transforming some strings to data frames, but there's a forward slash, / and some white space in my string that keep bugging me. I could work around it, but I eager to learn if I can use some fancy either or in strsplit. My working example below should illustrate the issue
The strsplit function I'm currrently using
str_to_df <- function(string){
t(sapply(1:length(string), function(x) strsplit(string, "\\s+")[[x]])) }
one type of string I got,
string1 <- c('One\t58/2', 'Two 22/3', 'Three\t15/5')
str_to_df(string1)
#> [,1] [,2]
#> [1,] "One" "58/2"
#> [2,] "Two" "22/3"
#> [3,] "Three" "15/5"
another type I got in the same spot,
string2 <- c('One 58 / 2', 'Two 22 / 3', 'Three 15 / 5')
str_to_df(string2)
#> [,1] [,2] [,3] [,4]
#> [1,] "One" "58" "/" "2"
#> [2,] "Two" "22" "/" "3"
#> [3,] "Three" "15" "/" "5"
They obviously create different outputs, and I can't figure out how to code a solution that work for both. Below is my desired outcome. Thank you in advance!
desired_outcome <- structure(c("One", "Two", "Three", "58", "22",
"15", "2", "3", "5"), .Dim = c(3L, 3L))
desired_outcome
#> [,1] [,2] [,3]
#> [1,] "One" "58" "2"
#> [2,] "Two" "22" "3"
#> [3,] "Three" "15" "5"
This works:
str_to_df <- function(string){
t(sapply(1:length(string), function(x) strsplit(string, "[/[:space:]]+")[[x]])) }
string1 <- c('One\t58/2', 'Two 22/3', 'Three\t15/5')
string2 <- c('One 58 / 2', 'Two 22 / 3', 'Three 15 / 5')
str_to_df(string1)
# [,1] [,2] [,3]
# [1,] "One" "58" "2"
# [2,] "Two" "22" "3"
# [3,] "Three" "15" "5"
str_to_df(string2)
# [,1] [,2] [,3]
# [1,] "One" "58" "2"
# [2,] "Two" "22" "3"
# [3,] "Three" "15" "5"
Another approach with tidyr could be:
string1 %>%
as_tibble() %>%
separate(value, into = c("Col1", "Col2", "Col3"), sep = "[/[:space:]]+")
# A tibble: 3 x 3
# Col1 Col2 Col3
# <chr> <chr> <chr>
# 1 One 58 2
# 2 Two 22 3
# 3 Three 15 5
We can create a function to split at one or more space or tab or forward slash
f1 <- function(str1) do.call(rbind, strsplit(str1, "[/\t ]+"))
f1(string1)
# [,1] [,2] [,3]
#[1,] "One" "58" "2"
#[2,] "Two" "22" "3"
#[3,] "Three" "15" "5"
f1(string2)
# [,1] [,2] [,3]
#[1,] "One" "58" "2"
#[2,] "Two" "22" "3"
#[3,] "Three" "15" "5"
Or we can do with read.csv after replacing the spaces with a common delimiter
read.csv(text=gsub("[\t/ ]+", ",", string1), header = FALSE)
# V1 V2 V3
#1 One 58 2
#2 Two 22 3
#3 Three 15 5
I am trying to build a large table in R. Yes I have heard of the table() function - in fact, I've used it several times in the code below - but I am building this because I do not want to type table() 20 times a day. I plan on just exporting this using xtable + knitr. The reason this is useful is that for those of us who have to repeatedly tabulate data, this would save a lot of time. Unfortunately, there is something wrong with the loop down here:
ESRD <- rep(c("Y", "N"), each=10)
DIABETES <- rep(c("Y", "N", "Y", "N"), c(5, 5, 5, 5))
BLAH <- rep(c("Y", "N"), each=10)
categoricalvariables <- data.frame(ESRD, DIABETES, BLAH)
descriptives <- function(VARIABLEMATRIX){
desc <- matrix(0, ncol=4, nrow=2*ncol(VARIABLEMATRIX) + ncol(VARIABLEMATRIX))
for (i in 1:ncol(VARIABLEMATRIX)){
matper <- matrix(0, nrow=dim(table(VARIABLEMATRIX[ ,i])), ncol=1)
for (i in 1:dim(table(VARIABLEMATRIX[ ,i]))){
matper[i, ] <- paste(round(prop.table(table(VARIABLEMATRIX[ ,i]))[i]*100, 2), "%")
}
matcount <- matrix(0, nrow=dim(table(VARIABLEMATRIX[ ,i])), ncol=1)
for (i in 1:dim(table(VARIABLEMATRIX[ ,i]))){
matcount[i, ] <- table(VARIABLEMATRIX[ ,i])[i]
}
desc[((3*i)-2), ] <- c(colnames(VARIABLEMATRIX)[i], "", "", "")
desc[((3*i)-1):(3*i), ] <- cbind("", names(table(VARIABLEMATRIX[ ,i])), matcount[ ,1], matper[ ,1])
return(desc)
}
}
descriptives(categoricalvariables)
The output I am getting is (clearly there is a bug but I am not sure what is wrong):
[,1] [,2] [,3] [,4]
[1,] "0" "0" "0" "0"
[2,] "0" "0" "0" "0"
[3,] "0" "0" "0" "0"
[4,] "DIABETES" "" "" ""
[5,] "" "N" "10" "50 %"
[6,] "" "Y" "10" "50 %"
[7,] "0" "0" "0" "0"
[8,] "0" "0" "0" "0"
[9,] "0" "0" "0" "0"
The expected output should be:
[,1] [,2] [,3] [,4]
[1,] "ESRD" "" "" ""
[2,] "" "N" "10" "50 %"
[3,] "" "Y" "10" "50 %"
[4,] "DIABETES" "" "" ""
[5,] "" "N" "10" "50 %"
[6,] "" "Y" "10" "50 %"
[7,] "BLAH" "" "" ""
[8,] "" "N" "10" "50 %"
[9,] "" "Y" "10" "50 %"
Here's one option:
desc <- function(x) {
af <- table(x)
rf <- prop.table(af) * 100
out <- cbind(Absolute=af, `Relative(%)`=rf)
dimnames(out) <- setNames(dimnames(out), c('Values', 'Frequency'))
out
}
lapply(categoricalvariables, desc)
#$ESRD
# Frequency
#Values Absolute Relative(%)
# N 10 50
# Y 10 50
#
#$DIABETES
# Frequency
#Values Absolute Relative(%)
# N 10 50
# Y 10 50
#
#$BLAH
# Frequency
#Values Absolute Relative(%)
# N 10 50
# Y 10 50
If you really want a character matrix
tmp <- lapply(categoricalvariables, desc)
out <- do.call(rbind, lapply(names(tmp), function(x) {
rbind(c(x, "", "", ""), cbind("", rownames(tmp[[x]]), tmp[[x]]))
}))
out <- unname(rbind(c("", "", "Abs.Freq", "Rel.Freq"), out))
out
# [,1] [,2] [,3] [,4]
# [1,] "" "" "Abs.Freq" "Rel.Freq"
# [2,] "ESRD" "" "" ""
# [3,] "" "N" "10" "50"
# [4,] "" "Y" "10" "50"
# [5,] "DIABETES" "" "" ""
# [6,] "" "N" "10" "50"
# [7,] "" "Y" "10" "50"
# [8,] "BLAH" "" "" ""
# [9,] "" "N" "10" "50"
#[10,] "" "Y" "10" "50"
a is a matrix:
a <- matrix(1:9,3)
> a
[,1] [,2] [,3]
[1,] 1 4 7
[2,] 2 5 8
[3,] 3 6 9
I want to replace all the 1 to good, all the 4 to medium, and all the 9 to bad.
I use the following code:
a[a==1] <- "good"
a[a==4] <- "medium"
a[a==9] <- "bad"
> a
[,1] [,2] [,3]
[1,] "good" "medium" "7"
[2,] "2" "5" "8"
[3,] "3" "6" "bad"
It works, but is this the simplest way to work it out? Can I combine these codes into one command?
Using cut():
matrix(cut(a, breaks = c(0:9),
labels = c("good", 2:3, "medium", 5:8, "bad")), 3)
But not really happy with manual labels bit.
Maybe using match(), more flexible:
res <- matrix(c("good", "medium", "bad")[match(a, c(1, 4, 9))], 3)
res <- ifelse(is.na(res), a, res)
car::recode() does nicely here, returning the same matrix structure as was given as input.
car::recode(a, "1='good';4='medium';9='bad'")
# [,1] [,2] [,3]
# [1,] "good" "medium" "7"
# [2,] "2" "5" "8"
# [3,] "3" "6" "bad"
I have a data.frame with character and integer columns. I want to transform them all into characters, but I get unwanted leading spaces for the numeric columns:
> example <- data.frame(a=1:10,b=1:10,c=rep("foo",10))
> apply(example,2,format,trim=T)
a b c
[1,] " 1" " 1" "foo"
[2,] " 2" " 2" "foo"
[3,] " 3" " 3" "foo"
[4,] " 4" " 4" "foo"
[5,] " 5" " 5" "foo"
[6,] " 6" " 6" "foo"
[7,] " 7" " 7" "foo"
[8,] " 8" " 8" "foo"
[9,] " 9" " 9" "foo"
[10,] "10" "10" "foo"
The trim=T parameter is ignored apparently. This only occurs in the presence of the character column 'c', i.e. it works fine if 'c' is not present (apply(example[,-3],...)).
If I remember correctly, it's because of as.matrix, but you can bypass this by using sapply:
> sapply(example, format, trim = TRUE)
a b c
[1,] "1" "1" "foo"
[2,] "2" "2" "foo"
[3,] "3" "3" "foo"
[4,] "4" "4" "foo"
[5,] "5" "5" "foo"
[6,] "6" "6" "foo"
[7,] "7" "7" "foo"
[8,] "8" "8" "foo"
[9,] "9" "9" "foo"
[10,] "10" "10" "foo"
If you're okay with a character matrix as an output (you seem to be based on your use of apply, try):
do.call(cbind, lapply(example, as.character))
This produces:
a b c
[1,] "1" "1" "foo"
[2,] "2" "2" "foo"
[3,] "3" "3" "foo"
[4,] "4" "4" "foo"
[5,] "5" "5" "foo"
[6,] "6" "6" "foo"
[7,] "7" "7" "foo"
[8,] "8" "8" "foo"
[9,] "9" "9" "foo"
[10,] "10" "10" "foo"
As it says in ?apply, the first argument is coerced to a matrix. In this case, it converts it to a character matrix because of column c. The call to as.matrix creates the leading spaces. The subsequent calls to format do nothing because the data are already character.
> as.matrix(example)
a b c
[1,] " 1" " 1" "foo"
[2,] " 2" " 2" "foo"
[3,] " 3" " 3" "foo"
[4,] " 4" " 4" "foo"
[5,] " 5" " 5" "foo"
[6,] " 6" " 6" "foo"
[7,] " 7" " 7" "foo"
[8,] " 8" " 8" "foo"
[9,] " 9" " 9" "foo"
[10,] "10" "10" "foo"
Without column c, it's converted to an integer matrix, and format converts the integers to character.
> as.matrix(example[,-3])
a b
[1,] 1 1
[2,] 2 2
[3,] 3 3
[4,] 4 4
[5,] 5 5
[6,] 6 6
[7,] 7 7
[8,] 8 8
[9,] 9 9
[10,] 10 10
Better to simply use lapply:
example <- data.frame(a=1:10,b=1:10,c=rep("foo",10))
example[] <- lapply(example, format, trim=TRUE)
# use sapply if you really want a matrix
example <- sapply(example, format, trim=TRUE)
I have df dataframe that needs subsetting into chunks of 2 names. From example below, there are 4 unique names: a,b,c,d. I need to subset into 2 one column matrices a,b and c,d.
Output format:
name1
item_value
item_value
...
END
name2
item_value
item_value
...
END
Example:
#dummy data
df <- data.frame(name=sort(c(rep(letters[1:4],2),"a","a","c")),
item=round(runif(11,1,10)),
stringsAsFactors=FALSE)
#tried approach - split per name. I need to split per 2 names.
lapply(split(df,f=df$name),
function(x)
{name <- unique(x$name)
as.matrix(c(name,x[,2],"END"))
})
#expected output
[,1]
[1,] "a"
[2,] "8"
[3,] "9"
[4,] "6"
[5,] "4"
[6,] "END"
[1,] "b"
[2,] "2"
[3,] "10"
[4,] "END"
[,2]
[1,] "c"
[2,] "6"
[3,] "6"
[4,] "2"
[5,] "END"
[1,] "d"
[2,] "4"
[3,] "1"
[4,] "END"
Note: Actual df has ~300000 rows with ~35000 unique names.
You may try this.
# for each 'name', "pad" 'item' with 'name' and 'END'
l1 <- lapply(split(df, f = df$name), function(x){
name <- unique(x$name)
as.matrix(c(name, x$item, "END"))
})
# create a sequence of numbers, to select two by two elements from the list
steps <- seq(from = 0, to = length(unique(df$name))/2, by = 2)
# loop over 'steps' to bind together list elements, two by two.
l2 <- lapply(steps, function(x){
do.call(rbind, l1[1:2 + x])
})
l2
# [[1]]
# [,1]
# [1,] "a"
# [2,] "6"
# [3,] "4"
# [4,] "10"
# [5,] "3"
# [6,] "END"
# [7,] "b"
# [8,] "6"
# [9,] "7"
# [10,] "END"
#
# [[2]]
# [,1]
# [1,] "c"
# [2,] "2"
# [3,] "6"
# [4,] "10"
# [5,] "END"
# [6,] "d"
# [7,] "5"
# [8,] "4"
# [9,] "END"
Instead of making the lists from individual names make it from the column of subsets of the data.frame
res <- list("a_b" = c(df[df$name == "a",2],"END",df[df$name == "b", 2],"END"),
"c_d" = c(df[df$name == "c",2],"END", df[df$name == "d", 2],"END"))
res2 <- vector(mode="list",length=2)
res2 <- sapply(1:(length(unique(df$name))/2),function(x) {
sapply(seq(1,length(unique(df$name))-1,by=2), function(y) {
name <- unique(df$name)
res2[x] <- as.matrix(c(name[y],df[df$name == name[y],2],"END",name[y+1],df[df$name == name[y+1],2],"END"))
})
})
answer <- res2[,1]
This is giving me a matrix of lists since there are two sapplys happening, I think everything you want is in res2[,1]