Trying to build a large table in R - r

I am trying to build a large table in R. Yes I have heard of the table() function - in fact, I've used it several times in the code below - but I am building this because I do not want to type table() 20 times a day. I plan on just exporting this using xtable + knitr. The reason this is useful is that for those of us who have to repeatedly tabulate data, this would save a lot of time. Unfortunately, there is something wrong with the loop down here:
ESRD <- rep(c("Y", "N"), each=10)
DIABETES <- rep(c("Y", "N", "Y", "N"), c(5, 5, 5, 5))
BLAH <- rep(c("Y", "N"), each=10)
categoricalvariables <- data.frame(ESRD, DIABETES, BLAH)
descriptives <- function(VARIABLEMATRIX){
desc <- matrix(0, ncol=4, nrow=2*ncol(VARIABLEMATRIX) + ncol(VARIABLEMATRIX))
for (i in 1:ncol(VARIABLEMATRIX)){
matper <- matrix(0, nrow=dim(table(VARIABLEMATRIX[ ,i])), ncol=1)
for (i in 1:dim(table(VARIABLEMATRIX[ ,i]))){
matper[i, ] <- paste(round(prop.table(table(VARIABLEMATRIX[ ,i]))[i]*100, 2), "%")
}
matcount <- matrix(0, nrow=dim(table(VARIABLEMATRIX[ ,i])), ncol=1)
for (i in 1:dim(table(VARIABLEMATRIX[ ,i]))){
matcount[i, ] <- table(VARIABLEMATRIX[ ,i])[i]
}
desc[((3*i)-2), ] <- c(colnames(VARIABLEMATRIX)[i], "", "", "")
desc[((3*i)-1):(3*i), ] <- cbind("", names(table(VARIABLEMATRIX[ ,i])), matcount[ ,1], matper[ ,1])
return(desc)
}
}
descriptives(categoricalvariables)
The output I am getting is (clearly there is a bug but I am not sure what is wrong):
[,1] [,2] [,3] [,4]
[1,] "0" "0" "0" "0"
[2,] "0" "0" "0" "0"
[3,] "0" "0" "0" "0"
[4,] "DIABETES" "" "" ""
[5,] "" "N" "10" "50 %"
[6,] "" "Y" "10" "50 %"
[7,] "0" "0" "0" "0"
[8,] "0" "0" "0" "0"
[9,] "0" "0" "0" "0"
The expected output should be:
[,1] [,2] [,3] [,4]
[1,] "ESRD" "" "" ""
[2,] "" "N" "10" "50 %"
[3,] "" "Y" "10" "50 %"
[4,] "DIABETES" "" "" ""
[5,] "" "N" "10" "50 %"
[6,] "" "Y" "10" "50 %"
[7,] "BLAH" "" "" ""
[8,] "" "N" "10" "50 %"
[9,] "" "Y" "10" "50 %"

Here's one option:
desc <- function(x) {
af <- table(x)
rf <- prop.table(af) * 100
out <- cbind(Absolute=af, `Relative(%)`=rf)
dimnames(out) <- setNames(dimnames(out), c('Values', 'Frequency'))
out
}
lapply(categoricalvariables, desc)
#$ESRD
# Frequency
#Values Absolute Relative(%)
# N 10 50
# Y 10 50
#
#$DIABETES
# Frequency
#Values Absolute Relative(%)
# N 10 50
# Y 10 50
#
#$BLAH
# Frequency
#Values Absolute Relative(%)
# N 10 50
# Y 10 50
If you really want a character matrix
tmp <- lapply(categoricalvariables, desc)
out <- do.call(rbind, lapply(names(tmp), function(x) {
rbind(c(x, "", "", ""), cbind("", rownames(tmp[[x]]), tmp[[x]]))
}))
out <- unname(rbind(c("", "", "Abs.Freq", "Rel.Freq"), out))
out
# [,1] [,2] [,3] [,4]
# [1,] "" "" "Abs.Freq" "Rel.Freq"
# [2,] "ESRD" "" "" ""
# [3,] "" "N" "10" "50"
# [4,] "" "Y" "10" "50"
# [5,] "DIABETES" "" "" ""
# [6,] "" "N" "10" "50"
# [7,] "" "Y" "10" "50"
# [8,] "BLAH" "" "" ""
# [9,] "" "N" "10" "50"
#[10,] "" "Y" "10" "50"

Related

How to use tidyverse map to iterate filtering and writing to csv in R

I've got a data frame full of study metadata, with two key columns: citation information and questions of mine that they pertain to:
library(tidyverse)
citation <- c(letters)
study_question <- rep(1:3, len = length(citation))
df <- as.data.frame(cbind(citation, study_question))
#so that df looks like:
citation study_question
[1,] "a" "1"
[2,] "b" "2"
[3,] "c" "3"
[4,] "d" "1"
[5,] "e" "2"
[6,] "f" "3"
[7,] "g" "1"
[8,] "h" "2"
[9,] "i" "3"
[10,] "j" "1"
[11,] "k" "2"
[12,] "l" "3"
[13,] "m" "1"
[14,] "n" "2"
[15,] "o" "3"
[16,] "p" "1"
[17,] "q" "2"
[18,] "r" "3"
[19,] "s" "1"
[20,] "t" "2"
[21,] "u" "3"
[22,] "v" "1"
[23,] "w" "2"
[24,] "x" "3"
[25,] "y" "1"
[26,] "z" "2"
>
What I'd like to do is use an iterative function to filter for study question = 1, to get:
> df %>% filter(study_question == 1)
citation study_question
1 a 1
2 d 1
3 g 1
4 j 1
5 m 1
6 p 1
7 s 1
8 v 1
9 y 1
then write that list of citations to a csv named "sq1_papers.csv", then do the same for study question = 2, with the output "sq2_papers.csv", and then the same for question 3.
I have tried this with a for loop, which has not worked, and would prefer to try it with a map function, which I have gotten to work in the past. Here is the code I tried:
for(i in study_question) {
file <- df %>%
filter(study_question == study_question[[i]])
write_csv(file, "data/sq[i]_papers.csv")
}
With tidyverse, we can group split by 'study_question, loop over the list with iwalk and write to 'csv' with write_csv from readr
library(dplyr)
library(purrr)
library(readr)
library(stringr)
df %>%
group_split(study_question) %>%
iwalk(~ write_csv(.x, str_c('data/sq', .y, '_papers.csv'))
The below code should work for you:
## iterate through unique "study_questions"
for(i in unique(df$study_question)) {
## filter data frame by current "study question"
file <- df %>%
filter(study_question == i)
## create file name for export
fileName <- paste0("data/sq",i,"_papers.csv")
## export filtered data to path of fileName
write_csv(file, fileName)
}
I hope this helps!

Write list of lists to separate CSV files

I have a list (414 elements) which contains other lists of different lengths (ranging from 0 to 9). Each of those sublists has different numbers of rows and columns.
Some of the sublists are of length 1 like the one below:
tables_list[[1]]
[,1] [,2]
[1,] "ID Number" "ABCD"
[2,] "Code" "1239463"
[3,] "Version" "1"
[4,] "Name" "ABC"
[5,] "Status" "Open"
[6,] "Currency" "USD"
[7,] "Average" "No"
[8,] "FX Rate" "2.47"
Other sublists are of length 2 or higher like the one below:
tables_list[[17]]
[[1]]
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
[1,] "" "" "USD" "Balance" "Movement in" "Aggregate" "Overall" "" "Overall"
[2,] "" "" "" "brought forward" "year" "annual" "aggregate" "" "funded account"
[3,] "" "" "" "from previous" "" "information" "adjustments" "" ""
[4,] "" "" "" "year end" "" "" "" "" ""
[5,] "" "" "" "1" "2" "3" "4" "" "5"
[6,] "12" "Value 1" "" "0" "3,275,020" "3,275,020" "" "0" "3,275,020"
[7,] "13" "Value 2" "" "0" "0" "0" "" "0" "0"
[8,] "14" "Value 3" "" "0" "8,267,862" "8,267,862" "" "0" "8,267,862"
[9,] "15" "Value 4" "" "0" "(590,073,321)" "(590,073,321)" "" "0" "(590,073,321)"
[10,] "16" "Value 5" "" "0" "0" "0" "" "0" "0"
[11,] "17" "Value 6" "" "0" "0" "0" "" "0" "0"
[12,] "18" "Value 7" "" "0" "0" "0" "" "0" "0"
[13,] "19" "Value 8" "" "0" "0" "0" "" "0" "0"
[14,] "20" "Value 9" "" "0" "(459,222,782)" "(459,222,782)" "" "0" "(459,222,782)"
[[2]]
[,1] [,2] [,3] [,4]
[1,] "Theme" "Year" "Comment" "Created"
[2,] "Line 17 Column 2" "N/A" "Amounts are calculated according to recent standards" "XXXXXXXXXXXX"
[3,] "" "" "paid by XXXXXXXXXXXXX" ""
I am trying to export each of those lists to an individual csv file but I cannot figure out a way to do so. Does anyone have any ideas on how to approach this? I tried using mapply but I keep getting the following error:
Error in file(file, ifelse(append, "a", "w")) :
invalid 'description' argument
In addition: Warning message:
In if (file == "") file <- stdout() else if (is.character(file)) { :
Error in file(file, ifelse(append, "a", "w")) :
invalid 'description' argument
First you flatten the list appropriately, then you can loop over it in a the regular manner.
flattenlist <- function(x){
morelists <- sapply(x, function(xprime) class(xprime)[1]=="list")
out <- c(x[!morelists], unlist(x[morelists], recursive=FALSE))
if (sum(morelists)) {
Recall(out)
} else {
return(out)
}
}
l <- list(a=list(1:2, b=2:4),
b=c("A", "B", "C"),
z=1,
m=matrix(4:1, 2),
d=data.frame(x=1:4, y=c(1, 3, 2, 4))
)
l.f <- flattenlist(l)
n <- paste0("robj_", names(l.f), ".csv")
sapply(1:length(l.f), function(x) write.csv(l.f[[x]], file=n[x]))

How to bind rows without losing those with character(0)?

I have a list like L (comes from a vector splitting).
L <- strsplit(c("1 5 9", "", "3 7 11", ""), " ")
# [[1]]
# [1] "1" "5" "9"
#
# [[2]]
# character(0)
#
# [[3]]
# [1] "3" "7" "11"
#
# [[4]]
# character(0)
When I do an ordinary rbind as follows, I'm losing all the character(0) rows.
do.call(rbind, L)
# [,1] [,2] [,3]
# [1,] "1" "5" "9"
# [2,] "3" "7" "11"
Do I always have to do a lapply like the following or have I missed something?
do.call(rbind, lapply(L, function(x)
if (length(x) == 0) rep("", 3) else x))
# [,1] [,2] [,3]
# [1,] "1" "5" "9"
# [2,] "" "" ""
# [3,] "3" "7" "11"
# [4,] "" "" ""
Base R answers are preferred.
If you use lapply you don't have to worry about length so you can skip the rep part it will automatically be recycled across columns.
do.call(rbind, lapply(L, function(x) if (length(x) == 0) "" else x))
# [,1] [,2] [,3]
#[1,] "1" "5" "9"
#[2,] "" "" ""
#[3,] "3" "7" "11"
#[4,] "" "" ""
Another option using same logic as #NelsonGon we can replace the empty lists with blank and then rbind.
L[lengths(L) == 0] <- ""
do.call(rbind, L)
# [,1] [,2] [,3]
#[1,] "1" "5" "9"
#[2,] "" "" ""
#[3,] "3" "7" "11"
#[4,] "" "" ""
Maybe this roundabout using data.table suits you:
L <- data.table::tstrsplit(c("1 5 9", "", "3 7 11", ""), " ", fill="")
t(do.call(rbind,L))
With plyr then proceed with replacement. Since OP asked for base R, see below.
plyr::ldply(L,rbind)
1 2 3
1 1 5 9
2 <NA> <NA> <NA>
3 3 7 11
4 <NA> <NA> <NA>
A less efficient base R way:
L <- strsplit(c("1 5 9", "", "3 7 11", ""), " ")
L[lapply(L,length)==0]<-"Miss"
res<-Reduce(rbind,L)
res[res=="Miss"]<-""
Result:
[,1] [,2] [,3]
init "1" "5" "9"
"" "" ""
"3" "7" "11"
"" "" ""
That is the defined behavior for scenarios like that. As written in ?rbind:
For cbind (rbind), vectors of zero length (including NULL) are ignored
unless the result would have zero rows (columns), for S compatibility.
(Zero-extent matrices do not occur in S3 and are not ignored in R.)
When you inspect your elements, you see that it is true:
length(L[[1]])
[1] 3
length(L[[2]])
[1] 0
However, as you see, multiple workarounds are possible.
We can use stri_list2matrix in a simple way
library(stringi)
stri_list2matrix(L, byrow = TRUE, fill = "")
# [,1] [,2] [,3]
#[1,] "1" "5" "9"
#[2,] "" "" ""
#[3,] "3" "7" "11"
#[4,] "" "" ""

subsetting using a threshold in R and returning a neighboring row using for loops

So I have been stumped on this problem for being able to build a table based off a threshold using R. All I want to do is groupby(set_a,set_b) and select a value based on whether rank_of_values is greater than .80 (utilizing some paste a the same time) to build a table. My criteria is, select the max rank_of_values and return the value in a table if true in the table. If that part is false, append in the table the first and second rank_of_values =value in the table. I was able to figure this out previously when there was only one group in set_b but now that there are multiple I am having trouble. I also have tried to figure out the warning but have been stumped tried for length error but no luck so far. Any help and advice would be appreciated.
library(data.table)
#create sample data
set_a <- c("a","a","a","a","b","b","b","b","c","c","c","c","a","a","a","a","b","b","b","b","c","c","c","c","a","a","a","a","b","b","b","b","c","c","c","c")
set_b <- c("red","red","red","red","red","red","red","red","red","red","red","red","blue","blue","blue","blue","blue","blue","blue","blue","blue","blue","blue","blue","green","green","green","green","green","green","green","green","green","green","green","green")
#value <- c(sample(1:100,size = 36,replace = T))
value <- c(19,15,3,62,61,17,2,31,16,511,2,64,81,51,58,94,81,79,23,35,9,40,54,78,78,56,11,58,99,74,36,58,5,47,39,98)
a = data.frame(set_a,set_b,value)
a = data.table(a)
#This function calculates the average of the counts
a = a[,rank_of_values:= value/sum(value), by=list(set_a,set_b)]
head(a)
mn <- c() #create matrix to fill
mn0 <- c() #temp matrix
colu = unique(as.character(a$set_a))
colu2 = unique(as.character(a$set_b))
for (i in seq_along(colu))
{
#subset the data table for the set_a:
t = subset(a, set_a == colu[i])
for (i in seq_along(colu2)){
t2 = subset(t, set_b == colu2[i])
#subsetting the data by the set_b
if(t2$rank_of_values > .8){
mn <- cbind(as.character(t2$set_a[i]),paste0(t2$set_b[i],"_a"),t2$value[i])
mn0 <- rbind(mn,mn0)
mn2 <- cbind(as.character(t2$set_a[i]),paste0(t2$set_b[i],"_b"),t2$value[i])
mn0 <- rbind(mn0,mn2)
}
else
{
t2[order(-rank_of_values)][,.SD[1:2]] #create a second data table to select for
#order the data.table and
mn3 <- cbind(as.character(t2$set_a[1]),paste0(t2$set_b[1],"_a"),t2$value[1])
mn0 <- rbind(mn0,mn3)
mn4 <- cbind(as.character(t2$set_a[2]),paste0(t2$set_b[2],"_b"),t2$value[2])
mn0 <- rbind(mn0,mn4)
}
}
}
mn0
Current result:
[,1] [,2] [,3]
[1,] "a" "red_a" "19"
[2,] "a" "red_b" "15"
[3,] "a" "blue_a" "81"
[4,] "a" "blue_b" "51"
[5,] "a" "green_a" "78"
[6,] "a" "green_b" "56"
[7,] "b" "red_a" "61"
[8,] "b" "red_b" "17"
[9,] "b" "blue_a" "81"
[10,] "b" "blue_b" "79"
[11,] "b" "green_a" "99"
[12,] "b" "green_b" "74"
[13,] "c" "red_a" "16"
[14,] "c" "red_b" "511"
[15,] "c" "blue_a" "9"
[16,] "c" "blue_b" "40"
[17,] "c" "green_a" "5"
[18,] "c" "green_b" "47"
Desired Result:
[,1] [,2] [,3]
[1,] "a" "red_a" "62"
[2,] "a" "red_b" "19"
[3,] "a" "blue_a" "94"
[4,] "a" "blue_b" "81"
[5,] "a" "green_a" "78"
[6,] "a" "green_b" "58"
[7,] "b" "red_a" "61"
[8,] "b" "red_b" "31"
[9,] "b" "blue_a" "81"
[10,] "b" "blue_b" "79"
[11,] "b" "green_a" "99"
[12,] "b" "green_b" "74"
[13,] "c" "red_a" "511"
[14,] "c" "red_b" "64"
[15,] "c" "blue_a" "78"
[16,] "c" "blue_b" "54"
[17,] "c" "green_a" "98"
[18,] "c" "green_b" "47"
Warning Message Received:
Warning messages:
1: In if (t2$rank_of_values > 0.8) { :
the condition has length > 1 and only the first element will be used
2: In if (t2$rank_of_values > 0.8) { :
the condition has length > 1 and only the first element will be used
3: In if (t2$rank_of_values > 0.8) { :
the condition has length > 1 and only the first element will be used
4: In if (t2$rank_of_values > 0.8) { :
the condition has length > 1 and only the first element will be used
5: In if (t2$rank_of_values > 0.8) { :
the condition has length > 1 and only the first element will be used
6: In if (t2$rank_of_values > 0.8) { :
the condition has length > 1 and only the first element will be used
7: In if (t2$rank_of_values > 0.8) { :
the condition has length > 1 and only the first element will be used
8: In if (t2$rank_of_values > 0.8) { :
the condition has length > 1 and only the first element will be used
9: In if (t2$rank_of_values > 0.8) { :
the condition has length > 1 and only the first element will be used
I came up with a solution to my question after searching the forums.
mn <- c() #create matrix to fill
mn0 <- c() #temp matrix
colu = unique(as.character(a$set_a))
colu2 = unique(as.character(a$set_b))
for (i in colu){
for (j in colu2){
t = a[a$set_a %in% i & a$set_b %in% j,][order(-rank_of_values)]
for(z in t$rank_of_values[[nrow(t)]]){
if(t$rank_of_values[[z]] > .8){
print(z)
mn <- cbind(as.character(t$set_a[[z]]),paste0(t$set_b[[z]],"_a"),t$value[[z]])
mn0 <- rbind(mn,mn0)
mn2 <- cbind(as.character(t$set_a[[z]]),paste0(t$set_b[[z]],"_b"),t$value[[z]])
mn0 <- rbind(mn0,mn2)
}
else{
#create a second data table to select for
#order the data.table and
mn3 <- cbind(as.character(t$set_a[[1]]),paste0(t$set_b[[1]],"_a"),t$value[[1]])
mn0 <- rbind(mn0,mn3)
mn4 <- cbind(as.character(t$set_a[[2]]),paste0(t$set_b[[2]],"_b"),t$value[[2]])
mn0 <- rbind(mn0,mn4)
}
}
}
}
Result
[,1] [,2] [,3]
[1,] "a" "red_a" "62"
[2,] "a" "red_b" "19"
[3,] "a" "blue_a" "94"
[4,] "a" "blue_b" "81"
[5,] "a" "green_a" "78"
[6,] "a" "green_b" "58"
[7,] "b" "red_a" "61"
[8,] "b" "red_b" "31"
[9,] "b" "blue_a" "81"
[10,] "b" "blue_b" "79"
[11,] "b" "green_a" "99"
[12,] "b" "green_b" "74"
[13,] "c" "red_a" "511"
[14,] "c" "red_b" "64"
[15,] "c" "blue_a" "78"
[16,] "c" "blue_b" "54"
[17,] "c" "green_a" "98"
[18,] "c" "green_b" "47"

Subset dataframe into equal subgroup chunks

I have df dataframe that needs subsetting into chunks of 2 names. From example below, there are 4 unique names: a,b,c,d. I need to subset into 2 one column matrices a,b and c,d.
Output format:
name1
item_value
item_value
...
END
name2
item_value
item_value
...
END
Example:
#dummy data
df <- data.frame(name=sort(c(rep(letters[1:4],2),"a","a","c")),
item=round(runif(11,1,10)),
stringsAsFactors=FALSE)
#tried approach - split per name. I need to split per 2 names.
lapply(split(df,f=df$name),
function(x)
{name <- unique(x$name)
as.matrix(c(name,x[,2],"END"))
})
#expected output
[,1]
[1,] "a"
[2,] "8"
[3,] "9"
[4,] "6"
[5,] "4"
[6,] "END"
[1,] "b"
[2,] "2"
[3,] "10"
[4,] "END"
[,2]
[1,] "c"
[2,] "6"
[3,] "6"
[4,] "2"
[5,] "END"
[1,] "d"
[2,] "4"
[3,] "1"
[4,] "END"
Note: Actual df has ~300000 rows with ~35000 unique names.
You may try this.
# for each 'name', "pad" 'item' with 'name' and 'END'
l1 <- lapply(split(df, f = df$name), function(x){
name <- unique(x$name)
as.matrix(c(name, x$item, "END"))
})
# create a sequence of numbers, to select two by two elements from the list
steps <- seq(from = 0, to = length(unique(df$name))/2, by = 2)
# loop over 'steps' to bind together list elements, two by two.
l2 <- lapply(steps, function(x){
do.call(rbind, l1[1:2 + x])
})
l2
# [[1]]
# [,1]
# [1,] "a"
# [2,] "6"
# [3,] "4"
# [4,] "10"
# [5,] "3"
# [6,] "END"
# [7,] "b"
# [8,] "6"
# [9,] "7"
# [10,] "END"
#
# [[2]]
# [,1]
# [1,] "c"
# [2,] "2"
# [3,] "6"
# [4,] "10"
# [5,] "END"
# [6,] "d"
# [7,] "5"
# [8,] "4"
# [9,] "END"
Instead of making the lists from individual names make it from the column of subsets of the data.frame
res <- list("a_b" = c(df[df$name == "a",2],"END",df[df$name == "b", 2],"END"),
"c_d" = c(df[df$name == "c",2],"END", df[df$name == "d", 2],"END"))
res2 <- vector(mode="list",length=2)
res2 <- sapply(1:(length(unique(df$name))/2),function(x) {
sapply(seq(1,length(unique(df$name))-1,by=2), function(y) {
name <- unique(df$name)
res2[x] <- as.matrix(c(name[y],df[df$name == name[y],2],"END",name[y+1],df[df$name == name[y+1],2],"END"))
})
})
answer <- res2[,1]
This is giving me a matrix of lists since there are two sapplys happening, I think everything you want is in res2[,1]

Resources