combine tables into a data frame - r

How do I turn a list of tables into a data frame?
I have:
> (tabs <- list(table(c('a','a','b')),table(c('c','c','b')),table(c()),table(c('b','b'))))
[[1]]
a b
2 1
[[2]]
b c
1 2
[[3]]
< table of extent 0 >
[[4]]
b
2
I want:
> data.frame(a=c(2,0,0),b=c(1,1,2),c=c(0,2,0))
a b c
1 2 1 0
2 0 1 2
3 0 0 0
4 0 2 0
PS. Please do not assume that the tables were created by table calls! They were not!

c_names <- unique(unlist(sapply(tabs, names)))
df <- do.call(rbind, lapply(tabs, `[`, c_names))
colnames(df) <- c_names
df[is.na(df)] <- 0

This assumes the tables are one dimensional.
all.names <- unique(unlist(lapply(tabs, names)))
df <- as.data.frame(do.call(rbind,
lapply(
tabs, function(x) as.list(replace(c(x)[all.names], is.na(c(x)[all.names]), 0))
) ) )
names(df) <- all.names
df
There is probably a cleaner way to do this.
# a b c
# 1 2 1 0
# 2 0 1 2
# 3 0 0 0
# 4 0 2 0

tabs <- list(table(c('a','a','b')),table(c('c','c','b')),table(c()),table(c('b','b')))
dat.names <- unique(unlist(sapply(tabs, names)))
dat <- matrix(0, nrow = length(tabs), ncol = length(dat.names))
colnames(dat) <- dat.names
for (ii in 1:length(tabs)) {
dat[ii, ] <- tabs[[ii]][match(colnames(dat), names(tabs[[ii]]) )]
}
dat[is.na(dat)] <- 0
> dat
a b c
[1,] 2 1 0
[2,] 0 1 2
[3,] 0 0 0
[4,] 0 2 0

Here is a pretty clean approach:
library(reshape2)
newTabs <- melt(tabs)
newTabs
# Var1 value L1
# 1 a 2 1
# 2 b 1 1
# 3 b 1 2
# 4 c 2 2
# 5 b 2 4
newTabs$L1 <- factor(newTabs$L1, seq_along(tabs))
dcast(newTabs, L1 ~ Var1, fill = 0, drop = FALSE)
# L1 a b c
# 1 1 2 1 0
# 2 2 0 1 2
# 3 3 0 0 0
# 4 4 0 2 0
This makes use of the fact that there is a melt method for lists (see reshape2:::melt.list) which automatically adds in a variable (L1 for an unnested list) that identifies the index of the list element. Since your list has some items which are empty, they won't show up in your melted list, so you need to factor the "L1" column, specifying the levels you want. dcast takes care of restructuring your output and allows you to specify the desired fill value.

Related

Separate long numbers into individual components in a data frame

I have a data frame with several hundred vectors that look like this
To analyze them I must split the columns so that the number is separated into its individual components in the rows beneath.
V1
0
0
0
0
0
0
...
I've tried using this code and tweaking it but I can't get it to work. There are 2225 columns and the vectors are not all the same size.
text_data <- read_excel("./data/wordcount_vectors.xlsx")
text_vector_data <- text_data %>% select(wordcountvec)
wordvec_list <- c()
for (i in 1:nrow(text_vector_data)){
text_vector_data[i,] <- removePunctuation(as.character(text_vector_data[i,]))
x <- as.list(text_vector_data[i,])
wordvec_list <- c(wordvec_list, x)
}
wordvec_df <- as.data.frame(wordvec_list)
df <- data.frame(matrix(ncol = 2225, nrow = 1106))
for (i in 1:ncol(text_vector_data)){ #Change range depending on size of c
c <- as.numeric(strsplit(as.character(wordvec_df[i]), "")[[1]])
dd[i] <- c[[i]]
}
word_vec_df <- Filter(function(x)!all(is.na(x)), dd)
row.names(word_vec_df)<- NULL ; colnames(word_vec_df)<- NULL
word_vec_df <- t(word_vec_df)
here's some toy data to try
v1 <- (100011000)
v2 <- (10102100)
v3 <- (1120210011)
wordcount_df <- data_frame(v1,v2,v3)
First, you need to read your data as character not numeric because otherwise the leading zeros will be lost:
text_data <- read_excel("./data/wordcount_vectors.xlsx", col_types="text")
Your example does not include spaces between the 1's and 0's but your picture does. Using your provided data:
wordcount_dfc <- as.character(wordcount_df)
wordcount_dfc
# [1] "100011000" "10102100" "1120210011"
wordcount_lst <- strsplit(wordcount_dfc, "") # Use " " if the values are separated by spaces
wordcount_lst <-sapply(wordcount_lst, as.integer)
wordcount_lst
# [[1]]
# [1] 1 0 0 0 1 1 0 0 0
#
# [[2]]
# [1] 1 0 1 0 2 1 0 0
#
# [[3]]
# [1] 1 1 2 0 2 1 0 0 1 1
sapply(wordcount_lst, length)
# [1] 9 8 10
You cannot just bind the columns together because they are of different lengths. The simplest approach would be to use wordcount_lst directly, but if you need to make a data frame, you need to add NAs to pad out short vectors:
size <- max(sapply(wordcount_lst, length))
wordcount_df <- data.frame(sapply(wordcount_lst, function(x) c(x, rep(NA, size - length(x)))))
wordcount_df
# X1 X2 X3
# 1 1 1 1
# 2 0 0 1
# 3 0 1 2
# 4 0 0 0
# 5 1 2 2
# 6 1 1 1
# 7 0 0 0
# 8 0 0 0
# 9 0 NA 1
# 10 NA NA 1

Separate data in R [duplicate]

I have a character vector like this:
a <- c("a,b,c", "a,b", "a,b,c,d")
What I would like to do is create a data frame where the individual letters in each string are represented by dummy columns:
a b c d
1] 1 1 1 0
2] 1 1 0 0
3] 1 1 1 1
I have a feeling that I need to be using some combination of read.table and reshape but am really struggling. Any and help appreciated.
You can try cSplit_e from my "splitstackshape" package:
library(splitstackshape)
a <- c("a,b,c", "a,b", "a,b,c,d")
cSplit_e(as.data.table(a), "a", ",", type = "character", fill = 0)
# a a_a a_b a_c a_d
# 1: a,b,c 1 1 1 0
# 2: a,b 1 1 0 0
# 3: a,b,c,d 1 1 1 1
cSplit_e(as.data.table(a), "a", ",", type = "character", fill = 0, drop = TRUE)
# a_a a_b a_c a_d
# 1: 1 1 1 0
# 2: 1 1 0 0
# 3: 1 1 1 1
There's also mtabulate from "qdapTools":
library(qdapTools)
mtabulate(strsplit(a, ","))
# a b c d
# 1 1 1 1 0
# 2 1 1 0 0
# 3 1 1 1 1
A very direct base R approach is to use table along with stack and strsplit:
table(rev(stack(setNames(strsplit(a, ",", TRUE), seq_along(a)))))
# values
# ind a b c d
# 1 1 1 1 0
# 2 1 1 0 0
# 3 1 1 1 1
Another convoluted base-R solution:
x <- strsplit(a,",")
xl <- unique(unlist(x))
t(sapply(x,function(z)table(factor(z,levels=xl))))
which gives
a b c d
[1,] 1 1 1 0
[2,] 1 1 0 0
[3,] 1 1 1 1
A base R - but longer solution:
el = unique(unlist(strsplit(a, ',')))
do.call(rbind, lapply(a, function(u) setNames(el %in% strsplit(u,',')[[1]]+0L, el))
# a b c d
#[1,] 1 1 1 0
#[2,] 1 1 0 0
#[3,] 1 1 1 1
Another option is tstrsplit() from data.table:
library(data.table)
vapply(tstrsplit(a, ",", fixed = TRUE, fill = 0), ">", integer(length(a)), 0L)
# [,1] [,2] [,3] [,4]
# [1,] 1 1 1 0
# [2,] 1 1 0 0
# [3,] 1 1 1 1
After I wrote this I noticed that Colonel Beauvel's solution is quite similar but perhaps this is sufficiently distinct to be a separate solution. No packages are used.
First we split the character strings into a list of vectors, L and then we compute the union of them all, u. Finally we determine a binary vector for each list element and rbind them together, convert the result from logical to numeric using + 0 and set the column names.
L <- strsplit(a, ",")
u <- Reduce(union, L)
m <- do.call(rbind, lapply(L, `%in%`, x = u)) + 0
colnames(m) <- u
giving:
> m
a b c d
[1,] 1 1 1 0
[2,] 1 1 0 0
[3,] 1 1 1 1
Added The last two lines of code could be replaced by either of these:
do.call(rbind, lapply(lapply(L, factor, levels = u), table))
do.call(rbind, Map(function(x) sapply(u, `%in%`, x), L)) + 0
Unfortunately, base R does not offer a vectorized string matching function but the stringi package does.
library(stringi)
a=c("a,b,c", "a,b", "a,b,c,d")
1*outer(a,unique(unlist(strsplit(a,","))),stri_detect_regex)
# [,1] [,2] [,3] [,4]
#[1,] 1 1 1 0
#[2,] 1 1 0 0
#[3,] 1 1 1 1
I've had a lot of success with dummy_cols within fastDummies which can take care of this fairly simply and can be specified by variable.
library(fastDummies)
a <- c("a,b,c", "a,b", "a,b,c,d")
a <- dummy_cols(a, split = ",")
outputs
# .data .data_a .data_b .data_c .data_d
# 1 a,b,c 1 1 1 0
# 2 a,b 1 1 0 0
# 3 a,b,c,d 1 1 1 1

Split character column into several binary (0/1) columns

I have a character vector like this:
a <- c("a,b,c", "a,b", "a,b,c,d")
What I would like to do is create a data frame where the individual letters in each string are represented by dummy columns:
a b c d
1] 1 1 1 0
2] 1 1 0 0
3] 1 1 1 1
I have a feeling that I need to be using some combination of read.table and reshape but am really struggling. Any and help appreciated.
You can try cSplit_e from my "splitstackshape" package:
library(splitstackshape)
a <- c("a,b,c", "a,b", "a,b,c,d")
cSplit_e(as.data.table(a), "a", ",", type = "character", fill = 0)
# a a_a a_b a_c a_d
# 1: a,b,c 1 1 1 0
# 2: a,b 1 1 0 0
# 3: a,b,c,d 1 1 1 1
cSplit_e(as.data.table(a), "a", ",", type = "character", fill = 0, drop = TRUE)
# a_a a_b a_c a_d
# 1: 1 1 1 0
# 2: 1 1 0 0
# 3: 1 1 1 1
There's also mtabulate from "qdapTools":
library(qdapTools)
mtabulate(strsplit(a, ","))
# a b c d
# 1 1 1 1 0
# 2 1 1 0 0
# 3 1 1 1 1
A very direct base R approach is to use table along with stack and strsplit:
table(rev(stack(setNames(strsplit(a, ",", TRUE), seq_along(a)))))
# values
# ind a b c d
# 1 1 1 1 0
# 2 1 1 0 0
# 3 1 1 1 1
Another convoluted base-R solution:
x <- strsplit(a,",")
xl <- unique(unlist(x))
t(sapply(x,function(z)table(factor(z,levels=xl))))
which gives
a b c d
[1,] 1 1 1 0
[2,] 1 1 0 0
[3,] 1 1 1 1
A base R - but longer solution:
el = unique(unlist(strsplit(a, ',')))
do.call(rbind, lapply(a, function(u) setNames(el %in% strsplit(u,',')[[1]]+0L, el))
# a b c d
#[1,] 1 1 1 0
#[2,] 1 1 0 0
#[3,] 1 1 1 1
Another option is tstrsplit() from data.table:
library(data.table)
vapply(tstrsplit(a, ",", fixed = TRUE, fill = 0), ">", integer(length(a)), 0L)
# [,1] [,2] [,3] [,4]
# [1,] 1 1 1 0
# [2,] 1 1 0 0
# [3,] 1 1 1 1
After I wrote this I noticed that Colonel Beauvel's solution is quite similar but perhaps this is sufficiently distinct to be a separate solution. No packages are used.
First we split the character strings into a list of vectors, L and then we compute the union of them all, u. Finally we determine a binary vector for each list element and rbind them together, convert the result from logical to numeric using + 0 and set the column names.
L <- strsplit(a, ",")
u <- Reduce(union, L)
m <- do.call(rbind, lapply(L, `%in%`, x = u)) + 0
colnames(m) <- u
giving:
> m
a b c d
[1,] 1 1 1 0
[2,] 1 1 0 0
[3,] 1 1 1 1
Added The last two lines of code could be replaced by either of these:
do.call(rbind, lapply(lapply(L, factor, levels = u), table))
do.call(rbind, Map(function(x) sapply(u, `%in%`, x), L)) + 0
Unfortunately, base R does not offer a vectorized string matching function but the stringi package does.
library(stringi)
a=c("a,b,c", "a,b", "a,b,c,d")
1*outer(a,unique(unlist(strsplit(a,","))),stri_detect_regex)
# [,1] [,2] [,3] [,4]
#[1,] 1 1 1 0
#[2,] 1 1 0 0
#[3,] 1 1 1 1
I've had a lot of success with dummy_cols within fastDummies which can take care of this fairly simply and can be specified by variable.
library(fastDummies)
a <- c("a,b,c", "a,b", "a,b,c,d")
a <- dummy_cols(a, split = ",")
outputs
# .data .data_a .data_b .data_c .data_d
# 1 a,b,c 1 1 1 0
# 2 a,b 1 1 0 0
# 3 a,b,c,d 1 1 1 1

Aggregating every 10 columns in binary matrice

I am new to R.
I would like to transform a binary matrix like this:
example:
" 1874 1875 1876 1877 1878 .... 2009
F 1 0 0 0 0 ... 0
E 1 1 0 0 0 ... 0
D 1 1 0 0 0 ... 0
C 1 1 0 0 0 ... 0
B 1 1 0 0 0 ... 0
A 1 1 0 0 0 ... 0"
Since, columns names are years I would like to aggregate them in decades and obtain something like:
"1840-1849 1850-1859 1860-1869 .... 2000-2009
F 1 0 0 0 0 ... 0
E 1 1 0 0 0 ... 0
D 1 1 0 0 0 ... 0
C 1 1 0 0 0 ... 0
B 1 1 0 0 0 ... 0
A 1 1 0 0 0 ... 0"
I am used to python and do not know how to do this transformation without making loops!
Thanks, isabel
It is unclear what aggregation you want, but using the following dummy data
set.seed(42)
df <- data.frame(matrix(sample(0:1, 6*25, replace = TRUE), ncol = 25))
names(df) <- 1874 + 0:24
The following counts events in each 10-year period.
Get the years as a numeric variable
years <- as.numeric(names(df))
Next we need an indicator for the start of each decade
ind <- seq(from = signif(years[1], 3), to = signif(tail(years, 1), 3), by = 10)
We then apply over the indices of ind (1:(length(ind)-1)), select columns from df that are the current decade and count the 1s using rowSums.
tmp <- lapply(seq_along(ind[-1]),
function(i, inds, data) {
rowSums(data[, names(data) %in% inds[i]:(inds[i+1]-1)])
}, inds = ind, data = df)
Next we cbind the resulting vectors into a data frame and fix-up the column names:
out <- do.call(cbind.data.frame, tmp)
names(out) <- paste(head(ind, -1), tail(ind, -1) - 1, sep = "-")
out
This gives:
> out
1870-1879 1880-1889 1890-1899
1 4 5 6
2 4 6 6
3 2 5 5
4 5 5 7
5 3 3 7
6 5 5 4
If you want simply a binary matrix with a 1 indicating at least 1 event happened in that decade, then you can use:
tmp2 <- lapply(seq_along(ind[-1]),
function(i, inds, data) {
as.numeric(rowSums(data[, names(data) %in% inds[i]:(inds[i+1]-1)]) > 0)
}, inds = ind, data = df)
out2 <- do.call(cbind.data.frame, tmp2)
names(out2) <- paste(head(ind, -1), tail(ind, -1) - 1, sep = "-")
out2
which gives:
> out2
1870-1879 1880-1889 1890-1899
1 1 1 1
2 1 1 1
3 1 1 1
4 1 1 1
5 1 1 1
6 1 1 1
If you want a different aggregation, then modify the function applied in the lapply call to use something other than rowSums.
This is another option, using modular arithmetic to aggregate the columns.
# setup, borrowed from #GavinSimpson
set.seed(42)
df <- data.frame(matrix(sample(0:1, 6*25, replace = TRUE), ncol = 25))
names(df) <- 1874 + 0:24
result <- do.call(cbind,
by(t(df), as.numeric(names(df)) %/% 10 * 10, colSums))
# add -xxx9 to column names, for each decade
dimnames(result)[[2]] <- paste(colnames(result), as.numeric(colnames(result)) + 9, sep='-')
# 1870-1879 1880-1889 1890-1899
# V1 4 5 6
# V2 4 6 6
# V3 2 5 5
# V4 5 5 7
# V5 3 3 7
# V6 5 5 4
If you wanted to aggregate with something other than sum, replace the call to
colSums with something like function(cols) lapply(cols, f), where f is the aggregating
function, e.g., max.

Generate vectors using R

I would like to ask,if some of You dont know any simple way to solve this kind of problem:
I need to generate all combinations of A numbers taken from a set B (0,1,2...B), with their sum = C.
ie if A=2, B=3, C=2:
Solution in this case:
(1,1);(0,2);(2,0)
So the vectors are length 2 (A), sum of all its items is 2 (C), possible values for each of vectors elements come from the set {0,1,2,3} (maximum is B).
A functional version since I already started before SO updated:
A=2
B=3
C=2
myfun <- function(a=A, b=B, c=C) {
out <- do.call(expand.grid, lapply(1:a, function(x) 0:b))
return(out[rowSums(out)==c,])
}
> out[rowSums(out)==c,]
Var1 Var2
3 2 0
6 1 1
9 0 2
z <- expand.grid(0:3,0:3)
z[rowSums(z)==2, ]
Var1 Var2
3 2 0
5 1 1
7 0 2
If you wanted to do the expand grid programmatically this would work:
z <- expand.grid( rep( list(C), A) )
You need to expand as a list so that the items remain separate. rep(0:3, 3) would not return 3 separate sequences. So for A=3:
> z <- expand.grid(rep(list(0:3), 3))
> z[rowSums(z)==2, ]
Var1 Var2 Var3
3 2 0 0
6 1 1 0
9 0 2 0
18 1 0 1
21 0 1 1
33 0 0 2
Using the nifty partitions() package, and more interesting values of A, B, and C:
library(partitions)
A <- 2
B <- 5
C <- 7
comps <- t(compositions(C, A))
ii <- apply(comps, 1, FUN=function(X) all(X %in% 0:B))
comps[ii, ]
# [,1] [,2]
# [1,] 5 2
# [2,] 4 3
# [3,] 3 4
# [4,] 2 5

Resources