Transform variable length list into matrix in R - r

If I had a list of vectors of variable lengths :
[[1]]
[1] 1 2 3 4
[[2]]
[1] 4 5 6
[[3]]
[1] 1 2 3 4 5 6 7 8 9
[[4]]
[1] 'a' 'b' 'c'
How could I transform this into a data frame / logical matrix with elements of the list represented as columns?
i.e a dataframe like:
1 2 3 4 5 6 7 8 9 'a' 'b' 'c'
[1] 1 1 1 1 0 0 0 0 0 0 0 0
[2] 0 0 0 1 1 1 0 0 0 0 0 0
[3] 1 1 1 1 1 1 1 1 1 0 0 0
[4] 0 0 0 0 0 0 0 0 0 1 1 1
some data:
x <- list(c(1, 2, 3, 4), c(4, 5, 6), c(1, 2, 3, 4, 5, 6, 7, 8, 9), c("a", "b", "c"))

Here is a base R option:
# extract unique values from x
uv <- unique(unlist(x))
# Check in each element of lists which values are present and bind everything toegether
out <- do.call(rbind, lapply(x, function(e) as.integer(uv %in% e) ))
# Convert from matrix to data.frame and add column names
out <- setNames(as.data.frame(out), uv)
out
1 2 3 4 5 6 7 8 9 a b c
1 1 1 1 1 0 0 0 0 0 0 0 0
2 0 0 0 1 1 1 0 0 0 0 0 0
3 1 1 1 1 1 1 1 1 1 0 0 0
4 0 0 0 0 0 0 0 0 0 1 1 1

Here is a base R option with stack and table
table(stack(setNames(x, seq_along(x)))[2:1])
# values
#ind 1 2 3 4 5 6 7 8 9 a b c
# 1 1 1 1 1 0 0 0 0 0 0 0 0
# 2 0 0 0 1 1 1 0 0 0 0 0 0
# 3 1 1 1 1 1 1 1 1 1 0 0 0
# 4 0 0 0 0 0 0 0 0 0 1 1 1

Something like this?
library(tidyverse)
x = list(c(1, 2, 3, 4), c(4, 5, 6), c(1, 2, 3, 4, 5, 6, 7, 8, 9))
y = tibble(column1= map_chr(x, str_flatten, " "))
Where y is this:
# A tibble: 3 x 1
column1
<chr>
1 1 2 3 4
2 4 5 6
3 1 2 3 4 5 6 7 8 9

Related

How to loop ifelse function through a grouped variable with dplyr

I'm trying to apply a rule for a group of IDs that, upon the first instance where the value for a variable in one row equals 1, all values for another variable in all subsequent rows in that group equal 1.
Essentially, here is what I am trying to do:
I have:
ID D
1 1
1 0
1 0
2 0
2 0
3 1
3 0
3 0
4 1
4 0
4 1
4 1
4 1
4 0
I want:
ID D PREV
1 1 0
1 0 1
1 0 1
2 0 0
2 0 0
3 1 0
3 0 1
3 0 1
4 1 0
4 0 1
4 1 1
4 1 1
4 0 1
I'm trying to use dplyr to iterate through a series of grouped rows, in each one applying an ifelse function. My code looks like this:
data$prev = 0
data <-
data %>%
group_by(id)%>%
mutate(prev = if_else(lag(prev) == 1 | lag(d) == 1, 1, 0))
But for some reason, this is not applying the ifelse function over the whole group, resulting in data that looks something like this:
ID D PREV
1 1 0
1 0 1
1 0 0
2 0 0
2 0 0
3 1 0
3 0 1
3 0 0
4 1 0
4 0 1
4 1 0
4 1 1
4 0 1
Can anyone help me with this?
What about this:
library(dplyr)
df %>%
group_by(ID) %>%
mutate(prev = +(cumsum(c(0, D[-length(D)])) > 0)) %>%
ungroup()
#> # A tibble: 14 x 3
#> ID D prev
#> <int> <int> <int>
#> 1 1 1 0
#> 2 1 0 1
#> 3 1 0 1
#> 4 2 0 0
#> 5 2 0 0
#> 6 3 1 0
#> 7 3 0 1
#> 8 3 0 1
#> 9 4 1 0
#> 10 4 0 1
#> 11 4 1 1
#> 12 4 1 1
#> 13 4 1 1
#> 14 4 0 1
To explain what it does, let's just take a simple vector.
The calc will be the same for each group.
Be x our vector
x <- c(0,0,0,1,1,0,0,2,3,4)
Do the cumulative sum over x
cumsum(x)
#> [1] 0 0 0 1 2 2 2 4 7 11
You are interested only on value above zeros, therefore:
cumsum(x)>0
#> [1] FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
You don't want logical, but numeric. Just a + makes the trick
+(cumsum(x)>0)
#> [1] 0 0 0 1 1 1 1 1 1 1
However, you want the 1s delayed by 1. Thus, we had a zero on top of x
+(cumsum(c(0,x))>0)
#> [1] 0 0 0 0 1 1 1 1 1 1 1
We need to keep the same length, so we remove the last value of x.
+(cumsum(c(0, x[-length(x)])) > 0)
#> [1] 0 0 0 0 1 1 1 1 1 1
And that makes the trick.
We can use lag
library(dplyr)
df %>%
group_by(ID) %>%
mutate(prev = lag(cumsum(D) > 0, default = 0))
-output
# A tibble: 14 x 3
# Groups: ID [4]
# ID D prev
# <dbl> <dbl> <dbl>
# 1 1 1 0
# 2 1 0 1
# 3 1 0 1
# 4 2 0 0
# 5 2 0 0
# 6 3 1 0
# 7 3 0 1
# 8 3 0 1
# 9 4 1 0
#10 4 0 1
#11 4 1 1
#12 4 1 1
#13 4 1 1
#14 4 0 1
data
df <- data.frame(
ID = c(1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4),
D = c(1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0)
)
You can use a new function from dplyr dplyr::group_modify to apply function over groups
df <- data.frame(
ID = c(1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4),
D = c(1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0)
)
df %>% group_by(ID) %>% group_modify(
function(x, y){
boo <- x[1, ]$D == 1
ifelse(boo,
{x$prev = 1
x$prev[1] = 0
},
{x$prev = 0})
x
}
)
# A tibble: 14 x 3
# Groups: ID [4]
ID D prev
<dbl> <dbl> <dbl>
1 1 1 0
2 1 0 1
3 1 0 1
4 2 0 0
5 2 0 0
6 3 1 0
7 3 0 1
8 3 0 1
9 4 1 0
10 4 0 1
11 4 1 1
12 4 1 1
13 4 1 1
14 4 0 1

Rename a range of columns in a dataframe

I'm trying to rename a range of columns in a dataframe so that they have the format [V1:V5]:
result_df = data.frame(V1 = 1, V2 = 2, V3 = 3, V4 = 4, V5 = 5, colnamethatshouldntberenamed = 6)
If the existing dataframe has the range of numbers somewhere in their names, it's relatively straigthforward (although I'm thinking there's probably a way to do it with one line of code, not two):
df1 = data.frame(X1q = 1, X2q = 2, X3q = 3, X4q = 4, X5q = 5, colnamethatshouldntberenamed = 6)
names(df1) <- gsub("X", "V", names(df1))
names(df1) <- gsub("q", "", names(df1))
But what if the column names have completely random names?
df2 = data.frame(name = 1, col = 2, random = 3, alsorandom = 4, somethingelse = 5, colnamethatshouldntberenamed = 6)
Is there a way to rename all of these columns in one-go? (assuming that they are adjoining columns in the dataframe, but there may be other columns in the dataframe with names that don't need to be changed)
If you have a different number of columns and/or you want to %>%, you can use purrr::set_names().
For example:
Sample data with 10 columns:
example1 <- data.frame(replicate(10,sample(0:1,5,rep=TRUE)))
example1
X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
1 1 1 0 1 1 1 0 1 0 1
2 0 0 1 0 1 1 1 0 0 1
3 0 0 1 0 0 1 1 1 0 0
4 0 1 0 1 1 0 1 0 0 0
5 1 0 0 1 1 0 1 1 0 1
You can use seq_along inside set_names which will rename the columns by order (with piping):
example1 %>%
set_names(c(seq_along(example1)))
Results:
1 2 3 4 5 6 7 8 9 10
1 1 1 0 1 1 1 0 1 0 1
2 0 0 1 0 1 1 1 0 0 1
3 0 0 1 0 0 1 1 1 0 0
4 0 1 0 1 1 0 1 0 0 0
5 1 0 0 1 1 0 1 1 0 1
Same idea with 15 columns and naming them using paste in set_names:
example2 <- data.frame(replicate(15,sample(0:1,10,rep=TRUE)))
example2 %>%
set_names(c(paste("VarNum", seq_along(example2), sep = "")))
Results
VarNum1 VarNum2 VarNum3 VarNum4 VarNum5 VarNum6 VarNum7 VarNum8 VarNum9 VarNum10 VarNum11 VarNum12 VarNum13 VarNum14 VarNum15
1 0 1 0 0 0 0 0 1 1 1 1 0 1 0 1
2 1 1 0 0 0 1 0 1 1 0 0 0 1 0 1
3 1 1 0 1 0 1 1 1 1 1 1 0 1 0 1
4 0 0 0 0 1 1 1 1 0 1 1 0 0 0 1
5 1 1 0 1 0 0 1 0 0 1 1 0 0 0 0

hex2bin in data frame

I'm new to R and excited about all the possibilities of data management and presentation.
Actually I have a problem and did not find any solution:
I have built a data frame with:
require(BMS)
n = c(1, 2, 3, 4, 5, 6, 7, 8)
s = c("55aa55aa", "aa55aa55", "12345678", "9ABCDEF0", "55aa55aa", "aa55aa55", "12345678", "9ABCDEF0")
df = data.frame(n, s)
df$s <- as.character(df$s)
df
# n s
# 1 1 55aa55aa
# 2 2 aa55aa55
# 3 3 12345678
# 4 4 9ABCDEF0
# 5 5 55aa55aa
# 6 6 aa55aa55
# 7 7 12345678
# 8 8 9ABCDEF0
Column s is a 32bit hex value which I want to add as the real bit string to the data frame as new column sbin.
It should look like this afterwards:
df
# n s sbin
# 1 1 55aa55aa 01010101101010100101010110101010
# 2 2 aa55aa55 10101010010101011010101001010101
# 3 3 12345678 00010010001101000101011001111000
# 4 4 9ABCDEF0 .......
# 5 5 55aa55aa ......
# 6 6 aa55aa55
# 7 7 12345678
# 8 8 9ABCDEF0
For conversion I like to use the "hex2bin" function out of "BMS" package.
I tried this
lapply(df$s, hex2bin)
# [[1]]
# [1] 0 1 0 1 0 1 0 1 1 0 1 0 1 0 1 0 0 1 0 1 0 1 0 1 1 0 1 0 1 0 1 0
# [[2]]
# [1] 1 0 1 0 1 0 1 0 0 1 0 1 0 1 0 1 1 0 1 0 1 0 1 0 0 1 0 1 0 1 0 1
# [[3]]
# [1] 0 0 0 1 0 0 1 0 0 0 1 1 0 1 0 0 0 1 0 1 0 1 1 0 0 1 1 1 1 0 0 0
# .....
but did not get the required output.
In the end I would like to access each bit in the data frame rows. So I would like to get 32 vectors with 8 bits each in this example.
How about this?
df$sbin <- sapply(df$s, FUN = function(x) { paste(hex2bin(x), collapse = "") })
# n s sbin
# 1 1 55aa55aa 01010101101010100101010110101010
# 2 2 aa55aa55 10101010010101011010101001010101
# 3 3 12345678 00010010001101000101011001111000
# 4 4 9ABCDEF0 10011010101111001101111011110000
# 5 5 55aa55aa 01010101101010100101010110101010
# 6 6 aa55aa55 10101010010101011010101001010101
# 7 7 12345678 00010010001101000101011001111000
# 8 8 9ABCDEF0 10011010101111001101111011110000

Creating special matrix in R

I have a matrix as follows.
dat = matrix(c(0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 2, 3, 4, 5, 6), ncol=4)
colnames(dat)=c("m1","m2","m3","m4")
dat
m1 m2 m3 m4
1 0 1 0 2
2 0 0 0 3
3 1 1 0 4
4 1 1 1 5
5 1 1 1 6
I would like to create four matrix(5*4) which each matrix column obtain by multiplying by itself and then each pair row values res1 = (m1*m1, m1*m2, m1*m3, m1*m4) , res2 = (m1*m2, m2*m2, m2*m3, m2*m4), res3 = (mm1*m3, m2*m3, m3*m3, m4*m3), res4 = (m1*m4, m2*m4, m3*m4, m4*m4) such as
res1
1 0 0 0 0
2 0 0 0 0
3 1 1 0 4
4 1 1 1 5
5 1 1 1 6
res2
1 1 1 0 2
2 0 0 0 0
3 1 1 0 4
4 1 1 1 5
5 1 1 1 6
res3
1 0 0 0 0
2 0 0 0 0
3 0 0 0 0
4 1 1 1 5
5 1 1 1 6
res4
1 0 2 0 4
2 0 0 0 9
3 4 4 0 16
4 5 5 5 25
5 6 6 6 36
How can I do it efficiently in R?
Running
res <- lapply(1:ncol(dat), function(i) dat * dat[,i])
will work thanks to the recycling of the element-wise multiplication. If you multiply by one column, those values will repeat over the entire matrix. And lapply will return them all in a list. You can get them out individually as res[[1]], res[[2]], etc.
test<-NULL
for (i in 1:ncol(dat)){
x<-dat*dat[,i]
test[i]<-list(x)
}
same as #Mrflick's comment
test[[2]]
m1 m2 m3 m4
[1,] 0 1 0 2
[2,] 0 0 0 0
[3,] 1 1 0 4
[4,] 1 1 1 5
[5,] 1 1 1 6

Splitting one column into multiple columns

I have a huge dataset in which there is one column including several values for each subject (row). Here is a simplified sample dataframe:
data <- data.frame(subject = c(1:8), sex = c(1, 2, 2, 1, 2, 1, 1, 2),
age = c(35, 29, 31, 46, 64, 57, 49, 58),
v1 = c("2", "0", "3,5", "2 1", "A,4", "B,1,C", "A and B,3", "5, 6 A or C"))
> data
subject sex age v1
1 1 1 35 2
2 2 2 29 0
3 3 2 31 3,5 # separated by a comma
4 4 1 46 2 1 # separated by a blank space
5 5 2 64 A,4
6 6 1 57 B,1,C
7 7 1 49 A and B,3
8 8 2 58 5, 6 A or C
I first want to remove the letters (A, B, A and B, …) in the fourth column (v1), and then split the fourth column into multiple columns just like this:
subject sex age x1 x2 x3 x4 x5 x6
1 1 1 35 0 1 0 0 0 0
2 2 2 29 0 0 0 0 0 0
3 3 2 31 0 0 1 0 1 0
4 4 1 46 1 1 0 0 0 0
5 5 2 64 0 0 0 1 0 0
6 6 1 57 1 0 0 0 0 0
7 7 1 49 0 0 1 0 0 0
8 8 2 58 0 0 0 0 1 1
where the 1st subject takes 1 at x2 because it takes 2 at v1 in the original dataset, the 3rd subject takes 1 at both x3 and x5 because it takes 3 and 5 at v1 in the original dataset, and so on.
I would appreciate any help on this question. Thanks a lot.
You can cbind this result to data[-4] and get what you need:
0+t(sapply(as.character(data$v1), function(line)
sapply(1:6, function(x) x %in% unlist(strsplit(line, split="\\s|\\,"))) ))
#----------------
[,1] [,2] [,3] [,4] [,5] [,6]
2 0 1 0 0 0 0
0 0 0 0 0 0 0
3,5 0 0 1 0 1 0
2 1 1 1 0 0 0 0
A,4 0 0 0 1 0 0
B,1,C 1 0 0 0 0 0
A and B,3 0 0 1 0 0 0
5, 6 A or C 0 0 0 0 1 1
One solution:
r <- sapply(strsplit(as.character(dt$v1), "[^0-9]+"), as.numeric)
m <- as.data.frame(t(sapply(r, function(x) {
y <- rep(0, 6)
y[x[!is.na(x)]] <- 1
y
})))
data <- cbind(data[, c("subject", "sex", "age")], m)
# subject sex age V1 V2 V3 V4 V5 V6
# 1 1 1 35 0 1 0 0 0 0
# 2 2 2 29 0 0 0 0 0 0
# 3 3 2 31 0 0 1 0 1 0
# 4 4 1 46 1 1 0 0 0 0
# 5 5 2 64 0 0 0 1 0 0
# 6 6 1 57 1 0 0 0 0 0
# 7 7 1 49 0 0 1 0 0 0
# 8 8 2 58 0 0 0 0 1 1
Following DWin's awesome solution, m could be modified as:
m <- as.data.frame(t(sapply(r, function(x) {
0 + 1:6 %in% x[!is.na(x)]
})))

Resources