Suppose I have a data set like this:
dat <- tibble(id = 1:4,
col1 = c(0, 1, 1, 0),
col2 = c(1, 0, 1, 0),
col3 = c(1, 1, 0, 1))
> dat
# A tibble: 4 × 4
id col1 col2 col3
<int> <dbl> <dbl> <dbl>
1 1 0 1 1
2 2 1 0 1
3 3 1 1 0
4 4 0 0 1
I'd like to separate, for every unique id, the multiple 1s into multiple rows, i.e. the expected output is:
# A tibble: 7 × 4
id col1 col2 col3
<dbl> <dbl> <dbl> <dbl>
1 1 0 1 0
2 1 0 0 1
3 2 1 0 0
4 2 0 0 1
5 3 1 0 0
6 3 0 1 0
7 4 0 0 1
For the first id (id = 1), col2 and col3 are both 1, so I would like a separate row for each of them. It kinda is like one-hot encoding for rows.
With help from Ritchie Sacramento and RobertoT
library(tidyverse)
dat <- tibble(id = 1:4,
col1 = c(0, 1, 1, 0),
col2 = c(1, 0, 1, 0),
col3 = c(1, 1, 0, 1))
dat %>%
pivot_longer(-id) %>%
filter(value != 0) %>%
mutate(rows = 1:nrow(.)) %>%
pivot_wider(values_fill = 0,
names_sort = TRUE) %>%
select(-rows)
# A tibble: 7 × 4
id col1 col2 col3
<int> <dbl> <dbl> <dbl>
1 1 0 1 0
2 1 0 0 1
3 2 1 0 0
4 2 0 0 1
5 3 1 0 0
6 3 0 1 0
7 4 0 0 1
Here is an alternative approach using model.matrix():
From the documenation: model.matrix creates a design (or model) matrix, e.g., by expanding factors to a set of dummy variables (depending on the contrasts) and expanding interactions similarly.
library(dplyr)
library(tidyr)
dat %>%
pivot_longer(-id) %>%
filter(value == 1) %>%
cbind((model.matrix(~ name + 0, .) == 1)*1)
id name value namecol1 namecol2 namecol3
1 1 col2 1 0 1 0
2 1 col3 1 0 0 1
3 2 col1 1 1 0 0
4 2 col3 1 0 0 1
5 3 col1 1 1 0 0
6 3 col2 1 0 1 0
7 4 col3 1 0 0 1
You could do
arrange(bind_rows(lapply(2:4, function(x) {
d <- dat[dat[[x]] == 1,]
d[-c(1, x)] <- 0
d})), id)
#> # A tibble: 7 x 4
#> id col1 col2 col3
#> <int> <dbl> <dbl> <dbl>
#> 1 1 0 1 0
#> 2 1 0 0 1
#> 3 2 1 0 0
#> 4 2 0 0 1
#> 5 3 1 0 0
#> 6 3 0 1 0
#> 7 4 0 0 1
Created on 2022-07-14 by the reprex package (v2.0.1)
Using explicit loops:
nullrow <- rep(0, ncol(dat)-1)
data <- dat[,-1]
rowsums <- apply(data, 1, sum)
res <- data[0,]
ids <- c()
for(i in 1:nrow(data)) {
if(rowsums[i]>0) {
for(j in 1:rowsums[i]) {
thisrow <- nullrow
thiscolumn <- which(data[i,]==1)[j]
thisrow[thiscolumn] <- 1
res <- rbind(res, thisrow)
}
ids <- c(ids, rep(dat$id[i], rowsums[i]))
}
}
names(res) <- colnames(data)
res$id <- ids
> res
col1 col2 col3 id
1 0 1 0 1
2 0 0 1 1
3 1 0 0 2
4 0 0 1 2
5 1 0 0 3
6 0 1 0 3
7 0 0 1 4
A possible solution, based on purrr:pmap_dfr and on the following ideas:
Loop over all dataframe rows.
Use each row to create a diagonal matrix with the contents of the diagonal being the dataframe row.
Filter out the rows that only have zeros.
library(tidyverse)
pmap_dfr(dat, ~ data.frame(id = ..1, diag(c(...)[-1]))) %>%
filter(if_any(X1:X3, ~ .x != 0))
#> id X1 X2 X3
#> 1 1 0 1 0
#> 2 1 0 0 1
#> 3 2 1 0 0
#> 4 2 0 0 1
#> 5 3 1 0 0
#> 6 3 0 1 0
#> 7 4 0 0 1
Another possible solution, based on Matrix::sparseMatrix:
First, it gets the indexes where there are 1 (with which).
Second, it adjusts the row indexes, to force one 1 per row.
Third, it creates a sparse matrix, putting the 1 where the adjusted indexes specify.
library(tidyverse)
library(Matrix)
which(dat[-1] == 1, arr.ind = T) %>%
as.data.frame %>%
arrange(row) %>%
mutate(id = dat[row,"id"], row = 1:n()) %>%
{data.frame(id = .$id, as.matrix( sparseMatrix(i = .$row, j= .$col, x= 1)))}
#> id X1 X2 X3
#> 1 1 0 1 0
#> 2 1 0 0 1
#> 3 2 1 0 0
#> 4 2 0 0 1
#> 5 3 1 0 0
#> 6 3 0 1 0
#> 7 4 0 0 1
Yet another possible solution:
library(tidyverse)
f <- function(df)
{
got <- 0
for (i in 1:nrow(df))
{
got <- which.max(df[i, (got+1):ncol(df)]) + got
df[i, -got] <- 0
}
df
}
dat %>%
slice(map(1:nrow(dat), ~ rep(.x, rowSums(dat[-1])[.x])) %>% unlist) %>%
group_by(id) %>%
group_modify(~ f(.)) %>%
ungroup
#> # A tibble: 7 × 4
#> id col1 col2 col3
#> <int> <dbl> <dbl> <dbl>
#> 1 1 0 1 0
#> 2 1 0 0 1
#> 3 2 1 0 0
#> 4 2 0 0 1
#> 5 3 1 0 0
#> 6 3 0 1 0
#> 7 4 0 0 1
Related
Consider this dataframe:
dat <- structure(list(col1 = c(1, 2, 0), col2 = c(0, 3, 2), col3 = c(1, 2, 3)), class = "data.frame", row.names = c(NA, -3L))
col1 col2 col3
1 1 0 1
2 2 3 2
3 0 2 3
How can one dummify rows? i.e. whenever there is a row with more than 1 non-0 value, separate the row into multiple rows with one non-0 value per row.
In this case, this would be:
col1 col2 col3
1 1 0 0
2 0 0 1
3 2 0 0
4 0 3 0
5 0 0 2
6 0 2 0
7 0 0 3
You can do:
library(tidyverse)
dat |>
pivot_longer(everything()) |>
mutate(id = 1:n()) |>
pivot_wider(values_fill = 0) |>
filter(!if_all(-id, ~ . == 0)) |>
select(-id)
# A tibble: 7 x 3
col1 col2 col3
<dbl> <dbl> <dbl>
1 1 0 0
2 0 0 1
3 2 0 0
4 0 3 0
5 0 0 2
6 0 2 0
7 0 0 3
Another approach, here I used data.table
library(data.table)
x <- rbindlist(apply(dat, 1, function(x) {
x <- data.table(diag(x, ncol(dat)))
x[colSums(x) > 0]
}))
setnames(x, names(dat))
x
# col1 col2 col3
# 1: 1 0 0
# 2: 0 0 1
# 3: 2 0 0
# 4: 0 3 0
# 5: 0 0 2
# 6: 0 2 0
# 7: 0 0 3
A very ugly way is:
library(tidyverse)
dat %>%
apply(1, diag) %>%
matrix(nrow = 3) %>%
t() %>%
as.data.frame() %>%
rename_with(~ names(dat), everything()) %>%
filter(rowSums(.) != 0)
col1 col2 col3
1 1 0 0
2 0 0 1
3 2 0 0
4 0 3 0
5 0 0 2
6 0 2 0
7 0 0 3
I'm trying to transform each of my column factors in a column containing just 0 or 1. Probably there is a function for that, or someone else already asked, but I couldn't found it. Here is a simple example to try to show what I need:
test = data.frame(my_groups = c("A", "A", "A", "B", "B", "C", "C", "C", "C"),
measure1 = c(1:9))
#as result:
# group_A group_B group_C measure1
# 1 1 0 0 1
# 1 1 0 0 2
# 1 1 0 0 3
# 1 0 1 0 4
# 1 0 1 0 5
# 1 0 0 1 6
# 1 0 0 1 7
# 1 0 0 1 8
# 1 0 0 1 9
Any hint on how can I do that?
We may use dummy_cols from fastDummies
library(fastDummies)
library(dplyr)
test %>%
rename(group = 'my_groups') %>%
dummy_cols('group', remove_selected_columns = TRUE) %>%
select(starts_with('group'), measure1)
-output
group_A group_B group_C measure1
1 1 0 0 1
2 1 0 0 2
3 1 0 0 3
4 0 1 0 4
5 0 1 0 5
6 0 0 1 6
7 0 0 1 7
8 0 0 1 8
9 0 0 1 9
Fortunately, there's a one-function Base R solution.
This type of problem happens a lot, and model.matrix() is built exactly for this.
# the "+ 0" is to avoid adding a column for the intercept.
model.matrix(~ my_groups + measure1 + 0, data=test)
Output:
my_groupsA my_groupsB my_groupsC measure1
1 1 0 0 1
2 1 0 0 2
3 1 0 0 3
4 0 1 0 4
5 0 1 0 5
6 0 0 1 6
7 0 0 1 7
8 0 0 1 8
9 0 0 1 9
Here's a base R solution, constructing the matrix using expand.grid, then adding the required names.
res <- data.frame( t( unique( matrix( as.numeric( do.call("==", expand.grid(
test$my_groups, test$my_groups) ) ), dim(test)[1] ) ) ), test$measure1 )
colnames(res) <- c( paste0( "group_", unique(test$my_groups) ), colnames(test)[2] )
res
group_A group_B group_C measure1
1 1 0 0 1
2 1 0 0 2
3 1 0 0 3
4 0 1 0 4
5 0 1 0 5
6 0 0 1 6
7 0 0 1 7
8 0 0 1 8
9 0 0 1 9
We can try this using dplyr or purrr.
library(tidyverse)
test = data.frame(my_groups = c("A", "A", "A", "B", "B", "C", "C", "C", "C"),
measure1 = c(1:9))
dummyfy <-
as_mapper(~{
len_row <- vector('numeric', nrow(test))
len_row[.] <- c(1)
len_row}
)
data <- pivot_wider(test, names_from = my_groups, values_from = measure1)
#> Warning: Values are not uniquely identified; output will contain list-cols.
#> * Use `values_fn = list` to suppress this warning.
#> * Use `values_fn = length` to identify where the duplicates arise
#> * Use `values_fn = {summary_fun}` to summarise duplicates
map(data, ~reduce(., c)) %>%
map_dfr(dummyfy) %>%
bind_cols(test[-1])
#> # A tibble: 9 × 4
#> A B C measure1
#> <dbl> <dbl> <dbl> <int>
#> 1 1 0 0 1
#> 2 1 0 0 2
#> 3 1 0 0 3
#> 4 0 1 0 4
#> 5 0 1 0 5
#> 6 0 0 1 6
#> 7 0 0 1 7
#> 8 0 0 1 8
#> 9 0 0 1 9
#equivalent using across:
data %>% summarise(across(everything(), ~reduce(., c) %>% dummyfy)) %>% bind_cols(test[-1])
#> # A tibble: 9 × 4
#> A B C measure1
#> <dbl> <dbl> <dbl> <int>
#> 1 1 0 0 1
#> 2 1 0 0 2
#> 3 1 0 0 3
#> 4 0 1 0 4
#> 5 0 1 0 5
#> 6 0 0 1 6
#> 7 0 0 1 7
#> 8 0 0 1 8
#> 9 0 0 1 9
Created on 2021-12-03 by the reprex package (v2.0.1)
I have the following dataset:
test_df=data.frame(Group=c(1,1,1,1,2,2),var1=c(1,0,0,1,1,1),var2=c(0,0,1,1,0,0),var3=c(0,1,0,0,0,1))
Group
var1
var2
var3
1
1
0
0
1
0
0
1
1
0
1
0
1
1
1
0
2
1
0
0
2
1
0
1
I want to add 3 columns (out1-3) for var1-3, which count number of rows until the first 1, by Group,
as shown below:
Group
var1
var2
var3
out1
out2
out3
1
1
0
0
1
3
2
1
0
0
1
1
3
2
1
0
1
0
1
3
2
1
1
1
0
1
3
2
2
1
0
0
1
0
2
2
1
0
1
1
0
2
I used this R code, I repeated it for my 3 variables, and my actual dataset contains more than only 3 columns.
But it is not working:
test_var1<-select(test_df,Group,var1 )%>%
group_by(Group) %>%
mutate(out1 = row_number()) %>%
filter(var1 != 0) %>%
slice(1)
df <- data.frame(Group=c(1,1,1,1,2,2),
var1=c(1,0,0,1,1,1),
var2=c(0,0,1,1,0,0),
var3=c(0,1,0,0,0,1))
This works for any number of variables as long as the structure is the same as in the example (i.e. Group + many variables that are 0 or 1)
df %>%
mutate(rownr = row_number()) %>%
pivot_longer(-c(Group, rownr)) %>%
group_by(Group, name) %>%
mutate(out = cumsum(value != 1 & (cumsum(value) < 1)) + 1,
out = ifelse(max(out) > n(), 0, max(out))) %>%
pivot_wider(names_from = c(name, name), values_from = c(value, out)) %>%
select(-rownr)
Returns:
Group value_var1 value_var2 value_var3 out_var1 out_var2 out_var3
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 0 0 1 3 2
2 1 0 0 1 1 3 2
3 1 0 1 0 1 3 2
4 1 1 1 0 1 3 2
5 2 1 0 0 1 0 2
6 2 1 0 1 1 0 2
If you only have 3 "out" variables then you can create three rows as follows
#1- Your dataset
df=data.frame(Group=rep(1,4),var1=c(1,0,0,1),var2=c(0,0,1,1),var3=c(0,1,0,0))
#2- Count the first row number with "1" value
df$out1=min(rownames(df)[which(df$var1==1)])
df$out2=min(rownames(df)[which(df$var2==1)])
df$out3=min(rownames(df)[which(df$var3==1)])
If you have more than 3 columns, then it may be better to create a loop for example
for(i in 1:3){
df[paste("out",i,sep="")]=min(rownames(df)[which(df[,which(colnames(df)==paste("var",i,sep=""))]==1)])
}
I have a vector like this in R:
dt = data.frame(input=c(0,0,1,1,0,0,1,0,0,0,1,1,1,0,1) )
dt
input
# 1 0
# 2 0
# 3 1
# 4 1
# 5 0
# 6 0
# 7 1
# 8 0
# 9 0
# 10 0
# 11 1
# 12 1
# 13 1
# 14 0
# 15 1
I want to replace the consecutive 0, in which the length is less than three, with 1, and save it to a new column.
#update:
I also hope that the replacement only happens when less than three 0 are sandwiched between 1. So in this condition, I will ignore the two 0 in raw 1 and 2. (or also when happening in the tail or meet NA)
For example, I want to output:
input output
# 1 0 0
# 2 0 0
# 3 1 1
# 4 1 1
# 5 0 1
# 6 0 1
# 7 1 1
# 8 0 0
# 9 0 0
# 10 0 0
# 11 1 1
# 12 1 1
# 13 1 1
# 14 0 1
# 15 1 1
How can I write it in the foreach loop? (I have the data with thousands of rows)
Thanks.
Create a grouping column with rleid on the 'input' column, and if the number of rows is less than 3 and all values are 0, replace with 1 or else return input
library(dplyr)
library(data.table)
dt %>%
mutate(new = cumsum(input)) %>%
group_by(grp = rleid(input)) %>%
mutate(output = if(n() <3 & all(input == 0) & all(new > 0)) 1 else input) %>%
ungroup %>%
select(-grp, -new)
-output
# A tibble: 15 × 2
input output
<dbl> <dbl>
1 0 0
2 0 0
3 1 1
4 1 1
5 1 1
6 0 1
7 1 1
8 0 0
9 0 0
10 0 0
11 1 1
12 1 1
13 1 1
14 0 1
15 1 1
Or use base R with rle
dt$output <- inverse.rle(within.list(rle(dt$input),
values[!values & lengths < 3 & seq_along(values) != 1] <- 1))
dt$output
#[1] 0 0 1 1 1 1 1 0 0 0 1 1 1 1 1
Update after clarification:
We could now ungroup() and check if the sequence is wrapped by a 1 with lag(input==1):
dt %>%
mutate(
x= cumsum(input != lag(input, def = first(input)))
) %>%
group_by(x) %>%
mutate(x = seq_along(input),
x = last(x)) %>%
ungroup() %>%
mutate(output = case_when(input == 0 &
lag(input==1) &
x<=2 ~ 1,
TRUE ~ as.numeric(input))) %>%
select(-x)
Output:
A tibble: 15 x 2
input output
<dbl> <dbl>
1 0 0
2 0 0
3 1 1
4 1 1
5 1 1
6 0 1
7 1 1
8 0 0
9 0 0
10 0 0
11 1 1
12 1 1
13 1 1
14 0 1
15 1 1
First answer:
Here is a suggestion. But I don't understand the rows 1 and 2 in your output. "replace consecutive 0, in which the length is less than three, with 1" this is the case for row 1 and 2.
dt %>%
mutate(
x= cumsum(input != lag(input, def = first(input)))
) %>%
group_by(x) %>%
mutate(x = seq_along(input),
x = last(x)) %>%
mutate(output = case_when(input == 0 & x<=2 ~ 1,
TRUE ~ as.numeric(input))) %>%
ungroup() %>%
select(-x)
input output
<dbl> <dbl>
1 0 1
2 0 1
3 1 1
4 1 1
5 1 1
6 0 1
7 1 1
8 0 0
9 0 0
10 0 0
11 1 1
12 1 1
13 1 1
14 0 1
15 1 1
Update following reformulation of the question: This tidyverse approach simply makes use of case_when().
library(dplyr)
mutate(dt, inputX = case_when(input == 0 &
lag(input) == 1 &
lead(input) == 1 ~ 1,
input == 0 &
lag(input) == 0 &
lag(input, n = 2) == 1 &
lead(input) == 1 ~ 1,
T ~ input))
# input inputX
# 1 0 0
# 2 0 0
# 3 1 1
# 4 1 1
# 5 1 1
# 6 0 1
# 7 1 1
# 8 0 0
# 9 0 0
# 10 0 0
# 11 1 1
# 12 1 1
# 13 1 1
# 14 0 1
# 15 1 1
Previous solution: Having understood the requirements like Tarjae did, a tidyverse option could look as follows.
library(dplyr)
dt %>%
mutate(x = cumsum(input)) %>%
group_by(x) %>%
mutate(y = +(n() %in% 2:3)) %>%
ungroup() %>%
transmute(input = input,
inputX = if_else(y == 1, 1, input))
# # A tibble: 15 x 2
# input inputX
# <dbl> <dbl>
# 1 0 1
# 2 0 1
# 3 1 1
# 4 1 1
# 5 1 1
# 6 0 1
# 7 1 1
# 8 0 0
# 9 0 0
# 10 0 0
# 11 1 1
# 12 1 1
# 13 1 1
# 14 0 1
# 15 1 1
I have a list of about 100 data frames. I would like to create a new list of data frames where the first data frame is made up of the first columns of all the existing data frames, and the second data frame is made up of the second column etc...
Please see code below for an example of what I want to do.
a <- c(0, 0, 1, 1, 1)
b <- c(0, 1, 0, 0, 1)
c <- c(1, 1, 0, 0, 1)
df1 <- data.frame(a, b, c)
df2 <- data.frame(c, b, a)
df3 <- data.frame(b, a, c)
my_lst <- list(df1, df2, df3)
new_df1 <- data.frame(df1[,1], df2[,1], df3[,1])
new_df2 <- data.frame(df1[,2], df2[,2], df3[,2])
new_df3 <- data.frame(df1[,3], df2[,3], df3[,3])
new_lst <- list(new_df1, new_df2, new_df3)
Is there a more compact way of doing this with large lists containing large data frames? Thanks in advance.
This is an option:
cols <- ncol(my_lst[[1]])
lapply(1:cols, function(x) do.call(cbind, lapply(my_lst, `[`, x)))
[[1]]
a c b
1 0 1 0
2 0 1 1
3 1 0 0
4 1 0 0
5 1 1 1
[[2]]
b b a
1 0 0 0
2 1 1 0
3 0 0 1
4 0 0 1
5 1 1 1
[[3]]
c a c
1 1 0 1
2 1 0 1
3 0 1 0
4 0 1 0
5 1 1 1
A tidyverse option is to change the columns names, transpose and bind_cols
library(tidyverse)
my_lst %>%
map(setNames, letters[1:3]) %>%
purrr::transpose() %>%
map(bind_cols)
#$a
# A tibble: 5 x 3
# V1 V2 V3
# <dbl> <dbl> <dbl>
#1 0 1 0
#2 0 1 1
#3 1 0 0
#4 1 0 0
#5 1 1 1
#$b
# A tibble: 5 x 3
# V1 V2 V3
# <dbl> <dbl> <dbl>
#1 0 0 0
#2 1 1 0
#3 0 0 1
#4 0 0 1
#5 1 1 1
#$c
# A tibble: 5 x 3
# V1 V2 V3
# <dbl> <dbl> <dbl>
#1 1 0 1
#2 1 0 1
#3 0 1 0
#4 0 1 0
#5 1 1 1