Create a new dataframe with the all possible combinations - r

Having a dataframe like this:
data.frame(previous = c(1,2,2,1,3,3), next = c(1,1,2,3,1,3), id = c(1,2,3,4,5,6))
How is it possible to exatract a data frame which will check the previous and next columns and create 9 new columns which will have 1 only if the combination of previous and next exist. Example if previous if 2 and next 1 the combination is 2 1 and receives one.
Example of expected output:
data.frame(previous = c(1,2,2,1,3,3), next = c(1,1,2,3,1,3),
col1_1 = c(1,0,0,0,0,0),
col1_2 = c(0,0,0,0,0,0),
col1_3 = c(0,0,0,1,0,0),
col2_1 = c(0,1,0,0,0,0),
col2_2 = c(0,0,1,0,0,0),
col2_3 = c(0,0,0,0,0,0),
col3_1 = c(0,0,0,0,1,0),
col3_2 = c(0,0,0,0,0,0),
col3_3 = c(0,0,0,0,0,1), id = c(1,2,3,4,5,6))

You could use expand.grid to get all the combinations.
Assuming your data frame is called df and the column next is actually called next. to avoid clashing with the keyword next:
as.data.frame(apply(expand.grid(1:3, 1:3), 1, function(x) {
as.numeric(x[1] == df$previous & x[2] == df$next.)}))
#> V1 V2 V3 V4 V5 V6 V7 V8 V9
#> 1 1 0 0 0 0 0 0 0 0
#> 2 0 1 0 0 0 0 0 0 0
#> 3 0 0 0 0 1 0 0 0 0
#> 4 0 0 0 0 0 0 1 0 0
#> 5 0 0 1 0 0 0 0 0 0
#> 6 0 0 0 0 0 0 0 0 1

An step by step approach might be the following one. I have changed the next column name for next1 to avoid problems:
AllComb<-expand.grid(unique(df$previous),unique(df$next1))# Creating all possible combinations
myframe <- matrix(rep(0,nrow(AllComb)*nrow(df)),ncol=nrow(AllComb),nrow =nrow(df))
colnames(myframe)<-paste("col_",AllComb$Var1,"_",AllComb$Var2, sep ="")
for(id_row in 1:ncol(df)){
myvec <- df[id_row,]
Word <- paste("col_",myvec[1],"_",myvec[2], sep ="")# Finding Word
Colindex <-which(colnames(myframe)==Word) #Finding Column index
myframe[id_row, Colindex] <-1 # Replacing in column index and vetor
}
dfRes<-cbind(previous =df$previous, "next"= df$next1, myframe, id=df$id)
# previous next col_1_1 col_2_1 col_3_1 col_1_2 col_2_2 col_3_2 col_1_3 col_2_3 col_3_3 id
# [1,] 1 1 1 0 0 0 0 0 0 0 0 1
# [2,] 2 1 0 1 0 0 0 0 0 0 0 2
# [3,] 2 2 0 0 0 0 1 0 0 0 0 3
# [4,] 1 3 0 0 0 0 0 0 0 0 0 4
# [5,] 3 1 0 0 0 0 0 0 0 0 0 5
# [6,] 3 3 0 0 0 0 0 0 0 0 0 6

Inside a by you could use a switch, because your values are nicely consecutive 1:3. Finally we merge to get the result.
tmp <- by(dat, dat$next., function(x) {
x1 <- x$previous
o <- `colnames<-`(t(sapply(x1, function(z)
switch(z, c(1, 0, 0), c(0, 1, 0), c(0, 0, 1)))),
paste(el(x1), 1:3, sep="_"))
cbind(x, col=o)
})
res <- Reduce(function(...) merge(..., all=TRUE), tmp)
res[is.na(res)] <- 0 ## set NA to zero if wanted
Result
res[order(res$id),] ## order by ID if needed
# previous next. id col.1_1 col.1_2 col.1_3 col.2_1 col.2_2 col.2_3
# 1 1 1 1 1 0 0 0 0 0
# 3 2 1 2 0 1 0 0 0 0
# 4 2 2 3 0 0 0 0 1 0
# 2 1 3 4 1 0 0 0 0 0
# 5 3 1 5 0 0 1 0 0 0
# 6 3 3 6 0 0 1 0 0 0
Data
dat <- structure(list(previous = c(1, 2, 2, 1, 3, 3), next. = c(1, 1,
2, 3, 1, 3), id = c(1, 2, 3, 4, 5, 6)), class = "data.frame", row.names = c(NA,
-6L))
Note: next as column name is not particularly a good idea, since it has a special meaning in R.

Here is a tidyverse approach:
library(tidyr)
library(dplyr)
df %>%
rowid_to_column() %>%
complete(previous, nxt) %>%
unite(col , previous, nxt, sep = "_", remove = FALSE) %>%
pivot_wider(names_from = col, values_from = rowid, values_fn = list(rowid = ~1), values_fill = list(rowid = 0)) %>%
na.omit() %>%
arrange(id)
# A tibble: 6 x 12
previous nxt id `1_1` `1_2` `1_3` `2_1` `2_2` `2_3` `3_1` `3_2` `3_3`
<dbl> <dbl> <dbl> <int> <int> <int> <int> <int> <int> <int> <int> <int>
1 1 1 1 1 0 0 0 0 0 0 0 0
2 2 1 2 0 0 0 1 0 0 0 0 0
3 2 2 3 0 0 0 0 1 0 0 0 0
4 1 3 4 0 0 1 0 0 0 0 0 0
5 3 1 5 0 0 0 0 0 0 1 0 0
6 3 3 6 0 0 0 0 0 0 0 0 1

This is another tidyverse solution that differ a little (maybe more concise) from #H1's one.
library(dplyr)
library(tidyr)
df %>%
mutate(n = 1) %>%
complete(id, previous, next., fill = list(n = 0)) %>%
unite(col, previous, next.) %>%
pivot_wider(names_from = col, names_prefix = "col", values_from = n) %>%
right_join(df)
# # A tibble: 6 x 12
# id col1_1 col1_2 col1_3 col2_1 col2_2 col2_3 col3_1 col3_2 col3_3 previous next.
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 1 0 0 0 0 0 0 0 0 1 1
# 2 2 0 0 0 1 0 0 0 0 0 2 1
# 3 3 0 0 0 0 1 0 0 0 0 2 2
# 4 4 0 0 1 0 0 0 0 0 0 1 3
# 5 5 0 0 0 0 0 0 1 0 0 3 1
# 6 6 0 0 0 0 0 0 0 0 1 3 3

You can try the code below
dfout <- within(df,
col <- `colnames<-`(t(sapply((Previous-1)*3+Next,
function(v) replace(rep(0,9),v,1))),
do.call(paste,c(expand.grid(1:3,1:3),sep = "_"))))
such that
> dfout
Previous Next id col.1_1 col.2_1 col.3_1 col.1_2 col.2_2 col.3_2 col.1_3 col.2_3 col.3_3
1 1 1 1 1 0 0 0 0 0 0 0 0
2 2 1 2 0 0 0 1 0 0 0 0 0
3 2 2 3 0 0 0 0 1 0 0 0 0
4 1 3 4 0 0 1 0 0 0 0 0 0
5 3 1 5 0 0 0 0 0 0 1 0 0
6 3 3 6 0 0 0 0 0 0 0 0 1

Related

How to convert a daatset where some subjects chose multiple answers into a dummy variables format?

I have this example dataset
df <- data.frame(subjects = 1:12,
Why_are_you_not_happy =
c(1,2,"1,2,5",5,1,2,"3,4",3,2,"1,5",3,4),
why_are_you_sad =
c("1,2,3",1,2,3,"4,5,3",2,1,4,3,1,1,1) )
And would like to convert it into a dummy variables format (based on the 5 answers of each question). Can someone guide me through an effective way ? thanks.
You can separate_rows for multiple choices, convert to dummy and summarise by subjects (to get one row per subjects, with all their choices).
library(fastDummies)
library(tidyr)
library(dplyr)
df %>%
separate_rows(Why_are_you_not_happy, why_are_you_sad) %>%
dummy_cols(c("Why_are_you_not_happy", "why_are_you_sad"),
remove_selected_columns = TRUE) %>%
group_by(subjects) %>%
summarise(across(everything(), max))
output
# A tibble: 12 × 11
subjects Why_are_you…¹ Why_a…² Why_a…³ Why_a…⁴ Why_a…⁵ why_a…⁶ why_a…⁷ why_a…⁸ why_a…⁹ why_a…˟
<int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
1 1 1 0 0 0 0 1 1 1 0 0
2 2 0 1 0 0 0 1 0 0 0 0
3 3 1 1 0 0 1 0 1 0 0 0
4 4 0 0 0 0 1 0 0 1 0 0
5 5 1 0 0 0 0 0 0 1 1 1
6 6 0 1 0 0 0 0 1 0 0 0
7 7 0 0 1 1 0 1 0 0 0 0
8 8 0 0 1 0 0 0 0 0 1 0
9 9 0 1 0 0 0 0 0 1 0 0
10 10 1 0 0 0 1 1 0 0 0 0
11 11 0 0 1 0 0 1 0 0 0 0
12 12 0 0 0 1 0 1 0 0 0 0

Converting unique values from data frame intro a reference Matrix

Hello lovely people of SO!
Guys I have the following raw dataset
ID_TRIAL<-c(1,1,1,2,3,4,5,5,5,6,6,6,7,7,8,8,8,8)
TYPE_FAIL<-c("A","B","C","F","A","A","A","B","K","T","F","A","A","B","B","Q","P","I")
ID TRIAL
TYPE_FAIL
1
A
1
B
1
C
2
F
3
A
4
A
5
A
5
B
5
K
6
T
6
F
6
A
7
A
7
B
8
B
8
Q
8
P
8
I
I need to transform this dataset in such manner that I am able to create a matrix whose columns are the TYPE OF FAILS in alphabetical order and its rows are a binary representation of all unique TYPE OF FAILS a TRIAL had for instance
all the TYPES OF FAILS are in alphabetical order: A B C F I K P Q T
So for TRAIL 8 the matrix row will look like this
A
B
C
F
I
K
P
Q
T
0
1
0
0
1
0
1
1
0
The zeros in all other cells represent that during trial 8 for example FAIL TYPE A did not occurred and so on
my desired output would look like this:
TRIAL
A
B
C
F
I
K
P
Q
T
1
1
1
1
0
0
0
0
0
0
2
0
0
0
1
0
0
0
0
0
3
1
0
0
0
0
0
0
0
0
4
1
0
0
0
0
0
0
0
0
5
1
1
0
0
0
1
0
0
0
6
1
0
0
1
0
0
0
0
1
7
1
1
0
0
0
0
0
0
0
8
0
1
0
0
1
0
1
1
0
Thank you all of you guys for helping me out I will be super attentive to read and response to all of your comments
Some of my thought-process behind my solution:
First I need to group by ID TRIAL then
I need to find a function or a routine that will look for
a letter lets say "B" and add a number one to my matrix under the column B for the
row of the TRIAL in case, I can do this using multiple ifelse lines but
my real dataset is quite large and I dont know if there is a way to perform this faster so thank you so much for helping me out on this
Here's a tidyverse solution using dplyr::count and tidyr::pivot_wider.
library(dplyr)
library(tidyr)
df1 <- data.frame(ID_TRIAL = c(1, 1, 1, 2, 3, 4, 5, 5 , 5, 6, 6, 6, 7, 7, 8, 8, 8, 8),
TYPE_FAIL = c("A", "B", "C", "F", "A", "A", "A", "B", "K", "T", "F",
"A", "A", "B", "B", "Q", "P", "I"))
df1 %>%
count(ID_TRIAL, TYPE_FAIL) %>%
pivot_wider(names_from = "TYPE_FAIL",
values_from = "n",
names_sort = TRUE) %>%
replace(is.na(.), 0)
Result:
# A tibble: 8 × 10
ID_TRIAL A B C F I K P Q T
<dbl> <int> <int> <int> <int> <int> <int> <int> <int> <int>
1 1 1 1 0 0 0 0 0 0
2 0 0 0 1 0 0 0 0 0
3 1 0 0 0 0 0 0 0 0
4 1 0 0 0 0 0 0 0 0
5 1 1 0 0 0 1 0 0 0
6 1 0 0 1 0 0 0 0 1
7 1 1 0 0 0 0 0 0 0
8 0 1 0 0 1 0 1 1 0
Matrix format. Generating a matrix from the table returned values
ID_TRIAL<-c(1,1,1,2,3,4,5,5,5,6,6,6,7,7,8,8,8,8)
TYPE_FAIL<-c("A","B","C","F","A","A","A","B","K","T","F","A","A","B","B","Q","P","I")
df <- data.frame(ID_TRIAL = ID_TRIAL, TYPE_FAIL = TYPE_FAIL)
mat <- table(df) |> matrix(nrow = 8, dimnames = list(unique(df$ID_TRIAL),
sort(unique(df$TYPE_FAIL))))
A B C F I K P Q T
1 1 1 1 0 0 0 0 0 0
2 0 0 0 1 0 0 0 0 0
3 1 0 0 0 0 0 0 0 0
4 1 0 0 0 0 0 0 0 0
5 1 1 0 0 0 1 0 0 0
6 1 0 0 1 0 0 0 0 1
7 1 1 0 0 0 0 0 0 0
8 0 1 0 0 1 0 1 1 0
I thought you meant a literal matrix.
If you meant data.frame you can do. Using the table function to generate some values we can use to pivot wider
data.frame(table(df)) |>
pivot_wider(id_cols = ID_TRIAL, names_from = TYPE_FAIL, values_from = Freq)
ID_TRIAL A B C F I K P Q T
<fct> <int> <int> <int> <int> <int> <int> <int> <int> <int>
1 1 1 1 1 0 0 0 0 0 0
2 2 0 0 0 1 0 0 0 0 0
3 3 1 0 0 0 0 0 0 0 0
4 4 1 0 0 0 0 0 0 0 0
5 5 1 1 0 0 0 1 0 0 0
6 6 1 0 0 1 0 0 0 0 1
7 7 1 1 0 0 0 0 0 0 0
8 8 0 1 0 0 1 0 1 1 0

Creating a new variable and altering dependent variables in r using ifelse

Let's say we have a df as follows:
A B C D E
1 1 0 0 1
0 0 1 0 0
0 0 0 0 1
1 1 1 1 0
0 1 1 0 1
1 0 1 0 0
So I would like to make another variable F which says, if the sum of A:D is greater than 1, F is 1 and A:D are 0.
Additionally, If E == 1, then F = 0.
So here's how I wrote it but it's not working...
#Counter
df<- df %>%
mutate(case_count = A+B+C+D)
df$F <- ifelse(df$E == 1, 0,
ifelse(df$case_count > 1,
df$A == 0 &
df$B == 0 &
df$C == 0 &
df$D == 0 &
df$F == 1, 0))
And the correct result here should then be
A B C D E case_count F
1 1 0 0 1 2 0
0 0 1 0 0 1 0
0 0 0 0 1 0 0
0 0 0 0 0 4 1
0 1 1 0 1 2 0
0 0 0 0 0 2 1
Using dplyr and the new functions across and c_across
df %>%
rowwise() %>%
mutate(
case_count = sum(c_across(A:D)),
F_ = ifelse(E == 1, 0, ifelse(case_count > 1, 1, 0))
) %>%
mutate(across(A:D, ~ifelse(F_ == 1, 0, .)))
I named the new column F_ instead of just F because the latter may be confused with the abbreviation for FALSE.
Output
# A tibble: 6 x 7
# Rowwise:
# A B C D E case_count F_
# <dbl> <dbl> <dbl> <dbl> <int> <int> <dbl>
# 1 1 1 0 0 1 2 0
# 2 0 0 1 0 0 1 0
# 3 0 0 0 0 1 0 0
# 4 0 0 0 0 0 4 1
# 5 0 1 1 0 1 2 0
# 6 0 0 0 0 0 2 1
You can try this solution (DF is your original data):
#Create index
DF$I1 <- rowSums(DF[,1:4])
DF[DF[,6]>1,1:4]<-0
#Create F
DF$F <- ifelse(DF$I1>1,1,0)
DF$F <- ifelse(DF$E==1,0,DF$F)
A B C D E I1 F
1 0 0 0 0 1 2 0
2 0 0 1 0 0 1 0
3 0 0 0 0 1 0 0
4 0 0 0 0 0 4 1
5 0 0 0 0 1 2 0
6 0 0 0 0 0 2 1

Convert from long to wide format from categorical data

Having categorical data like this:
data.frame(id = c(1,2,3,4,5), stock1 = c(1,2,0,1,2), stock2 = c(0,1,0,1,1), end = c(0,1,3,0,3), start = c(2,3,0,1,0))
id stock1 stock2 end start
1 1 1 0 0 2
2 2 2 1 1 3
3 3 0 0 3 0
4 4 1 1 0 1
5 5 2 1 3 0
How is it possible to convert them from long to wide format in which every column will show if exist or not with specific name?
Example of expected output:
data.frame(id = c(1,2,3,4,5), stock1_0 = c(0,0,1,0,0), stock1_1 = c(1,0,0,1,0), stock1_2 = c(0,1,0,0,1), stock2_0 = c(1,0,1,0,0), stock2_1 = c(0,1,0,0,0), end_0 = c(1,0,0,1,0), end_1 = c(0,1,0,0,0), end_3 = c(0,0,1,0,1), start_0 = c(0,0,1,0,1), start_1 = c(0,0,0,1,0), start_2 = c(1,0,0,0,0), start_3 = c(0,1,0,0,0))
id stock1_0 stock1_1 stock1_2 stock2_0 stock2_1 end_0 end_1 end_3 start_0 start_1 start_2 start_3
1 1 0 1 0 1 0 1 0 0 0 0 1 0
2 2 0 0 1 0 1 0 1 0 0 0 0 1
3 3 1 0 0 1 0 0 0 1 1 0 0 0
4 4 0 1 0 0 0 1 0 0 0 1 0 0
5 5 0 0 1 0 0 0 0 1 1 0 0 0
You could use model.matrix.
data.frame(dat[1],
do.call(cbind, lapply(seq(dat)[-1], function(x)
`colnames<-`(m <- model.matrix( ~ as.factor(dat[[x]]) - 1),
paste(names(dat[x]), seq_len(ncol(m)), sep="_")))))
# id stock1_1 stock1_2 stock1_3 stock2_1 stock2_2 end_1 end_2 end_3 start_1
# 1 1 0 1 0 1 0 1 0 0 0
# 2 2 0 0 1 0 1 0 1 0 0
# 3 3 1 0 0 1 0 0 0 1 1
# 4 4 0 1 0 0 1 1 0 0 0
# 5 5 0 0 1 0 1 0 0 1 1
# start_2 start_3 start_4
# 1 0 1 0
# 2 0 0 1
# 3 0 0 0
# 4 1 0 0
# 5 0 0 0
Data:
dat <- structure(list(id = c(1, 2, 3, 4, 5), stock1 = c(1, 2, 0, 1,
2), stock2 = c(0, 1, 0, 1, 1), end = c(0, 1, 3, 0, 3), start = c(2,
3, 0, 1, 0)), class = "data.frame", row.names = c(NA, -5L))
library(data.table)
setDT(df)
dcast(melt(df, 'id'),
id ~ paste0(variable, '_', value),
fun.aggregate = length)
# id end_0 end_1 end_3 start_0 start_1 start_2 start_3 stock1_0
# 1: 1 1 0 0 0 0 1 0 0
# 2: 2 0 1 0 0 0 0 1 0
# 3: 3 0 0 1 1 0 0 0 1
# 4: 4 1 0 0 0 1 0 0 0
# 5: 5 0 0 1 1 0 0 0 0
# stock1_1 stock1_2 stock2_0 stock2_1
# 1: 1 0 1 0
# 2: 0 1 0 1
# 3: 0 0 1 0
# 4: 1 0 0 1
# 5: 0 1 0 1
One way would be to get data in long format, combine column name with value and get the data back in wide format.
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = -id) %>%
unite(name, name, value) %>%
mutate(value = 1) %>%
pivot_wider(values_fill = list(value = 0))
# A tibble: 5 x 13
# id stock1_1 stock2_0 end_0 start_2 stock1_2 stock2_1 end_1 start_3 stock1_0 end_3 start_0 start_1
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 1 1 1 1 1 0 0 0 0 0 0 0 0
#2 2 0 0 0 0 1 1 1 1 0 0 0 0
#3 3 0 1 0 0 0 0 0 0 1 1 1 0
#4 4 1 0 1 0 0 1 0 0 0 0 0 1
#5 5 0 0 0 0 1 1 0 0 0 1 1 0

R - Common users across months

I have a transaction table with the following columns:
TransactionId UserId YearMonth Group
What I am trying to accomplish is to get unique users across different months.
Eg:
YearMonth Group UsersCountMonth1 UsersCountMonth2 UsersCountMonth3
201301 A 1000 900 800
201301 B 1200 940 700
201302 B 1300 1140 900
201303 A 12e0 970 706
Basically Month1 and Month2 are the incremental months based on YearMonth value for the record.
I am using this result to perform retention analysis.
I remember you were looking for a possibility to analyze subscription cohorts, yesterday. So I guess you can do
library(tidyverse)
set.seed(1)
n <- 100
df <- data.frame(
user = sample(1:20, n, T),
transDate = sample(seq(as.Date("2016-01-01"), as.Date("2016-12-31"), "1 month"), n, T),
group = sample(LETTERS[1:2], n, T)
)
diffmonth <- function(d1, d2) {
# http://stackoverflow.com/questions/1995933/number-of-months-between-two-dates
monnb <- function(d) {
lt <- as.POSIXlt(as.Date(d, origin="1900-01-01"))
lt$year*12 + lt$mon
}
monnb(d2) - monnb(d1) + 1L
}
df %>%
group_by(user, group) %>%
mutate(cohort = min(transDate), month = diffmonth(cohort, transDate)) %>%
unite(cohort, cohort, group, remove = T) %>%
group_by(month, cohort) %>%
summarise(n=n()) %>%
spread(month, n, fill = 0, drop = F)
# # A tibble: 16 × 12
# cohort `1` `2` `3` `4` `5` `6` `7` `8` `9` `10` `11`
# * <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 2016-01-01_A 5 1 0 1 1 1 1 0 2 0 0
# 2 2016-02-01_A 1 0 0 0 0 0 0 0 1 0 1
# 3 2016-02-01_B 4 1 2 1 0 1 2 0 1 1 0
# 4 2016-03-01_A 5 0 3 1 2 2 2 0 1 2 0
# 5 2016-03-01_B 4 0 0 0 2 0 1 0 0 0 0
# 6 2016-04-01_A 4 0 2 1 0 1 0 2 1 0 0
# 7 2016-04-01_B 1 0 0 0 0 0 0 0 0 0 0
# 8 2016-05-01_A 2 0 2 2 0 0 2 0 0 0 0
# 9 2016-05-01_B 1 0 0 1 0 0 2 0 0 0 0
# 10 2016-06-01_A 1 0 2 0 0 1 0 0 0 0 0
# 11 2016-06-01_B 4 0 0 0 0 1 1 0 0 0 0
# 12 2016-07-01_A 1 0 1 0 0 0 0 0 0 0 0
# 13 2016-08-01_B 4 1 1 0 0 0 0 0 0 0 0
# 14 2016-09-01_A 1 0 0 0 0 0 0 0 0 0 0
# 15 2016-10-01_B 1 0 0 0 0 0 0 0 0 0 0
# 16 2016-12-01_A 3 0 0 0 0 0 0 0 0 0 0

Resources