Convert from long to wide format from categorical data - r

Having categorical data like this:
data.frame(id = c(1,2,3,4,5), stock1 = c(1,2,0,1,2), stock2 = c(0,1,0,1,1), end = c(0,1,3,0,3), start = c(2,3,0,1,0))
id stock1 stock2 end start
1 1 1 0 0 2
2 2 2 1 1 3
3 3 0 0 3 0
4 4 1 1 0 1
5 5 2 1 3 0
How is it possible to convert them from long to wide format in which every column will show if exist or not with specific name?
Example of expected output:
data.frame(id = c(1,2,3,4,5), stock1_0 = c(0,0,1,0,0), stock1_1 = c(1,0,0,1,0), stock1_2 = c(0,1,0,0,1), stock2_0 = c(1,0,1,0,0), stock2_1 = c(0,1,0,0,0), end_0 = c(1,0,0,1,0), end_1 = c(0,1,0,0,0), end_3 = c(0,0,1,0,1), start_0 = c(0,0,1,0,1), start_1 = c(0,0,0,1,0), start_2 = c(1,0,0,0,0), start_3 = c(0,1,0,0,0))
id stock1_0 stock1_1 stock1_2 stock2_0 stock2_1 end_0 end_1 end_3 start_0 start_1 start_2 start_3
1 1 0 1 0 1 0 1 0 0 0 0 1 0
2 2 0 0 1 0 1 0 1 0 0 0 0 1
3 3 1 0 0 1 0 0 0 1 1 0 0 0
4 4 0 1 0 0 0 1 0 0 0 1 0 0
5 5 0 0 1 0 0 0 0 1 1 0 0 0

You could use model.matrix.
data.frame(dat[1],
do.call(cbind, lapply(seq(dat)[-1], function(x)
`colnames<-`(m <- model.matrix( ~ as.factor(dat[[x]]) - 1),
paste(names(dat[x]), seq_len(ncol(m)), sep="_")))))
# id stock1_1 stock1_2 stock1_3 stock2_1 stock2_2 end_1 end_2 end_3 start_1
# 1 1 0 1 0 1 0 1 0 0 0
# 2 2 0 0 1 0 1 0 1 0 0
# 3 3 1 0 0 1 0 0 0 1 1
# 4 4 0 1 0 0 1 1 0 0 0
# 5 5 0 0 1 0 1 0 0 1 1
# start_2 start_3 start_4
# 1 0 1 0
# 2 0 0 1
# 3 0 0 0
# 4 1 0 0
# 5 0 0 0
Data:
dat <- structure(list(id = c(1, 2, 3, 4, 5), stock1 = c(1, 2, 0, 1,
2), stock2 = c(0, 1, 0, 1, 1), end = c(0, 1, 3, 0, 3), start = c(2,
3, 0, 1, 0)), class = "data.frame", row.names = c(NA, -5L))

library(data.table)
setDT(df)
dcast(melt(df, 'id'),
id ~ paste0(variable, '_', value),
fun.aggregate = length)
# id end_0 end_1 end_3 start_0 start_1 start_2 start_3 stock1_0
# 1: 1 1 0 0 0 0 1 0 0
# 2: 2 0 1 0 0 0 0 1 0
# 3: 3 0 0 1 1 0 0 0 1
# 4: 4 1 0 0 0 1 0 0 0
# 5: 5 0 0 1 1 0 0 0 0
# stock1_1 stock1_2 stock2_0 stock2_1
# 1: 1 0 1 0
# 2: 0 1 0 1
# 3: 0 0 1 0
# 4: 1 0 0 1
# 5: 0 1 0 1

One way would be to get data in long format, combine column name with value and get the data back in wide format.
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = -id) %>%
unite(name, name, value) %>%
mutate(value = 1) %>%
pivot_wider(values_fill = list(value = 0))
# A tibble: 5 x 13
# id stock1_1 stock2_0 end_0 start_2 stock1_2 stock2_1 end_1 start_3 stock1_0 end_3 start_0 start_1
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 1 1 1 1 1 0 0 0 0 0 0 0 0
#2 2 0 0 0 0 1 1 1 1 0 0 0 0
#3 3 0 1 0 0 0 0 0 0 1 1 1 0
#4 4 1 0 1 0 0 1 0 0 0 0 0 1
#5 5 0 0 0 0 1 1 0 0 0 1 1 0

Related

Creating a new variable and altering dependent variables in r using ifelse

Let's say we have a df as follows:
A B C D E
1 1 0 0 1
0 0 1 0 0
0 0 0 0 1
1 1 1 1 0
0 1 1 0 1
1 0 1 0 0
So I would like to make another variable F which says, if the sum of A:D is greater than 1, F is 1 and A:D are 0.
Additionally, If E == 1, then F = 0.
So here's how I wrote it but it's not working...
#Counter
df<- df %>%
mutate(case_count = A+B+C+D)
df$F <- ifelse(df$E == 1, 0,
ifelse(df$case_count > 1,
df$A == 0 &
df$B == 0 &
df$C == 0 &
df$D == 0 &
df$F == 1, 0))
And the correct result here should then be
A B C D E case_count F
1 1 0 0 1 2 0
0 0 1 0 0 1 0
0 0 0 0 1 0 0
0 0 0 0 0 4 1
0 1 1 0 1 2 0
0 0 0 0 0 2 1
Using dplyr and the new functions across and c_across
df %>%
rowwise() %>%
mutate(
case_count = sum(c_across(A:D)),
F_ = ifelse(E == 1, 0, ifelse(case_count > 1, 1, 0))
) %>%
mutate(across(A:D, ~ifelse(F_ == 1, 0, .)))
I named the new column F_ instead of just F because the latter may be confused with the abbreviation for FALSE.
Output
# A tibble: 6 x 7
# Rowwise:
# A B C D E case_count F_
# <dbl> <dbl> <dbl> <dbl> <int> <int> <dbl>
# 1 1 1 0 0 1 2 0
# 2 0 0 1 0 0 1 0
# 3 0 0 0 0 1 0 0
# 4 0 0 0 0 0 4 1
# 5 0 1 1 0 1 2 0
# 6 0 0 0 0 0 2 1
You can try this solution (DF is your original data):
#Create index
DF$I1 <- rowSums(DF[,1:4])
DF[DF[,6]>1,1:4]<-0
#Create F
DF$F <- ifelse(DF$I1>1,1,0)
DF$F <- ifelse(DF$E==1,0,DF$F)
A B C D E I1 F
1 0 0 0 0 1 2 0
2 0 0 1 0 0 1 0
3 0 0 0 0 1 0 0
4 0 0 0 0 0 4 1
5 0 0 0 0 1 2 0
6 0 0 0 0 0 2 1

Create a new dataframe with the all possible combinations

Having a dataframe like this:
data.frame(previous = c(1,2,2,1,3,3), next = c(1,1,2,3,1,3), id = c(1,2,3,4,5,6))
How is it possible to exatract a data frame which will check the previous and next columns and create 9 new columns which will have 1 only if the combination of previous and next exist. Example if previous if 2 and next 1 the combination is 2 1 and receives one.
Example of expected output:
data.frame(previous = c(1,2,2,1,3,3), next = c(1,1,2,3,1,3),
col1_1 = c(1,0,0,0,0,0),
col1_2 = c(0,0,0,0,0,0),
col1_3 = c(0,0,0,1,0,0),
col2_1 = c(0,1,0,0,0,0),
col2_2 = c(0,0,1,0,0,0),
col2_3 = c(0,0,0,0,0,0),
col3_1 = c(0,0,0,0,1,0),
col3_2 = c(0,0,0,0,0,0),
col3_3 = c(0,0,0,0,0,1), id = c(1,2,3,4,5,6))
You could use expand.grid to get all the combinations.
Assuming your data frame is called df and the column next is actually called next. to avoid clashing with the keyword next:
as.data.frame(apply(expand.grid(1:3, 1:3), 1, function(x) {
as.numeric(x[1] == df$previous & x[2] == df$next.)}))
#> V1 V2 V3 V4 V5 V6 V7 V8 V9
#> 1 1 0 0 0 0 0 0 0 0
#> 2 0 1 0 0 0 0 0 0 0
#> 3 0 0 0 0 1 0 0 0 0
#> 4 0 0 0 0 0 0 1 0 0
#> 5 0 0 1 0 0 0 0 0 0
#> 6 0 0 0 0 0 0 0 0 1
An step by step approach might be the following one. I have changed the next column name for next1 to avoid problems:
AllComb<-expand.grid(unique(df$previous),unique(df$next1))# Creating all possible combinations
myframe <- matrix(rep(0,nrow(AllComb)*nrow(df)),ncol=nrow(AllComb),nrow =nrow(df))
colnames(myframe)<-paste("col_",AllComb$Var1,"_",AllComb$Var2, sep ="")
for(id_row in 1:ncol(df)){
myvec <- df[id_row,]
Word <- paste("col_",myvec[1],"_",myvec[2], sep ="")# Finding Word
Colindex <-which(colnames(myframe)==Word) #Finding Column index
myframe[id_row, Colindex] <-1 # Replacing in column index and vetor
}
dfRes<-cbind(previous =df$previous, "next"= df$next1, myframe, id=df$id)
# previous next col_1_1 col_2_1 col_3_1 col_1_2 col_2_2 col_3_2 col_1_3 col_2_3 col_3_3 id
# [1,] 1 1 1 0 0 0 0 0 0 0 0 1
# [2,] 2 1 0 1 0 0 0 0 0 0 0 2
# [3,] 2 2 0 0 0 0 1 0 0 0 0 3
# [4,] 1 3 0 0 0 0 0 0 0 0 0 4
# [5,] 3 1 0 0 0 0 0 0 0 0 0 5
# [6,] 3 3 0 0 0 0 0 0 0 0 0 6
Inside a by you could use a switch, because your values are nicely consecutive 1:3. Finally we merge to get the result.
tmp <- by(dat, dat$next., function(x) {
x1 <- x$previous
o <- `colnames<-`(t(sapply(x1, function(z)
switch(z, c(1, 0, 0), c(0, 1, 0), c(0, 0, 1)))),
paste(el(x1), 1:3, sep="_"))
cbind(x, col=o)
})
res <- Reduce(function(...) merge(..., all=TRUE), tmp)
res[is.na(res)] <- 0 ## set NA to zero if wanted
Result
res[order(res$id),] ## order by ID if needed
# previous next. id col.1_1 col.1_2 col.1_3 col.2_1 col.2_2 col.2_3
# 1 1 1 1 1 0 0 0 0 0
# 3 2 1 2 0 1 0 0 0 0
# 4 2 2 3 0 0 0 0 1 0
# 2 1 3 4 1 0 0 0 0 0
# 5 3 1 5 0 0 1 0 0 0
# 6 3 3 6 0 0 1 0 0 0
Data
dat <- structure(list(previous = c(1, 2, 2, 1, 3, 3), next. = c(1, 1,
2, 3, 1, 3), id = c(1, 2, 3, 4, 5, 6)), class = "data.frame", row.names = c(NA,
-6L))
Note: next as column name is not particularly a good idea, since it has a special meaning in R.
Here is a tidyverse approach:
library(tidyr)
library(dplyr)
df %>%
rowid_to_column() %>%
complete(previous, nxt) %>%
unite(col , previous, nxt, sep = "_", remove = FALSE) %>%
pivot_wider(names_from = col, values_from = rowid, values_fn = list(rowid = ~1), values_fill = list(rowid = 0)) %>%
na.omit() %>%
arrange(id)
# A tibble: 6 x 12
previous nxt id `1_1` `1_2` `1_3` `2_1` `2_2` `2_3` `3_1` `3_2` `3_3`
<dbl> <dbl> <dbl> <int> <int> <int> <int> <int> <int> <int> <int> <int>
1 1 1 1 1 0 0 0 0 0 0 0 0
2 2 1 2 0 0 0 1 0 0 0 0 0
3 2 2 3 0 0 0 0 1 0 0 0 0
4 1 3 4 0 0 1 0 0 0 0 0 0
5 3 1 5 0 0 0 0 0 0 1 0 0
6 3 3 6 0 0 0 0 0 0 0 0 1
This is another tidyverse solution that differ a little (maybe more concise) from #H1's one.
library(dplyr)
library(tidyr)
df %>%
mutate(n = 1) %>%
complete(id, previous, next., fill = list(n = 0)) %>%
unite(col, previous, next.) %>%
pivot_wider(names_from = col, names_prefix = "col", values_from = n) %>%
right_join(df)
# # A tibble: 6 x 12
# id col1_1 col1_2 col1_3 col2_1 col2_2 col2_3 col3_1 col3_2 col3_3 previous next.
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 1 0 0 0 0 0 0 0 0 1 1
# 2 2 0 0 0 1 0 0 0 0 0 2 1
# 3 3 0 0 0 0 1 0 0 0 0 2 2
# 4 4 0 0 1 0 0 0 0 0 0 1 3
# 5 5 0 0 0 0 0 0 1 0 0 3 1
# 6 6 0 0 0 0 0 0 0 0 1 3 3
You can try the code below
dfout <- within(df,
col <- `colnames<-`(t(sapply((Previous-1)*3+Next,
function(v) replace(rep(0,9),v,1))),
do.call(paste,c(expand.grid(1:3,1:3),sep = "_"))))
such that
> dfout
Previous Next id col.1_1 col.2_1 col.3_1 col.1_2 col.2_2 col.3_2 col.1_3 col.2_3 col.3_3
1 1 1 1 1 0 0 0 0 0 0 0 0
2 2 1 2 0 0 0 1 0 0 0 0 0
3 2 2 3 0 0 0 0 1 0 0 0 0
4 1 3 4 0 0 1 0 0 0 0 0 0
5 3 1 5 0 0 0 0 0 0 1 0 0
6 3 3 6 0 0 0 0 0 0 0 0 1

Rename a range of columns in a dataframe

I'm trying to rename a range of columns in a dataframe so that they have the format [V1:V5]:
result_df = data.frame(V1 = 1, V2 = 2, V3 = 3, V4 = 4, V5 = 5, colnamethatshouldntberenamed = 6)
If the existing dataframe has the range of numbers somewhere in their names, it's relatively straigthforward (although I'm thinking there's probably a way to do it with one line of code, not two):
df1 = data.frame(X1q = 1, X2q = 2, X3q = 3, X4q = 4, X5q = 5, colnamethatshouldntberenamed = 6)
names(df1) <- gsub("X", "V", names(df1))
names(df1) <- gsub("q", "", names(df1))
But what if the column names have completely random names?
df2 = data.frame(name = 1, col = 2, random = 3, alsorandom = 4, somethingelse = 5, colnamethatshouldntberenamed = 6)
Is there a way to rename all of these columns in one-go? (assuming that they are adjoining columns in the dataframe, but there may be other columns in the dataframe with names that don't need to be changed)
If you have a different number of columns and/or you want to %>%, you can use purrr::set_names().
For example:
Sample data with 10 columns:
example1 <- data.frame(replicate(10,sample(0:1,5,rep=TRUE)))
example1
X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
1 1 1 0 1 1 1 0 1 0 1
2 0 0 1 0 1 1 1 0 0 1
3 0 0 1 0 0 1 1 1 0 0
4 0 1 0 1 1 0 1 0 0 0
5 1 0 0 1 1 0 1 1 0 1
You can use seq_along inside set_names which will rename the columns by order (with piping):
example1 %>%
set_names(c(seq_along(example1)))
Results:
1 2 3 4 5 6 7 8 9 10
1 1 1 0 1 1 1 0 1 0 1
2 0 0 1 0 1 1 1 0 0 1
3 0 0 1 0 0 1 1 1 0 0
4 0 1 0 1 1 0 1 0 0 0
5 1 0 0 1 1 0 1 1 0 1
Same idea with 15 columns and naming them using paste in set_names:
example2 <- data.frame(replicate(15,sample(0:1,10,rep=TRUE)))
example2 %>%
set_names(c(paste("VarNum", seq_along(example2), sep = "")))
Results
VarNum1 VarNum2 VarNum3 VarNum4 VarNum5 VarNum6 VarNum7 VarNum8 VarNum9 VarNum10 VarNum11 VarNum12 VarNum13 VarNum14 VarNum15
1 0 1 0 0 0 0 0 1 1 1 1 0 1 0 1
2 1 1 0 0 0 1 0 1 1 0 0 0 1 0 1
3 1 1 0 1 0 1 1 1 1 1 1 0 1 0 1
4 0 0 0 0 1 1 1 1 0 1 1 0 0 0 1
5 1 1 0 1 0 0 1 0 0 1 1 0 0 0 0

R - Common users across months

I have a transaction table with the following columns:
TransactionId UserId YearMonth Group
What I am trying to accomplish is to get unique users across different months.
Eg:
YearMonth Group UsersCountMonth1 UsersCountMonth2 UsersCountMonth3
201301 A 1000 900 800
201301 B 1200 940 700
201302 B 1300 1140 900
201303 A 12e0 970 706
Basically Month1 and Month2 are the incremental months based on YearMonth value for the record.
I am using this result to perform retention analysis.
I remember you were looking for a possibility to analyze subscription cohorts, yesterday. So I guess you can do
library(tidyverse)
set.seed(1)
n <- 100
df <- data.frame(
user = sample(1:20, n, T),
transDate = sample(seq(as.Date("2016-01-01"), as.Date("2016-12-31"), "1 month"), n, T),
group = sample(LETTERS[1:2], n, T)
)
diffmonth <- function(d1, d2) {
# http://stackoverflow.com/questions/1995933/number-of-months-between-two-dates
monnb <- function(d) {
lt <- as.POSIXlt(as.Date(d, origin="1900-01-01"))
lt$year*12 + lt$mon
}
monnb(d2) - monnb(d1) + 1L
}
df %>%
group_by(user, group) %>%
mutate(cohort = min(transDate), month = diffmonth(cohort, transDate)) %>%
unite(cohort, cohort, group, remove = T) %>%
group_by(month, cohort) %>%
summarise(n=n()) %>%
spread(month, n, fill = 0, drop = F)
# # A tibble: 16 × 12
# cohort `1` `2` `3` `4` `5` `6` `7` `8` `9` `10` `11`
# * <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 2016-01-01_A 5 1 0 1 1 1 1 0 2 0 0
# 2 2016-02-01_A 1 0 0 0 0 0 0 0 1 0 1
# 3 2016-02-01_B 4 1 2 1 0 1 2 0 1 1 0
# 4 2016-03-01_A 5 0 3 1 2 2 2 0 1 2 0
# 5 2016-03-01_B 4 0 0 0 2 0 1 0 0 0 0
# 6 2016-04-01_A 4 0 2 1 0 1 0 2 1 0 0
# 7 2016-04-01_B 1 0 0 0 0 0 0 0 0 0 0
# 8 2016-05-01_A 2 0 2 2 0 0 2 0 0 0 0
# 9 2016-05-01_B 1 0 0 1 0 0 2 0 0 0 0
# 10 2016-06-01_A 1 0 2 0 0 1 0 0 0 0 0
# 11 2016-06-01_B 4 0 0 0 0 1 1 0 0 0 0
# 12 2016-07-01_A 1 0 1 0 0 0 0 0 0 0 0
# 13 2016-08-01_B 4 1 1 0 0 0 0 0 0 0 0
# 14 2016-09-01_A 1 0 0 0 0 0 0 0 0 0 0
# 15 2016-10-01_B 1 0 0 0 0 0 0 0 0 0 0
# 16 2016-12-01_A 3 0 0 0 0 0 0 0 0 0 0

Count how many times a vector/row matches data frame

I have a large data frame with "positive" (1) or "negative" (0) data points.
data example
my_data <- data.frame(cell = 1:4, marker_a = c(1, 0, 0, 0),
marker_b = c(0,1,1,1), marker_c = c(0,1,1,0), marker_d = c(0,1,0,1))
cell marker_a marker_b marker_c marker_d
1 1 1 0 0 0
2 2 0 1 1 1
3 3 0 1 1 0
4 4 0 1 0 1
...
I have a different data.frame with all the possible combinations of positive and negative markers any my_data$cell can have
combinations_df <- expand.grid(
marker_a = c(0, 1),
marker_b = c(0, 1),
marker_c = c(0, 1),
marker_d = c(0, 1)
)
marker_a marker_b marker_c marker_d
1 0 0 0 0
2 1 0 0 0
3 0 1 0 0
4 1 1 0 0
5 0 0 1 0
6 1 0 1 0
7 0 1 1 0
8 1 1 1 0
9 0 0 0 1
10 1 0 0 1
11 0 1 0 1
12 1 1 0 1
13 0 0 1 1
14 1 0 1 1
15 0 1 1 1
16 1 1 1 1
How can I get a data.frame where each row/combination is matched vs every row of my_data and return the final count for each combination
Example of expected output:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
1 14969 15223 15300 14779 14844 16049 15374 15648 15045 15517 15116 15405 14990 15347 14432 15569
I'm guessing the data.table way is fairly efficient:
library(data.table)
setDT(my_data)
my_data[ combinations_df, on = names(combinations_df), .N, by = .EACHI ]
marker_a marker_b marker_c marker_d N
1: 0 0 0 0 0
2: 1 0 0 0 1
3: 0 1 0 0 0
4: 1 1 0 0 0
5: 0 0 1 0 0
6: 1 0 1 0 0
7: 0 1 1 0 1
8: 1 1 1 0 0
9: 0 0 0 1 0
10: 1 0 0 1 0
11: 0 1 0 1 1
12: 1 1 0 1 0
13: 0 0 1 1 0
14: 1 0 1 1 0
15: 0 1 1 1 1
16: 1 1 1 1 0
If you only care about combinations that show up in the data, "chain" a filtering command:
my_data[ combinations_df, on = names(combinations_df), .N, by = .EACHI ][ N > 0 ]
marker_a marker_b marker_c marker_d N
1: 1 0 0 0 1
2: 0 1 1 0 1
3: 0 1 0 1 1
4: 0 1 1 1 1
Alternately, in this case you don't even need combinations_df...
my_data[, .N, by = marker_a:marker_d ]
marker_a marker_b marker_c marker_d N
1: 1 0 0 0 1
2: 0 1 1 1 1
3: 0 1 1 0 1
4: 0 1 0 1 1
You are writing your combinations in "binary", so no need of any join, but just little math. Try this:
setNames(tabulate(as.matrix(my_data[,2:5])%*%2^(0:3)+1,16),1:16)
# 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
# 0 1 0 0 0 0 1 0 0 0 1 0 0 0 1 0
Perhaps you may need
setNames(sapply(do.call(paste0, combinations_df ),
function(x) sum(do.call(paste0, my_data[-1])==x)), 1:nrow(combinations_df ))

Resources