Related
My data:
data <- structure(list(col1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), col2 = c(0L, 1L, 1L, 0L, 0L,
1L, 0L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 0L)), class = "data.frame", row.names = c(NA,
-18L))
I want to get 2 new columns based on col1 and col2.
column 3 is obtained: We leave units if there is zero in the second column, 2 are simply transferred.
column 4 will turn out: We leave units if there is one in the second column, 2 are simply transferred.
What I want to get:
data <- structure(list(col1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), col2 = c(0L, 1L, 1L, 0L, 0L,
1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), group1 = c(1L,
NA, NA, 1L, 1L, NA, 1L, NA, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L), group2 = c(NA, 1L, 1L, NA, NA, 1L, NA, 1L, NA, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L)), class = "data.frame", row.names = c(NA,
-18L))
A solution that uses tidyr::pivot_wider():
library(dplyr)
data %>%
mutate(id = 1:n(), name = paste0("group", col2 + 1), value = 1) %>%
tidyr::pivot_wider() %>%
mutate(col2 = replace(col2, col1 == 2, 0),
across(starts_with("group"), replace, col1 == 2, 2)) %>%
select(-id)
# A tibble: 18 x 4
col1 col2 group1 group2
<int> <dbl> <dbl> <dbl>
1 1 0 1 NA
2 1 1 NA 1
3 1 1 NA 1
4 1 0 1 NA
5 1 0 1 NA
6 1 1 NA 1
7 1 0 1 NA
8 1 1 NA 1
9 1 0 1 NA
10 2 0 2 2
11 2 0 2 2
12 2 0 2 2
13 2 0 2 2
14 2 0 2 2
15 2 0 2 2
16 2 0 2 2
17 2 0 2 2
18 2 0 2 2
You can use ifelse to get group1 and group2.
transform(data
, group1 = ifelse(col1==2, 2, ifelse(col2==0, 1, NA))
, group2 = ifelse(col1==2, 2, ifelse(col2==1, 1, NA))
)
# col1 col2 group1 group2
#1 1 0 1 NA
#2 1 1 NA 1
#3 1 1 NA 1
#4 1 0 1 NA
#5 1 0 1 NA
#6 1 1 NA 1
#7 1 0 1 NA
#8 1 1 NA 1
#9 1 0 1 NA
#10 2 0 2 2
#11 2 1 2 2
#12 2 1 2 2
#13 2 0 2 2
#14 2 0 2 2
#15 2 1 2 2
#16 2 0 2 2
#17 2 1 2 2
#18 2 0 2 2
I am having a data frame named inputDf which have the binary values in all the columns other than Rating column.
inputDf <- structure(list(Q1 = c(0L, 1L, 1L, 1L, 1L, 0L, 0L, 1L), Q2 = c(1L,
1L, 1L, 1L, 1L, 0L, 1L, 0L), Q3 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), Q4 = c(1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L), Q5 = c(1L, 1L, 1L,
1L, 1L, 0L, 0L, 1L), Q6 = c(1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L),
Q7 = c(1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L), Q8 = c(1L, 1L, 1L,
1L, 1L, 1L, 0L, 0L), Q9 = c(1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L
), Q10 = c(0L, 1L, 0L, 1L, 1L, 0L, 0L, 0L), Q11 = c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L), Q12 = c(1L, 1L, 1L, 1L, 1L, 1L,
0L, 1L), Rating = c(7L, 7L, 6L, 5L, 6L, 6L, 7L, 5L), RatingBinary = c(1L,
1L, 1L, 0L, 1L, 1L, 1L, 0L)), row.names = c(13L, 17L, 26L,
71L, 72L, 55L, 56L, 69L), class = "data.frame")
I am having another similar data frame named combinationDf
combinationDf <- structure(list(Q1 = c(0L, 0L), Q2 = c(0L, 0L), Q3 = 1:0, Q4 = c(1L,
1L), Q5 = c(0L, 0L), Q6 = c(0L, 0L), Q7 = c(0L, 0L), Q8 = c(0L,
0L), Q9 = c(0L, 0L), Q10 = c(0L, 0L), Q11 = c(1L, 1L), Q12 = 0:1), row.names = 1:2, class =
"data.frame")
The problem statement is for every combination of 1's in each row in combinationDf, I need to filter rows from inputDf
I implemented the logic by hard-coding the number of columns to be considered for filtering out the data.
finalDf <- data.frame()
for(i in 1:nrow(combinationDf)){
ind <- which(combinationDf[i,] == 1)
ind <- paste("Q",ind, sep = "")
sample <- inputDf %>%
dplyr::filter(eval(parse(text=ind[1])) == 1 & eval(parse(text=ind[2])) == 1 & eval(parse(text=ind[3])) == 1) %>%
as.data.frame()
finalDf <<- rbind(finalDf,sample)
}
However, I'm looking for the general code to filter out the data using N # of columns. i.e, the above code works for filtering using 3 columns. If I need to filter based on 4 columns, I need to add a condition. To overcome that, I used the code below,
sample <- inputDf %>%
dplyr::filter(as.logical(paste(paste0("eval(parse(text = ind[", 1:length(ind), "])) == 1"), collapse = " & "))) %>%
as.data.frame()
This snippet doesn't filter the rows as expected. Can anyone point me out the mistake I have done in the above code? Or can provide the best approach to achieve the same?
It may make sense to subset and then do a semi join for filtering
finalDf <- data.frame()
for(i in 1:nrow(combinationDf)){
sample <- inputDf %>%
semi_join(combinationDf %>% slice(i) %>% select(where(~.x==1)))
finalDf2 <- rbind(finalDf ,sample)
}
At each loop iteration we select all the columns that are 1 and then just join to extract the matching values from inputDf. This will work with any number of columns. Another way of expressing this without the loop in dplyr is
combinationDf %>%
group_by(id=1:n()) %>%
group_map(~.x %>%
select(where(~.x==1)) %>%
semi_join(inputDf, .)
) %>%
bind_rows()
This may be more readable.
Base R approach :
Use apply in rowwise fashion to go through each row in combinationDf.
Get the column names which has value as 1 in a row.
Subset those columns in inputDf and select rows where all the values are 1.
Combine the list of dataframes into one dataframe.
result <- do.call(rbind, apply(combinationDf, 1, function(x)
inputDf[rowSums(inputDf[names(x)[x == 1]] != 1) == 0, ]))
rownames(result) <- NULL
result
# Q1 Q2 Q3 Q4 Q5 Q6 Q7 Q8 Q9 Q10 Q11 Q12 Rating RatingBinary
#1 0 1 1 1 1 1 1 1 1 0 1 1 7 1
#2 1 1 1 1 1 1 1 1 1 1 1 1 7 1
#3 1 1 1 1 1 1 1 1 1 0 1 1 6 1
#4 1 1 1 1 1 1 1 1 1 1 1 1 5 0
#5 1 1 1 1 1 1 1 1 1 1 1 1 6 1
#6 0 1 1 1 1 1 1 1 1 0 1 1 7 1
#7 1 1 1 1 1 1 1 1 1 1 1 1 7 1
#8 1 1 1 1 1 1 1 1 1 0 1 1 6 1
#9 1 1 1 1 1 1 1 1 1 1 1 1 5 0
#10 1 1 1 1 1 1 1 1 1 1 1 1 6 1
combinationDf %>%
apply(1, function(x) paste0(names(inputDf)[x == 1], "==1", collapse = "&")) %>%
lapply(function(x) filter(inputDf, eval(parse(text = x)))) %>%
Reduce(rbind, .)
My dataset look like this
ID choice_situation Alternative Attr1 Attr2 Attr3 choice
ID_1 1 1 0 0 0 0
ID_1 1 2 1 1 0 1
ID_1 2 1 1 1 0 0
ID_1 2 2 1 1 1 1
ID_1 3 1 2 1 0 1
ID_1 3 2 3 1 0 0
ID_2 1 1 3 0 1 1
ID_2 1 2 0 0 0 0
ID_2 2 1 2 1 1 0
ID_2 2 2 2 1 1 1
ID_2 3 1 0 0 0 1
ID_2 3 2 0 0 1 0
.....
Every time I run the code of mlogit function
DCE_data<- mlogit.data(data=dataset, choice = "choice", shape = "long", alt.var = "Alternative", id.var = "ID") #ok
model<- mlogit(choice ~ Attr1 + Attr2 + Attr3 | 0, DCE_data)#error
I get the error below :
Error in dfidx(x, .idx, pkg = pkg) :
the two indexes don't define unique observations
The problem is from the transformed data : DCE_data ?
Thanks in advance!
For me your code works:
library(tidyverse)
df <- tibble::tribble(
~ID, ~choice_situation, ~Alternative, ~Attr1, ~Attr2, ~Attr3, ~choice,
"ID_1", 1L, 1L, 0L, 0L, 0L, 0L,
"ID_1", 1L, 2L, 1L, 1L, 0L, 1L,
"ID_1", 2L, 1L, 1L, 1L, 0L, 0L,
"ID_1", 2L, 2L, 1L, 1L, 1L, 1L,
"ID_1", 3L, 1L, 2L, 1L, 0L, 1L,
"ID_1", 3L, 2L, 3L, 1L, 0L, 0L,
"ID_2", 1L, 1L, 3L, 0L, 1L, 1L,
"ID_2", 1L, 2L, 0L, 0L, 0L, 0L,
"ID_2", 2L, 1L, 2L, 1L, 1L, 0L,
"ID_2", 2L, 2L, 2L, 1L, 1L, 1L,
"ID_2", 3L, 1L, 0L, 0L, 0L, 1L,
"ID_2", 3L, 2L, 0L, 0L, 1L, 0L
)
library(mlogit)
DCE_data<- mlogit.data(data=df, choice = "choice", shape = "long", alt.var = "Alternative", id.var = "ID") #ok
model<- mlogit(choice ~ Attr1 + Attr2 + Attr3 | 0, DCE_data)#error
summary(model)
> model
Call:
mlogit(formula = choice ~ Attr1 + Attr2 + Attr3 | 0, data = DCE_data, method = "nr")
Coefficients:
Attr1 Attr2 Attr3
0.34137 14.86152 0.39473
Here is some mock data related to this problem:
structure(list(HHID = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L,
3L, 3L, 4L, 4L, 4L, 4L, 4L), PERS = c(1L, 2L, 3L, 4L, 5L, 1L,
2L, 3L, 4L, 1L, 2L, 3L, 1L, 2L, 3L, 4L, 5L), MARSTAT = c(2L,
2L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 5L, 1L, 1L
), SEX = c(1L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L,
1L, 2L, 2L, 1L), VAR1 = c(NA, 1L, 4L, 4L, 4L, NA, 1L, 5L, 4L,
NA, 4L, 4L, NA, 1L, 8L, 4L, 4L), VAR2 = c(NA, NA, 4L, 4L, 4L,
NA, NA, 4L, 5L, NA, NA, 6L, NA, NA, 12L, 4L, 4L), VAR3 = c(NA,
NA, NA, 6L, 6L, NA, NA, NA, 7L, NA, NA, NA, NA, NA, NA, 11L,
11L), VAR4 = c(NA, NA, NA, NA, 6L, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, 6L), VAR5 = c(NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_), FLAG = c(0L,
0L, 0L, 1L, 0L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 1L
)), .Names = c("HHID", "PERS", "MARSTAT", "SEX", "VAR1", "VAR2",
"VAR3", "VAR4", "VAR5", "FLAG"), row.names = c(NA, 17L), class = "data.frame")
For each household in my data, I want to transpose the values in the lower triangle into the upper triangle so that for each household I essentially have a symmetrical matrix with the diagonal either NA or 0 (for this analysis, 0 and NA are interchangeable). So based on the above example, I would be looking for the following dataset:
structure(list(HHID = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L,
3L, 3L, 4L, 4L, 4L, 4L, 4L), PERS = c(1L, 2L, 3L, 4L, 5L, 1L,
2L, 3L, 4L, 1L, 2L, 3L, 1L, 2L, 3L, 4L, 5L), MARSTAT = c(2L,
2L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 5L, 1L, 1L
), SEX = c(1L, 2L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 2L,
1L, 2L, 2L, 1L), VAR1 = c(NA, 1L, 4L, 4L, 4L, NA, 1L, 5L, 4L,
NA, 4L, 4L, NA, 1L, 8L, 4L, 4L), VAR2 = c(1L, NA, 4L, 4L, 4L,
1L, NA, 4L, 5L, 4L, NA, 6L, 1L, NA, 12L, 4L, 4L), VAR3 = c(4L,
4L, NA, 6L, 6L, 5L, 4L, NA, 7L, 4L, 6L, NA, 8L, 12L, NA, 11L,
11L), VAR4 = c(4L, 4L, 6L, NA, 6L, 4L, 5L, 7L, NA, NA, NA, NA,
4L, 4L, 11L, NA, 6L), VAR5 = c(4L, 4L, 6L, 6L, NA, NA, NA, NA,
NA, NA, NA, NA, 4L, 4L, 11L, 6L, NA), FLAG = c(0L, 0L, 0L, 1L,
0L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 4L, 4L, 11L, 1L, 1L)), .Names = c("HHID",
"PERS", "MARSTAT", "SEX", "VAR1", "VAR2", "VAR3", "VAR4", "VAR5",
"FLAG"), class = "data.frame", row.names = c(NA, -17L))
I have been able to do this for one household, as follows (though it misses the HHID which I would need to distinguish between households):
HH1 <- df %>%
filter(HHID == 1) %>%
select(VAR1, VAR2, VAR3, VAR4, VAR5)
HH1 <- as.matrix(HH1)
HH1[is.na(HH1)] <- 0
T_HH1 <- t(HH1)
T_HH1[is.na(T_HH1)] <- 0
combo <- HH1 + T_HH1
A <- combo
However, how would I go about doing this for multiple households across my dataset, also keeping the "HHID" and "PERS" information so that I can link on any extra info if needed?
Thank you so much in advance!
One approach is:
Split your data frame by HHID into groups
Create a custom function to take VAR columns, make it a square matrix, and transpose
Use rbindlist to reconstruct into rows again using fill to add NA as lengths in the list differ
Replace VAR columns (5 through 9) with new VAR columns
Let me know if this works for you.
f <- function(m) {
m <- m[, 1:nrow(m)]
m[upper.tri(m)] <- t(m)[upper.tri(m)]
m
}
df1[,5:9] <- rbindlist(lapply(split(df1[,5:9], df1$HHID), f), fill = TRUE)
Output
HHID PERS MARSTAT SEX VAR1 VAR2 VAR3 VAR4 VAR5 FLAG
1 1 1 2 1 NA 1 4 4 4 0
2 1 2 2 2 1 NA 4 4 4 0
3 1 3 1 2 4 4 NA 6 6 0
4 1 4 1 1 4 4 6 NA 6 1
5 1 5 1 1 4 4 6 6 NA 0
6 2 1 2 2 NA 1 5 4 NA 0
7 2 2 2 1 1 NA 4 5 NA 0
8 2 3 1 2 5 4 NA 7 NA 1
9 2 4 1 1 4 5 7 NA NA 1
10 3 1 1 2 NA 4 4 NA NA 0
11 3 2 1 2 4 NA 6 NA NA 1
12 3 3 1 1 4 6 NA NA NA 0
13 4 1 2 2 NA 1 8 4 4 0
14 4 2 2 1 1 NA 12 4 4 0
15 4 3 5 2 8 12 NA 11 11 0
16 4 4 1 2 4 4 11 NA 6 1
17 4 5 1 1 4 4 11 6 NA 1
additional solution
library(purrr)
library(tidyverse)
df %>%
mutate_all(~ replace_na(., 0)) %>%
select(HHID, starts_with("VAR")) %>%
group_by(HHID) %>%
nest %>%
mutate(data = map(data, ~ .x + t(.x))) %>%
unnest(data) %>%
bind_cols(select(df, -starts_with("VAR"), -HHID))
You can split the data on the HHID, apply an anonymous function to do the matrix stuff, then unsplit it.
vars <- grep("^VAR", names(df))
df[, vars] <- unsplit(lapply(split(df[, vars], df$HHID), tt), df$HHID)
# HHID PERS MARSTAT SEX VAR1 VAR2 VAR3 VAR4 VAR5 FLAG
# 1 1 1 2 1 0 1 4 4 4 0
# 2 1 2 2 2 1 0 4 4 4 0
# 3 1 3 1 2 4 4 0 6 6 0
# 4 1 4 1 1 4 4 6 0 6 1
# 5 1 5 1 1 4 4 6 6 0 0
# 6 2 1 2 2 0 1 5 4 0 0
# 7 2 2 2 1 1 0 4 5 0 0
# 8 2 3 1 2 5 4 0 7 0 0
# 9 2 4 1 1 4 5 7 0 0 0
# 10 3 1 1 2 0 4 4 0 0 0
# 11 3 2 1 2 4 0 6 0 0 0
# 12 3 3 1 1 4 6 0 0 0 0
# 13 4 1 2 2 0 1 8 4 4 0
# 14 4 2 2 1 1 0 12 4 4 0
# 15 4 3 5 2 8 12 0 11 11 0
# 16 4 4 1 2 4 4 11 0 6 1
# 17 4 5 1 1 4 4 11 6 0 1
Here's the anonymous function:
tt <- function(x) {
x <- x[, 1:nrow(x)] # Make it square
x[upper.tri(x)] <- 0 # replace upper triangle with 0
x + t(x) # add them together
}
I have two data frames: households and individuals.
This is households:
structure(list(ID = 1:5), class = "data.frame", row.names = c(NA,
-5L))
This is individuals:
structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L,
3L, 4L, 4L, 4L, 4L, 5L, 5L), Yesno = c(1L, 0L, 1L, 0L, 0L, 0L,
1L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 0L)), class = "data.frame", row.names = c(NA,
-17L))
I'm trying to to add a new column to households that counts the number of times variable Yesno is equal to 1, grouping results by ID.
I have tried
households$Count <- as.numeric(ave(individuals$Yesno[individuals$Yesno == 1], households$ID, FUN = count))
households should look like this:
ID Count
1 2
2 3
3 0
4 2
5 1
Option 1: In base R
Using merge and aggregate
aggregate(Yesno ~ ID, merge(households, individuals), FUN = sum)
# ID Yesno
#1 1 2
#2 2 3
#3 3 0
#4 4 2
#5 5 1
Option 2: With dplyr
Using left_join and group_by+summarise
library(dplyr)
left_join(households, individuals) %>%
group_by(ID) %>%
summarise(Count = sum(Yesno))
#Joining, by = "ID"
## A tibble: 5 x 2
# ID Count
# <int> <int>
#1 1 2
#2 2 3
#3 3 0
#4 4 2
#5 5 1
Option 3: With data.table
library(data.table)
setDT(households)
setDT(individuals)
households[individuals, on = "ID"][, .(Count = sum(Yesno)), by = ID]
# ID Count
#1: 1 2
#2: 2 3
#3: 3 0
#4: 4 2
#5: 5 1
Sample data
households <- structure(list(ID = 1:5), class = "data.frame", row.names = c(NA,
-5L))
individuals <- structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L,
3L, 4L, 4L, 4L, 4L, 5L, 5L), Yesno = c(1L, 0L, 1L, 0L, 0L, 0L,
1L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 0L)), class = "data.frame", row.names = c(NA,
-17L))
Another base R approach using sapply is to loop over each ID in households and subset that ID from individuals and count how many of them have 1 in Yesno column.
households$Count <- sapply(households$ID, function(x)
sum(individuals$Yesno[individuals$ID == x] == 1))
households
# ID Count
#1 1 2
#2 2 3
#3 3 0
#4 4 2
#5 5 1
The == 1 part in the function can be removed if the Yesno column has only 0's and 1's.