How to find the statistical mode of each ID - r

Here are the observations of two individuals of my dataset.
data=structure(list(id = c(2L, 2L, 2L, 3L, 3L, 3L), trt = c(1L, 1L,
1L, 1L, 1L, 1L), status = c(0L, 0L, 0L, 2L, 2L, 2L), stage = c(3L,
3L, 3L, 4L, 4L, 4L), spiders = c(1L, 1L, 1L, 0L, 1L, 0L), sex = structure(c(2L,
2L, 2L, 1L, 1L, 1L), .Label = c("m", "f"), class = "factor"),
hepato = c(1L, 1L, 1L, 0L, 1L, 0L), edema = c(0, 0, 0, 0.5,
0, 0.5), ascites = c(0L, 0L, 0L, 0L, 0L, 0L)), row.names = c(NA,
-6L), class = "data.frame")
I want to calculate the the statistical mode for each individual after grouping by id. I used this code below:
library(dplyr)
library(modeest)
data%>%
group_by(id)%>%mutate(edema2=mlv(edema))
And I get an error message when calculating the mode, while this method work well with other statistical parameters such as mean, sd, min, max....

The warnings that you are getting are suggesting two things.
You have not specified what method to choose so default method 'shorth' is used.
It is suggesting that there is a tie in selection of Mode value.
Alternatively, why not use the Mode function from here :
Mode <- function(x) {
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
To apply by group you can use it with dplyr as :
library(dplyr)
data%>% group_by(id)%>% mutate(edema2= Mode(edema))
# id trt status stage spiders sex hepato edema ascites edema2
# <int> <int> <int> <int> <int> <fct> <int> <dbl> <int> <dbl>
#1 2 1 0 3 1 f 1 0 0 0
#2 2 1 0 3 1 f 1 0 0 0
#3 2 1 0 3 1 f 1 0 0 0
#4 3 1 2 4 0 m 0 0.5 0 0.5
#5 3 1 2 4 1 m 1 0 0 0.5
#6 3 1 2 4 0 m 0 0.5 0 0.5

Related

Get new columns based on data from other columns

My data:
data <- structure(list(col1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), col2 = c(0L, 1L, 1L, 0L, 0L,
1L, 0L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 0L)), class = "data.frame", row.names = c(NA,
-18L))
I want to get 2 new columns based on col1 and col2.
column 3 is obtained: We leave units if there is zero in the second column, 2 are simply transferred.
column 4 will turn out: We leave units if there is one in the second column, 2 are simply transferred.
What I want to get:
data <- structure(list(col1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), col2 = c(0L, 1L, 1L, 0L, 0L,
1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), group1 = c(1L,
NA, NA, 1L, 1L, NA, 1L, NA, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L), group2 = c(NA, 1L, 1L, NA, NA, 1L, NA, 1L, NA, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L)), class = "data.frame", row.names = c(NA,
-18L))
A solution that uses tidyr::pivot_wider():
library(dplyr)
data %>%
mutate(id = 1:n(), name = paste0("group", col2 + 1), value = 1) %>%
tidyr::pivot_wider() %>%
mutate(col2 = replace(col2, col1 == 2, 0),
across(starts_with("group"), replace, col1 == 2, 2)) %>%
select(-id)
# A tibble: 18 x 4
col1 col2 group1 group2
<int> <dbl> <dbl> <dbl>
1 1 0 1 NA
2 1 1 NA 1
3 1 1 NA 1
4 1 0 1 NA
5 1 0 1 NA
6 1 1 NA 1
7 1 0 1 NA
8 1 1 NA 1
9 1 0 1 NA
10 2 0 2 2
11 2 0 2 2
12 2 0 2 2
13 2 0 2 2
14 2 0 2 2
15 2 0 2 2
16 2 0 2 2
17 2 0 2 2
18 2 0 2 2
You can use ifelse to get group1 and group2.
transform(data
, group1 = ifelse(col1==2, 2, ifelse(col2==0, 1, NA))
, group2 = ifelse(col1==2, 2, ifelse(col2==1, 1, NA))
)
# col1 col2 group1 group2
#1 1 0 1 NA
#2 1 1 NA 1
#3 1 1 NA 1
#4 1 0 1 NA
#5 1 0 1 NA
#6 1 1 NA 1
#7 1 0 1 NA
#8 1 1 NA 1
#9 1 0 1 NA
#10 2 0 2 2
#11 2 1 2 2
#12 2 1 2 2
#13 2 0 2 2
#14 2 0 2 2
#15 2 1 2 2
#16 2 0 2 2
#17 2 1 2 2
#18 2 0 2 2

Filtering rows based on dynamic column count & column name in R

I am having a data frame named inputDf which have the binary values in all the columns other than Rating column.
inputDf <- structure(list(Q1 = c(0L, 1L, 1L, 1L, 1L, 0L, 0L, 1L), Q2 = c(1L,
1L, 1L, 1L, 1L, 0L, 1L, 0L), Q3 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), Q4 = c(1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L), Q5 = c(1L, 1L, 1L,
1L, 1L, 0L, 0L, 1L), Q6 = c(1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L),
Q7 = c(1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L), Q8 = c(1L, 1L, 1L,
1L, 1L, 1L, 0L, 0L), Q9 = c(1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L
), Q10 = c(0L, 1L, 0L, 1L, 1L, 0L, 0L, 0L), Q11 = c(1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L), Q12 = c(1L, 1L, 1L, 1L, 1L, 1L,
0L, 1L), Rating = c(7L, 7L, 6L, 5L, 6L, 6L, 7L, 5L), RatingBinary = c(1L,
1L, 1L, 0L, 1L, 1L, 1L, 0L)), row.names = c(13L, 17L, 26L,
71L, 72L, 55L, 56L, 69L), class = "data.frame")
I am having another similar data frame named combinationDf
combinationDf <- structure(list(Q1 = c(0L, 0L), Q2 = c(0L, 0L), Q3 = 1:0, Q4 = c(1L,
1L), Q5 = c(0L, 0L), Q6 = c(0L, 0L), Q7 = c(0L, 0L), Q8 = c(0L,
0L), Q9 = c(0L, 0L), Q10 = c(0L, 0L), Q11 = c(1L, 1L), Q12 = 0:1), row.names = 1:2, class =
"data.frame")
The problem statement is for every combination of 1's in each row in combinationDf, I need to filter rows from inputDf
I implemented the logic by hard-coding the number of columns to be considered for filtering out the data.
finalDf <- data.frame()
for(i in 1:nrow(combinationDf)){
ind <- which(combinationDf[i,] == 1)
ind <- paste("Q",ind, sep = "")
sample <- inputDf %>%
dplyr::filter(eval(parse(text=ind[1])) == 1 & eval(parse(text=ind[2])) == 1 & eval(parse(text=ind[3])) == 1) %>%
as.data.frame()
finalDf <<- rbind(finalDf,sample)
}
However, I'm looking for the general code to filter out the data using N # of columns. i.e, the above code works for filtering using 3 columns. If I need to filter based on 4 columns, I need to add a condition. To overcome that, I used the code below,
sample <- inputDf %>%
dplyr::filter(as.logical(paste(paste0("eval(parse(text = ind[", 1:length(ind), "])) == 1"), collapse = " & "))) %>%
as.data.frame()
This snippet doesn't filter the rows as expected. Can anyone point me out the mistake I have done in the above code? Or can provide the best approach to achieve the same?
It may make sense to subset and then do a semi join for filtering
finalDf <- data.frame()
for(i in 1:nrow(combinationDf)){
sample <- inputDf %>%
semi_join(combinationDf %>% slice(i) %>% select(where(~.x==1)))
finalDf2 <- rbind(finalDf ,sample)
}
At each loop iteration we select all the columns that are 1 and then just join to extract the matching values from inputDf. This will work with any number of columns. Another way of expressing this without the loop in dplyr is
combinationDf %>%
group_by(id=1:n()) %>%
group_map(~.x %>%
select(where(~.x==1)) %>%
semi_join(inputDf, .)
) %>%
bind_rows()
This may be more readable.
Base R approach :
Use apply in rowwise fashion to go through each row in combinationDf.
Get the column names which has value as 1 in a row.
Subset those columns in inputDf and select rows where all the values are 1.
Combine the list of dataframes into one dataframe.
result <- do.call(rbind, apply(combinationDf, 1, function(x)
inputDf[rowSums(inputDf[names(x)[x == 1]] != 1) == 0, ]))
rownames(result) <- NULL
result
# Q1 Q2 Q3 Q4 Q5 Q6 Q7 Q8 Q9 Q10 Q11 Q12 Rating RatingBinary
#1 0 1 1 1 1 1 1 1 1 0 1 1 7 1
#2 1 1 1 1 1 1 1 1 1 1 1 1 7 1
#3 1 1 1 1 1 1 1 1 1 0 1 1 6 1
#4 1 1 1 1 1 1 1 1 1 1 1 1 5 0
#5 1 1 1 1 1 1 1 1 1 1 1 1 6 1
#6 0 1 1 1 1 1 1 1 1 0 1 1 7 1
#7 1 1 1 1 1 1 1 1 1 1 1 1 7 1
#8 1 1 1 1 1 1 1 1 1 0 1 1 6 1
#9 1 1 1 1 1 1 1 1 1 1 1 1 5 0
#10 1 1 1 1 1 1 1 1 1 1 1 1 6 1
combinationDf %>%
apply(1, function(x) paste0(names(inputDf)[x == 1], "==1", collapse = "&")) %>%
lapply(function(x) filter(inputDf, eval(parse(text = x)))) %>%
Reduce(rbind, .)

How do I convert this adjacency matrix into a graph object?

I have a matrix that represents social interaction data on a CSV, which looks like below:
`0` `1` `2` `3` `4` `5` `6` `7` `8` `9`
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
0 0 29 1 0 1 9 3 0 1 4
1 1 0 0 1 3 1 0 1 1 1
2 1 1 0 13 4 0 1 1 15 0
3 3 0 1 0 1 1 7 1 1 1
4 1 0 1 98 0 1 1 1 1 2
5 2 5 1 1 3 0 2 0 1 5
6 1 1 0 0 12 1 0 2 1 1
7 1 1 0 1 0 1 9 0 1 2
8 1 1 17 13 145 1 39 1 0 1
9 88 23 1 5 1 2 1 7 1 0
I am new to social network analysis, so I am not sure of my terminology, but this seems like a weighted adjacency matrix to me, as we can say from this that student 1 has had 29 interactions with student 0 in the last year. I had this object stored as a data-frame in my RStudio, but when I ran the following code, I received the below error:
> fn <- graph_from_adjacency_matrix(output, weighted = T)
Error in mde(x) : 'list' object cannot be coerced to type 'double'
I've tried converting it to matrix, but that does not seem to work either. Any help concerning this would be really appreciated.
You need to convert your data.frame to matrix first and then apply graph_from_adjacency_matrix, e.g.,
g <- graph_from_adjacency_matrix(as.matrix(df),weighted = TRUE)
and plot(g) gives
Data
> dput(df)
structure(list(``0`` = c(0L, 1L, 1L, 3L, 1L, 2L, 1L, 1L, 1L,
88L), ``1`` = c(29L, 0L, 1L, 0L, 0L, 5L, 1L, 1L, 1L, 23L), ``2`` = c(1L,
0L, 0L, 1L, 1L, 1L, 0L, 0L, 17L, 1L), ``3`` = c(0L, 1L, 13L,
0L, 98L, 1L, 0L, 1L, 13L, 5L), ``4`` = c(1L, 3L, 4L, 1L, 0L,
3L, 12L, 0L, 145L, 1L), ``5`` = c(9L, 1L, 0L, 1L, 1L, 0L, 1L,
1L, 1L, 2L), ``6`` = c(3L, 0L, 1L, 7L, 1L, 2L, 0L, 9L, 39L, 1L
), ``7`` = c(0L, 1L, 1L, 1L, 1L, 0L, 2L, 0L, 1L, 7L), ``8`` = c(1L,
1L, 15L, 1L, 1L, 1L, 1L, 1L, 0L, 1L), ``9`` = c(4L, 1L, 0L, 1L,
2L, 5L, 1L, 2L, 1L, 0L)), class = "data.frame", row.names = c("0",
"1", "2", "3", "4", "5", "6", "7", "8", "9"))

Error with using mlogit R function: "The two indexes don't define unique observations"

My dataset look like this
ID choice_situation Alternative Attr1 Attr2 Attr3 choice
ID_1 1 1 0 0 0 0
ID_1 1 2 1 1 0 1
ID_1 2 1 1 1 0 0
ID_1 2 2 1 1 1 1
ID_1 3 1 2 1 0 1
ID_1 3 2 3 1 0 0
ID_2 1 1 3 0 1 1
ID_2 1 2 0 0 0 0
ID_2 2 1 2 1 1 0
ID_2 2 2 2 1 1 1
ID_2 3 1 0 0 0 1
ID_2 3 2 0 0 1 0
.....
Every time I run the code of mlogit function
DCE_data<- mlogit.data(data=dataset, choice = "choice", shape = "long", alt.var = "Alternative", id.var = "ID") #ok
model<- mlogit(choice ~ Attr1 + Attr2 + Attr3 | 0, DCE_data)#error
I get the error below :
Error in dfidx(x, .idx, pkg = pkg) :
the two indexes don't define unique observations
The problem is from the transformed data : DCE_data ?
Thanks in advance!
For me your code works:
library(tidyverse)
df <- tibble::tribble(
~ID, ~choice_situation, ~Alternative, ~Attr1, ~Attr2, ~Attr3, ~choice,
"ID_1", 1L, 1L, 0L, 0L, 0L, 0L,
"ID_1", 1L, 2L, 1L, 1L, 0L, 1L,
"ID_1", 2L, 1L, 1L, 1L, 0L, 0L,
"ID_1", 2L, 2L, 1L, 1L, 1L, 1L,
"ID_1", 3L, 1L, 2L, 1L, 0L, 1L,
"ID_1", 3L, 2L, 3L, 1L, 0L, 0L,
"ID_2", 1L, 1L, 3L, 0L, 1L, 1L,
"ID_2", 1L, 2L, 0L, 0L, 0L, 0L,
"ID_2", 2L, 1L, 2L, 1L, 1L, 0L,
"ID_2", 2L, 2L, 2L, 1L, 1L, 1L,
"ID_2", 3L, 1L, 0L, 0L, 0L, 1L,
"ID_2", 3L, 2L, 0L, 0L, 1L, 0L
)
library(mlogit)
DCE_data<- mlogit.data(data=df, choice = "choice", shape = "long", alt.var = "Alternative", id.var = "ID") #ok
model<- mlogit(choice ~ Attr1 + Attr2 + Attr3 | 0, DCE_data)#error
summary(model)
> model
Call:
mlogit(formula = choice ~ Attr1 + Attr2 + Attr3 | 0, data = DCE_data, method = "nr")
Coefficients:
Attr1 Attr2 Attr3
0.34137 14.86152 0.39473

Counting incidences from one data frame, entering results into a different data frame

I have two data frames: households and individuals.
This is households:
structure(list(ID = 1:5), class = "data.frame", row.names = c(NA,
-5L))
This is individuals:
structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L,
3L, 4L, 4L, 4L, 4L, 5L, 5L), Yesno = c(1L, 0L, 1L, 0L, 0L, 0L,
1L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 0L)), class = "data.frame", row.names = c(NA,
-17L))
I'm trying to to add a new column to households that counts the number of times variable Yesno is equal to 1, grouping results by ID.
I have tried
households$Count <- as.numeric(ave(individuals$Yesno[individuals$Yesno == 1], households$ID, FUN = count))
households should look like this:
ID Count
1 2
2 3
3 0
4 2
5 1
Option 1: In base R
Using merge and aggregate
aggregate(Yesno ~ ID, merge(households, individuals), FUN = sum)
# ID Yesno
#1 1 2
#2 2 3
#3 3 0
#4 4 2
#5 5 1
Option 2: With dplyr
Using left_join and group_by+summarise
library(dplyr)
left_join(households, individuals) %>%
group_by(ID) %>%
summarise(Count = sum(Yesno))
#Joining, by = "ID"
## A tibble: 5 x 2
# ID Count
# <int> <int>
#1 1 2
#2 2 3
#3 3 0
#4 4 2
#5 5 1
Option 3: With data.table
library(data.table)
setDT(households)
setDT(individuals)
households[individuals, on = "ID"][, .(Count = sum(Yesno)), by = ID]
# ID Count
#1: 1 2
#2: 2 3
#3: 3 0
#4: 4 2
#5: 5 1
Sample data
households <- structure(list(ID = 1:5), class = "data.frame", row.names = c(NA,
-5L))
individuals <- structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L,
3L, 4L, 4L, 4L, 4L, 5L, 5L), Yesno = c(1L, 0L, 1L, 0L, 0L, 0L,
1L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 0L)), class = "data.frame", row.names = c(NA,
-17L))
Another base R approach using sapply is to loop over each ID in households and subset that ID from individuals and count how many of them have 1 in Yesno column.
households$Count <- sapply(households$ID, function(x)
sum(individuals$Yesno[individuals$ID == x] == 1))
households
# ID Count
#1 1 2
#2 2 3
#3 3 0
#4 4 2
#5 5 1
The == 1 part in the function can be removed if the Yesno column has only 0's and 1's.

Resources