Related
I'm trying to get the frequency distribution for a list if it's over a certain number. In my data, I have multiple columns and I want to generate a code that identifies the frequency of "0" in each column where "0" is greater than 3.
My dataset is like this:
a b c d e f g h
0 1 0 1 1 1 1 1
2 0 0 0 0 0 0 0
0 1 2 2 2 1 0 1
0 0 0 0 1 0 0 0
1 0 2 1 1 0 0 0
1 1 0 0 1 0 0 0
0 1 2 2 2 2 2 2
```
The output of the code that I need is :
```
Variable Frequency
a 4
c 4
f 4
g 5
h 4
```
So this will show us the numbers of "0" in the data frame in each column when it is greater than 3.
Thank you.
You can use colSums to count number of 0's in each column and subset the values which are greater than 3.
subset(stack(colSums(df == 0, na.rm = TRUE)), values > 3)
tidyverse way would be :
library(dplyr)
df %>%
summarise(across(.fns = ~sum(. == 0, na.rm = TRUE))) %>%
tidyr::pivot_longer(cols = everything()) %>%
filter(value > 3)
# name value
# <chr> <int>
#1 a 4
#2 c 4
#3 f 4
#4 g 5
#5 h 4
data
df <- structure(list(a = c(0L, 2L, 0L, 0L, 1L, 1L, 0L), b = c(1L, 0L,
1L, 0L, 0L, 1L, 1L), c = c(0L, 0L, 2L, 0L, 2L, 0L, 2L), d = c(1L,
0L, 2L, 0L, 1L, 0L, 2L), e = c(1L, 0L, 2L, 1L, 1L, 1L, 2L), f = c(1L,
0L, 1L, 0L, 0L, 0L, 2L), g = c(1L, 0L, 0L, 0L, 0L, 0L, 2L), h = c(1L,
0L, 1L, 0L, 0L, 0L, 2L)), class = "data.frame", row.names = c(NA, -7L))
My dataset look like this
ID choice_situation Alternative Attr1 Attr2 Attr3 choice
ID_1 1 1 0 0 0 0
ID_1 1 2 1 1 0 1
ID_1 2 1 1 1 0 0
ID_1 2 2 1 1 1 1
ID_1 3 1 2 1 0 1
ID_1 3 2 3 1 0 0
ID_2 1 1 3 0 1 1
ID_2 1 2 0 0 0 0
ID_2 2 1 2 1 1 0
ID_2 2 2 2 1 1 1
ID_2 3 1 0 0 0 1
ID_2 3 2 0 0 1 0
.....
Every time I run the code of mlogit function
DCE_data<- mlogit.data(data=dataset, choice = "choice", shape = "long", alt.var = "Alternative", id.var = "ID") #ok
model<- mlogit(choice ~ Attr1 + Attr2 + Attr3 | 0, DCE_data)#error
I get the error below :
Error in dfidx(x, .idx, pkg = pkg) :
the two indexes don't define unique observations
The problem is from the transformed data : DCE_data ?
Thanks in advance!
For me your code works:
library(tidyverse)
df <- tibble::tribble(
~ID, ~choice_situation, ~Alternative, ~Attr1, ~Attr2, ~Attr3, ~choice,
"ID_1", 1L, 1L, 0L, 0L, 0L, 0L,
"ID_1", 1L, 2L, 1L, 1L, 0L, 1L,
"ID_1", 2L, 1L, 1L, 1L, 0L, 0L,
"ID_1", 2L, 2L, 1L, 1L, 1L, 1L,
"ID_1", 3L, 1L, 2L, 1L, 0L, 1L,
"ID_1", 3L, 2L, 3L, 1L, 0L, 0L,
"ID_2", 1L, 1L, 3L, 0L, 1L, 1L,
"ID_2", 1L, 2L, 0L, 0L, 0L, 0L,
"ID_2", 2L, 1L, 2L, 1L, 1L, 0L,
"ID_2", 2L, 2L, 2L, 1L, 1L, 1L,
"ID_2", 3L, 1L, 0L, 0L, 0L, 1L,
"ID_2", 3L, 2L, 0L, 0L, 1L, 0L
)
library(mlogit)
DCE_data<- mlogit.data(data=df, choice = "choice", shape = "long", alt.var = "Alternative", id.var = "ID") #ok
model<- mlogit(choice ~ Attr1 + Attr2 + Attr3 | 0, DCE_data)#error
summary(model)
> model
Call:
mlogit(formula = choice ~ Attr1 + Attr2 + Attr3 | 0, data = DCE_data, method = "nr")
Coefficients:
Attr1 Attr2 Attr3
0.34137 14.86152 0.39473
I would like to calculate the mean of every SACCADIC_RT for which COMMISSION_ERROR =1, that follows every 5 consecutive HITS=1, per ID per condition.
ID | TRIAL | TRIAL_TYPE| CONDITION | COMMISSION_ERROR | HITS| SACCADIC_RT
1 183 nogo square_1 1 -1 175
1 54 go square_1 -1 1 259
1 26 nogo square_1 1 -1 365
1 3 nogo square_1 1 -1 346
1 100 nogo square_1 1 -1 287
1 11 go square_1 -1 1 164
1 52 go square_1 -1 1 244
1 8 go square_1 -1 1 223
1 10 go square_1 -1 1 183
1 21 go square_1 -1 1 234
1 32 go square_1 1 -1 221
1 2 go square_1 -1 1 183
1 13 nogo square_1 0 -1 -1
1 87 nogo square_2 1 -1 228
1 95 nogo square_2 1 -1 274
1 111 go square_2 -1 1 305
1 28 nogo square_2 0 -1 309
1 65 go square_2 -1 0 -1
1 40 nogo square_1 0 -1 199
1 19 nogo square_1 0 -1 207
1 28 go square_1 -1 1 257
2 45 nogo square_1 1 -1 169
2 197 nogo square_1 1 -1 350
2 115 nogo square_1 1 -1 321
2 65 go square_2 -1 1 298
2 24 go square_2 -1 0 -1
2 1 nogo square_2 1 -1 183
2 77 go square_2 -1 1 225
2 90 go square_2 -1 1 305
2 89 go square_2 -1 1 210
2 104 go square_2 -1 1 199
2 116 go square_2 -1 1 175
2 29 nogo square_2 1 -1 99
2 41 go square_2 -1 1 104
The sample table can be recreated in r as:
structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), X..TRIAL.. = c(183L, 54L, 26L,
3L, 100L, 11L, 52L, 8L, 10L, 21L, 32L, 2L, 13L, 87L, 95L, 111L,
28L, 65L, 40L, 19L, 28L, 45L, 197L, 115L, 65L, 24L, 1L, 77L,
90L, 89L, 104L, 116L, 29L, 41L), TRIAL_TYPE. = structure(c(2L,
1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 2L,
1L, 2L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L,
1L), .Label = c("go", "nogo"), class = "factor"), CONDITION = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L), .Label = c("square_1", "square_2"), class = "factor"), X..COMMISSION_ERROR = c(1L,
-1L, 1L, 1L, 1L, -1L, -1L, -1L, -1L, -1L, 1L, -1L, 0L, 1L, 1L,
-1L, 0L, -1L, 0L, 0L, -1L, 1L, 1L, 1L, -1L, -1L, 1L, -1L, -1L,
-1L, -1L, -1L, 1L, -1L), X..HITS. = c(-1L, 1L, -1L, -1L, -1L,
1L, 1L, 1L, 1L, 1L, -1L, 1L, -1L, -1L, -1L, 1L, -1L, 0L, -1L,
-1L, 1L, -1L, -1L, -1L, 1L, 0L, -1L, 1L, 1L, 1L, 1L, 1L, -1L,
1L), SACCADIC_RT = c(175L, 259L, 365L, 346L, 287L, 164L, 244L,
223L, 183L, 234L, 221L, 183L, -1L, 228L, 274L, 305L, 309L, -1L,
199L, 207L, 257L, 169L, 350L, 321L, 298L, -1L, 183L, 225L, 305L,
210L, 199L, 175L, 99L, 104L)), .Names = c("ID", "X..TRIAL..",
"TRIAL_TYPE.", "CONDITION", "X..COMMISSION_ERROR", "X..HITS.",
"SACCADIC_RT"), class = "data.frame", row.names = c(NA, -34L))
So the result from this example will be like:
ID | CONDITION | x
1 square_1 221
2 square_2 99
You can use the package data.table to perform this task.
The steps are then as follows:
1) for each ID and condition calculate the rolling sum of hits
2) take only rows which satisfy 2 conditions: commision_error = 1 on previous row, there is number 5 in rolling sum column
3) calculate mean for each ID and condition in the table created in step 2
# load your data
data <- read.csv("./yourData.csv")
# load data table library
library(data.table)
# convert your data to data.table object
data <- data.table(data)
# group data by ID and Condition, calculate rolling sum over 5 rows
data[, roll := Reduce('+', shift(HITS, 0:4)), by = list(ID, CONDITION)]
# take only rows where there were 5 hits in a row and commission error is 1
newData <- data[shift(roll, 1) == 5 & COMMISSION_ERROR == 1]
# calculate mean of SACCADIC_RT for each ID and Condition in the new dataset
newData[, meanSacc := mean(SACCADIC_RT), by = list(ID, CONDITION)]
Code Explained:
roll := Reduce('+', shift(HITS, 0:4))
The shift function allows you to calculate value in current row based on the value in some previous row. Here the 'Reduce('+', shift(HITS, 0:4)) 'take value of hits on a given row and add to that the value of hits on 3 previous rows. This value is then written to the new column called roll.
newData <- data[shift(roll, 1) == 5 & COMMISSION_ERROR == 1]
The above code keeps only rows from the original dataset, where there is value 5 in the previous row of the column roll and the value of COMISSION_ERROR on the current line is equal to 1.
newData[, meanSacc := mean(SACCADIC_RT), by = list(ID, CONDITION)]
The above snip calculate mean of SACCADIC_RT for each ID and CONDITION and the mean is calculated only from rows in the new dataset created above. The means are then written to a new value called meanSacc
This is my matrix in R:
[,1] [,2] [,3]
[1,] 5 0 0
[2,] 0 0 5
[3,] 0 2 3
[4,] 1 2 2
[5,] 5 0 0
[6,] 1 4 0
[7,] 4 1 0
[8,] 0 0 5
[9,] 1 2 2
[10,] 3 2 0
[11,] 4 0 1
mat <- structure(c(5L, 0L, 0L, 1L, 5L, 1L, 4L, 0L, 1L, 3L, 4L, 0L, 0L,
2L, 2L, 0L, 4L, 1L, 0L, 2L, 2L, 0L, 0L, 5L, 3L, 2L, 0L, 0L, 0L,
5L, 2L, 0L, 1L), .Dim = c(11L, 3L))
I need find how many rows do not contain a zero, in this case the answer is 2:
the 4th row (1,2,2)
the 9th row, also(1,2,2)
Is there is a command for this or should I make a routine? I tried with two for() loops, but it's bad.
quick answer:
sum( 0 < apply(mat,1,prod) )
also:
nonzerorows <- 0 < apply(mat,1,prod) # logical selector of rows
mat[ nonzerorows, ]
mat[!nonzerorows, ]
which(nonzerorows)
sum(nonzerorows)
OP's data:
mat <- structure(c(5L, 0L, 0L, 1L, 5L, 1L, 4L, 0L, 1L, 3L, 4L, 0L, 0L,
2L, 2L, 0L, 4L, 1L, 0L, 2L, 2L, 0L, 0L, 5L, 3L, 2L, 0L, 0L, 0L,
5L, 2L, 0L, 1L), .Dim = c(11L, 3L))
mat <- matrix(sample(0:4, 16, replace=T), 4, 4)
mat
# [,1] [,2] [,3] [,4]
# [1,] 4 1 2 2
# [2,] 3 3 1 1
# [3,] 1 2 4 4
# [4,] 0 4 4 4
apply(mat, 1, function(x) all(x!=0))
# [1] TRUE TRUE TRUE FALSE
which(apply(mat, 1, function(x) all(x!=0)))
# [1] 1 2 3
A more general approach, e.g. when you're concerned about other numbers or elements could be this:
sum(apply(mat,1,function(x) {0 %in% x == F}))
I came across a table of freq. counts today I had to expand into a data frame of raw values. I was able to do it but was wondering if there's a faster way using the reshape package or data.table?
The original table looked like this:
i1 i2 i3 i4 m f
1 0 0 0 0 22 29
2 1 0 0 0 30 50
3 0 1 0 0 13 15
4 0 0 1 0 1 6
5 1 1 0 0 24 67
6 1 0 1 0 5 12
7 0 1 1 0 1 2
8 1 1 1 0 10 22
9 0 0 0 1 10 7
10 1 0 0 1 27 30
11 0 1 0 1 14 4
12 0 0 1 1 1 0
13 1 1 0 1 54 63
14 1 0 1 1 8 10
15 0 1 1 1 8 6
16 1 1 1 1 57 51
Here's an easy grab of the data using dput:
dat <- structure(list(i1 = c(0L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 1L,
0L, 0L, 1L, 1L, 0L, 1L), i2 = c(0L, 0L, 1L, 0L, 1L, 0L, 1L, 1L,
0L, 0L, 1L, 0L, 1L, 0L, 1L, 1L), i3 = c(0L, 0L, 0L, 1L, 0L, 1L,
1L, 1L, 0L, 0L, 0L, 1L, 0L, 1L, 1L, 1L), i4 = c(0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), m = c(22L, 30L,
13L, 1L, 24L, 5L, 1L, 10L, 10L, 27L, 14L, 1L, 54L, 8L, 8L, 57L
), f = c(29L, 50L, 15L, 6L, 67L, 12L, 2L, 22L, 7L, 30L, 4L, 0L,
63L, 10L, 6L, 51L)), .Names = c("i1", "i2", "i3", "i4", "m",
"f"), class = "data.frame", row.names = c(NA, -16L))
My approach(s) to reshape the data (is there a faster way?):
#step 1: method 1 (in this case binding and stacking uses less code than reshape)
dat2 <- data.frame(rbind(dat[,1:4], dat[, 1:4]),
sex = rep(c('m', 'f'), each=16),
n = c(dat$m, dat$f))
dat2
#step 1: method 2
dat3 <- reshape(dat, direction = "long", idvar = 1:4,
varying = list(c("m", "f")),
v.names = c("n"),
timevar = "sex",
times = c("m", "f"))
rownames(dat3) <- 1:nrow(dat3)
dat3 <- data.frame(dat3)
dat3$sex <- as.factor(dat3$sex)
all.equal(dat3, dat2) #just to show both method 1 and 2 give the same data frame
#step 2
dat4 <- dat2[rep(seq_len(nrow(dat2)), dat2$n), 1:5]
rownames(dat4) <- 1:nrow(dat4)
dat4
I assume this is a common problem as when you want to take a table from an article and reproduce it, it requires some unpacking. I am finding myself doing this more and more and want to make sure I'm being efficient.
Here is a one-liner.
dat2 <- ddply(dat, 1:4, summarize, sex = c(rep('m', m), rep('f', f)))
And here's a base R one-liner.
dat2 <- cbind(dat[c(rep(1:nrow(dat), dat$m), rep(1:nrow(dat), dat$f)),1:4],
sex=c(rep("m",sum(dat$m)), rep("f", sum(dat$f))))
Or, a little more generally:
d1 <- dat[,1:4]
d2 <- as.matrix(dat[,5:6])
dat2 <- cbind(d1[rep(rep(1:nrow(dat), ncol(d2)), d2),],
sex=rep(colnames(d2), colSums(d2)))
Given that nobody has posted a data.table solution (as suggested in the original question)
library(data.table)
DT <- as.data.table(dat)
DT[,list(sex = rep(c('m','f'),c(m,f))), by= list(i1,i2,i3,i4)]
Or, even more succinctly
DT[,list(sex = rep(c('m','f'),c(m,f))), by= 'i1,i2,i3,i4']
I would use melt for the first step and ddply for the second.
library(reshape2)
library(plyr)
d <- ddply(
melt(dat, id.vars=c("i1","i2","i3","i4"), variable.name="sex"),
c("i1","i2","i3","i4","sex"),
summarize,
id=rep(1,value)
)
d$id <- cumsum(d$id)