I have a dataset like this:
> dput(head(BurnData))
structure(list(Treatment = c(0L, 0L, 0L, 0L, 0L, 0L), Gender = c(0L,
0L, 0L, 0L, 0L, 0L), Race = c(0L, 1L, 1L, 0L, 1L, 1L), Surface = c(15L,
20L, 15L, 20L, 70L, 20L), head = c(0L, 0L, 0L, 1L, 1L, 1L), buttock = c(0L,
0L, 0L, 0L, 1L, 0L), trunk = c(1L, 1L, 0L, 1L, 1L, 1L), `upper leg` = c(1L,
0L, 1L, 0L, 1L, 0L), `lower leg` = c(0L, 0L, 1L, 0L, 0L, 0L),
`respiratory tract` = c(0L, 0L, 0L, 0L, 0L, 0L), type = c(2L,
4L, 2L, 2L, 2L, 4L), `excision time` = c(12L, 9L, 13L, 11L,
28L, 11L), excision = c(0L, 0L, 0L, 1L, 1L, 0L), `antibiotic time` = c(12L,
9L, 13L, 29L, 31L, 11L), antibiotic = c(0L, 0L, 0L, 0L, 0L,
0L), infection_t = c(12L, 9L, 7L, 29L, 4L, 8L), infection = c(0L,
0L, 1L, 0L, 1L, 1L)), .Names = c("Treatment", "Gender", "Race",
"Surface", "head", "buttock", "trunk", "upper leg", "lower leg",
"respiratory tract", "type", "excision time", "excision", "antibiotic time",
"antibiotic", "infection_t", "infection"), row.names = c(NA,
6L), class = "data.frame")
I am trying to create a new variable which combines the indicators head, buttock, trunk, upper leg, lower leg, respiratory tract into ONE new indicator variable where 0 is when all indicators are zero, 1 - only head, 2 - only buttock, 3 ... , 7 - only respiratory tract and 8 - combination of any of them.
I have been trying to do this with mutate, dplyr but i cannot get it right. I am not very good at this.
Here is an approach with base R using an ifelse statement.
ifelse(rowSums(d1[5:10]) > 1, 8,
ifelse(rowSums(d1[5:10]) == 0, 0, max.col(d1[5:10])))
#1 2 3 4 5 6
#8 3 8 8 8 8
You can also try a case_when using the tidyverse
library(tidyverse)
d %>%
select(head:`respiratory tract`) %>%
mutate(res=case_when(rowSums(.) == 0 ~ 0,
rowSums(.) > 1 ~ 8,
head == 1 ~ 1,
buttock == 1 ~ 2,
trunk == 1 ~ 3,
`upper leg`==1 ~ 4,
`lower leg`==1~5,
`respiratory tract`==1 ~ 6)) %>%
select(res) %>%
bind_cols(d,.)
Treatment Gender Race Surface head buttock trunk upper leg lower leg respiratory tract type
1 0 0 0 15 0 0 1 1 0 0 2
2 0 0 1 20 0 0 1 0 0 0 4
3 0 0 1 15 0 0 0 1 1 0 2
4 0 0 0 20 1 0 1 0 0 0 2
5 0 0 1 70 1 1 1 1 0 0 2
6 0 0 1 20 1 0 1 0 0 0 4
excision time excision antibiotic time antibiotic infection_t infection res
1 12 0 12 0 12 0 8
2 9 0 9 0 9 0 3
3 13 0 13 0 7 1 8
4 11 1 29 0 29 0 8
5 28 1 31 0 4 1 8
6 11 0 11 0 8 1 8
Or completely using the elegant solution of Sotos
mutate(res=case_when(rowSums(.) == 0 ~ 0L,
rowSums(.) > 1 ~ 8L,
TRUE ~ max.col(.)))
Related
Following Exercise 3 of the mlogit package https://cran.r-project.org/web/packages/mlogit/vignettes/e3mxlogit.html, but attempting to use my own data (see below)
structure(list(Choice.Set = c(4L, 5L, 7L, 8L, 10L, 12L), Alternative = c(2L,
1L, 1L, 2L, 2L, 2L), respondent = c(1L, 1L, 1L, 1L, 1L, 1L),
code = c(7L, 9L, 13L, 15L, 19L, 23L), Choice = c(1L, 1L,
1L, 1L, 1L, 1L), price1 = c(0L, 0L, 1L, 1L, 0L, 0L), price2 = c(0L,
1L, 0L, 0L, 1L, 1L), price3 = c(0L, 0L, 0L, 0L, 0L, 0L),
price4 = c(1L, 0L, 0L, 0L, 0L, 0L), price5 = c(0L, 0L, 0L,
0L, 0L, 0L), zone1 = c(0L, 0L, 0L, 1L, 1L, 1L), zone2 = c(0L,
0L, 0L, 0L, 0L, 0L), zone3 = c(1L, 0L, 1L, 0L, 0L, 0L), zone4 = c(0L,
1L, 0L, 0L, 0L, 0L), lic1 = c(0L, 0L, 0L, 0L, 0L, 0L), lic2 = c(1L,
0L, 1L, 0L, 1L, 1L), lic3 = c(0L, 1L, 0L, 1L, 0L, 0L), enf1 = c(0L,
0L, 1L, 0L, 1L, 0L), enf2 = c(0L, 0L, 0L, 1L, 0L, 1L), enf3 = c(1L,
1L, 0L, 0L, 0L, 0L), chid = 1:6), row.names = c(4L, 5L, 7L,
8L, 10L, 12L), class = "data.frame")
I have run into an error when running the code:
dfml <- dfidx(df, idx=list(c("chid", "respondent")),
choice="Alternative", varying=6:20, sep ="")
"Error in reshapeLong(data, idvar = idvar, timevar = timevar, varying = varying, :
'varying' arguments must be the same length"
I have check the data and each col from 6:20 is the same length, however, some respondents chose some of the options more than the others. Can someone possibly point out where I have gone wrong? It's my first attempt at analyzing choice experiment data.
The error means, that your price has five options, whereas the others, zone, lic, enf have less. dfidx obviously can't handle that. You need to provide them, at least as NA columns.
df <- transform(df, zone5=NA, lic4=NA, lic5=NA, enf4=NA, enf5=NA)
library(mlogit)
dfml <- dfidx(df, idx=list(c("chid","respondent")), choice="Alternative",
varying=grep('^price|^zone|^lic|^enf', names(df)), sep="")
dfml
# ~~~~~~~
# first 10 observations out of 30
# ~~~~~~~
# Choice.Set Alternative code Choice price zone lic enf idx
# 1 4 FALSE 7 1 0 0 0 0 1:1
# 2 4 TRUE 7 1 0 0 1 0 1:2
# 3 4 FALSE 7 1 0 1 0 1 1:3
# 4 4 FALSE 7 1 1 0 NA NA 1:4
# 5 4 FALSE 7 1 0 NA NA NA 1:5
# 6 5 TRUE 9 1 0 0 0 0 2:1
# 7 5 FALSE 9 1 1 0 0 0 2:2
# 8 5 FALSE 9 1 0 0 1 1 2:3
# 9 5 FALSE 9 1 0 1 NA NA 2:4
# 10 5 FALSE 9 1 0 NA NA NA 2:5
#
# ~~~ indexes ~~~~
# chid respondent id2
# 1 1 1 1
# 2 1 1 2
# 3 1 1 3
# 4 1 1 4
# 5 1 1 5
# 6 2 1 1
# 7 2 1 2
# 8 2 1 3
# 9 2 1 4
# 10 2 1 5
# indexes: 1, 1, 2
I use grep here to identify the varying= columns. Get rid of the habit of lazily specifying variables as numbers; it's dangerous since order might change easily with small changes in the script.
I'm trying to get the frequency distribution for a list if it's over a certain number. In my data, I have multiple columns and I want to generate a code that identifies the frequency of "0" in each column where "0" is greater than 3.
My dataset is like this:
a b c d e f g h
0 1 0 1 1 1 1 1
2 0 0 0 0 0 0 0
0 1 2 2 2 1 0 1
0 0 0 0 1 0 0 0
1 0 2 1 1 0 0 0
1 1 0 0 1 0 0 0
0 1 2 2 2 2 2 2
```
The output of the code that I need is :
```
Variable Frequency
a 4
c 4
f 4
g 5
h 4
```
So this will show us the numbers of "0" in the data frame in each column when it is greater than 3.
Thank you.
You can use colSums to count number of 0's in each column and subset the values which are greater than 3.
subset(stack(colSums(df == 0, na.rm = TRUE)), values > 3)
tidyverse way would be :
library(dplyr)
df %>%
summarise(across(.fns = ~sum(. == 0, na.rm = TRUE))) %>%
tidyr::pivot_longer(cols = everything()) %>%
filter(value > 3)
# name value
# <chr> <int>
#1 a 4
#2 c 4
#3 f 4
#4 g 5
#5 h 4
data
df <- structure(list(a = c(0L, 2L, 0L, 0L, 1L, 1L, 0L), b = c(1L, 0L,
1L, 0L, 0L, 1L, 1L), c = c(0L, 0L, 2L, 0L, 2L, 0L, 2L), d = c(1L,
0L, 2L, 0L, 1L, 0L, 2L), e = c(1L, 0L, 2L, 1L, 1L, 1L, 2L), f = c(1L,
0L, 1L, 0L, 0L, 0L, 2L), g = c(1L, 0L, 0L, 0L, 0L, 0L, 2L), h = c(1L,
0L, 1L, 0L, 0L, 0L, 2L)), class = "data.frame", row.names = c(NA, -7L))
I have a datset that looks something like this:
> head(BurnData)
Treatment Gender Race Surface head buttock trunk up.leg low.leg resp.tract type ex.time excision antib.time antibiotic
1 0 0 0 15 0 0 1 1 0 0 2 12 0 12 0
2 0 0 1 20 0 0 1 0 0 0 4 9 0 9 0
3 0 0 1 15 0 0 0 1 1 0 2 13 0 13 0
4 0 0 0 20 1 0 1 0 0 0 2 11 1 29 0
5 0 0 1 70 1 1 1 1 0 0 2 28 1 31 0
6 0 0 1 20 1 0 1 0 0 0 4 11 0 11 0
inf.time infection
1 12 0
2 9 0
3 7 1
4 29 0
5 4 1
6 8 1
I want to run a Cox's Regression on variables Surface, ex.time and, antib.time and treatment. Treatment is an indicator variable. Surface denotes the % of body burned. ex.time and antib.time both record time to event in days.
I am aware that to run a time dependent Cox's Regression i need to convert the data in longitudinal structure, but how can i do it in R?
then i will use the forluma:
coxph(formula = Surv(tstart, tstop, infection) ~ covariate)
DATA
> dput(head(BurnData))
structure(list(Treatment = c(0L, 0L, 0L, 0L, 0L, 0L), Gender = c(0L,
0L, 0L, 0L, 0L, 0L), Race = c(0L, 1L, 1L, 0L, 1L, 1L), Surface = c(15L,
20L, 15L, 20L, 70L, 20L), head = c(0L, 0L, 0L, 1L, 1L, 1L), buttock = c(0L,
0L, 0L, 0L, 1L, 0L), trunk = c(1L, 1L, 0L, 1L, 1L, 1L), up.leg = c(1L,
0L, 1L, 0L, 1L, 0L), low.leg = c(0L, 0L, 1L, 0L, 0L, 0L), resp.tract = c(0L,
0L, 0L, 0L, 0L, 0L), type = c(2L, 4L, 2L, 2L, 2L, 4L), ex.time = c(12L,
9L, 13L, 11L, 28L, 11L), excision = c(0L, 0L, 0L, 1L, 1L, 0L),
antib.time = c(12L, 9L, 13L, 29L, 31L, 11L), antibiotic = c(0L,
0L, 0L, 0L, 0L, 0L), inf.time = c(12L, 9L, 7L, 29L, 4L, 8L
), infection = c(0L, 0L, 1L, 0L, 1L, 1L), Surface_discr = structure(c(1L,
1L, 1L, 1L, 2L, 1L), .Label = c("1", "2"), class = "factor"),
ex.time_discr = c(1L, 1L, 1L, 1L, 2L, 1L), antib.time_discr = c(1L,
1L, 1L, 2L, 2L, 1L)), .Names = c("Treatment", "Gender", "Race",
"Surface", "head", "buttock", "trunk", "up.leg", "low.leg", "resp.tract",
"type", "ex.time", "excision", "antib.time", "antibiotic", "inf.time",
"infection", "Surface_discr", "ex.time_discr", "antib.time_discr"
), row.names = c(NA, 6L), class = "data.frame")
I have a data table:
> COUNT_ID_CATEGORY
id 706 799 1703 1726 2119 2202 3203 3504 3509 4401 4517 5122 5558 5616 5619 5824 6202 7205 9115 9909
1: 86246 9 0 15 4 28 0 15 63 39 5 7 25 27 43 12 64 1 16 0 96
2: 86252 3 0 17 6 21 0 6 62 24 6 7 12 25 32 6 49 1 26 0 103
3: 12262064 3 0 1 1 12 0 0 2 1 0 0 0 2 4 0 4 0 0 0 12
4: 12277270 2 0 0 0 1 0 3 0 3 0 0 0 0 24 0 6 2 5 0 60
5: 12332190 2 0 2 0 4 0 1 2 0 0 0 1 0 3 0 1 3 2 0 46
---
310661: 4837642552 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0
310662: 4843417324 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 2 0 0 0 0
310663: 4847628950 2 0 1 1 16 0 0 2 3 0 0 2 9 5 0 3 3 2 3 14
310664: 4847787712 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
310665: 4853598737 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0
> class(COUNT_ID_CATEGORY)
[1] "data.table" "data.frame"
>
and I wish to read the data as quickly as possible as follows:
COUNT_ID_CATEGORY for (id == 86246) & (category == 706)
which should return the value 9 (top left in the table).
(for example)
I can get the row with:
COUNT_ID_CATEGORY[id==86246,]
but how do I get the column?
> dput(head(COUNT_ID_CATEGORY))
structure(list(id = c(86246, 86252, 12262064, 12277270, 12332190,
12524696), `706` = c(9L, 3L, 3L, 2L, 2L, 0L), `799` = c(0L, 0L,
0L, 0L, 0L, 0L), `1703` = c(15L, 17L, 1L, 0L, 2L, 0L), `1726` = c(4L,
6L, 1L, 0L, 0L, 0L), `2119` = c(28L, 21L, 12L, 1L, 4L, 0L), `2202` = c(0L,
0L, 0L, 0L, 0L, 0L), `3203` = c(15L, 6L, 0L, 3L, 1L, 0L), `3504` = c(63L,
62L, 2L, 0L, 2L, 11L), `3509` = c(39L, 24L, 1L, 3L, 0L, 3L),
`4401` = c(5L, 6L, 0L, 0L, 0L, 1L), `4517` = c(7L, 7L, 0L,
0L, 0L, 1L), `5122` = c(25L, 12L, 0L, 0L, 1L, 0L), `5558` = c(27L,
25L, 2L, 0L, 0L, 1L), `5616` = c(43L, 32L, 4L, 24L, 3L, 18L
), `5619` = c(12L, 6L, 0L, 0L, 0L, 0L), `5824` = c(64L, 49L,
4L, 6L, 1L, 10L), `6202` = c(1L, 1L, 0L, 2L, 3L, 6L), `7205` = c(16L,
26L, 0L, 5L, 2L, 4L), `9115` = c(0L, 0L, 0L, 0L, 0L, 0L),
`9909` = c(96L, 103L, 12L, 60L, 46L, 1L)), .Names = c("id",
"706", "799", "1703", "1726", "2119", "2202", "3203", "3504",
"3509", "4401", "4517", "5122", "5558", "5616", "5619", "5824",
"6202", "7205", "9115", "9909"), sorted = "id", class = c("data.table",
"data.frame"), row.names = c(NA, -6L), .internal.selfref = <pointer: 0x043a24a0>)
First setkey for fast lookup using data.table's binary search/subset feature:
setkey(COUNT_ID_CATEGORY, id)
Then you can do:
COUNT_ID_CATEGORY[J(86246)][, '706']
The first part COUNT_ID_CATEGORY[J(86246)] performs fast subset using binary search. You can read more about J(.) and what it does here.
The next part [, '706', with=FALSE] takes the subset result, which is a data.table and selects just the column 706.
Just to be complete, this post shows more ways of selecting/subsetting columns from a data.table.
I came across a table of freq. counts today I had to expand into a data frame of raw values. I was able to do it but was wondering if there's a faster way using the reshape package or data.table?
The original table looked like this:
i1 i2 i3 i4 m f
1 0 0 0 0 22 29
2 1 0 0 0 30 50
3 0 1 0 0 13 15
4 0 0 1 0 1 6
5 1 1 0 0 24 67
6 1 0 1 0 5 12
7 0 1 1 0 1 2
8 1 1 1 0 10 22
9 0 0 0 1 10 7
10 1 0 0 1 27 30
11 0 1 0 1 14 4
12 0 0 1 1 1 0
13 1 1 0 1 54 63
14 1 0 1 1 8 10
15 0 1 1 1 8 6
16 1 1 1 1 57 51
Here's an easy grab of the data using dput:
dat <- structure(list(i1 = c(0L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 1L,
0L, 0L, 1L, 1L, 0L, 1L), i2 = c(0L, 0L, 1L, 0L, 1L, 0L, 1L, 1L,
0L, 0L, 1L, 0L, 1L, 0L, 1L, 1L), i3 = c(0L, 0L, 0L, 1L, 0L, 1L,
1L, 1L, 0L, 0L, 0L, 1L, 0L, 1L, 1L, 1L), i4 = c(0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), m = c(22L, 30L,
13L, 1L, 24L, 5L, 1L, 10L, 10L, 27L, 14L, 1L, 54L, 8L, 8L, 57L
), f = c(29L, 50L, 15L, 6L, 67L, 12L, 2L, 22L, 7L, 30L, 4L, 0L,
63L, 10L, 6L, 51L)), .Names = c("i1", "i2", "i3", "i4", "m",
"f"), class = "data.frame", row.names = c(NA, -16L))
My approach(s) to reshape the data (is there a faster way?):
#step 1: method 1 (in this case binding and stacking uses less code than reshape)
dat2 <- data.frame(rbind(dat[,1:4], dat[, 1:4]),
sex = rep(c('m', 'f'), each=16),
n = c(dat$m, dat$f))
dat2
#step 1: method 2
dat3 <- reshape(dat, direction = "long", idvar = 1:4,
varying = list(c("m", "f")),
v.names = c("n"),
timevar = "sex",
times = c("m", "f"))
rownames(dat3) <- 1:nrow(dat3)
dat3 <- data.frame(dat3)
dat3$sex <- as.factor(dat3$sex)
all.equal(dat3, dat2) #just to show both method 1 and 2 give the same data frame
#step 2
dat4 <- dat2[rep(seq_len(nrow(dat2)), dat2$n), 1:5]
rownames(dat4) <- 1:nrow(dat4)
dat4
I assume this is a common problem as when you want to take a table from an article and reproduce it, it requires some unpacking. I am finding myself doing this more and more and want to make sure I'm being efficient.
Here is a one-liner.
dat2 <- ddply(dat, 1:4, summarize, sex = c(rep('m', m), rep('f', f)))
And here's a base R one-liner.
dat2 <- cbind(dat[c(rep(1:nrow(dat), dat$m), rep(1:nrow(dat), dat$f)),1:4],
sex=c(rep("m",sum(dat$m)), rep("f", sum(dat$f))))
Or, a little more generally:
d1 <- dat[,1:4]
d2 <- as.matrix(dat[,5:6])
dat2 <- cbind(d1[rep(rep(1:nrow(dat), ncol(d2)), d2),],
sex=rep(colnames(d2), colSums(d2)))
Given that nobody has posted a data.table solution (as suggested in the original question)
library(data.table)
DT <- as.data.table(dat)
DT[,list(sex = rep(c('m','f'),c(m,f))), by= list(i1,i2,i3,i4)]
Or, even more succinctly
DT[,list(sex = rep(c('m','f'),c(m,f))), by= 'i1,i2,i3,i4']
I would use melt for the first step and ddply for the second.
library(reshape2)
library(plyr)
d <- ddply(
melt(dat, id.vars=c("i1","i2","i3","i4"), variable.name="sex"),
c("i1","i2","i3","i4","sex"),
summarize,
id=rep(1,value)
)
d$id <- cumsum(d$id)