Calculate value with two columns based on four conditions in R - r

I have a large dataset uploaded in r (see below for a short version): I want to calculate a value for each Cruiseid, Samplenr, Species and Age (so based on four conditions):
Cruiseid Samplenr Species Age Length LK TNumStat TNumLK
197502 37 154 0 12,5 2 2,791666667 5,583333
197502 37 154 0 17,5 3 2,166666667 6,5
197502 37 154 2 172,5 34 11,54166667 392,4167
197502 37 154 2 177,5 35 12,0625 422,1875
197502 37 154 2 182,5 36 2,083333333 75
197502 35 154 0 112,5 22 11,85654008 260,8439
197502 35 154 2 197,5 39 2,109704641 82,27848
197502 35 154 2 217,5 43 2,109704641 90,7173
197502 35 154 2 232,5 46 2,109704641 97,04641
197502 36 154 0 12,5 2 4,685314685 9,370629
197502 36 154 2 182,5 36 3,496503497 125,8741
197502 41 154 0 17,5 3 2,260869565 6,782609
197502 41 154 2 202,5 40 4,347826087 173,913
197502 41 154 2 212,5 42 2,173913043 91,30435
197502 41 154 2 242,5 48 2,173913043 104,3478
197503 56 154 0 17,5 3 7,428571429 22,28571
197503 56 154 0 147,5 29 10,30952381 298,9762
197503 56 154 2 172,5 34 13,19047619 448,4762
197503 56 154 2 187,5 37 2,380952381 88,09524
197503 54 154 0 12,5 2 3,35 6,7
197503 54 154 0 157,5 31 12 372
197503 54 154 0 167,5 33 13,25 437,25
197503 54 154 2 172,5 34 13,85 470,9
197503 54 154 2 187,5 37 2,5 92,5
197503 54 154 2 217,5 43 2,5 107,5
197503 53 154 0 12,5 2 2,875536481 5,751073
197503 53 154 0 97,5 19 4,806866953 91,33047
197503 53 154 0 107,5 21 5,622317597 118,0687
197503 53 154 0 142,5 28 8,776824034 245,7511
I want to calcuate:((TNumStat$TNumLK/TNumStat$TNumStat)*0.5+0.25)*10for each Cruiseid, Samplenr, Species and Age.
I have already tried something in a loop construction:
#######################
Cruise <- unique(TNumStat$Cruiseid)
Track <- unique(TNumStat$Samplenr)
#######################
AvrLengthCr <- c()
AvrLengthCr <- rep(NA, length(TNumStat$Species))
#######################
for(j in 1:length(Cruise)){
t1.ss <- which(TNumStat$Cruiseid == Cruise[j])
###
for(i in 1:length(Track)){
t2.ss <- which(TNumStat$Samplenr[t1.ss] == Track[i])
###
AvrLengthCr[t1.ss][t2.ss] <- ((TNumStat$TNumLK[t1.ss][t2.ss]/TNumStat$TNumStat[t1.ss][t2.ss])*0.5+0.25)*10
}}
But it doesn't seem to work. And I've also been looking at something with dcast:
TNumStat2<-dcast(TNumStat,Cruiseid+Samplenr+Species+Age,formula = (((TNumStat$TNumLK/TNumStat$TNumStat*0.5+0.25)*10) )),na.rm=TRUE)
Non of the options I have tried seem to work, and I dont know how to solve this. Can someone please help me?
Thank you

Good Morning,
the question is not totally clear in my opinion. But you could try something like (with dplyr)
sample <- sample %>%
mutate(calculate = ((TNumLK/TNumStat) * 0.5 + 0.25) * 10) %>%
group_by(Cruiseid, Samplenr, Species, Age)
summarisedDF <- sample %>%
summarise(avg.calculate = mean(calculate))

What strikes me is your columns "Length", "TNumStat", "TNumLK" have , instead of . and thus are in character format that can not be coerced to numeric so easily.
TNumStat[c("TNumStat", "TNumLK")] <-
lapply(TNumStat[c("TNumStat", "TNumLK")],
function(x) as.numeric(gsub(",", ".", x)))
Maybe this is dependent to your system locale, so just ignore this step if it works for you.
Then, you could use by to apply your formula.
l <- by(TNumStat, TNumStat[c("Cruiseid", "Samplenr", "Species")],
function(x) cbind(unique(x[1:3]),
value=with(x, ((mean(TNumLK)/mean(TNumStat))*0.5+0.25)*10)))
This gives you a list that you rbind to get the result.
TNumStat.new <- do.call(rbind, l)
TNumStat.new
# Cruiseid Samplenr Species value
# 6 197502 35 154 148.46288
# 10 197502 36 154 85.14956
# 1 197502 37 154 149.61421
# 12 197502 41 154 174.24600
# 26 197503 53 154 106.86347
# 20 197503 54 154 159.17545
# 16 197503 56 154 131.26698
Data
TNumStat <- structure(list(Cruiseid = c(197502L, 197502L, 197502L, 197502L,
197502L, 197502L, 197502L, 197502L, 197502L, 197502L, 197502L,
197502L, 197502L, 197502L, 197502L, 197503L, 197503L, 197503L,
197503L, 197503L, 197503L, 197503L, 197503L, 197503L, 197503L,
197503L, 197503L, 197503L, 197503L), Samplenr = c(37L, 37L, 37L,
37L, 37L, 35L, 35L, 35L, 35L, 36L, 36L, 41L, 41L, 41L, 41L, 56L,
56L, 56L, 56L, 54L, 54L, 54L, 54L, 54L, 54L, 53L, 53L, 53L, 53L
), Species = c(154L, 154L, 154L, 154L, 154L, 154L, 154L, 154L,
154L, 154L, 154L, 154L, 154L, 154L, 154L, 154L, 154L, 154L, 154L,
154L, 154L, 154L, 154L, 154L, 154L, 154L, 154L, 154L, 154L),
Age = c(0L, 0L, 2L, 2L, 2L, 0L, 2L, 2L, 2L, 0L, 2L, 0L, 2L,
2L, 2L, 0L, 0L, 2L, 2L, 0L, 0L, 0L, 2L, 2L, 2L, 0L, 0L, 0L,
0L), Length = structure(c(3L, 8L, 9L, 10L, 11L, 2L, 13L,
16L, 17L, 3L, 11L, 8L, 14L, 15L, 18L, 8L, 5L, 9L, 12L, 3L,
6L, 7L, 9L, 12L, 16L, 3L, 19L, 1L, 4L), .Label = c("107,5",
"112,5", "12,5", "142,5", "147,5", "157,5", "167,5", "17,5",
"172,5", "177,5", "182,5", "187,5", "197,5", "202,5", "212,5",
"217,5", "232,5", "242,5", "97,5"), class = "factor"), LK = c(2L,
3L, 34L, 35L, 36L, 22L, 39L, 43L, 46L, 2L, 36L, 3L, 40L,
42L, 48L, 3L, 29L, 34L, 37L, 2L, 31L, 33L, 34L, 37L, 43L,
2L, 19L, 21L, 28L), TNumStat = structure(c(16L, 11L, 2L,
5L, 9L, 3L, 10L, 10L, 10L, 21L, 19L, 13L, 20L, 12L, 12L,
24L, 1L, 6L, 14L, 18L, 4L, 7L, 8L, 15L, 15L, 17L, 22L, 23L,
25L), .Label = c("10,30952381", "11,54166667", "11,85654008",
"12", "12,0625", "13,19047619", "13,25", "13,85", "2,083333333",
"2,109704641", "2,166666667", "2,173913043", "2,260869565",
"2,380952381", "2,5", "2,791666667", "2,875536481", "3,35",
"3,496503497", "4,347826087", "4,685314685", "4,806866953",
"5,622317597", "7,428571429", "8,776824034"), class = "factor"),
TNumLK = structure(c(16L, 18L, 11L, 12L, 21L, 8L, 22L, 25L,
29L, 24L, 4L, 20L, 5L, 26L, 1L, 6L, 9L, 14L, 23L, 19L, 10L,
13L, 15L, 28L, 2L, 17L, 27L, 3L, 7L), .Label = c("104,3478",
"107,5", "118,0687", "125,8741", "173,913", "22,28571", "245,7511",
"260,8439", "298,9762", "372", "392,4167", "422,1875", "437,25",
"448,4762", "470,9", "5,583333", "5,751073", "6,5", "6,7",
"6,782609", "75", "82,27848", "88,09524", "9,370629", "90,7173",
"91,30435", "91,33047", "92,5", "97,04641"), class = "factor")), class = "data.frame", row.names = c(NA,
-29L))

Related

How to create a cross tabulation table between two variables with the counts in R?

I have a data frame with two columns that I want to cross tabulate. The data also includes the counts for the combination. I am trying to create the cross table and include those counts within the table. I am struggling to use the counts from the dataframe into the cross table.
> df %>% arrange(d1)%>% head()
count d1 d2
1 3 1 15
2 86 1 14
3 13 1 12
4 186 1 16
5 29 1 9
6 86 1 13
> table(df$d1,df$d2)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
2 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
3 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1
Expecting [1,15] and [1,14] to show 3, 86 based on the counts in df table.
Right now it shows 0s and 1s only based on if the combinations exists.
Here is my sample data:
structure(list(count = c(37L, 6L, 44L, 21L, 8L, 3L, 9L, 17L,
13L, 32L, 106L, 34L, 505L, 173L, 12L, 2L, 4L, 45L, 3L, 43L, 5L,
16L, 1L, 27L, 17L, 3L, 4L, 1L, 27L, 86L, 79L, 10L, 161L, 32L,
3L, 209L, 9L, 83L, 23L, 108L, 161L, 22L, 4L, 16L, 2L, 6L, 67L,
86L, 3L, 1L, 14L, 14L, 111L, 5L, 5L, 44L, 105L, 13L, 269L, 186L,
3L, 5L, 5L, 27L, 3L, 186L, 58L, 29L, 34L, 43L, 8L, 92L, 9L, 455L,
22L, 32L, 4L, 14L, 58L, 22L, 190L, 94L, 27L, 152L, 264L, 36L,
1L, 505L, 86L, 44L, 3L, 1L, 79L, 75L, 12L, 32L, 11L, 197L, 90L,
269L, 9L, 6L, 47L, 14L, 158L, 303L, 335L, 37L, 33L, 3L, 83L,
15L, 31L, 124L, 146L, 26L, 36L, 27L, 37L, 31L, 108L, 121L, 111L,
11L, 5L, 26L, 166L, 11L, 18L, 11L, 8L, 15L, 18L, 165L, 80L, 14L,
5L, 3L, 492L, 7L, 90L, 146L, 130L, 197L, 165L, 34L, 22L, 122L,
29L, 74L, 455L, 303L, 45L, 5L, 173L, 33L, 24L, 229L, 79L, 43L,
68L, 16L, 10L, 73L, 35L, 99L, 229L, 94L, 23L, 492L, 18L, 84L,
92L, 86L, 35L, 31L, 1L, 23L, 8L, 121L, 1L, 173L, 400L, 124L,
20L, 11L, 6L, 3L, 166L, 84L, 31L, 122L, 15L, 24L, 70L, 43L, 74L,
209L, 45L, 158L, 44L, 15L, 37L, 35L, 27L, 68L, 20L, 15L, 11L,
21L, 4L, 18L, 44L, 234L, 80L, 10L, 44L, 4L, 47L, 7L, 67L, 10L,
3L, 173L, 99L, 79L, 130L, 3L, 75L, 1L, 335L, 14L, 106L, 15L,
34L, 190L, 152L, 16L, 73L, 45L, 1L, 3L, 264L, 160L, 23L, 1L,
160L, 400L, 105L, 234L, 70L, 35L), d1 = c(10L, 17L, 5L, 3L, 12L,
1L, 10L, 10L, 12L, 7L, 14L, 6L, 16L, 3L, 7L, 9L, 7L, 13L, 4L,
8L, 9L, 2L, 7L, 16L, 8L, 15L, 12L, 12L, 2L, 1L, 16L, 15L, 14L,
5L, 8L, 14L, 11L, 11L, 4L, 4L, 13L, 7L, 12L, 11L, 17L, 8L, 4L,
13L, 15L, 15L, 12L, 13L, 4L, 5L, 5L, 5L, 2L, 1L, 2L, 1L, 2L,
13L, 12L, 5L, 3L, 16L, 10L, 1L, 14L, 2L, 7L, 9L, 15L, 16L, 3L,
11L, 8L, 12L, 9L, 9L, 14L, 11L, 8L, 11L, 16L, 10L, 17L, 6L, 1L,
3L, 5L, 1L, 3L, 11L, 10L, 14L, 5L, 3L, 6L, 16L, 15L, 15L, 4L,
14L, 14L, 16L, 16L, 8L, 3L, 7L, 1L, 15L, 6L, 11L, 6L, 5L, 1L,
15L, 2L, 7L, 14L, 2L, 13L, 10L, 6L, 1L, 3L, 15L, 2L, 3L, 9L,
7L, 11L, 3L, 10L, 16L, 17L, 7L, 3L, 15L, 1L, 2L, 10L, 13L, 4L,
5L, 8L, 4L, 9L, 16L, 13L, 4L, 10L, 17L, 6L, 8L, 7L, 11L, 8L,
9L, 16L, 7L, 14L, 9L, 4L, 3L, 13L, 4L, 8L, 16L, 8L, 6L, 14L,
14L, 9L, 13L, 17L, 12L, 10L, 1L, 17L, 11L, 16L, 2L, 1L, 7L, 14L,
12L, 2L, 9L, 8L, 6L, 4L, 13L, 9L, 6L, 5L, 6L, 12L, 11L, 4L, 2L,
14L, 12L, 11L, 7L, 8L, 6L, 1L, 12L, 9L, 12L, 5L, 3L, 6L, 15L,
13L, 8L, 10L, 4L, 1L, 13L, 17L, 13L, 1L, 10L, 14L, 17L, 9L, 2L,
10L, 17L, 2L, 12L, 5L, 3L, 6L, 7L, 3L, 16L, 15L, 5L, 9L, 2L,
6L, 5L, 13L, 11L, 4L, 6L, 13L, 4L), d2 = c(2L, 14L, 4L, 12L,
10L, 15L, 15L, 8L, 1L, 14L, 2L, 5L, 6L, 11L, 10L, 17L, 8L, 10L,
17L, 6L, 5L, 7L, 15L, 15L, 10L, 1L, 9L, 17L, 5L, 14L, 8L, 14L,
13L, 11L, 5L, 6L, 15L, 1L, 8L, 14L, 14L, 3L, 8L, 7L, 9L, 15L,
1L, 1L, 2L, 5L, 13L, 12L, 13L, 12L, 9L, 3L, 4L, 12L, 16L, 16L,
15L, 17L, 5L, 2L, 17L, 1L, 9L, 9L, 5L, 9L, 9L, 14L, 11L, 13L,
7L, 5L, 12L, 14L, 10L, 8L, 3L, 4L, 11L, 6L, 9L, 1L, 1L, 16L,
13L, 5L, 8L, 17L, 10L, 9L, 7L, 7L, 10L, 13L, 1L, 2L, 10L, 8L,
10L, 12L, 11L, 4L, 10L, 14L, 8L, 12L, 11L, 6L, 7L, 2L, 2L, 1L,
10L, 16L, 10L, 6L, 4L, 1L, 4L, 5L, 17L, 5L, 2L, 3L, 8L, 15L,
7L, 4L, 12L, 4L, 6L, 17L, 6L, 5L, 16L, 4L, 6L, 6L, 14L, 3L, 3L,
14L, 9L, 6L, 1L, 5L, 16L, 16L, 13L, 13L, 13L, 3L, 13L, 13L, 16L,
2L, 7L, 2L, 15L, 3L, 12L, 1L, 11L, 11L, 4L, 3L, 2L, 9L, 9L, 1L,
4L, 8L, 12L, 6L, 12L, 2L, 2L, 3L, 11L, 11L, 8L, 1L, 17L, 7L,
3L, 6L, 13L, 4L, 7L, 7L, 13L, 8L, 16L, 14L, 16L, 14L, 5L, 12L,
8L, 4L, 8L, 16L, 1L, 15L, 7L, 3L, 12L, 11L, 13L, 6L, 10L, 13L,
5L, 7L, 4L, 15L, 4L, 15L, 4L, 6L, 3L, 3L, 10L, 3L, 11L, 17L,
16L, 16L, 14L, 2L, 6L, 14L, 11L, 11L, 9L, 12L, 7L, 7L, 16L, 13L,
12L, 15L, 2L, 16L, 2L, 3L, 9L, 9L)), row.names = c(NA, 252L), class = "data.frame")
xtabs may be useful here
> xtabs(count ~ d1 + d2, df)
d2
d1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
1 0 121 99 67 26 90 11 20 29 36 83 13 86 86 3 186 1
2 121 0 166 105 27 146 16 18 43 37 124 15 160 106 3 269 1
3 99 166 0 165 44 234 22 33 73 79 173 21 197 190 11 492 3
4 67 105 165 0 44 122 15 23 35 47 94 35 111 108 7 303 3
5 26 27 44 44 0 34 3 3 5 11 32 5 44 34 1 74 0
6 90 146 234 122 34 0 31 43 84 80 152 23 173 209 15 505 5
7 11 16 22 15 3 31 0 4 8 12 16 3 24 32 1 68 0
8 20 18 33 23 3 43 4 0 22 17 27 4 31 37 6 79 0
9 29 43 73 35 5 84 8 22 0 58 75 4 70 92 0 264 2
10 36 37 79 47 11 80 12 17 58 0 0 8 45 130 9 335 0
11 83 124 173 94 32 152 16 27 75 0 0 18 229 158 9 400 0
12 13 15 21 35 5 23 3 4 4 8 18 0 14 14 0 45 1
13 86 160 197 111 44 173 24 31 70 45 229 14 0 161 10 455 5
14 86 106 190 108 34 209 32 37 92 130 158 14 161 0 10 0 6
15 3 3 11 7 1 15 1 6 0 9 9 0 10 10 0 27 0
16 186 269 492 303 74 505 68 79 264 335 400 45 455 0 27 0 14
17 1 1 3 3 0 5 0 0 2 0 0 1 5 6 0 14 0
Convert to data.frame if required
as.data.frame.matrix(xtabs(count ~ d1 + d2, df))
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
1 0 121 99 67 26 90 11 20 29 36 83 13 86 86 3 186 1
2 121 0 166 105 27 146 16 18 43 37 124 15 160 106 3 269 1
3 99 166 0 165 44 234 22 33 73 79 173 21 197 190 11 492 3
4 67 105 165 0 44 122 15 23 35 47 94 35 111 108 7 303 3
5 26 27 44 44 0 34 3 3 5 11 32 5 44 34 1 74 0
6 90 146 234 122 34 0 31 43 84 80 152 23 173 209 15 505 5
7 11 16 22 15 3 31 0 4 8 12 16 3 24 32 1 68 0
8 20 18 33 23 3 43 4 0 22 17 27 4 31 37 6 79 0
9 29 43 73 35 5 84 8 22 0 58 75 4 70 92 0 264 2
10 36 37 79 47 11 80 12 17 58 0 0 8 45 130 9 335 0
11 83 124 173 94 32 152 16 27 75 0 0 18 229 158 9 400 0
12 13 15 21 35 5 23 3 4 4 8 18 0 14 14 0 45 1
13 86 160 197 111 44 173 24 31 70 45 229 14 0 161 10 455 5
14 86 106 190 108 34 209 32 37 92 130 158 14 161 0 10 0 6
15 3 3 11 7 1 15 1 6 0 9 9 0 10 10 0 27 0
16 186 269 492 303 74 505 68 79 264 335 400 45 455 0 27 0 14
17 1 1 3 3 0 5 0 0 2 0 0 1 5 6 0 14 0
Or may use dcast
library(data.table)
dcast(df, d1 ~ d2, value.var = 'count')
Key: <d1>
d1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
<int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
1: 1 NA 121 99 67 26 90 11 20 29 36 83 13 86 86 3 186 1
2: 2 121 NA 166 105 27 146 16 18 43 37 124 15 160 106 3 269 1
3: 3 99 166 NA 165 44 234 22 33 73 79 173 21 197 190 11 492 3
4: 4 67 105 165 NA 44 122 15 23 35 47 94 35 111 108 7 303 3
5: 5 26 27 44 44 NA 34 3 3 5 11 32 5 44 34 1 74 NA
6: 6 90 146 234 122 34 NA 31 43 84 80 152 23 173 209 15 505 5
7: 7 11 16 22 15 3 31 NA 4 8 12 16 3 24 32 1 68 NA
8: 8 20 18 33 23 3 43 4 NA 22 17 27 4 31 37 6 79 NA
9: 9 29 43 73 35 5 84 8 22 NA 58 75 4 70 92 NA 264 2
10: 10 36 37 79 47 11 80 12 17 58 NA NA 8 45 130 9 335 NA
11: 11 83 124 173 94 32 152 16 27 75 NA NA 18 229 158 9 400 NA
12: 12 13 15 21 35 5 23 3 4 4 8 18 NA 14 14 NA 45 1
13: 13 86 160 197 111 44 173 24 31 70 45 229 14 NA 161 10 455 5
14: 14 86 106 190 108 34 209 32 37 92 130 158 14 161 NA 10 NA 6
15: 15 3 3 11 7 1 15 1 6 NA 9 9 NA 10 10 NA 27 NA
16: 16 186 269 492 303 74 505 68 79 264 335 400 45 455 NA 27 NA 14
17: 17 1 1 3 3 NA 5 NA NA 2 NA NA 1 5 6 NA 14 NA
If you are looking for a more publishable solution, you might want to try crosstable::crosstable() as it would let you output a nice HTML table.
This would require a few parameters though, as it is not meant for crossing long vectors of numbers in the first place.
Here is the code:
library(dplyr)
library(crosstable)
ct = df %>%
crosstable(d1, by=d2, percent_pattern="{n}", unique_numeric=Inf)
as_flextable(ct)

How to convert a multidimensional contingency table to tidy data?

I have a raw multidimensional contingency table that I want to convert to tidy data or other long form so that I can fit a logistic regression on it. I have found great methods for portions of it. But I'd like a strategy for dealing with whole thing iteratively.
Here's a half of it formatted:
White
<35 35-44 >44
Region M F M F M F
Northeast
Satisfied 288 60 224 35 337 70
Not satisfied 177 57 166 19 172 30
Mid-Atlantic
Satisfied 90 19 96 12 124 17
Not satisfied 45 12 42 5 39 2
Southern
Satisfied 226 88 189 44 156 70
Not satisfied 128 57 117 34 73 25
Here's the full raw data, stripped of its headers:
> dput(df_raw)
structure(list(V1 = c(288L, 177L, 90L, 45L, 226L, 128L), V2 = c(60L,
57L, 19L, 12L, 88L, 57L), V3 = c(224L, 166L, 96L, 42L, 189L,
117L), V4 = c(35L, 19L, 12L, 5L, 44L, 34L), V5 = c(337L, 172L,
124L, 39L, 156L, 73L), V6 = c(70L, 30L, 17L, 2L, 70L, 25L), V7 = c(38L,
33L, 18L, 6L, 45L, 31L), V8 = c(19L, 35L, 13L, 7L, 47L, 35L),
V9 = c(32L, 11L, 7L, 2L, 18L, 3L), V10 = c(22L, 20L, 0L,
3L, 13L, 7L), V11 = c(21L, 8L, 9L, 2L, 11L, 2L), V12 = c(15L,
10L, 1L, 1L, 9L, 2L)), class = "data.frame", row.names = c(NA,
-6L))
Here's how I can take care of one section:
ne35 <- data.frame(c(288, 177), c(60, 57))
colnames(ne35) <- c("Male", "Female")
rownames(ne35) <- c("Sat", "Unsat")
ne35 %>%
rownames_to_column() %>% # set row names as a variable
gather(rowname2,value,-rowname) %>% # reshape
rowwise() %>% # for every row
mutate(value = list(1:value)) %>% # numbers based on the value
unnest(value) %>% # unnest the counter
select(-value) # remove the counts
# A tibble: 582 x 2
rowname rowname2
<chr> <chr>
1 Sat Male
2 Sat Male
3 Sat Male
4 Sat Male
5 Sat Male
6 Sat Male
7 Sat Male
8 Sat Male
9 Sat Male
10 Sat Male
# … with 572 more rows
I am stumped, however, on how to apply this to a few tiers of categorical variables.

Multiple imputation separated by gpoup

In my data example
data=structure(list(groupvar = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 1L,
2L, 1L), v1 = c(27L, 52L, 92L, 86L, NA, 19L, 94L, NA, 26L, 94L,
NA, 58L, 96L, 74L, 8L, 66L, 65L, 41L, 70L, 21L, 64L, 40L, 17L,
7L, NA, 14L, 63L), v2 = c(59L, 91L, 45L, 40L, 56L, 17L, 72L,
78L, 19L, 62L, 87L, NA, 79L, 62L, 40L, 67L, 93L, 1L, 64L, 22L,
NA, 98L, 44L, 85L, 67L, 88L, 92L), v3 = c(97L, 15L, 27L, 55L,
86L, 66L, NA, 61L, 27L, 47L, 93L, 68L, 72L, 4L, 35L, 69L, 65L,
NA, 83L, 60L, 42L, NA, 90L, 81L, NA, 27L, 60L)), .Names = c("groupvar",
"v1", "v2", "v3"), class = "data.frame", row.names = c(NA, -27L
))
There is groupvar (1 group and second group). I have many variable, but here only three.
And there are many missing values in these variables.
How can i perform multiple imputation for each variable(the type of variable can by numeric,int and so on), but for each group separately, using MICE
Edit
simple imp <- mice(data) is not give the need output, because i need by group
I want that the result was
groupvar v1 v2 v3
1 27 59 97
1 52 91 15
1 92 45 27
1 86 40 55
1 *64* 56 86
2 7 85 81
2 58*61,8* 68
2 64 *61,8* 42
** i marked example of imputed value
Group 'groupvar' as a factor.
data <- structure(list(groupvar = as.factor(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 1L,
2L, 1L)),
v1 = c(27L, 52L, 92L, 86L, NA, 19L, 94L, NA, 26L, 94L,
NA, 58L, 96L, 74L, 8L, 66L, 65L, 41L, 70L, 21L, 64L, 40L, 17L,
7L, NA, 14L, 63L),
v2 = c(59L, 91L, 45L, 40L, 56L, 17L, 72L,
78L, 19L, 62L, 87L, NA, 79L, 62L, 40L, 67L, 93L, 1L, 64L, 22L,
NA, 98L, 44L, 85L, 67L, 88L, 92L),
v3 = c(97L, 15L, 27L, 55L,
86L, 66L, NA, 61L, 27L, 47L, 93L, 68L, 72L, 4L, 35L, 69L, 65L,
NA, 83L, 60L, 42L, NA, 90L, 81L, NA, 27L, 60L)),
.Names = c("groupvar",
"v1", "v2", "v3"), class = "data.frame", row.names = c(NA, -27L
))
Then use the mice package assuming the mice package is properly installed.
library(mice)
imp <- mice(data)
complete(imp)
groupvar v1 v2 v3
1 1 27 59 97
2 1 52 91 15
3 1 92 45 27
4 1 86 40 55
5 1 21 56 86
6 1 19 17 66
7 1 94 72 4
8 1 66 78 61
9 1 26 19 27
10 2 94 62 47
11 2 8 87 93
12 2 58 72 68
13 2 96 79 72
14 2 74 62 4
15 2 8 40 35
16 2 66 67 69
17 2 65 93 65
18 2 41 1 47
19 2 70 64 83
20 2 21 22 60
21 2 64 62 42
22 1 40 98 27
23 1 17 44 90
24 2 7 85 81
25 1 63 67 55
26 2 14 88 27
27 1 63 92 60

add new column in a certain order [duplicate]

This question already has answers here:
Add (insert) a column between two columns in a data.frame
(18 answers)
Closed 4 years ago.
Suppose i have dataset
df=structure(list(SaleCount = c(7L, 35L, 340L, 260L, 3L, 31L, 420L,
380L, 45L, 135L, 852L, 1L, 34L, 360L, 140L, 14L, 62L, 501L, 560L,
0L, 640L, 0L, 0L, 16L, 0L), DocumentNum = c(36L, 4L, 41L, 41L,
36L, 4L, 41L, 41L, 33L, 33L, 33L, 36L, 4L, 41L, 41L, 33L, 33L,
33L, 62L, 63L, 62L, 63L, 36L, 4L, 41L)), .Names = c("SaleCount",
"DocumentNum"), class = "data.frame", row.names = c(NA, -25L))
i need create the column, but this column must be second by order.
If i do so:
df["MY_NEW_COLUMN"] <- NA .
The new colums is third.
How it create that it was second by order?
I.E. i expect output
SaleCount newcolumn DocumentNum
1 7 NA 36
2 35 NA 4
3 340 NA 41
4 260 NA 41
5 3 NA 36
6 31 NA 4
7 420 NA 41
8 380 NA 41
9 45 NA 33
10 135 NA 33
11 852 NA 33
12 1 NA 36
13 34 NA 4
14 360 NA 41
15 140 NA 41
16 14 NA 33
17 62 NA 33
18 501 NA 33
19 560 NA 62
20 0 NA 63
21 640 NA 62
22 0 NA 63
23 0 NA 36
24 16 NA 4
25 0 NA 41
Of course sometimes I need to create a fourth column by order and so on.
You can use the dplyr library and the select function.
library(dplyr)
df=structure(list(SaleCount = c(7L, 35L, 340L, 260L, 3L, 31L, 420L,
380L, 45L, 135L, 852L, 1L, 34L, 360L, 140L, 14L, 62L, 501L, 560L,
0L, 640L, 0L, 0L, 16L, 0L), DocumentNum = c(36L, 4L, 41L, 41L,
36L, 4L, 41L, 41L, 33L, 33L, 33L, 36L, 4L, 41L, 41L, 33L, 33L,
33L, 62L, 63L, 62L, 63L, 36L, 4L, 41L)), .Names = c("SaleCount",
"DocumentNum"), class = "data.frame", row.names = c(NA, -25L))
df["MY_NEW_COLUMN"] <- NA
select(df,SaleCount, MY_NEW_COLUMN, DocumentNum)

R- Create function that selects entire row in data frame by column name

This is a question for an R Programming class, but I have been working on it for several hours, over a period of a few days. I have done internet searches and referenced three different books. I have tried very hard to solve it on my own. I am finally asking for help.
I was given a csv, which I read into the program. This is the resulting dataframe, named df:
name hw0 hw1 hw2 hw3 hw4 hw5 hw6 quiz1 quiz2 quiz3 quiz4 quiz5 quiz6 term1
1 20 14 30 100 50 60 36 12 15 30 15 25 25 100
2 A 20 13 30 100 50 60 30 11 15 0 14 25 25 100
3 B 20 14 30 100 50 60 36 8 11 24 8 13 9 95
4 C 20 14 28 100 50 60 36 12 4 25 13 24 14 95
5 D 20 12 30 100 50 0 33 7 15 26 12 22 0 100
6 E 20 14 30 90 30 0 0 10 15 30 15 21 15 100
7 F 20 13 30 100 48 0 36 12 15 30 15 25 23 95
8 G 20 14 26 85 40 42 33 11 15 23 11 17 16 90
9 H 20 0 0 85 50 0 0 0 15 0 0 15 10 85
10 I 20 14 15 0 10 48 30 11 0 27 11 14 16 60
11 J 20 14 29 80 35 0 36 11 13 24 12 14 0 70
12 K 20 14 29 97 50 60 36 4 7 19 11 20 15 100
13 L 20 14 30 100 45 0 36 10 6 26 8 16 7 80
14 M 20 14 30 100 50 60 36 7 15 28 14 25 25 100
15 N 20 11 0 95 20 0 0 8 14 26 7 9 0 95
16 O 20 12 28 97 0 40 0 11 10 27 11 15 15 70
17 P 20 13 0 90 45 0 20 4 13 30 10 20 17 90
18 Q 20 14 30 100 45 0 36 0 12 21 11 14 17 75
term2 term3 exam1 exam2 exam3 final
1 100 100 100 100 95 100
2 100 100 97 97 80 97
3 100 100 83 85 73 73
4 100 100 88 75 56 77
5 100 0 90 87 72 81
6 100 80 92 82 69 79
7 100 100 90 95 87 90
8 100 0 89 79 81 78
9 90 100 62 83 42 75
10 90 72 78 78 66 81
11 0 0 79 77 51 78
12 100 100 79 77 57 81
13 0 100 68 74 76 76
14 100 100 99 98 82 99
15 0 0 70 70 52 61
16 0 0 63 66 0 0
17 100 100 75 72 56 64
18 90 75 72 84 54 63
QUESTION:
checkStudent <- function(df, studentName);
This function extracts a particular student's grades data from a data frame and returns them.
REQUIRED FORMAT:
checkStudent <- function(df, studentName)
{
}
TIPS PROVIDED:
inputs:
df -- a data frame that contains all the grades data
studentName -- name of a student
return:
all the grades for the student whose name is given as studentName
purpose:
extracting a particular student's grades data from a data frame and returning them
PROJECT TESTER- line of code and expected results:
checkStudent(df,"A")
name hw0 hw1 hw2 hw3 hw4 hw5 hw6 quiz1 quiz2 quiz3 quiz4 quiz5
2 A 20 13 30 100 50 60 30 11 15 0 14 25
quiz6 term1 term2 term3 exam1 exam2 exam3 final
2 25 100 100 100 97 97 80 97
I feel like I have been given everything and still can't get it right. I have tried:
checkStudent <- function(df, studentName)
{
name <- studentName
df["name", ]
}
and
checkStudent <- function(df, studentName)
{
subset(df, "name" == studentName, 1:21)
}
and numerous other lines of code, too many to list.
Please help. I am truly stuck.
Again, this needs to be done strictly in R. If it matters, I'm using RStudio. Thank you so much.
You're really close.
Variables in R should never be encapsulated in quotes, but always are free standing. Additionally your code is just printing the row, it is not returning it.
Here's a slightly modify version of your first attempt, without the quotes.
checkStudent <- function(df, studentName)
{
name <- studentName
return(df[name, ])
}
Edit: Oops, I realized your rows aren't named as the students.
You'll need to make it more like this:
checkStudent <- function(df, studentName)
{
my_row <- which(df$name == studentName)
return(df[my_row, ])
}
Try with logical subsetting:
checkStudent <- function(x,y) x[x['name']==y,]
Test:
checkStudent(df,"A")
# name hw0 hw1 hw2 hw3 hw4 hw5 hw6 quiz1 quiz2 quiz3 quiz4 quiz5 quiz6 term1 term2 term3 exam1 exam2 exam3 final
#1 A 20 13 30 100 50 60 30 11 15 0 14 25 25 100 100 100 97 97 80 97
data:
df <- structure(list(name = structure(1:17, .Label = c("A", "B", "C",
"D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P",
"Q"), class = "factor"), hw0 = c(20L, 20L, 20L, 20L, 20L, 20L,
20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L, 20L), hw1 = c(13L,
14L, 14L, 12L, 14L, 13L, 14L, 0L, 14L, 14L, 14L, 14L, 14L, 11L,
12L, 13L, 14L), hw2 = c(30L, 30L, 28L, 30L, 30L, 30L, 26L, 0L,
15L, 29L, 29L, 30L, 30L, 0L, 28L, 0L, 30L), hw3 = c(100L, 100L,
100L, 100L, 90L, 100L, 85L, 85L, 0L, 80L, 97L, 100L, 100L, 95L,
97L, 90L, 100L), hw4 = c(50L, 50L, 50L, 50L, 30L, 48L, 40L, 50L,
10L, 35L, 50L, 45L, 50L, 20L, 0L, 45L, 45L), hw5 = c(60L, 60L,
60L, 0L, 0L, 0L, 42L, 0L, 48L, 0L, 60L, 0L, 60L, 0L, 40L, 0L,
0L), hw6 = c(30L, 36L, 36L, 33L, 0L, 36L, 33L, 0L, 30L, 36L,
36L, 36L, 36L, 0L, 0L, 20L, 36L), quiz1 = c(11L, 8L, 12L, 7L,
10L, 12L, 11L, 0L, 11L, 11L, 4L, 10L, 7L, 8L, 11L, 4L, 0L), quiz2 = c(15L,
11L, 4L, 15L, 15L, 15L, 15L, 15L, 0L, 13L, 7L, 6L, 15L, 14L,
10L, 13L, 12L), quiz3 = c(0L, 24L, 25L, 26L, 30L, 30L, 23L, 0L,
27L, 24L, 19L, 26L, 28L, 26L, 27L, 30L, 21L), quiz4 = c(14L,
8L, 13L, 12L, 15L, 15L, 11L, 0L, 11L, 12L, 11L, 8L, 14L, 7L,
11L, 10L, 11L), quiz5 = c(25L, 13L, 24L, 22L, 21L, 25L, 17L,
15L, 14L, 14L, 20L, 16L, 25L, 9L, 15L, 20L, 14L), quiz6 = c(25L,
9L, 14L, 0L, 15L, 23L, 16L, 10L, 16L, 0L, 15L, 7L, 25L, 0L, 15L,
17L, 17L), term1 = c(100L, 95L, 95L, 100L, 100L, 95L, 90L, 85L,
60L, 70L, 100L, 80L, 100L, 95L, 70L, 90L, 75L), term2 = c(100L,
100L, 100L, 100L, 100L, 100L, 100L, 90L, 90L, 0L, 100L, 0L, 100L,
0L, 0L, 100L, 90L), term3 = c(100L, 100L, 100L, 0L, 80L, 100L,
0L, 100L, 72L, 0L, 100L, 100L, 100L, 0L, 0L, 100L, 75L), exam1 = c(97L,
83L, 88L, 90L, 92L, 90L, 89L, 62L, 78L, 79L, 79L, 68L, 99L, 70L,
63L, 75L, 72L), exam2 = c(97L, 85L, 75L, 87L, 82L, 95L, 79L,
83L, 78L, 77L, 77L, 74L, 98L, 70L, 66L, 72L, 84L), exam3 = c(80L,
73L, 56L, 72L, 69L, 87L, 81L, 42L, 66L, 51L, 57L, 76L, 82L, 52L,
0L, 56L, 54L), final = c(97L, 73L, 77L, 81L, 79L, 90L, 78L, 75L,
81L, 78L, 81L, 76L, 99L, 61L, 0L, 64L, 63L)), .Names = c("name",
"hw0", "hw1", "hw2", "hw3", "hw4", "hw5", "hw6", "quiz1", "quiz2",
"quiz3", "quiz4", "quiz5", "quiz6", "term1", "term2", "term3",
"exam1", "exam2", "exam3", "final"), row.names = c(NA, -17L), class = "data.frame")

Resources