Multiple imputation separated by gpoup - r

In my data example
data=structure(list(groupvar = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 1L,
2L, 1L), v1 = c(27L, 52L, 92L, 86L, NA, 19L, 94L, NA, 26L, 94L,
NA, 58L, 96L, 74L, 8L, 66L, 65L, 41L, 70L, 21L, 64L, 40L, 17L,
7L, NA, 14L, 63L), v2 = c(59L, 91L, 45L, 40L, 56L, 17L, 72L,
78L, 19L, 62L, 87L, NA, 79L, 62L, 40L, 67L, 93L, 1L, 64L, 22L,
NA, 98L, 44L, 85L, 67L, 88L, 92L), v3 = c(97L, 15L, 27L, 55L,
86L, 66L, NA, 61L, 27L, 47L, 93L, 68L, 72L, 4L, 35L, 69L, 65L,
NA, 83L, 60L, 42L, NA, 90L, 81L, NA, 27L, 60L)), .Names = c("groupvar",
"v1", "v2", "v3"), class = "data.frame", row.names = c(NA, -27L
))
There is groupvar (1 group and second group). I have many variable, but here only three.
And there are many missing values in these variables.
How can i perform multiple imputation for each variable(the type of variable can by numeric,int and so on), but for each group separately, using MICE
Edit
simple imp <- mice(data) is not give the need output, because i need by group
I want that the result was
groupvar v1 v2 v3
1 27 59 97
1 52 91 15
1 92 45 27
1 86 40 55
1 *64* 56 86
2 7 85 81
2 58*61,8* 68
2 64 *61,8* 42
** i marked example of imputed value

Group 'groupvar' as a factor.
data <- structure(list(groupvar = as.factor(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 1L,
2L, 1L)),
v1 = c(27L, 52L, 92L, 86L, NA, 19L, 94L, NA, 26L, 94L,
NA, 58L, 96L, 74L, 8L, 66L, 65L, 41L, 70L, 21L, 64L, 40L, 17L,
7L, NA, 14L, 63L),
v2 = c(59L, 91L, 45L, 40L, 56L, 17L, 72L,
78L, 19L, 62L, 87L, NA, 79L, 62L, 40L, 67L, 93L, 1L, 64L, 22L,
NA, 98L, 44L, 85L, 67L, 88L, 92L),
v3 = c(97L, 15L, 27L, 55L,
86L, 66L, NA, 61L, 27L, 47L, 93L, 68L, 72L, 4L, 35L, 69L, 65L,
NA, 83L, 60L, 42L, NA, 90L, 81L, NA, 27L, 60L)),
.Names = c("groupvar",
"v1", "v2", "v3"), class = "data.frame", row.names = c(NA, -27L
))
Then use the mice package assuming the mice package is properly installed.
library(mice)
imp <- mice(data)
complete(imp)
groupvar v1 v2 v3
1 1 27 59 97
2 1 52 91 15
3 1 92 45 27
4 1 86 40 55
5 1 21 56 86
6 1 19 17 66
7 1 94 72 4
8 1 66 78 61
9 1 26 19 27
10 2 94 62 47
11 2 8 87 93
12 2 58 72 68
13 2 96 79 72
14 2 74 62 4
15 2 8 40 35
16 2 66 67 69
17 2 65 93 65
18 2 41 1 47
19 2 70 64 83
20 2 21 22 60
21 2 64 62 42
22 1 40 98 27
23 1 17 44 90
24 2 7 85 81
25 1 63 67 55
26 2 14 88 27
27 1 63 92 60

Related

plotting specific columns of a data frame in R

ind
set
inst_0
inst_1
inst_2
Inst_3
inst_4
inst_5
0
1
20
30
50
55
58
60
0
2
34
44
46
67
89
70
0
3
37
89
78
80
90
98
0
4
23
45
67
89
87
89
1
1
34
56
65
78
77
89
1
2
23
32
45
55
66
77
1
3
35
69
88
99
98
57
1
4
23
45
56
78
89
99
2
1
23
34
55
55
77
88
2
2
12
44
55
67
88
90
2
3
12
66
77
91
44
99
2
4
45
55
88
31
56
100
I have a data frame like this above and I would like to make a plot showing this kind of a trend like in the graph below( this is only made for 4 individual in a same set) for the combinations of for example Ind0-set1, Ind1-set1, Ind2-set2...,Ind0-set2,Ind1-set2 and second question is that how to plot multiple line graph separately for each set in one graph?
I am not sure to use ggplot2 or it can be done plot function too.
If you want to do this using ggplot2 then the first step would be to reshape your data to long or tidy format using e.g. tidyr::pivot_longer:
library(tidyr)
library(dplyr)
library(ggplot2)
# Reshape to long
dat <- dat %>%
# Convert all column names to lower case
rename_with(tolower) %>%
pivot_longer(-c(ind, set), names_to = "inst", values_to = "value", names_prefix = "inst_")
After doing so you could create a plot showing all individuals for all sets by using facetting:
ggplot(dat, aes(inst, value, color = factor(ind), group = ind)) +
geom_line() +
geom_point() +
facet_wrap(~set)
Or you could filter your data for your desired combinations to create a plot for e.g. just one set like so:
dat_filtered <- dat[dat$set == 1, ]
ggplot(dat_filtered, aes(inst, value, color = factor(ind), group = ind)) +
geom_line() +
geom_point()
DATA
dat <- data.frame(
ind = c(0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L),
set = c(1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L),
inst_0 = c(20L, 34L, 37L, 23L, 34L, 23L, 35L, 23L, 23L, 12L, 12L, 45L),
inst_1 = c(30L, 44L, 89L, 45L, 56L, 32L, 69L, 45L, 34L, 44L, 66L, 55L),
inst_2 = c(50L, 46L, 78L, 67L, 65L, 45L, 88L, 56L, 55L, 55L, 77L, 88L),
Inst_3 = c(55L, 67L, 80L, 89L, 78L, 55L, 99L, 78L, 55L, 67L, 91L, 31L),
inst_4 = c(58L, 89L, 90L, 87L, 77L, 66L, 98L, 89L, 77L, 88L, 44L, 56L),
inst_5 = c(60L, 70L, 98L, 89L, 89L, 77L, 57L, 99L, 88L, 90L, 99L, 100L)
)

Is there an R function that will remove the last value of each row?

I have a data set that contains the steps a consumer took on their way to make a purchase and a value showing how much each step was worth. I want to get rid of the last value in each row to look at how much non last steps helped. I need help replacing or sub setting out those values.
This data is of varying lengths and has many different values:
My data looks somewhat like this.
df <- data.frame(
weight_1 = c(43L, 2L, 6L, 30L, 69L, 82L, 98L, 79L, 68L),
weight_2 = c(60L, 40L, 78L, 48L, 75L, 77L, 55L, 3L, 66L),
weight_3 = c(22L, 4L, 77L, 40L, 91L, 57L, 34L, 84L, NA),
weight_4 = c(88L, 47L, 77L, 82L, 31L, 19L, 11L, NA, NA),
weight_5 = c(80L, 65L, 12L, 17L, 62L, 95L, NA, NA, NA),
weight_6 = c(95L, 71L, 14L, 29L, 66L, 83L, NA, NA, NA),
weight_7 = c(64L, 20L, 69L, 57L, NA, NA, NA, NA, NA),
weight_8 = c(45L, 19L, NA, NA, NA, NA, NA, NA, NA)
)
I have found the last row value using
final_row <- as.data.frame(df[cbind( 1:nrow(df), max.col(!is.na(df),"last") )])
colnames(final_row)[1] <- "last_value"
Now I'm just looking for a way to delete those values from the dataset so I can look at non last step values.
You perhaps want this?
df <- data.frame(
weight_1 = c(43L, 2L, 6L, 30L, 69L, 82L, 98L, 79L, 68L),
weight_2 = c(60L, 40L, 78L, 48L, 75L, 77L, 55L, 3L, 66L),
weight_3 = c(22L, 4L, 77L, 40L, 91L, 57L, 34L, 84L, NA),
weight_4 = c(88L, 47L, 77L, 82L, 31L, NA, 19L, 11L, NA),
weight_5 = c(80L, 65L, 12L, 17L, 62L, NA, 40L, 95L, NA),
weight_6 = c(95L, 71L, 14L, NA, 29L, NA, 66L, 83L, NA),
weight_7 = c(64L, 20L, NA, NA, 69L, NA, 57L, NA, NA),
weight_8 = c(45L, NA, NA, NA, NA, NA, 19L, NA, NA)
)
library(dplyr, warn.conflicts = F)
df %>%
mutate(across(everything(), ~ifelse(cur_column() == names(df)[max.col(!is.na(df), ties.method = 'last')], NA, .)))
#> weight_1 weight_2 weight_3 weight_4 weight_5 weight_6 weight_7 weight_8
#> 1 43 60 22 88 80 95 64 NA
#> 2 2 40 4 47 65 71 NA NA
#> 3 6 78 77 77 12 NA NA NA
#> 4 30 48 40 82 NA NA NA NA
#> 5 69 75 91 31 62 29 NA NA
#> 6 82 77 NA NA NA NA NA NA
#> 7 98 55 34 19 40 66 57 NA
#> 8 79 3 84 11 95 NA NA NA
#> 9 68 NA NA NA NA NA NA NA
Created on 2021-07-19 by the reprex package (v2.0.0)
earlier answer
df <- data.frame(
weight_1 = c(43L, 2L, 6L, 30L, 69L, 82L, 98L, 79L, 68L),
weight_2 = c(60L, 40L, 78L, 48L, 75L, 77L, 55L, 3L, 66L),
weight_3 = c(22L, 4L, 77L, 40L, 91L, 57L, 34L, 84L, NA),
weight_4 = c(88L, 47L, 77L, 82L, 31L, NA, 19L, 11L, NA),
weight_5 = c(80L, 65L, 12L, 17L, 62L, NA, 40L, 95L, NA),
weight_6 = c(95L, 71L, 14L, NA, 29L, NA, 66L, 83L, NA),
weight_7 = c(64L, 20L, NA, NA, 69L, NA, 57L, NA, NA),
weight_8 = c(45L, NA, NA, NA, NA, NA, 19L, NA, NA)
)
df
#> weight_1 weight_2 weight_3 weight_4 weight_5 weight_6 weight_7 weight_8
#> 1 43 60 22 88 80 95 64 45
#> 2 2 40 4 47 65 71 20 NA
#> 3 6 78 77 77 12 14 NA NA
#> 4 30 48 40 82 17 NA NA NA
#> 5 69 75 91 31 62 29 69 NA
#> 6 82 77 57 NA NA NA NA NA
#> 7 98 55 34 19 40 66 57 19
#> 8 79 3 84 11 95 83 NA NA
#> 9 68 66 NA NA NA NA NA NA
library(dplyr, warn.conflicts = F)
df %>% rowwise() %>%
summarise(last_col = last(na.omit(c_across(everything()))), .groups = 'drop')
#> # A tibble: 9 x 1
#> last_col
#> <int>
#> 1 45
#> 2 20
#> 3 14
#> 4 17
#> 5 69
#> 6 57
#> 7 19
#> 8 83
#> 9 66
Created on 2021-07-17 by the reprex package (v2.0.0)
To replace last non-NA value in each row with NA you can use apply in base R -
df[] <- t(apply(df, 1, function(x) {
x[max(which(!is.na(x)))] <- NA
x
}))

add new column in a certain order [duplicate]

This question already has answers here:
Add (insert) a column between two columns in a data.frame
(18 answers)
Closed 4 years ago.
Suppose i have dataset
df=structure(list(SaleCount = c(7L, 35L, 340L, 260L, 3L, 31L, 420L,
380L, 45L, 135L, 852L, 1L, 34L, 360L, 140L, 14L, 62L, 501L, 560L,
0L, 640L, 0L, 0L, 16L, 0L), DocumentNum = c(36L, 4L, 41L, 41L,
36L, 4L, 41L, 41L, 33L, 33L, 33L, 36L, 4L, 41L, 41L, 33L, 33L,
33L, 62L, 63L, 62L, 63L, 36L, 4L, 41L)), .Names = c("SaleCount",
"DocumentNum"), class = "data.frame", row.names = c(NA, -25L))
i need create the column, but this column must be second by order.
If i do so:
df["MY_NEW_COLUMN"] <- NA .
The new colums is third.
How it create that it was second by order?
I.E. i expect output
SaleCount newcolumn DocumentNum
1 7 NA 36
2 35 NA 4
3 340 NA 41
4 260 NA 41
5 3 NA 36
6 31 NA 4
7 420 NA 41
8 380 NA 41
9 45 NA 33
10 135 NA 33
11 852 NA 33
12 1 NA 36
13 34 NA 4
14 360 NA 41
15 140 NA 41
16 14 NA 33
17 62 NA 33
18 501 NA 33
19 560 NA 62
20 0 NA 63
21 640 NA 62
22 0 NA 63
23 0 NA 36
24 16 NA 4
25 0 NA 41
Of course sometimes I need to create a fourth column by order and so on.
You can use the dplyr library and the select function.
library(dplyr)
df=structure(list(SaleCount = c(7L, 35L, 340L, 260L, 3L, 31L, 420L,
380L, 45L, 135L, 852L, 1L, 34L, 360L, 140L, 14L, 62L, 501L, 560L,
0L, 640L, 0L, 0L, 16L, 0L), DocumentNum = c(36L, 4L, 41L, 41L,
36L, 4L, 41L, 41L, 33L, 33L, 33L, 36L, 4L, 41L, 41L, 33L, 33L,
33L, 62L, 63L, 62L, 63L, 36L, 4L, 41L)), .Names = c("SaleCount",
"DocumentNum"), class = "data.frame", row.names = c(NA, -25L))
df["MY_NEW_COLUMN"] <- NA
select(df,SaleCount, MY_NEW_COLUMN, DocumentNum)

Remove rows where at least x percent of the values are zero in both groups

Given the following dataframe i would like to remove all rows with at least x percent (e.g 50%) of values = 0 in at least one group.
For example if a row has less than 50% of values in both groups (control and treatment) it will be removed.
If the row has 50% of non zero value in group control(or treatment) and no values in the other group it will be kept since there is still one group with at least 50% values.
Hope it´s clear.
treatment control control treatment control treatment
row1 0 21 21 21 45 34
row2 0 21 78 321 93 0
row3 34 32 98 87 34 0
row4 75 21 12 54 45 34
row5 46 21 13 45 0 0
row6 85 21 87 45 0 23
row7 24 84 0 0 45 5
row8 87 21 0 98 87 76
row9 43 2 0 45 12 9
row10 12 12 0 0 23 0
Here below the dataframe
df <- structure(list(structure(c(1L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L,
2L), .Label = c("row1", "row10", "row2", "row3", "row4", "row5",
"row6", "row7", "row8", "row9"), class = "factor"), treatment = c(0L,
0L, 34L, 75L, 46L, 85L, 24L, 87L, 43L, 12L), control = c(21L,
21L, 32L, 21L, 21L, 21L, 84L, 21L, 2L, 12L), control = c(21L,
78L, 98L, 12L, 13L, 87L, 0L, 0L, 0L, 0L), treatment = c(21L,
321L, 87L, 54L, 45L, 45L, 0L, 98L, 45L, 0L), control = c(45L,
93L, 34L, 45L, 0L, 0L, 45L, 87L, 12L, 23L), treatment = c(34L,
0L, 0L, 34L, 0L, 23L, 5L, 76L, 9L, 0L)), .Names = c("", "treatment",
"control", "control", "treatment", "control", "treatment"), class = "data.frame", row.names = c(NA,
-10L))
Based on what you want, if a row has more than 3 "0", you want to remove the row.
rownames(df) <- df[,1]
df <- df[,-1]
df <- df[apply(df, 1, FUN = function(x){sum(x == 0)}) < 3,]
Row 10 is removed.

Removing an observation

This is very basic, but i have been stuck on this for a while now.
I want to remove the observation -Steven Sax from Dataset hitters.txt:
> dput(hitters[280:290,])
structure(list(AtBat = c(439L, 453L, 528L, 633L, 16L, 562L, 281L,
593L, 687L, 368L, 263L), Hits = c(96L, 103L, 122L, 210L, 2L,
169L, 76L, 152L, 213L, 103L, 70L), HmRun = c(0L, 8L, 1L, 6L,
0L, 17L, 3L, 23L, 10L, 3L, 1L), Runs = c(44L, 53L, 67L, 91L,
1L, 88L, 42L, 69L, 91L, 48L, 26L), RBI = c(36L, 33L, 45L, 56L,
0L, 73L, 25L, 75L, 65L, 28L, 23L), Walks = c(65L, 52L, 51L, 59L,
0L, 53L, 20L, 53L, 27L, 54L, 30L), Years = c(4L, 2L, 4L, 6L,
2L, 8L, 8L, 6L, 4L, 8L, 4L), CAtBat = c(711L, 507L, 1716L, 3070L,
28L, 3181L, 2658L, 2765L, 1518L, 1897L, 888L), CHits = c(148L,
123L, 403L, 872L, 4L, 841L, 657L, 686L, 448L, 493L, 220L), CHmRun = c(1L,
8L, 12L, 19L, 0L, 61L, 48L, 133L, 15L, 9L, 9L), CRuns = c(68L,
63L, 211L, 420L, 1L, 450L, 324L, 369L, 196L, 207L, 83L), CRBI = c(56L,
39L, 146L, 230L, 0L, 342L, 300L, 384L, 137L, 162L, 82L), CWalks = c(99L,
58L, 155L, 274L, 0L, 373L, 179L, 321L, 89L, 198L, 86L), League = structure(c(2L,
1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L), .Label = c("A", "N"), class = "factor"),
Division = structure(c(1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 1L,
2L, 1L), .Label = c("E", "W"), class = "factor"), PutOuts = c(229L,
289L, 209L, 367L, 247L, 351L, 106L, 315L, 294L, 209L, 81L
), Assists = c(406L, 407L, 372L, 432L, 4L, 442L, 144L, 10L,
445L, 246L, 147L), Errors = c(22L, 6L, 17L, 16L, 8L, 17L,
7L, 6L, 13L, 3L, 4L), Salary = c(150, 105, 350, 90, NA, 530,
341.667, 940, 350, 326.667, 250), NewLeague = structure(c(2L,
1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L), .Label = c("A",
"N"), class = "factor")), .Names = c("AtBat", "Hits", "HmRun",
"Runs", "RBI", "Walks", "Years", "CAtBat", "CHits", "CHmRun",
"CRuns", "CRBI", "CWalks", "League", "Division", "PutOuts", "Assists",
"Errors", "Salary", "NewLeague"), row.names = c("-Steve Jeltz",
"-Steve Lombardozzi", "-Spike Owen", "-Steve Sax", "-Tony Armas",
"-Tony Bernazard", "-Tom Brookens", "-Tom Brunansky", "-Tony Fernandez",
"-Tim Flannery", "-Tom Foley"), class = "data.frame")
If i knew the name of the first column i would have used:
hitters <- hitters[!hitters$Colname == "-Steve Sax",]
or
hitters <- hitters[hitters$AtBat != "-Steve Sax", ]
But i don't know the name of the first column:
. I have tried: read.table("hitters.txt", head = F)
`and
read.table("hitters.txt", head = F)
My questions are:
How can I remove the observation?
Why head = T didnt work?
The first "column" represents the row names (this is not an actual column in the data set but appears as such in the output). You can access row names with the function rownames:
hitters[!rownames(hitters) %in% '-Steve Sax', ]
will extract the observation from the data set.
Output:
AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns
-Steve Jeltz 439 96 0 44 36 65 4 711 148 1 68
-Steve Lombardozzi 453 103 8 53 33 52 2 507 123 8 63
-Spike Owen 528 122 1 67 45 51 4 1716 403 12 211
-Tony Armas 16 2 0 1 0 0 2 28 4 0 1
-Tony Bernazard 562 169 17 88 73 53 8 3181 841 61 450
-Tom Brookens 281 76 3 42 25 20 8 2658 657 48 324
-Tom Brunansky 593 152 23 69 75 53 6 2765 686 133 369
-Tony Fernandez 687 213 10 91 65 27 4 1518 448 15 196
-Tim Flannery 368 103 3 48 28 54 8 1897 493 9 207
-Tom Foley 263 70 1 26 23 30 4 888 220 9 83

Resources