Can you specify multiple value columns in pivot_longer()?
My original data (posted below) I had to transpose to be in a wider format. Then I want to take this new transposed data and return it to the original format (lets assume I did some transformations/ and can't use the original data). To re-transpose back into a longer format I have to use both pivot_longer() then pivot_wider() because there are multiple values I want to be their own columns.
I would like to avoid the pivot_wider() and just use pivot_longer() when re-transposing the data back if possible.
As a side note the unique identifier for each row is the combination of id and report.
Code
dfa <- dfx %>%
pivot_wider(
id_cols = id,
names_from = report,
values_from = c(pts,
p1, p2, p3,p4,p5,
d1,d2,d3,d4,d5)
)
df_return <- dfa %>%
pivot_longer(cols = !id,
names_to = c('vars','report'),
names_pattern = "([a-z0-9]+)_(.*)",
values_drop_na = TRUE) %>%
pivot_wider(id_cols = c(id, report),
names_from = vars,
values_from = value)
Data
structure(list(pts = c(431L, 167L, 167L, 760L, 348L, 768L, 619L,
169L, 416L, 155L, 47L, 37L, 6L, 17L, 22L, 1L, 149L, 3L, 284L,
7L), d1 = c(129L, 48L, 52L, 166L, 90L, 178L, 184L, 20L, 158L,
42L, 3L, 15L, 2L, 7L, 9L, 0L, 54L, 1L, 69L, 6L), d2 = c(172L,
67L, 64L, 257L, 132L, 255L, 261L, 30L, 201L, 61L, 9L, 20L, 2L,
9L, 12L, 0L, 69L, 1L, 123L, 6L), d3 = c(205L, 77L, 73L, 312L,
153L, 307L, 310L, 39L, 235L, 70L, 12L, 21L, 2L, 10L, 12L, 0L,
77L, 2L, 139L, 6L), d4 = c(227L, 81L, 82L, 363L, 177L, 350L,
342L, 52L, 257L, 75L, 15L, 24L, 2L, 12L, 13L, 0L, 86L, 2L, 151L,
6L), d5 = c(248L, 88L, 92L, 414L, 192L, 387L, 374L, 66L, 279L,
86L, 16L, 26L, 2L, 12L, 15L, 0L, 90L, 3L, 164L, 7L), report = c("2006",
"2006", "2006", "2006", "2006", "2006", "2006", "2006", "2006",
"2006", "2006", "2006", "2006", "2006", "2006", "2006", "2006",
"2006", "2006", "2006"), p1 = c(1.0360364394094, 1.22979866735429,
1.21423740998677, 0.87891144382145, 0.810310827130179, 0.965901663505148,
1.02621739486337, 0.69319116444678, 1.18938130906092, 1.04220816515009,
0.683545688193799, 1.05179228560845, 1.51468104603873, 1.15200888955888,
0.948041330809858, 0, 1.23227405154205, 3.11155226007598, 0.908056299174703,
1.57712371536702), p2 = c(0.986884800185635, 1.23066225499351,
1.07336930339221, 0.966734485786667, 0.87421381769247, 0.974775549615439,
1.06274655160121, 0.705150638862953, 1.12934487417415, 1.10234720984265,
1.11084642794988, 1.06558505521222, 1.0197697665798, 1.15605466288868,
1.01469386643771, 0, 1.17689541437029, 1.42783711234222, 1.16124019281912,
1.27756288696848), p3 = c(0.993575954694177, 1.17968893104311,
1.02608313159672, 0.965200422661265, 0.862910478266102, 0.976436243011877,
1.06679768502287, 0.722966824498357, 1.12591016481614, 1.05867627021151,
1.11227024088529, 0.98275117259764, 0.803738347803303, 1.09341228936369,
0.878291424560146, 0, 1.10500006213832, 1.93128861370172, 1.0949534752299,
1.14755029569502), p4 = c(0.986244633210798, 1.08520792731261,
1.01128789684232, 0.977245321880205, 0.89785754450165, 0.981536130349165,
1.04454959427709, 0.807825580390444, 1.1035817255901, 1.00192975678877,
1.14371311954082, 1.02812279984398, 0.66742040677939, 1.15526702119886,
0.878479047328667, 0, 1.10559111180852, 1.4717526513624, 1.05479137550321,
1.07005088091939), p5 = c(0.992583778223324, 1.06016737802091,
1.02253158347207, 1.00026491073882, 0.896290873874826, 0.985549150023704,
1.04187931404895, 0.886647217836043, 1.09837506943384, 1.0323002052873,
1.05833769015682, 1.05042831618603, 0.592515872759586, 1.05106420250504,
0.961672664191663, 0, 1.05868657273466, 1.81304485775152, 1.04168095802127,
1.19437925124365), id = c("ID 1", "ID 2", "ID 3", "ID 4", "ID 5",
"ID 6", "ID 7", "ID 8", "ID 9", "ID 10", "ID 11", "ID 12", "ID 13",
"ID 14", "ID 15", "ID 16", "ID 17", "ID 18", "ID 19", "ID 20"
)), row.names = c(NA, 20L), class = "data.frame")
We may need the .value in the names_to, which selects the prefix part of the column name before the _ as the column value and the 'report' will return the suffix column name
library(tidyr)
pivot_longer(dfa, cols = -id, names_to = c(".value", "report"),
names_sep = "_")
-output
# A tibble: 20 × 13
id report pts p1 p2 p3 p4 p5 d1 d2 d3 d4 d5
<chr> <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <int> <int> <int> <int>
1 ID 1 2006 431 1.04 0.987 0.994 0.986 0.993 129 172 205 227 248
2 ID 2 2006 167 1.23 1.23 1.18 1.09 1.06 48 67 77 81 88
3 ID 3 2006 167 1.21 1.07 1.03 1.01 1.02 52 64 73 82 92
4 ID 4 2006 760 0.879 0.967 0.965 0.977 1.00 166 257 312 363 414
5 ID 5 2006 348 0.810 0.874 0.863 0.898 0.896 90 132 153 177 192
6 ID 6 2006 768 0.966 0.975 0.976 0.982 0.986 178 255 307 350 387
7 ID 7 2006 619 1.03 1.06 1.07 1.04 1.04 184 261 310 342 374
8 ID 8 2006 169 0.693 0.705 0.723 0.808 0.887 20 30 39 52 66
9 ID 9 2006 416 1.19 1.13 1.13 1.10 1.10 158 201 235 257 279
10 ID 10 2006 155 1.04 1.10 1.06 1.00 1.03 42 61 70 75 86
11 ID 11 2006 47 0.684 1.11 1.11 1.14 1.06 3 9 12 15 16
12 ID 12 2006 37 1.05 1.07 0.983 1.03 1.05 15 20 21 24 26
13 ID 13 2006 6 1.51 1.02 0.804 0.667 0.593 2 2 2 2 2
14 ID 14 2006 17 1.15 1.16 1.09 1.16 1.05 7 9 10 12 12
15 ID 15 2006 22 0.948 1.01 0.878 0.878 0.962 9 12 12 13 15
16 ID 16 2006 1 0 0 0 0 0 0 0 0 0 0
17 ID 17 2006 149 1.23 1.18 1.11 1.11 1.06 54 69 77 86 90
18 ID 18 2006 3 3.11 1.43 1.93 1.47 1.81 1 1 2 2 3
19 ID 19 2006 284 0.908 1.16 1.09 1.05 1.04 69 123 139 151 164
20 ID 20 2006 7 1.58 1.28 1.15 1.07 1.19 6 6 6 6 7
Related
I have two dataframes, with a similar strucure:
df_I <- structure(list(year = c("2006", "2006", "2006", "2006", "2006",
"2006", "2006", "2006", "2006"), code = c(0, 1110,
1120, 1130, 1220, 1230, 1310, 1320, 1330), `1` = c(1L,
8L, 2L, 2L, 0L, 2L, 0L, 1L, 0L), `2` = c(0L, 10L, 0L, 0L,
0L, 2L, 1L, 3L, 1L), `3` = c(4L, 2L, 1L, 2L, 0L, 4L,
0L, 0L, 3L), `4` = c(4L, 6L, 0L, 3L, 1L, 3L, 0L, 0L, 3L),
totaal = c(11, 26, 3, 7, 1, 9, 7, 7, 6)), row.names = c(NA,
-9L), class = c("tbl_df", "tbl", "data.frame"))
# A tibble: 9 × 7
year code `1` `2` `3` `4` totaal
<chr> <dbl> <int> <int> <int> <int> <dbl>
1 2006 0 1 0 4 4 11
2 2006 1110 8 10 2 6 26
3 2006 1120 2 0 1 0 3
4 2006 1130 2 0 2 3 7
5 2006 1220 0 0 0 1 1
6 2006 1230 2 2 4 3 9
7 2006 1310 0 1 0 0 7
8 2006 1320 1 3 0 0 7
9 2006 1330 0 1 3 3 6
df_II <- structure(list(year = c("2006", "2006", "2006", "2006", "2006",
"2006", "2006", "2006", "2006", "2006"), code = c(0, 1110,
1120, 1130, 1210, 1220, 1230, 1310, 1320, 1330), `1` = c(15806L,
655L, 105L, 328L, 138L, 452L, 445L, 471L, 672L, 615L), `2` = c(9681L,
337L, 68L, 215L, 97L, 357L, 366L, 245L, 440L, 360L), `3` = c(10457L,
221L, 40L, 123L, 65L, 325L, 322L, 151L, 352L, 332L), `4` = c(7109L,
128L, 5L, 64L, 56L, 256L, 240L, 83L, 274L, 192L), totaal = c(43053,
1341, 218, 730, 356, 1390, 1373, 950, 1738, 1499)), row.names = c(NA,
-10L), class = c("tbl_df", "tbl", "data.frame"))
# A tibble: 10 × 7
year code `1` `2` `3` `4` totaal
<chr> <dbl> <int> <int> <int> <int> <dbl>
1 2006 0 15806 9681 10457 7109 43053
2 2006 1110 655 337 221 128 1341
3 2006 1120 105 68 40 5 218
4 2006 1130 328 215 123 64 730
5 2006 1210 138 97 65 56 356
6 2006 1220 452 357 325 256 1390
7 2006 1230 445 366 322 240 1373
8 2006 1310 471 245 151 83 950
9 2006 1320 672 440 352 274 1738
10 2006 1330 615 360 332 192 1499
I would like to create a new data.frame df_out, which divides df_I by df_II, for columns 1,2,3,4, totaal by year and code. The issue is that not every code is available for each year.
What is the best way to divide this unequal dataframe?
Desired outcome:
# A tibble: 10 × 7
year code `1` `2` `3` `4` totaal
<chr> <dbl> <int> <int> <int> <int> <dbl>
1 2006 0 1 /15806 0/9681 4/10457 4/7109 11/43053
You could subset the second data frame using %in%, assuming both code columns are properly ordered.
cols <- as.character(1:4)
cbind(df_I[setdiff(names(df_I), cols)], df_I[cols] / subset(df_II, code %in% df_I$code, cols))
# year code totaal 1 2 3 4
# 1 2006 0 11 6.326711e-05 0.000000000 0.0003825189 0.000562667
# 2 2006 1110 26 1.221374e-02 0.029673591 0.0090497738 0.046875000
# 3 2006 1120 3 1.904762e-02 0.000000000 0.0250000000 0.000000000
# 4 2006 1130 7 6.097561e-03 0.000000000 0.0162601626 0.046875000
# 5 2006 1220 1 0.000000e+00 0.000000000 0.0000000000 0.003906250
# 6 2006 1230 9 4.494382e-03 0.005464481 0.0124223602 0.012500000
# 7 2006 1310 7 0.000000e+00 0.004081633 0.0000000000 0.000000000
# 8 2006 1320 7 1.488095e-03 0.006818182 0.0000000000 0.000000000
# 9 2006 1330 6 0.000000e+00 0.002777778 0.0090361446 0.015625000
You could use complete to make the number of rows between the two data frames equal, and then do the division:
library(tidyr)
df_I %<>%
complete(code = df_II$code) %>%
fill(year) %>%
replace(is.na(.), 0)
cbind(df_I[c(1, 2)], df_I[-c(1, 2)] / df_II[-c(1, 2)])
code year `1` `2` `3` `4` totaal
<dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 0 2006 0.0000633 0 0.000383 0.000563 0.000255
2 1110 2006 0.0122 0.0297 0.00905 0.0469 0.0194
3 1120 2006 0.0190 0 0.025 0 0.0138
4 1130 2006 0.00610 0 0.0163 0.0469 0.00959
5 1210 2006 0 0 0 0 0
6 1220 2006 0 0 0 0.00391 0.000719
7 1230 2006 0.00449 0.00546 0.0124 0.0125 0.00655
8 1310 2006 0 0.00408 0 0 0.00737
9 1320 2006 0.00149 0.00682 0 0 0.00403
10 1330 2006 0 0.00278 0.00904 0.0156 0.00400
I am working with a wide data set resembling the following:
I am looking to write a function that I can iterate over sets of columns with similar names, but with different names. For the sake of simplicity here in terms of the function itself, I'll just create a function that takes the mean of two columns.
avg <- function(data, scorecol, distcol) {
ScoreDistanceAvg = (scorecol + distcol)/2
data$ScoreDistanceAvg <- ScoreDistanceAvg
return(data)
}
avg(data = dat, scorecol = dat$ScoreGame0, distcol = dat$DistanceGame0)
How can I apply the new function to sets of columns with repeated names but different numbers? That is, how could I create a column that takes the mean of ScoreGame0 and DistanceGame0, then create a column that takes the mean of ScoreGame5 and DistanceGame5, and so on? This would be the final output:
Of course, I could just run the function multiple times, but since my full data set is much larger, how could I automate this process? I imagine it involves apply, but I'm not sure how to use apply with a repeated pattern like that. Additionally, I imagine it may involve rewriting the function to better automate the naming of columns.
Data:
structure(list(Player = c("Lebron James", "Lebron James", "Lebron James",
"Lebron James", "Lebron James", "Lebron James", "Lebron James",
"Lebron James", "Lebron James", "Lebron James", "Lebron James",
"Lebron James", "Steph Curry", "Steph Curry", "Steph Curry",
"Steph Curry", "Steph Curry", "Steph Curry", "Steph Curry", "Steph Curry",
"Steph Curry", "Steph Curry", "Steph Curry", "Steph Curry"),
Game = c(0L, 1L, 2L, 3L, 4L, 5L, 0L, 1L, 2L, 3L, 4L, 5L,
0L, 1L, 2L, 3L, 4L, 5L, 0L, 1L, 2L, 3L, 4L, 5L), ScoreGame0 = c(32L,
32L, 32L, 32L, 32L, 32L, 44L, 44L, 44L, 44L, 44L, 44L, 45L,
45L, 45L, 45L, 45L, 45L, 76L, 76L, 76L, 76L, 76L, 76L), ScoreGame5 = c(27L,
27L, 27L, 27L, 27L, 27L, 12L, 12L, 12L, 12L, 12L, 12L, 76L,
76L, 76L, 76L, 76L, 76L, 32L, 32L, 32L, 32L, 32L, 32L), DistanceGame0 = c(12L,
12L, 12L, 12L, 12L, 12L, 79L, 79L, 79L, 79L, 79L, 79L, 18L,
18L, 18L, 18L, 18L, 18L, 88L, 88L, 88L, 88L, 88L, 88L), DistanceGame5 = c(13L,
13L, 13L, 13L, 13L, 13L, 34L, 34L, 34L, 34L, 34L, 34L, 42L,
42L, 42L, 42L, 42L, 42L, 54L, 54L, 54L, 54L, 54L, 54L)), class = "data.frame", row.names = c(NA,
-24L))
Rewrite your function slightly and use it in mapply by greping over the columns. sort makes this even safer.
avg <- function(scorecol, distcol) {
(scorecol + distcol)/2
}
mapply(avg, dat[sort(grep('ScoreGame', names(dat)))], dat[sort(grep('DistanceGame', names(dat)))])
# ScoreGame0 ScoreGame5
# [1,] 22.0 20
# [2,] 22.0 20
# [3,] 22.0 20
# [4,] 22.0 20
# [5,] 22.0 20
# [6,] 22.0 20
# [7,] 61.5 23
# [8,] 61.5 23
# [9,] 61.5 23
# [10,] 61.5 23
# [11,] 61.5 23
# [12,] 61.5 23
# [13,] 31.5 59
# [14,] 31.5 59
# [15,] 31.5 59
# [16,] 31.5 59
# [17,] 31.5 59
# [18,] 31.5 59
# [19,] 82.0 43
# [20,] 82.0 43
# [21,] 82.0 43
# [22,] 82.0 43
# [23,] 82.0 43
# [24,] 82.0 43
To see what grep does try
grep('DistanceGame', names(dat), value=TRUE)
# [1] "DistanceGame0" "DistanceGame5"
in Base R:
cols_used <- names(df[, -(1:2)])
f <- sub("[^0-9]+", 'ScoreDistance', cols_used)
data.frame(lapply(split.default(df[cols_used], f), rowMeans))
ScoreDistance0 ScoreDistance5
1 22.0 20
2 22.0 20
3 22.0 20
4 22.0 20
5 22.0 20
6 22.0 20
7 61.5 23
8 61.5 23
9 61.5 23
10 61.5 23
11 61.5 23
12 61.5 23
13 31.5 59
14 31.5 59
15 31.5 59
16 31.5 59
17 31.5 59
18 31.5 59
19 82.0 43
20 82.0 43
21 82.0 43
22 82.0 43
23 82.0 43
24 82.0 43
Using tidyverse:
Here's a solution with a forloop and readr:
library(readr)
game_num <- names(dat) |>
readr::parse_number() |>
na.omit()
for(i in unique(game_num)) {
avg <- paste0("ScoreDistanceAvg", i)
score <- paste0("ScoreGame", i)
distance <- paste0("DistanceGame", i)
dat[[avg]] <- (dat[[score]] + dat[[distance]])/2
}
Which gives:
Player Game ScoreGame0 ScoreGame5 DistanceGame0 DistanceGame5 ScoreDistanceAvg0 ScoreDistanceAvg5
1 Lebron James 0 32 27 12 13 22.0 20
2 Lebron James 1 32 27 12 13 22.0 20
3 Lebron James 2 32 27 12 13 22.0 20
4 Lebron James 3 32 27 12 13 22.0 20
5 Lebron James 4 32 27 12 13 22.0 20
6 Lebron James 5 32 27 12 13 22.0 20
7 Lebron James 0 44 12 79 34 61.5 23
8 Lebron James 1 44 12 79 34 61.5 23
9 Lebron James 2 44 12 79 34 61.5 23
10 Lebron James 3 44 12 79 34 61.5 23
11 Lebron James 4 44 12 79 34 61.5 23
12 Lebron James 5 44 12 79 34 61.5 23
13 Steph Curry 0 45 76 18 42 31.5 59
I have data frames with counts from a series of years, 1970-2020, generated by a subset command from a larger data set, i.e. resulting in two columns "Year" and "Count":
Year Count
1987 8
1989 1
1991 1
1992 4
1995 11
1996 3
1997 7
.
.
.
2019 2
2020 5
There are missing years where Count=0, and I need a procedure to fill these df's with the missing years and Count=0. I have this script that I can't get to work:
library(tidyr)
aug <- subset(mainframe, month==8)
complete(aug, year = 1987:2020, fill = list(Count = 0))
Here's a sample dataframe 'aug':
dput(aug)
structure(list(month = structure(c(8L, 8L, 8L, 8L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L,
8L, 8L, 8L, 8L, 8L), .Label = c("1", "2", "3", "4", "5", "6",
"7", "8", "9", "10", "11", "12"), class = "factor"), year = structure(1:28, .Label = c("1987",
"1988", "1989", "1990", "1991", "1992", "1993", "1994", "1995",
"1996", "1998", "2000", "2001", "2002", "2003", "2004", "2005",
"2006", "2007", "2008", "2009", "2010", "2011", "2013", "2015",
"2016", "2018", "2020"), class = "factor"), Count = c(4L, 0L,
3L, 3L, 0L, 0L, 1L, 0L, 1L, 1L, 3L, 0L, 0L, 0L, 0L, 2L, 0L, 0L,
0L, 2L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 1L)), row.names = c(8L, 20L,
32L, 44L, 56L, 68L, 80L, 92L, 104L, 116L, 128L, 140L, 152L, 164L,
176L, 188L, 200L, 212L, 224L, 236L, 248L, 260L, 272L, 284L, 296L,
308L, 320L, 332L), class = "data.frame")
If I get your question correctly, you want to have a complete dataframe containing year 1987 to 2020, but there are some missing years in your aug dataframe, and you want to fill in the missing years with month = "8" and Count = 0.
Here's a tidyverse approach (in your original aug dataframe, your year is factor, so at the end of my solution I also transformed it into factor):
Your dataset
month year Count
8 8 1987 4
20 8 1988 0
32 8 1989 3
44 8 1990 3
56 8 1991 0
68 8 1992 0
80 8 1993 1
92 8 1994 0
104 8 1995 1
116 8 1996 1
128 8 1998 3
140 8 2000 0
152 8 2001 0
164 8 2002 0
176 8 2003 0
188 8 2004 2
200 8 2005 0
212 8 2006 0
224 8 2007 0
236 8 2008 2
248 8 2009 0
260 8 2010 1
272 8 2011 1
284 8 2013 0
296 8 2015 0
308 8 2016 1
320 8 2018 0
332 8 2020 1
Solution
library(tidyverse)
aug %>% mutate(year = as.numeric(as.character(year))) %>%
complete(year = first(year):max(year), # or year = 1987:2020
fill = list(month = "8", Count = 0)) %>%
mutate(year = as.factor(year))
Output
year month Count
1987 8 4
1988 8 0
1989 8 3
1990 8 3
1991 8 0
1992 8 0
1993 8 1
1994 8 0
1995 8 1
1996 8 1
1997 8 0
1998 8 3
1999 8 0
2000 8 0
2001 8 0
2002 8 0
2003 8 0
2004 8 2
2005 8 0
2006 8 0
2007 8 0
2008 8 2
2009 8 0
2010 8 1
2011 8 1
2012 8 0
2013 8 0
2014 8 0
2015 8 0
2016 8 1
2017 8 0
2018 8 0
2019 8 0
2020 8 1
I want to transform my data from this:
current data.frame
to this: desired data.frame
I have no clue how to start, any help is welcome!
Thanks in advance,
Mitch
One solution with reshape() melt()
library(readr)
library(reshape)
Data:
df<-structure(list(age_group = c("<20", ">70", "20-29", "30-39",
"40-49", "50-59", "60-69"), no = c(19L, 1L, 447L, 196L, 92L,
55L, 24L), yes = c(21L, 1L, 664L, 371L, 204L, 137L, 63L), total = c(2L,
0L, 217L, 175L, 112L, 82L, 39L)), class = "data.frame", row.names = c(NA,
-7L))
Code:
df<-melt(C0001)
df<-as.data.frame(df)
df[order(df$age_group),]
age_group variable value
1 <20 no 19
8 <20 yes 21
15 <20 total 2
2 >70 no 1
9 >70 yes 1
16 >70 total 0
3 20-29 no 447
10 20-29 yes 664
17 20-29 total 217
4 30-39 no 196
11 30-39 yes 371
18 30-39 total 175
5 40-49 no 92
12 40-49 yes 204
19 40-49 total 112
6 50-59 no 55
13 50-59 yes 137
20 50-59 total 82
7 60-69 no 24
14 60-69 yes 63
21 60-69 total 39
I was wondering how I could go about trying to take outliers from Boxplot$out (returns the outliers within the data) and put them into a table which shows the class they belong to e.g. if outlier is from class "Van", "Bus, "Saab" etc..
I have tried using which() function but this returns only the index of the outlier and not the class. I am not sure how to go about putting this into a table.
Any help would be greatly appreciated!
library(reshape2)
vehData <-
structure(
list(
Samples = 1:6,
Comp = c(95L, 91L, 104L, 93L, 85L,
107L),
Circ = c(48L, 41L, 50L, 41L, 44L, 57L),
D.Circ = c(83L,
84L, 106L, 82L, 70L, 106L),
Rad.Ra = c(178L, 141L, 209L, 159L,
205L, 172L),
Pr.Axis.Ra = c(72L, 57L, 66L, 63L, 103L, 50L),
Max.L.Ra = c(10L,
9L, 10L, 9L, 52L, 6L),
Scat.Ra = c(162L, 149L, 207L, 144L, 149L,
255L),
Elong = c(42L, 45L, 32L, 46L, 45L, 26L),
Pr.Axis.Rect = c(20L,
19L, 23L, 19L, 19L, 28L),
Max.L.Rect = c(159L, 143L, 158L, 143L,
144L, 169L),
Sc.Var.Maxis = c(176L, 170L, 223L, 160L, 241L, 280L),
Sc.Var.maxis = c(379L, 330L, 635L, 309L, 325L, 957L),
Ra.Gyr = c(184L,
158L, 220L, 127L, 188L, 264L),
Skew.Maxis = c(70L, 72L, 73L,
63L, 127L, 85L),
Skew.maxis = c(6L, 9L, 14L, 6L, 9L, 5L),
Kurt.maxis = c(16L,
14L, 9L, 10L, 11L, 9L),
Kurt.Maxis = c(187L, 189L, 188L, 199L,
180L, 181L),
Holl.Ra = c(197L, 199L, 196L, 207L, 183L, 183L),
Class = c("van", "van", "saab", "van", "bus", "bus")
),
row.names = c(NA,
6L), class = "data.frame")
#Remove outliers
removeOutliers <- function(data) {
OutVals <- boxplot(data)$out
remOutliers <- sapply(data, function(x) x[!x %in% OutVals])
return (remOutliers)
}
vehDataRemove1 <- vehData[, -1]
vehDataRemove2 <- vehDataRemove1[,-19]
vehData <- vehDataRemove2
vehClass <- vehData$Class
boxplot(vehData)
#Begin removing outliers
removeOutliers1 <- removeOutliers(vehData)
removeOutliers2 <- removeOutliers(removeOutliers1)
This can be simplified. Starting with your data frame vehData. First get the rownumbers of the outliers. In my comment I accidentally left out the seq() function:
vehDataRemove <- vehData[, -c(1, 20)]
OutVals <- boxplot(vehDataRemove)
idx <- sapply(seq(length(OutVals$out)), function(x) which(vehDataRemove[, OutVals$group[x]] == OutVals$out[x]))
idx
# [1] 5 5 6 5 3
Notice that three outliers are in the 5th row. Now remove the rows with outliers:
NoOuts <- vehDataRemove[-unique(idx), ]
NoOuts
# Comp Circ D.Circ Rad.Ra Pr.Axis.Ra Max.L.Ra Scat.Ra Elong Pr.Axis.Rect Max.L.Rect Sc.Var.Maxis Sc.Var.maxis Ra.Gyr Skew.Maxis Skew.maxis Kurt.maxis Kurt.Maxis Holl.Ra
# 1 95 48 83 178 72 10 162 42 20 159 176 379 184 70 6 16 187 197
# 2 91 41 84 141 57 9 149 45 19 143 170 330 158 72 9 14 189 199
# 4 93 41 82 159 63 9 144 46 19 143 160 309 127 63 6 10 199 207
So you have lost half of your data! Alternatively set the outliers to missing values:
Outs2NA <- vehDataRemove
Outs2NA[cbind(idx, OutVals$group)] <- NA
Outs2NA
# Comp Circ D.Circ Rad.Ra Pr.Axis.Ra Max.L.Ra Scat.Ra Elong Pr.Axis.Rect Max.L.Rect Sc.Var.Maxis Sc.Var.maxis Ra.Gyr Skew.Maxis Skew.maxis Kurt.maxis Kurt.Maxis Holl.Ra
# 1 95 48 83 178 72 10 162 42 20 159 176 379 184 70 6 16 187 197
# 2 91 41 84 141 57 9 149 45 19 143 170 330 158 72 9 14 189 199
# 3 104 50 106 209 66 10 207 32 23 158 223 635 220 73 NA 9 188 196
# 4 93 41 82 159 63 9 144 46 19 143 160 309 127 63 6 10 199 207
# 5 85 44 70 205 NA NA 149 45 19 144 241 325 188 NA 9 11 180 183
# 6 107 57 106 172 50 NA 255 26 28 169 280 957 264 85 5 9 181 183