Boxplot outlier values into a table - r

I was wondering how I could go about trying to take outliers from Boxplot$out (returns the outliers within the data) and put them into a table which shows the class they belong to e.g. if outlier is from class "Van", "Bus, "Saab" etc..
I have tried using which() function but this returns only the index of the outlier and not the class. I am not sure how to go about putting this into a table.
Any help would be greatly appreciated!
library(reshape2)
vehData <-
structure(
list(
Samples = 1:6,
Comp = c(95L, 91L, 104L, 93L, 85L,
107L),
Circ = c(48L, 41L, 50L, 41L, 44L, 57L),
D.Circ = c(83L,
84L, 106L, 82L, 70L, 106L),
Rad.Ra = c(178L, 141L, 209L, 159L,
205L, 172L),
Pr.Axis.Ra = c(72L, 57L, 66L, 63L, 103L, 50L),
Max.L.Ra = c(10L,
9L, 10L, 9L, 52L, 6L),
Scat.Ra = c(162L, 149L, 207L, 144L, 149L,
255L),
Elong = c(42L, 45L, 32L, 46L, 45L, 26L),
Pr.Axis.Rect = c(20L,
19L, 23L, 19L, 19L, 28L),
Max.L.Rect = c(159L, 143L, 158L, 143L,
144L, 169L),
Sc.Var.Maxis = c(176L, 170L, 223L, 160L, 241L, 280L),
Sc.Var.maxis = c(379L, 330L, 635L, 309L, 325L, 957L),
Ra.Gyr = c(184L,
158L, 220L, 127L, 188L, 264L),
Skew.Maxis = c(70L, 72L, 73L,
63L, 127L, 85L),
Skew.maxis = c(6L, 9L, 14L, 6L, 9L, 5L),
Kurt.maxis = c(16L,
14L, 9L, 10L, 11L, 9L),
Kurt.Maxis = c(187L, 189L, 188L, 199L,
180L, 181L),
Holl.Ra = c(197L, 199L, 196L, 207L, 183L, 183L),
Class = c("van", "van", "saab", "van", "bus", "bus")
),
row.names = c(NA,
6L), class = "data.frame")
#Remove outliers
removeOutliers <- function(data) {
OutVals <- boxplot(data)$out
remOutliers <- sapply(data, function(x) x[!x %in% OutVals])
return (remOutliers)
}
vehDataRemove1 <- vehData[, -1]
vehDataRemove2 <- vehDataRemove1[,-19]
vehData <- vehDataRemove2
vehClass <- vehData$Class
boxplot(vehData)
#Begin removing outliers
removeOutliers1 <- removeOutliers(vehData)
removeOutliers2 <- removeOutliers(removeOutliers1)

This can be simplified. Starting with your data frame vehData. First get the rownumbers of the outliers. In my comment I accidentally left out the seq() function:
vehDataRemove <- vehData[, -c(1, 20)]
OutVals <- boxplot(vehDataRemove)
idx <- sapply(seq(length(OutVals$out)), function(x) which(vehDataRemove[, OutVals$group[x]] == OutVals$out[x]))
idx
# [1] 5 5 6 5 3
Notice that three outliers are in the 5th row. Now remove the rows with outliers:
NoOuts <- vehDataRemove[-unique(idx), ]
NoOuts
# Comp Circ D.Circ Rad.Ra Pr.Axis.Ra Max.L.Ra Scat.Ra Elong Pr.Axis.Rect Max.L.Rect Sc.Var.Maxis Sc.Var.maxis Ra.Gyr Skew.Maxis Skew.maxis Kurt.maxis Kurt.Maxis Holl.Ra
# 1 95 48 83 178 72 10 162 42 20 159 176 379 184 70 6 16 187 197
# 2 91 41 84 141 57 9 149 45 19 143 170 330 158 72 9 14 189 199
# 4 93 41 82 159 63 9 144 46 19 143 160 309 127 63 6 10 199 207
So you have lost half of your data! Alternatively set the outliers to missing values:
Outs2NA <- vehDataRemove
Outs2NA[cbind(idx, OutVals$group)] <- NA
Outs2NA
# Comp Circ D.Circ Rad.Ra Pr.Axis.Ra Max.L.Ra Scat.Ra Elong Pr.Axis.Rect Max.L.Rect Sc.Var.Maxis Sc.Var.maxis Ra.Gyr Skew.Maxis Skew.maxis Kurt.maxis Kurt.Maxis Holl.Ra
# 1 95 48 83 178 72 10 162 42 20 159 176 379 184 70 6 16 187 197
# 2 91 41 84 141 57 9 149 45 19 143 170 330 158 72 9 14 189 199
# 3 104 50 106 209 66 10 207 32 23 158 223 635 220 73 NA 9 188 196
# 4 93 41 82 159 63 9 144 46 19 143 160 309 127 63 6 10 199 207
# 5 85 44 70 205 NA NA 149 45 19 144 241 325 188 NA 9 11 180 183
# 6 107 57 106 172 50 NA 255 26 28 169 280 957 264 85 5 9 181 183

Related

use pivot_longer to created multiple value columns

Can you specify multiple value columns in pivot_longer()?
My original data (posted below) I had to transpose to be in a wider format. Then I want to take this new transposed data and return it to the original format (lets assume I did some transformations/ and can't use the original data). To re-transpose back into a longer format I have to use both pivot_longer() then pivot_wider() because there are multiple values I want to be their own columns.
I would like to avoid the pivot_wider() and just use pivot_longer() when re-transposing the data back if possible.
As a side note the unique identifier for each row is the combination of id and report.
Code
dfa <- dfx %>%
pivot_wider(
id_cols = id,
names_from = report,
values_from = c(pts,
p1, p2, p3,p4,p5,
d1,d2,d3,d4,d5)
)
df_return <- dfa %>%
pivot_longer(cols = !id,
names_to = c('vars','report'),
names_pattern = "([a-z0-9]+)_(.*)",
values_drop_na = TRUE) %>%
pivot_wider(id_cols = c(id, report),
names_from = vars,
values_from = value)
Data
structure(list(pts = c(431L, 167L, 167L, 760L, 348L, 768L, 619L,
169L, 416L, 155L, 47L, 37L, 6L, 17L, 22L, 1L, 149L, 3L, 284L,
7L), d1 = c(129L, 48L, 52L, 166L, 90L, 178L, 184L, 20L, 158L,
42L, 3L, 15L, 2L, 7L, 9L, 0L, 54L, 1L, 69L, 6L), d2 = c(172L,
67L, 64L, 257L, 132L, 255L, 261L, 30L, 201L, 61L, 9L, 20L, 2L,
9L, 12L, 0L, 69L, 1L, 123L, 6L), d3 = c(205L, 77L, 73L, 312L,
153L, 307L, 310L, 39L, 235L, 70L, 12L, 21L, 2L, 10L, 12L, 0L,
77L, 2L, 139L, 6L), d4 = c(227L, 81L, 82L, 363L, 177L, 350L,
342L, 52L, 257L, 75L, 15L, 24L, 2L, 12L, 13L, 0L, 86L, 2L, 151L,
6L), d5 = c(248L, 88L, 92L, 414L, 192L, 387L, 374L, 66L, 279L,
86L, 16L, 26L, 2L, 12L, 15L, 0L, 90L, 3L, 164L, 7L), report = c("2006",
"2006", "2006", "2006", "2006", "2006", "2006", "2006", "2006",
"2006", "2006", "2006", "2006", "2006", "2006", "2006", "2006",
"2006", "2006", "2006"), p1 = c(1.0360364394094, 1.22979866735429,
1.21423740998677, 0.87891144382145, 0.810310827130179, 0.965901663505148,
1.02621739486337, 0.69319116444678, 1.18938130906092, 1.04220816515009,
0.683545688193799, 1.05179228560845, 1.51468104603873, 1.15200888955888,
0.948041330809858, 0, 1.23227405154205, 3.11155226007598, 0.908056299174703,
1.57712371536702), p2 = c(0.986884800185635, 1.23066225499351,
1.07336930339221, 0.966734485786667, 0.87421381769247, 0.974775549615439,
1.06274655160121, 0.705150638862953, 1.12934487417415, 1.10234720984265,
1.11084642794988, 1.06558505521222, 1.0197697665798, 1.15605466288868,
1.01469386643771, 0, 1.17689541437029, 1.42783711234222, 1.16124019281912,
1.27756288696848), p3 = c(0.993575954694177, 1.17968893104311,
1.02608313159672, 0.965200422661265, 0.862910478266102, 0.976436243011877,
1.06679768502287, 0.722966824498357, 1.12591016481614, 1.05867627021151,
1.11227024088529, 0.98275117259764, 0.803738347803303, 1.09341228936369,
0.878291424560146, 0, 1.10500006213832, 1.93128861370172, 1.0949534752299,
1.14755029569502), p4 = c(0.986244633210798, 1.08520792731261,
1.01128789684232, 0.977245321880205, 0.89785754450165, 0.981536130349165,
1.04454959427709, 0.807825580390444, 1.1035817255901, 1.00192975678877,
1.14371311954082, 1.02812279984398, 0.66742040677939, 1.15526702119886,
0.878479047328667, 0, 1.10559111180852, 1.4717526513624, 1.05479137550321,
1.07005088091939), p5 = c(0.992583778223324, 1.06016737802091,
1.02253158347207, 1.00026491073882, 0.896290873874826, 0.985549150023704,
1.04187931404895, 0.886647217836043, 1.09837506943384, 1.0323002052873,
1.05833769015682, 1.05042831618603, 0.592515872759586, 1.05106420250504,
0.961672664191663, 0, 1.05868657273466, 1.81304485775152, 1.04168095802127,
1.19437925124365), id = c("ID 1", "ID 2", "ID 3", "ID 4", "ID 5",
"ID 6", "ID 7", "ID 8", "ID 9", "ID 10", "ID 11", "ID 12", "ID 13",
"ID 14", "ID 15", "ID 16", "ID 17", "ID 18", "ID 19", "ID 20"
)), row.names = c(NA, 20L), class = "data.frame")
We may need the .value in the names_to, which selects the prefix part of the column name before the _ as the column value and the 'report' will return the suffix column name
library(tidyr)
pivot_longer(dfa, cols = -id, names_to = c(".value", "report"),
names_sep = "_")
-output
# A tibble: 20 × 13
id report pts p1 p2 p3 p4 p5 d1 d2 d3 d4 d5
<chr> <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <int> <int> <int> <int>
1 ID 1 2006 431 1.04 0.987 0.994 0.986 0.993 129 172 205 227 248
2 ID 2 2006 167 1.23 1.23 1.18 1.09 1.06 48 67 77 81 88
3 ID 3 2006 167 1.21 1.07 1.03 1.01 1.02 52 64 73 82 92
4 ID 4 2006 760 0.879 0.967 0.965 0.977 1.00 166 257 312 363 414
5 ID 5 2006 348 0.810 0.874 0.863 0.898 0.896 90 132 153 177 192
6 ID 6 2006 768 0.966 0.975 0.976 0.982 0.986 178 255 307 350 387
7 ID 7 2006 619 1.03 1.06 1.07 1.04 1.04 184 261 310 342 374
8 ID 8 2006 169 0.693 0.705 0.723 0.808 0.887 20 30 39 52 66
9 ID 9 2006 416 1.19 1.13 1.13 1.10 1.10 158 201 235 257 279
10 ID 10 2006 155 1.04 1.10 1.06 1.00 1.03 42 61 70 75 86
11 ID 11 2006 47 0.684 1.11 1.11 1.14 1.06 3 9 12 15 16
12 ID 12 2006 37 1.05 1.07 0.983 1.03 1.05 15 20 21 24 26
13 ID 13 2006 6 1.51 1.02 0.804 0.667 0.593 2 2 2 2 2
14 ID 14 2006 17 1.15 1.16 1.09 1.16 1.05 7 9 10 12 12
15 ID 15 2006 22 0.948 1.01 0.878 0.878 0.962 9 12 12 13 15
16 ID 16 2006 1 0 0 0 0 0 0 0 0 0 0
17 ID 17 2006 149 1.23 1.18 1.11 1.11 1.06 54 69 77 86 90
18 ID 18 2006 3 3.11 1.43 1.93 1.47 1.81 1 1 2 2 3
19 ID 19 2006 284 0.908 1.16 1.09 1.05 1.04 69 123 139 151 164
20 ID 20 2006 7 1.58 1.28 1.15 1.07 1.19 6 6 6 6 7

How to apply function to specific columns based upon column name?

I am working with a wide data set resembling the following:
I am looking to write a function that I can iterate over sets of columns with similar names, but with different names. For the sake of simplicity here in terms of the function itself, I'll just create a function that takes the mean of two columns.
avg <- function(data, scorecol, distcol) {
ScoreDistanceAvg = (scorecol + distcol)/2
data$ScoreDistanceAvg <- ScoreDistanceAvg
return(data)
}
avg(data = dat, scorecol = dat$ScoreGame0, distcol = dat$DistanceGame0)
How can I apply the new function to sets of columns with repeated names but different numbers? That is, how could I create a column that takes the mean of ScoreGame0 and DistanceGame0, then create a column that takes the mean of ScoreGame5 and DistanceGame5, and so on? This would be the final output:
Of course, I could just run the function multiple times, but since my full data set is much larger, how could I automate this process? I imagine it involves apply, but I'm not sure how to use apply with a repeated pattern like that. Additionally, I imagine it may involve rewriting the function to better automate the naming of columns.
Data:
structure(list(Player = c("Lebron James", "Lebron James", "Lebron James",
"Lebron James", "Lebron James", "Lebron James", "Lebron James",
"Lebron James", "Lebron James", "Lebron James", "Lebron James",
"Lebron James", "Steph Curry", "Steph Curry", "Steph Curry",
"Steph Curry", "Steph Curry", "Steph Curry", "Steph Curry", "Steph Curry",
"Steph Curry", "Steph Curry", "Steph Curry", "Steph Curry"),
Game = c(0L, 1L, 2L, 3L, 4L, 5L, 0L, 1L, 2L, 3L, 4L, 5L,
0L, 1L, 2L, 3L, 4L, 5L, 0L, 1L, 2L, 3L, 4L, 5L), ScoreGame0 = c(32L,
32L, 32L, 32L, 32L, 32L, 44L, 44L, 44L, 44L, 44L, 44L, 45L,
45L, 45L, 45L, 45L, 45L, 76L, 76L, 76L, 76L, 76L, 76L), ScoreGame5 = c(27L,
27L, 27L, 27L, 27L, 27L, 12L, 12L, 12L, 12L, 12L, 12L, 76L,
76L, 76L, 76L, 76L, 76L, 32L, 32L, 32L, 32L, 32L, 32L), DistanceGame0 = c(12L,
12L, 12L, 12L, 12L, 12L, 79L, 79L, 79L, 79L, 79L, 79L, 18L,
18L, 18L, 18L, 18L, 18L, 88L, 88L, 88L, 88L, 88L, 88L), DistanceGame5 = c(13L,
13L, 13L, 13L, 13L, 13L, 34L, 34L, 34L, 34L, 34L, 34L, 42L,
42L, 42L, 42L, 42L, 42L, 54L, 54L, 54L, 54L, 54L, 54L)), class = "data.frame", row.names = c(NA,
-24L))
Rewrite your function slightly and use it in mapply by greping over the columns. sort makes this even safer.
avg <- function(scorecol, distcol) {
(scorecol + distcol)/2
}
mapply(avg, dat[sort(grep('ScoreGame', names(dat)))], dat[sort(grep('DistanceGame', names(dat)))])
# ScoreGame0 ScoreGame5
# [1,] 22.0 20
# [2,] 22.0 20
# [3,] 22.0 20
# [4,] 22.0 20
# [5,] 22.0 20
# [6,] 22.0 20
# [7,] 61.5 23
# [8,] 61.5 23
# [9,] 61.5 23
# [10,] 61.5 23
# [11,] 61.5 23
# [12,] 61.5 23
# [13,] 31.5 59
# [14,] 31.5 59
# [15,] 31.5 59
# [16,] 31.5 59
# [17,] 31.5 59
# [18,] 31.5 59
# [19,] 82.0 43
# [20,] 82.0 43
# [21,] 82.0 43
# [22,] 82.0 43
# [23,] 82.0 43
# [24,] 82.0 43
To see what grep does try
grep('DistanceGame', names(dat), value=TRUE)
# [1] "DistanceGame0" "DistanceGame5"
in Base R:
cols_used <- names(df[, -(1:2)])
f <- sub("[^0-9]+", 'ScoreDistance', cols_used)
data.frame(lapply(split.default(df[cols_used], f), rowMeans))
ScoreDistance0 ScoreDistance5
1 22.0 20
2 22.0 20
3 22.0 20
4 22.0 20
5 22.0 20
6 22.0 20
7 61.5 23
8 61.5 23
9 61.5 23
10 61.5 23
11 61.5 23
12 61.5 23
13 31.5 59
14 31.5 59
15 31.5 59
16 31.5 59
17 31.5 59
18 31.5 59
19 82.0 43
20 82.0 43
21 82.0 43
22 82.0 43
23 82.0 43
24 82.0 43
Using tidyverse:
Here's a solution with a forloop and readr:
library(readr)
game_num <- names(dat) |>
readr::parse_number() |>
na.omit()
for(i in unique(game_num)) {
avg <- paste0("ScoreDistanceAvg", i)
score <- paste0("ScoreGame", i)
distance <- paste0("DistanceGame", i)
dat[[avg]] <- (dat[[score]] + dat[[distance]])/2
}
Which gives:
Player Game ScoreGame0 ScoreGame5 DistanceGame0 DistanceGame5 ScoreDistanceAvg0 ScoreDistanceAvg5
1 Lebron James 0 32 27 12 13 22.0 20
2 Lebron James 1 32 27 12 13 22.0 20
3 Lebron James 2 32 27 12 13 22.0 20
4 Lebron James 3 32 27 12 13 22.0 20
5 Lebron James 4 32 27 12 13 22.0 20
6 Lebron James 5 32 27 12 13 22.0 20
7 Lebron James 0 44 12 79 34 61.5 23
8 Lebron James 1 44 12 79 34 61.5 23
9 Lebron James 2 44 12 79 34 61.5 23
10 Lebron James 3 44 12 79 34 61.5 23
11 Lebron James 4 44 12 79 34 61.5 23
12 Lebron James 5 44 12 79 34 61.5 23
13 Steph Curry 0 45 76 18 42 31.5 59

Is there a way to replace NAs in R using horizontal order?

I have the following data frame:
df <-structure(list(time = c("12:00:00", "12:05:00", "12:10:00", "12:15:00",
"12:20:00", "12:25:00", "12:30:00", "12:35:00", "12:40:00", "12:45:00",
"12:50:00", "12:55:00", "13:00:00", "13:05:00", "13:10:00", "13:15:00",
"13:20:00", "13:25:00"), speedA = c(60L, 75L, 65L, 45L, 12L,
15L, 20L, 45L, 65L, 60L, 60L, 30L, 35L, 45L, 25L, 15L, 10L, 5L
), speedB = c(50L, 30L, NA, 40L, NA, NA, 18L, NA, NA, NA, 15L,
10L, 25L, NA, NA, 12L, NA, NA), speedC = c(60L, 25L, NA, NA,
30L, 15L, 50L, 60L, NA, 35L, 34L, NA, 15L, 64L, 10L, 7L, 60L,
60L), speedD = c(NA, 10L, 60L, NA, 50L, 55L, 45L, 35L, NA, NA,
45L, 60L, 35L, 34L, 36L, 39L, 48L, 47L)), class = "data.frame", row.names = c(NA,
-18L))
I want to replace the NAs with values using interpolation between the horizontal values at the same row of each NA.
The expected result:
df2<- structure(list(time = c("12:00:00", "12:05:00", "12:10:00", "12:15:00",
"12:20:00", "12:25:00", "12:30:00", "12:35:00", "12:40:00", "12:45:00",
"12:50:00", "12:55:00", "13:00:00", "13:05:00", "13:10:00", "13:15:00",
"13:20:00", "13:25:00"), speedA = c(60L, 75L, 65L, 45L, 12L,
15L, 20L, 45L, 65L, 60L, 60L, 30L, 35L, 45L, 25L, 15L, 10L, 5L
), speedB = c(50, 30, 63.33333, 40, 21, 15, 18, 52.5, 65, 47.5,
15, 10, 25, 54.5, 17.5, 12, 35, 32.5), speedC = c(60, 25, 61.66667,
40, 30, 15, 50, 60, 65, 35, 34, 35, 15, 64, 10, 7, 60, 60), speedD = c(60L,
10L, 60L, 40L, 50L, 55L, 45L, 35L, 65L, 35L, 45L, 60L, 35L, 34L,
36L, 39L, 48L, 47L)), class = "data.frame", row.names = c(NA,
-18L))
We can use zoo::na.approx to interpolate values. For values which we are not able to interpolate (NA values at the last) we use tidyr::fill to fill it.
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = -time) %>%
group_by(time) %>%
mutate(value = zoo::na.approx(value, na.rm = FALSE)) %>%
fill(value) %>%
pivot_wider()
# time speedA speedB speedC speedD
# <chr> <dbl> <dbl> <dbl> <dbl>
# 1 12:00:00 60 50 60 60
# 2 12:05:00 75 30 25 10
# 3 12:10:00 65 63.333 61.667 60
# 4 12:15:00 45 40 40 40
# 5 12:20:00 12 21 30 50
# 6 12:25:00 15 15 15 55
# 7 12:30:00 20 18 50 45
# 8 12:35:00 45 52.5 60 35
# 9 12:40:00 65 65 65 65
#10 12:45:00 60 47.5 35 35
#11 12:50:00 60 15 34 45
#12 12:55:00 30 10 35 60
#13 13:00:00 35 25 15 35
#14 13:05:00 45 54.5 64 34
#15 13:10:00 25 17.5 10 36
#16 13:15:00 15 12 7 39
#17 13:20:00 10 35 60 48
#18 13:25:00 5 32.5 60 47
You can use zoo::na.approx() row-wise with c_across().
library(dplyr)
library(tidyr)
library(zoo)
df %>%
rowwise() %>%
mutate(speed = list(na.locf(na.approx(c_across(-time), na.rm = FALSE))), .keep = "unused") %>%
unnest_wider(speed, names_sep = "")
# # A tibble: 18 x 5
# time speed1 speed2 speed3 speed4
# <chr> <dbl> <dbl> <dbl> <dbl>
# 1 12:00:00 60 50 60 60
# 2 12:05:00 75 30 25 10
# 3 12:10:00 65 63.3 61.7 60
# 4 12:15:00 45 40 40 40
# 5 12:20:00 12 21 30 50
# 6 12:25:00 15 15 15 55
# 7 12:30:00 20 18 50 45
# 8 12:35:00 45 52.5 60 35
# 9 12:40:00 65 65 65 65
# 10 12:45:00 60 47.5 35 35
# 11 12:50:00 60 15 34 45
# 12 12:55:00 30 10 35 60
# 13 13:00:00 35 25 15 35
# 14 13:05:00 45 54.5 64 34
# 15 13:10:00 25 17.5 10 36
# 16 13:15:00 15 12 7 39
# 17 13:20:00 10 35 60 48
# 18 13:25:00 5 32.5 60 47

Split data frame by class regarding to OID

I try to split dataframe by 50% by class. However, I do not want to split fields with the same OID (object identifier). I would like the fields with the same OID to be in the same set.
#Data frame:
"b1""b2""b3""CLASS" "OID"
110 134 119 "tree" 1
112 133 118 "tree" 1
105 125 110 "tree" 2
112 132 117 "tree" 2
109 125 115 "meadow" 6
93 110 101 "meadow" 6
86 106 95 "meadow" 7
105 136 116 "meadow" 7
102 128 111 "meadow" 8
108 129 115 "meadow" 8
113 134 119 "meadow" 8
Expected data:
#Expected:
"b1""b2""b3""CLASS" "OID"
110 134 119 "tree" 1
112 133 118 "tree" 1
109 125 115 "meadow" 6
93 110 101 "meadow" 6
86 106 95 "meadow" 7
105 136 116 "meadow" 7
This selects the top half of rows in each group, plus any rows which have the same OID as the rows in that top half.
library(dplyr)
df %>%
group_by(CLASS) %>%
filter(OID %in% head(OID, n() %/% 2)) %>%
ungroup
# # A tibble: 6 x 5
# b1 b2 b3 CLASS OID
# <int> <int> <int> <chr> <int>
# 1 110 134 119 tree 1
# 2 112 133 118 tree 1
# 3 109 125 115 meadow 6
# 4 93 110 101 meadow 6
# 5 86 106 95 meadow 7
# 6 105 136 116 meadow 7
If your real data is arranged by OID like this example, you could also use top_frac
df %>%
group_by(CLASS) %>%
top_frac(.5, -OID)
# # A tibble: 6 x 5
# b1 b2 b3 CLASS OID
# <int> <int> <int> <chr> <int>
# 1 110 134 119 tree 1
# 2 112 133 118 tree 1
# 3 109 125 115 meadow 6
# 4 93 110 101 meadow 6
# 5 86 106 95 meadow 7
# 6 105 136 116 meadow 7
Your data:
df = structure(list(b1 = c(110L, 112L, 105L, 112L, 109L, 93L, 86L,
105L, 102L, 108L, 113L), b2 = c(134L, 133L, 125L, 132L, 125L,
110L, 106L, 136L, 128L, 129L, 134L), b3 = c(119L, 118L, 110L,
117L, 115L, 101L, 95L, 116L, 111L, 115L, 119L), CLASS = structure(c(2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("meadow",
"tree"), class = "factor"), OID = c(1L, 1L, 2L, 2L, 6L, 6L, 7L,
7L, 8L, 8L, 8L)), class = "data.frame", row.names = c(NA, -11L
))
First create a function to take 1/2 according to OID
func = function(x){
x[x$OID %in% x$OID[1:round(nrow(x)/2)],]
}
We randomize the way the OID are sorted
df$OID = factor(df$OID,levels=sample(unique(df$OID)))
df = df[order(df$OID),]
do.call(rbind,by(df,df$CLASS,func))
This will ensure you get random ~ 50% everytime, with complete OID

R software histogram with all printed x values

I have the following data
meterpiuimportanti[row_sub,]
Meter Numero Nodi
[1,] 608 107
[2,] 51 89
[3,] 197 81
[4,] 52 81
[5,] 192 21
[6,] 110 14
[7,] 171 13
[8,] 114 12
[9,] 252 11
[10,] 121 10
[11,] 94 10
[12,] 295 9
[13,] 341 9
[14,] 113 7
[15,] 118 5
[16,] 196 4
[17,] 91 3
[18,] 92 3
[19,] 96 3
[20,] 112 3
[21,] 345 3
[22,] 378 3
[23,] 386 3
[24,] 90 2
[25,] 105 2
[26,] 204 2
[27,] 374 2
[28,] 104 1
[29,] 287 1
[30,] 328 1
[31,] 414 1
I would like to have a full page (1024x768) histogram with x axis being the first column and Y as second column.
The problems are:
1) I don't know how to enlarge the page
2) I want that all x values must be printed on x axis and on the top of each box of the histogram I want to print the value of the y
Thanks for your help
See the code below. It uses the grDevices package. I can't remember for sure, but I think it comes with the base install.
df <-read.csv("/Data/test1.csv") #read
png(filename="output.png", width=1024, height=768) #open graphics
df <- df[order(df$x),] #order data source
mp <- barplot(df$y,axes=F) #plot w/o labels
#add value labels
text(cex=1.5, x=mp, y=df$y+par("cxy")[2]/2+1, round(df$y,2), xpd=TRUE)
axis(1,at=mp,labels=df$x, las=2) #add x labels, make'm vertical
axis(2,seq(0,max(df$y),round(max(df$y)/20))) #add y labels
dev.off()
You can make use of the ggplot2 package:
Code
library(ggplot2)
png('~/x.png',width=1024,height=768)
ggplot(d) +
aes(x=factor(V1,levels=V1),y=V2) +
geom_bar(position='dodge',stat='identity') +
xlab('V1') +
geom_text(aes(label=V2), position=position_dodge(width=0.9), vjust=-0.25)
dev.off()
Result
Data Set
d <- structure(list(V1 = c(608L, 51L, 197L, 52L, 192L, 110L, 171L,
114L, 252L, 121L, 94L, 295L, 341L, 113L, 118L, 196L, 91L, 92L,
96L, 112L, 345L, 378L, 386L, 90L, 105L, 204L, 374L, 104L, 287L,
328L, 414L), V2 = c(107L, 89L, 81L, 81L, 21L, 14L, 13L, 12L,
11L, 10L, 10L, 9L, 9L, 7L, 5L, 4L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L)), .Names = c("V1", "V2"), class = "data.frame", row.names = c(NA,
-31L))

Resources