I am working with a wide data set resembling the following:
I am looking to write a function that I can iterate over sets of columns with similar names, but with different names. For the sake of simplicity here in terms of the function itself, I'll just create a function that takes the mean of two columns.
avg <- function(data, scorecol, distcol) {
ScoreDistanceAvg = (scorecol + distcol)/2
data$ScoreDistanceAvg <- ScoreDistanceAvg
return(data)
}
avg(data = dat, scorecol = dat$ScoreGame0, distcol = dat$DistanceGame0)
How can I apply the new function to sets of columns with repeated names but different numbers? That is, how could I create a column that takes the mean of ScoreGame0 and DistanceGame0, then create a column that takes the mean of ScoreGame5 and DistanceGame5, and so on? This would be the final output:
Of course, I could just run the function multiple times, but since my full data set is much larger, how could I automate this process? I imagine it involves apply, but I'm not sure how to use apply with a repeated pattern like that. Additionally, I imagine it may involve rewriting the function to better automate the naming of columns.
Data:
structure(list(Player = c("Lebron James", "Lebron James", "Lebron James",
"Lebron James", "Lebron James", "Lebron James", "Lebron James",
"Lebron James", "Lebron James", "Lebron James", "Lebron James",
"Lebron James", "Steph Curry", "Steph Curry", "Steph Curry",
"Steph Curry", "Steph Curry", "Steph Curry", "Steph Curry", "Steph Curry",
"Steph Curry", "Steph Curry", "Steph Curry", "Steph Curry"),
Game = c(0L, 1L, 2L, 3L, 4L, 5L, 0L, 1L, 2L, 3L, 4L, 5L,
0L, 1L, 2L, 3L, 4L, 5L, 0L, 1L, 2L, 3L, 4L, 5L), ScoreGame0 = c(32L,
32L, 32L, 32L, 32L, 32L, 44L, 44L, 44L, 44L, 44L, 44L, 45L,
45L, 45L, 45L, 45L, 45L, 76L, 76L, 76L, 76L, 76L, 76L), ScoreGame5 = c(27L,
27L, 27L, 27L, 27L, 27L, 12L, 12L, 12L, 12L, 12L, 12L, 76L,
76L, 76L, 76L, 76L, 76L, 32L, 32L, 32L, 32L, 32L, 32L), DistanceGame0 = c(12L,
12L, 12L, 12L, 12L, 12L, 79L, 79L, 79L, 79L, 79L, 79L, 18L,
18L, 18L, 18L, 18L, 18L, 88L, 88L, 88L, 88L, 88L, 88L), DistanceGame5 = c(13L,
13L, 13L, 13L, 13L, 13L, 34L, 34L, 34L, 34L, 34L, 34L, 42L,
42L, 42L, 42L, 42L, 42L, 54L, 54L, 54L, 54L, 54L, 54L)), class = "data.frame", row.names = c(NA,
-24L))
Rewrite your function slightly and use it in mapply by greping over the columns. sort makes this even safer.
avg <- function(scorecol, distcol) {
(scorecol + distcol)/2
}
mapply(avg, dat[sort(grep('ScoreGame', names(dat)))], dat[sort(grep('DistanceGame', names(dat)))])
# ScoreGame0 ScoreGame5
# [1,] 22.0 20
# [2,] 22.0 20
# [3,] 22.0 20
# [4,] 22.0 20
# [5,] 22.0 20
# [6,] 22.0 20
# [7,] 61.5 23
# [8,] 61.5 23
# [9,] 61.5 23
# [10,] 61.5 23
# [11,] 61.5 23
# [12,] 61.5 23
# [13,] 31.5 59
# [14,] 31.5 59
# [15,] 31.5 59
# [16,] 31.5 59
# [17,] 31.5 59
# [18,] 31.5 59
# [19,] 82.0 43
# [20,] 82.0 43
# [21,] 82.0 43
# [22,] 82.0 43
# [23,] 82.0 43
# [24,] 82.0 43
To see what grep does try
grep('DistanceGame', names(dat), value=TRUE)
# [1] "DistanceGame0" "DistanceGame5"
in Base R:
cols_used <- names(df[, -(1:2)])
f <- sub("[^0-9]+", 'ScoreDistance', cols_used)
data.frame(lapply(split.default(df[cols_used], f), rowMeans))
ScoreDistance0 ScoreDistance5
1 22.0 20
2 22.0 20
3 22.0 20
4 22.0 20
5 22.0 20
6 22.0 20
7 61.5 23
8 61.5 23
9 61.5 23
10 61.5 23
11 61.5 23
12 61.5 23
13 31.5 59
14 31.5 59
15 31.5 59
16 31.5 59
17 31.5 59
18 31.5 59
19 82.0 43
20 82.0 43
21 82.0 43
22 82.0 43
23 82.0 43
24 82.0 43
Using tidyverse:
Here's a solution with a forloop and readr:
library(readr)
game_num <- names(dat) |>
readr::parse_number() |>
na.omit()
for(i in unique(game_num)) {
avg <- paste0("ScoreDistanceAvg", i)
score <- paste0("ScoreGame", i)
distance <- paste0("DistanceGame", i)
dat[[avg]] <- (dat[[score]] + dat[[distance]])/2
}
Which gives:
Player Game ScoreGame0 ScoreGame5 DistanceGame0 DistanceGame5 ScoreDistanceAvg0 ScoreDistanceAvg5
1 Lebron James 0 32 27 12 13 22.0 20
2 Lebron James 1 32 27 12 13 22.0 20
3 Lebron James 2 32 27 12 13 22.0 20
4 Lebron James 3 32 27 12 13 22.0 20
5 Lebron James 4 32 27 12 13 22.0 20
6 Lebron James 5 32 27 12 13 22.0 20
7 Lebron James 0 44 12 79 34 61.5 23
8 Lebron James 1 44 12 79 34 61.5 23
9 Lebron James 2 44 12 79 34 61.5 23
10 Lebron James 3 44 12 79 34 61.5 23
11 Lebron James 4 44 12 79 34 61.5 23
12 Lebron James 5 44 12 79 34 61.5 23
13 Steph Curry 0 45 76 18 42 31.5 59
Related
Can you specify multiple value columns in pivot_longer()?
My original data (posted below) I had to transpose to be in a wider format. Then I want to take this new transposed data and return it to the original format (lets assume I did some transformations/ and can't use the original data). To re-transpose back into a longer format I have to use both pivot_longer() then pivot_wider() because there are multiple values I want to be their own columns.
I would like to avoid the pivot_wider() and just use pivot_longer() when re-transposing the data back if possible.
As a side note the unique identifier for each row is the combination of id and report.
Code
dfa <- dfx %>%
pivot_wider(
id_cols = id,
names_from = report,
values_from = c(pts,
p1, p2, p3,p4,p5,
d1,d2,d3,d4,d5)
)
df_return <- dfa %>%
pivot_longer(cols = !id,
names_to = c('vars','report'),
names_pattern = "([a-z0-9]+)_(.*)",
values_drop_na = TRUE) %>%
pivot_wider(id_cols = c(id, report),
names_from = vars,
values_from = value)
Data
structure(list(pts = c(431L, 167L, 167L, 760L, 348L, 768L, 619L,
169L, 416L, 155L, 47L, 37L, 6L, 17L, 22L, 1L, 149L, 3L, 284L,
7L), d1 = c(129L, 48L, 52L, 166L, 90L, 178L, 184L, 20L, 158L,
42L, 3L, 15L, 2L, 7L, 9L, 0L, 54L, 1L, 69L, 6L), d2 = c(172L,
67L, 64L, 257L, 132L, 255L, 261L, 30L, 201L, 61L, 9L, 20L, 2L,
9L, 12L, 0L, 69L, 1L, 123L, 6L), d3 = c(205L, 77L, 73L, 312L,
153L, 307L, 310L, 39L, 235L, 70L, 12L, 21L, 2L, 10L, 12L, 0L,
77L, 2L, 139L, 6L), d4 = c(227L, 81L, 82L, 363L, 177L, 350L,
342L, 52L, 257L, 75L, 15L, 24L, 2L, 12L, 13L, 0L, 86L, 2L, 151L,
6L), d5 = c(248L, 88L, 92L, 414L, 192L, 387L, 374L, 66L, 279L,
86L, 16L, 26L, 2L, 12L, 15L, 0L, 90L, 3L, 164L, 7L), report = c("2006",
"2006", "2006", "2006", "2006", "2006", "2006", "2006", "2006",
"2006", "2006", "2006", "2006", "2006", "2006", "2006", "2006",
"2006", "2006", "2006"), p1 = c(1.0360364394094, 1.22979866735429,
1.21423740998677, 0.87891144382145, 0.810310827130179, 0.965901663505148,
1.02621739486337, 0.69319116444678, 1.18938130906092, 1.04220816515009,
0.683545688193799, 1.05179228560845, 1.51468104603873, 1.15200888955888,
0.948041330809858, 0, 1.23227405154205, 3.11155226007598, 0.908056299174703,
1.57712371536702), p2 = c(0.986884800185635, 1.23066225499351,
1.07336930339221, 0.966734485786667, 0.87421381769247, 0.974775549615439,
1.06274655160121, 0.705150638862953, 1.12934487417415, 1.10234720984265,
1.11084642794988, 1.06558505521222, 1.0197697665798, 1.15605466288868,
1.01469386643771, 0, 1.17689541437029, 1.42783711234222, 1.16124019281912,
1.27756288696848), p3 = c(0.993575954694177, 1.17968893104311,
1.02608313159672, 0.965200422661265, 0.862910478266102, 0.976436243011877,
1.06679768502287, 0.722966824498357, 1.12591016481614, 1.05867627021151,
1.11227024088529, 0.98275117259764, 0.803738347803303, 1.09341228936369,
0.878291424560146, 0, 1.10500006213832, 1.93128861370172, 1.0949534752299,
1.14755029569502), p4 = c(0.986244633210798, 1.08520792731261,
1.01128789684232, 0.977245321880205, 0.89785754450165, 0.981536130349165,
1.04454959427709, 0.807825580390444, 1.1035817255901, 1.00192975678877,
1.14371311954082, 1.02812279984398, 0.66742040677939, 1.15526702119886,
0.878479047328667, 0, 1.10559111180852, 1.4717526513624, 1.05479137550321,
1.07005088091939), p5 = c(0.992583778223324, 1.06016737802091,
1.02253158347207, 1.00026491073882, 0.896290873874826, 0.985549150023704,
1.04187931404895, 0.886647217836043, 1.09837506943384, 1.0323002052873,
1.05833769015682, 1.05042831618603, 0.592515872759586, 1.05106420250504,
0.961672664191663, 0, 1.05868657273466, 1.81304485775152, 1.04168095802127,
1.19437925124365), id = c("ID 1", "ID 2", "ID 3", "ID 4", "ID 5",
"ID 6", "ID 7", "ID 8", "ID 9", "ID 10", "ID 11", "ID 12", "ID 13",
"ID 14", "ID 15", "ID 16", "ID 17", "ID 18", "ID 19", "ID 20"
)), row.names = c(NA, 20L), class = "data.frame")
We may need the .value in the names_to, which selects the prefix part of the column name before the _ as the column value and the 'report' will return the suffix column name
library(tidyr)
pivot_longer(dfa, cols = -id, names_to = c(".value", "report"),
names_sep = "_")
-output
# A tibble: 20 × 13
id report pts p1 p2 p3 p4 p5 d1 d2 d3 d4 d5
<chr> <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <int> <int> <int> <int>
1 ID 1 2006 431 1.04 0.987 0.994 0.986 0.993 129 172 205 227 248
2 ID 2 2006 167 1.23 1.23 1.18 1.09 1.06 48 67 77 81 88
3 ID 3 2006 167 1.21 1.07 1.03 1.01 1.02 52 64 73 82 92
4 ID 4 2006 760 0.879 0.967 0.965 0.977 1.00 166 257 312 363 414
5 ID 5 2006 348 0.810 0.874 0.863 0.898 0.896 90 132 153 177 192
6 ID 6 2006 768 0.966 0.975 0.976 0.982 0.986 178 255 307 350 387
7 ID 7 2006 619 1.03 1.06 1.07 1.04 1.04 184 261 310 342 374
8 ID 8 2006 169 0.693 0.705 0.723 0.808 0.887 20 30 39 52 66
9 ID 9 2006 416 1.19 1.13 1.13 1.10 1.10 158 201 235 257 279
10 ID 10 2006 155 1.04 1.10 1.06 1.00 1.03 42 61 70 75 86
11 ID 11 2006 47 0.684 1.11 1.11 1.14 1.06 3 9 12 15 16
12 ID 12 2006 37 1.05 1.07 0.983 1.03 1.05 15 20 21 24 26
13 ID 13 2006 6 1.51 1.02 0.804 0.667 0.593 2 2 2 2 2
14 ID 14 2006 17 1.15 1.16 1.09 1.16 1.05 7 9 10 12 12
15 ID 15 2006 22 0.948 1.01 0.878 0.878 0.962 9 12 12 13 15
16 ID 16 2006 1 0 0 0 0 0 0 0 0 0 0
17 ID 17 2006 149 1.23 1.18 1.11 1.11 1.06 54 69 77 86 90
18 ID 18 2006 3 3.11 1.43 1.93 1.47 1.81 1 1 2 2 3
19 ID 19 2006 284 0.908 1.16 1.09 1.05 1.04 69 123 139 151 164
20 ID 20 2006 7 1.58 1.28 1.15 1.07 1.19 6 6 6 6 7
I was wondering how I could go about trying to take outliers from Boxplot$out (returns the outliers within the data) and put them into a table which shows the class they belong to e.g. if outlier is from class "Van", "Bus, "Saab" etc..
I have tried using which() function but this returns only the index of the outlier and not the class. I am not sure how to go about putting this into a table.
Any help would be greatly appreciated!
library(reshape2)
vehData <-
structure(
list(
Samples = 1:6,
Comp = c(95L, 91L, 104L, 93L, 85L,
107L),
Circ = c(48L, 41L, 50L, 41L, 44L, 57L),
D.Circ = c(83L,
84L, 106L, 82L, 70L, 106L),
Rad.Ra = c(178L, 141L, 209L, 159L,
205L, 172L),
Pr.Axis.Ra = c(72L, 57L, 66L, 63L, 103L, 50L),
Max.L.Ra = c(10L,
9L, 10L, 9L, 52L, 6L),
Scat.Ra = c(162L, 149L, 207L, 144L, 149L,
255L),
Elong = c(42L, 45L, 32L, 46L, 45L, 26L),
Pr.Axis.Rect = c(20L,
19L, 23L, 19L, 19L, 28L),
Max.L.Rect = c(159L, 143L, 158L, 143L,
144L, 169L),
Sc.Var.Maxis = c(176L, 170L, 223L, 160L, 241L, 280L),
Sc.Var.maxis = c(379L, 330L, 635L, 309L, 325L, 957L),
Ra.Gyr = c(184L,
158L, 220L, 127L, 188L, 264L),
Skew.Maxis = c(70L, 72L, 73L,
63L, 127L, 85L),
Skew.maxis = c(6L, 9L, 14L, 6L, 9L, 5L),
Kurt.maxis = c(16L,
14L, 9L, 10L, 11L, 9L),
Kurt.Maxis = c(187L, 189L, 188L, 199L,
180L, 181L),
Holl.Ra = c(197L, 199L, 196L, 207L, 183L, 183L),
Class = c("van", "van", "saab", "van", "bus", "bus")
),
row.names = c(NA,
6L), class = "data.frame")
#Remove outliers
removeOutliers <- function(data) {
OutVals <- boxplot(data)$out
remOutliers <- sapply(data, function(x) x[!x %in% OutVals])
return (remOutliers)
}
vehDataRemove1 <- vehData[, -1]
vehDataRemove2 <- vehDataRemove1[,-19]
vehData <- vehDataRemove2
vehClass <- vehData$Class
boxplot(vehData)
#Begin removing outliers
removeOutliers1 <- removeOutliers(vehData)
removeOutliers2 <- removeOutliers(removeOutliers1)
This can be simplified. Starting with your data frame vehData. First get the rownumbers of the outliers. In my comment I accidentally left out the seq() function:
vehDataRemove <- vehData[, -c(1, 20)]
OutVals <- boxplot(vehDataRemove)
idx <- sapply(seq(length(OutVals$out)), function(x) which(vehDataRemove[, OutVals$group[x]] == OutVals$out[x]))
idx
# [1] 5 5 6 5 3
Notice that three outliers are in the 5th row. Now remove the rows with outliers:
NoOuts <- vehDataRemove[-unique(idx), ]
NoOuts
# Comp Circ D.Circ Rad.Ra Pr.Axis.Ra Max.L.Ra Scat.Ra Elong Pr.Axis.Rect Max.L.Rect Sc.Var.Maxis Sc.Var.maxis Ra.Gyr Skew.Maxis Skew.maxis Kurt.maxis Kurt.Maxis Holl.Ra
# 1 95 48 83 178 72 10 162 42 20 159 176 379 184 70 6 16 187 197
# 2 91 41 84 141 57 9 149 45 19 143 170 330 158 72 9 14 189 199
# 4 93 41 82 159 63 9 144 46 19 143 160 309 127 63 6 10 199 207
So you have lost half of your data! Alternatively set the outliers to missing values:
Outs2NA <- vehDataRemove
Outs2NA[cbind(idx, OutVals$group)] <- NA
Outs2NA
# Comp Circ D.Circ Rad.Ra Pr.Axis.Ra Max.L.Ra Scat.Ra Elong Pr.Axis.Rect Max.L.Rect Sc.Var.Maxis Sc.Var.maxis Ra.Gyr Skew.Maxis Skew.maxis Kurt.maxis Kurt.Maxis Holl.Ra
# 1 95 48 83 178 72 10 162 42 20 159 176 379 184 70 6 16 187 197
# 2 91 41 84 141 57 9 149 45 19 143 170 330 158 72 9 14 189 199
# 3 104 50 106 209 66 10 207 32 23 158 223 635 220 73 NA 9 188 196
# 4 93 41 82 159 63 9 144 46 19 143 160 309 127 63 6 10 199 207
# 5 85 44 70 205 NA NA 149 45 19 144 241 325 188 NA 9 11 180 183
# 6 107 57 106 172 50 NA 255 26 28 169 280 957 264 85 5 9 181 183
I have the following data frame:
df <-structure(list(time = c("12:00:00", "12:05:00", "12:10:00", "12:15:00",
"12:20:00", "12:25:00", "12:30:00", "12:35:00", "12:40:00", "12:45:00",
"12:50:00", "12:55:00", "13:00:00", "13:05:00", "13:10:00", "13:15:00",
"13:20:00", "13:25:00"), speedA = c(60L, 75L, 65L, 45L, 12L,
15L, 20L, 45L, 65L, 60L, 60L, 30L, 35L, 45L, 25L, 15L, 10L, 5L
), speedB = c(50L, 30L, NA, 40L, NA, NA, 18L, NA, NA, NA, 15L,
10L, 25L, NA, NA, 12L, NA, NA), speedC = c(60L, 25L, NA, NA,
30L, 15L, 50L, 60L, NA, 35L, 34L, NA, 15L, 64L, 10L, 7L, 60L,
60L), speedD = c(NA, 10L, 60L, NA, 50L, 55L, 45L, 35L, NA, NA,
45L, 60L, 35L, 34L, 36L, 39L, 48L, 47L)), class = "data.frame", row.names = c(NA,
-18L))
I want to replace the NAs with values using interpolation between the horizontal values at the same row of each NA.
The expected result:
df2<- structure(list(time = c("12:00:00", "12:05:00", "12:10:00", "12:15:00",
"12:20:00", "12:25:00", "12:30:00", "12:35:00", "12:40:00", "12:45:00",
"12:50:00", "12:55:00", "13:00:00", "13:05:00", "13:10:00", "13:15:00",
"13:20:00", "13:25:00"), speedA = c(60L, 75L, 65L, 45L, 12L,
15L, 20L, 45L, 65L, 60L, 60L, 30L, 35L, 45L, 25L, 15L, 10L, 5L
), speedB = c(50, 30, 63.33333, 40, 21, 15, 18, 52.5, 65, 47.5,
15, 10, 25, 54.5, 17.5, 12, 35, 32.5), speedC = c(60, 25, 61.66667,
40, 30, 15, 50, 60, 65, 35, 34, 35, 15, 64, 10, 7, 60, 60), speedD = c(60L,
10L, 60L, 40L, 50L, 55L, 45L, 35L, 65L, 35L, 45L, 60L, 35L, 34L,
36L, 39L, 48L, 47L)), class = "data.frame", row.names = c(NA,
-18L))
We can use zoo::na.approx to interpolate values. For values which we are not able to interpolate (NA values at the last) we use tidyr::fill to fill it.
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = -time) %>%
group_by(time) %>%
mutate(value = zoo::na.approx(value, na.rm = FALSE)) %>%
fill(value) %>%
pivot_wider()
# time speedA speedB speedC speedD
# <chr> <dbl> <dbl> <dbl> <dbl>
# 1 12:00:00 60 50 60 60
# 2 12:05:00 75 30 25 10
# 3 12:10:00 65 63.333 61.667 60
# 4 12:15:00 45 40 40 40
# 5 12:20:00 12 21 30 50
# 6 12:25:00 15 15 15 55
# 7 12:30:00 20 18 50 45
# 8 12:35:00 45 52.5 60 35
# 9 12:40:00 65 65 65 65
#10 12:45:00 60 47.5 35 35
#11 12:50:00 60 15 34 45
#12 12:55:00 30 10 35 60
#13 13:00:00 35 25 15 35
#14 13:05:00 45 54.5 64 34
#15 13:10:00 25 17.5 10 36
#16 13:15:00 15 12 7 39
#17 13:20:00 10 35 60 48
#18 13:25:00 5 32.5 60 47
You can use zoo::na.approx() row-wise with c_across().
library(dplyr)
library(tidyr)
library(zoo)
df %>%
rowwise() %>%
mutate(speed = list(na.locf(na.approx(c_across(-time), na.rm = FALSE))), .keep = "unused") %>%
unnest_wider(speed, names_sep = "")
# # A tibble: 18 x 5
# time speed1 speed2 speed3 speed4
# <chr> <dbl> <dbl> <dbl> <dbl>
# 1 12:00:00 60 50 60 60
# 2 12:05:00 75 30 25 10
# 3 12:10:00 65 63.3 61.7 60
# 4 12:15:00 45 40 40 40
# 5 12:20:00 12 21 30 50
# 6 12:25:00 15 15 15 55
# 7 12:30:00 20 18 50 45
# 8 12:35:00 45 52.5 60 35
# 9 12:40:00 65 65 65 65
# 10 12:45:00 60 47.5 35 35
# 11 12:50:00 60 15 34 45
# 12 12:55:00 30 10 35 60
# 13 13:00:00 35 25 15 35
# 14 13:05:00 45 54.5 64 34
# 15 13:10:00 25 17.5 10 36
# 16 13:15:00 15 12 7 39
# 17 13:20:00 10 35 60 48
# 18 13:25:00 5 32.5 60 47
I have a matrix that has been ordered by rowSums(). I now want to take a selected few of these rows, by passing a char vector of row names, and easily move them back at the top of the matrix while keeping the moved rows in the same order as they are in the selection vector.
I've tried to do this with various combinations of subset() or just straight index selection, but I can never get the resulting matrix in the order I want, if it works at all. I feel like there has to be a more straightforward way to do this.
Let's say I have a matrix mat ordered by rowSums():
sam1 sam2 sam3 sam4 sam5
sig1 1 2 3 4 5
sig2 6 7 8 9 10
sig3 11 12 13 14 15
sig4a 16 17 18 19 20
sig4b 21 22 23 24 25
sig4c 26 27 28 29 30
sig5 31 32 33 34 35
sig6 36 37 38 39 40
sig7a 41 42 43 44 45
aig7b 46 47 48 49 50
And I want to take a select number of rows I'm interested in:
select = c('sig6','sig4a','sig2')
And move them back to the top of the matrix, while keeping them in the order in the select vector, while leaving the remaining unselected rows below them to get a new matrix:
sam1 sam2 sam3 sam4 sam5
sig6 36 37 38 39 40 *
sig4a 16 17 18 19 20 *
sig2 6 7 8 9 10 *
sig1 1 2 3 4 5
sig3 11 12 13 14 15
sig4b 21 22 23 24 25
sig4c 26 27 28 29 30
sig5 31 32 33 34 35
sig7a 41 42 43 44 45
aig7b 46 47 48 49 50
Is there a straightforward way to do this that doesn't involve making intermediate matrices or complicated workarounds? It seems like there should be, but I haven't been able to find a solution. Maybe I am overlooking something.
An option is to specify the vector of row names first followed by the ones that are left with setdiff
mat[c(select, setdiff(row.names(mat), select)),]
#. sam1 sam2 sam3 sam4 sam5
#sig6 36 37 38 39 40
#sig4a 16 17 18 19 20
#sig2 6 7 8 9 10
#sig1 1 2 3 4 5
#sig3 11 12 13 14 15
#sig4b 21 22 23 24 25
#sig4c 26 27 28 29 30
#sig5 31 32 33 34 35
#sig7a 41 42 43 44 45
#aig7b 46 47 48 49 50
data
mat <- structure(c(1L, 6L, 11L, 16L, 21L, 26L, 31L, 36L, 41L, 46L, 2L,
7L, 12L, 17L, 22L, 27L, 32L, 37L, 42L, 47L, 3L, 8L, 13L, 18L,
23L, 28L, 33L, 38L, 43L, 48L, 4L, 9L, 14L, 19L, 24L, 29L, 34L,
39L, 44L, 49L, 5L, 10L, 15L, 20L, 25L, 30L, 35L, 40L, 45L, 50L
), .Dim = c(10L, 5L), .Dimnames = list(c("sig1", "sig2", "sig3",
"sig4a", "sig4b", "sig4c", "sig5", "sig6", "sig7a", "aig7b"),
c("sam1", "sam2", "sam3", "sam4", "sam5")))
Suppose I have the following dataframe:
dc tmin tmax cint wcmin wcmax wsmin wsmax gsmin gsmax wd rmin rmax cir lr
1: 24 -1 4 5 -5 -2 20 25 35 40 90 11.8 26.6 14.8 3
2: 41 -3 5 8 -8 -3 15 20 35 40 90 10.0 23.5 13.5 3
3: 48 0 5 5 -4 0 30 35 45 50 45 7.3 19.0 11.7 6
4: 50 0 5 5 -4 0 30 35 45 50 45 7.3 19.0 11.7 6
5: 52 3 5 2 -3 1 20 25 35 40 45 6.7 17.4 10.7 6
6: 57 -2 5 7 -6 -1 25 30 35 40 315 4.4 13.8 9.4 7
lc wc li yd yr nF factdcx
1: 1 3 TRUE 1 2010 2 24
2: 1 3 TRUE 1 2010 8 41
3: 2 3 TRUE 1 2010 0 48
4: 2 3 TRUE 1 2010 0 50
5: 2 3 TRUE 1 2010 0 52
6: 3 3 FALSE 1 2010 0 57
I'd like to turn it into a new dataframe like the following:
dc tmin tmax cint wcmin wcmax wsmin wsmax gsmin gsmax wd rmin rmax cir lr
1: 24 -1 4 5 -5 -2 20 25 35 40 90 11.8 26.6 14.8 3
2: 41 -3 5 8 -8 -3 15 20 35 40 90 10.0 23.5 13.5 3
3: 48 0 5 5 -4 0 30 35 45 50 45 7.3 19.0 11.7 6
4: 52 3 5 2 -3 1 20 25 35 40 45 6.7 17.4 10.7 6
5: 57 -2 5 7 -6 -1 25 30 35 40 315 4.4 13.8 9.4 7
lc wc li yd yr nF factdcx
1: 1 3 TRUE 1 2010 2 24
2: 1 3 TRUE 1 2010 8 41
3: 2 3 TRUE 1 2010 0 (sum of nF for 48 and 50, factdcx) 48
4: 2 3 TRUE 1 2010 0 52
5: 3 3 FALSE 1 2010 0 57
How can I do it? (Surely, the dataframe, abc, is much larger, but I want the sum of all categories of 48 and 50 and group it into a new category, say '48').
Many thanks!
> dput(head(abc1))
structure(list(dc = c(24L, 41L, 48L, 50L, 52L, 57L), tmin = c(-1L,
-3L, 0L, 0L, 3L, -2L), tmax = c(4L, 5L, 5L, 5L, 5L, 5L), cint = c(5L,
8L, 5L, 5L, 2L, 7L), wcmin = c(-5L, -8L, -4L, -4L, -3L, -6L),
wcmax = c(-2L, -3L, 0L, 0L, 1L, -1L), wsmin = c(20L, 15L,
30L, 30L, 20L, 25L), wsmax = c(25L, 20L, 35L, 35L, 25L, 30L
), gsmin = c(35L, 35L, 45L, 45L, 35L, 35L), gsmax = c(40L,
40L, 50L, 50L, 40L, 40L), wd = c(90L, 90L, 45L, 45L, 45L,
315L), rmin = c(11.8, 10, 7.3, 7.3, 6.7, 4.4), rmax = c(26.6,
23.5, 19, 19, 17.4, 13.8), cir = c(14.8, 13.5, 11.7, 11.7,
10.7, 9.4), lr = c(3L, 3L, 6L, 6L, 6L, 7L), lc = c(1L, 1L,
2L, 2L, 2L, 3L), wc = c(3L, 3L, 3L, 3L, 3L, 3L), li = c(TRUE,
TRUE, TRUE, TRUE, TRUE, FALSE), yd = c(1L, 1L, 1L, 1L, 1L,
1L), yr = c(2010L, 2010L, 2010L, 2010L, 2010L, 2010L), nF = c(2L,
8L, 0L, 0L, 0L, 0L), factdcx = structure(1:6, .Label = c("24",
"41", "48", "50", "52", "57", "70"), class = "factor")), .Names = c("dc",
"tmin", "tmax", "cint", "wcmin", "wcmax", "wsmin", "wsmax", "gsmin",
"gsmax", "wd", "rmin", "rmax", "cir", "lr", "lc", "wc", "li",
"yd", "yr", "nF", "factdcx"), class = c("data.table", "data.frame"
), row.names = c(NA, -6L), .internal.selfref = <pointer: 0x054b24a0>)
Still got a problem, sir/madam:
> head(abc1 (updated))
dc tmin tmax cint wcmin wcmax wsmin wsmax gsmin gsmax wd rmin rmax cir lr
1: 24 -1 4 5 -5 -2 20 25 35 40 90 11.8 26.6 14.8 3
2: 41 -3 5 8 -8 -3 15 20 35 40 90 10.0 23.5 13.5 3
3: 48 0 5 5 -4 0 30 35 45 50 45 7.3 19.0 11.7 6
4: 52 3 5 2 -3 1 20 25 35 40 45 6.7 17.4 10.7 6
5: 57 -2 5 7 -6 -1 25 30 35 40 315 4.4 13.8 9.4 7
6: 70 -2 3 5 -4 -1 20 25 30 35 360 3.6 10.2 6.6 7
lc wc li yd yr nF factdcx
1: 1 3 TRUE 1 2010 2 24
2: 1 3 TRUE 1 2010 8 41
3: 2 3 TRUE 1 2010 57 48
4: 2 3 TRUE 1 2010 0 52
5: 3 3 FALSE 1 2010 0 57
6: 3 2 TRUE 1 2010 1 70
The sum of nF was incorrect, it should be zero.
Try
library(data.table)
unique(setDT(df1)[, factdcx:= as.character(factdcx)][factdcx %chin%
c('48','50'), c('dc', 'factdcx', 'nF') := list('48', '48', sum(nF))])
# dc tmin tmax cint wcmin wcmax wsmin wsmax gsmin gsmax wd rmin rmax cir lr
#1: 24 -1 4 5 -5 -2 20 25 35 40 90 11.8 26.6 14.8 3
#2: 41 -3 5 8 -8 -3 15 20 35 40 90 10.0 23.5 13.5 3
#3: 48 0 5 5 -4 0 30 35 45 50 45 7.3 19.0 11.7 6
#4: 52 3 5 2 -3 1 20 25 35 40 45 6.7 17.4 10.7 6
#5: 57 -2 5 7 -6 -1 25 30 35 40 315 4.4 13.8 9.4 7
# lc wc li yd yr nF factdcx
#1: 1 3 TRUE 1 2010 2 24
#2: 1 3 TRUE 1 2010 8 41
#3: 2 3 TRUE 1 2010 0 48
#4: 2 3 TRUE 1 2010 0 52
#5: 3 3 FALSE 1 2010 0 57
For abc1,
res1 <- unique(setDT(abc1)[, factdcx:= as.character(factdcx)][factdcx %chin%
c('48','50'), c('dc', 'factdcx', 'nF') := list(48, '48', sum(nF))])
res1
# dc tmin tmax cint wcmin wcmax wsmin wsmax gsmin gsmax wd rmin rmax cir lr
#1: 24 -1 4 5 -5 -2 20 25 35 40 90 11.8 26.6 14.8 3
#2: 41 -3 5 8 -8 -3 15 20 35 40 90 10.0 23.5 13.5 3
#3: 48 0 5 5 -4 0 30 35 45 50 45 7.3 19.0 11.7 6
#4: 52 3 5 2 -3 1 20 25 35 40 45 6.7 17.4 10.7 6
#5: 57 -2 5 7 -6 -1 25 30 35 40 315 4.4 13.8 9.4 7
# lc wc li yd yr nF factdcx
#1: 1 3 TRUE 1 2010 2 24
#2: 1 3 TRUE 1 2010 8 41
#3: 2 3 TRUE 1 2010 0 48
#4: 2 3 TRUE 1 2010 0 52
#5: 3 3 FALSE 1 2010 0 57
data
df1 <- structure(list(dc = structure(1:6, .Label = c("24", "41",
"48",
"50", "52", "57"), class = "factor"), tmin = c(-1L, -3L, 0L,
0L, 3L, -2L), tmax = c(4L, 5L, 5L, 5L, 5L, 5L), cint = c(5L,
8L, 5L, 5L, 2L, 7L), wcmin = c(-5L, -8L, -4L, -4L, -3L, -6L),
wcmax = c(-2L, -3L, 0L, 0L, 1L, -1L), wsmin = c(20L, 15L,
30L, 30L, 20L, 25L), wsmax = c(25L, 20L, 35L, 35L, 25L, 30L
), gsmin = c(35L, 35L, 45L, 45L, 35L, 35L), gsmax = c(40L,
40L, 50L, 50L, 40L, 40L), wd = c(90L, 90L, 45L, 45L, 45L,
315L), rmin = c(11.8, 10, 7.3, 7.3, 6.7, 4.4), rmax = c(26.6,
23.5, 19, 19, 17.4, 13.8), cir = c(14.8, 13.5, 11.7, 11.7,
10.7, 9.4), lr = c(3L, 3L, 6L, 6L, 6L, 7L), lc = c(1L, 1L,
2L, 2L, 2L, 3L), wc = c(3L, 3L, 3L, 3L, 3L, 3L), li = c(TRUE,
TRUE, TRUE, TRUE, TRUE, FALSE), yd = c(1L, 1L, 1L, 1L, 1L,
1L), yr = c(2010L, 2010L, 2010L, 2010L, 2010L, 2010L), nF = c(2L,
8L, 0L, 0L, 0L, 0L), factdcx = structure(1:6, .Label = c("24",
"41", "48", "50", "52", "57"), class = "factor")), .Names = c("dc",
"tmin", "tmax", "cint", "wcmin", "wcmax", "wsmin", "wsmax", "gsmin",
"gsmax", "wd", "rmin", "rmax", "cir", "lr", "lc", "wc", "li",
"yd", "yr", "nF", "factdcx"), row.names = c("1:", "2:", "3:",
"4:", "5:", "6:"), class = "data.frame")