I have a large data set with variables like this:
df <- data.frame(ID=c(1,2),
Hei3ght1=c(180,192),
Weight1=c(70,90),
Hip1=c(25,29),
hei5ght1=c(160,150),
Hei3ght2=c(167,168),
Weight2=c(50,50),
Hip2=c(23,27),
hei5ght2=c(160,150),
Hei3ght3=c(175,176),
Weight3=c(50,70),
Hip3=c(28,28),
hei5ght3=c(160,150))
I would like to order the variables as follows:
ID, Hei3ght1, Hei3ght2, Hei3ght3, Weight1, Weight2, Weight3, Hip1, Hip2, Hip3, Hei5ght1, Hei5ght2, Hei5ght3
I have tried with:
df <- df[sort(names(df))]
But I do not want all the variables alphabetically.
Thank you so much in advance.
UODATE 2
df <- data.frame(ID=c(1,2),
Hei3ght1=c(180,192),
Weight1=c(70,90),
Hip1=c(25,29),
hei5ght1=c(160,150),
hei5ght21=c(160,150),
Hei3ght2=c(167,168),
Weight2=c(50,50),
Hip2=c(23,27),
hei5ght2=c(160,150),
hei5ght22=c(160,150),
Hei3ght3=c(175,176),
Weight3=c(50,70),
Hip3=c(28,28),
hei5ght3=c(160,150),
hei5ght23=c(160,150))
An option in base R would be to convert the column names to a matrix and then to a vector:
n <- length(unique(sub("_\\d+", "", names(df)[-1])))
df[c('ID', c(matrix(names(df)[-1], ncol = n, byrow = TRUE)))]
Output:
ID Height_1 Height_2 Height_3 Weight_1 Weight_2 Weight_3 Hip_1 Hip_2 Hip_3
1 1 180 167 175 70 50 50 25 23 28
2 2 192 168 176 90 50 70 29 27 28
Or you may use
library(data.table)
library(dplyr)
df %>%
select(ID, order(rowid(readr::parse_number(names(.)[-1])))+1)
Output:
ID Height_1 Height_2 Height_3 Weight_1 Weight_2 Weight_3 Hip_1 Hip_2 Hip_3
1 1 180 167 175 70 50 50 25 23 28
2 2 192 168 176 90 50 70 29 27 28
Update
For the updated data
library(stringr)
df %>%
select(ID, order(rowid(str_extract(names(.)[-1], "\\d+$")))+1)
Output:
ID Hei3ght1 Hei3ght2 Hei3ght3 Weight1 Weight2 Weight3 Hip1 Hip2 Hip3 hei5ght1 hei5ght2 hei5ght3
1 1 180 167 175 70 50 50 25 23 28 160 160 160
2 2 192 168 176 90 50 70 29 27 28 150 150 150
Update2
df %>%
select(ID, order(rowid(str_extract(names(.)[-1], "\\d$")))+1)
ID Hei3ght1 Hei3ght2 Hei3ght3 Weight1 Weight2 Weight3 Hip1 Hip2 Hip3 hei5ght1 hei5ght2 hei5ght3 hei5ght21 hei5ght22 hei5ght23
1 1 180 167 175 70 50 50 25 23 28 160 160 160 160 160 160
2 2 192 168 176 90 50 70 29 27 28 150 150 150 150 150 150
This is the long version of #akrun's solution, the core idea is to make pivot longer, transform to factor, and the arrange and pivot back:
library(tidyverse)
df %>%
pivot_longer(-ID) %>%
mutate(helper = str_replace_all(name, "[[:punct:]][0-9]+", ""),
helper = factor(helper, levels = c("Height", "Weight", "Hip"))) %>%
group_by(ID) %>%
arrange(helper, .by_group = TRUE) %>%
select(-helper) %>%
pivot_wider(names_from = name, values_from = value)
ID Height_1 Height_2 Height_3 Weight_1 Weight_2 Weight_3 Hip_1 Hip_2 Hip_3
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 180 167 175 70 50 50 25 23 28
2 2 192 168 176 90 50 70 29 27 28
Related
I have data that looks like this:
library(dplyr)
Data <- tibble(
ID = c("Code001", "Code001","Code001","Code002","Code002","Code002","Code002","Code002","Code003","Code003","Code003","Code003"),
Value = c(107,107,107,346,346,346,346,346,123,123,123,123))
I need to work out the average value per group per row. However, the value needs to be rounded (so no decimal places) and the group sum needs to equal the group sum of Value.
So solutions like this won't work:
Data %>%
add_count(ID) %>%
group_by(ID) %>%
mutate(Prop_Value_1 = Value/n,
Prop_Value_2 = round(Value/n))
Is there a solution that can produce an output like this:
Data %>%
mutate(Prop_Value = c(35,36,36,69,69,69,69,70,30,31,31,31))
Can use ceiling and then row_number to get there:
Data %>%
group_by(ID) %>%
mutate(count = n(),
ceil_avg = ceiling(Value/count)) %>%
mutate(sum_ceil_avg = sum(ceil_avg),
diff_sum = sum_ceil_avg - Value,
rn = row_number()) %>%
mutate(new_avg = ifelse(rn <= diff_sum,
ceil_avg - 1,
ceil_avg))
# A tibble: 12 × 8
# Groups: ID [3]
ID Value count ceil_avg sum_ceil_avg diff_sum rn new_avg
<chr> <dbl> <int> <dbl> <dbl> <dbl> <int> <dbl>
1 Code001 107 3 36 108 1 1 35
2 Code001 107 3 36 108 1 2 36
3 Code001 107 3 36 108 1 3 36
4 Code002 346 5 70 350 4 1 69
5 Code002 346 5 70 350 4 2 69
6 Code002 346 5 70 350 4 3 69
7 Code002 346 5 70 350 4 4 69
8 Code002 346 5 70 350 4 5 70
9 Code003 123 4 31 124 1 1 30
10 Code003 123 4 31 124 1 2 31
11 Code003 123 4 31 124 1 3 31
12 Code003 123 4 31 124 1 4 31
A first solution is to use integer division:
Data %>%
group_by(ID) %>%
mutate(Prop_Value = ifelse(row_number() <= Value %% n(), Value %/% n() + 1, Value %/% n()))
# A tibble: 12 × 3
# Groups: ID [3]
ID Value Prop_Value
<chr> <dbl> <dbl>
1 Code001 107 36
2 Code001 107 36
3 Code001 107 35
4 Code002 346 70
5 Code002 346 69
6 Code002 346 69
7 Code002 346 69
8 Code002 346 69
9 Code003 123 31
10 Code003 123 31
11 Code003 123 31
12 Code003 123 30
I have df like this
ID <- c("A01","B20","C3","D4")
Nb_data <- c(2,2,2,3)
Weight_t1 <- c(70,44,98,65)
Weight_t2 <- c(75,78,105,68)
Weight_t3 <- c(72,52,107,NA)
year1 <- c(20,28,32,50)
year2 <- c(28,32,35,60)
year3 <- c(29,35,38,NA)
LENGTHt1 <- c(175,155,198,165)
LENGTHt2 <- c(175,155,198,163)
LENGTHt3 <- c(176,154,198,NA)
df <- data.frame(ID,Nb_data,Weight_t1,Weight_t2,Weight_t3,year1,year2,year3,LENGTHt1,LENGTHt2,LENGTHt3)
weight/year and length : t1 to t28
I want to tidy my data like :
ID
Nb_data
Weigth
Year
Length
A01
3
70
20
175
A01
3
75
28
175
A01
3
72
29
176
B20
3
44
28
155
B20
3
78
32
155
B20
3
52
35
154
I try
df1 <- df %>%
pivot_longer(cols = -c('ID','Nb_data'),
names_to = c('Weight','Year','Length' ),
names_pattern = '(Weight_t[0-9]*|year[0-9]*|LENGTHt[0-9]*)' ,
values_drop_na = TRUE)
or names_pattern = '(.t[0-9])(.t[0-9])(.t[0-9])'
I have some difficulties to use regex or maybe pivot_longer are not suitable...
You need to extract the common timepoint information from the variable names. Make this information consistent first, with a clear separator (_ in this case), then it becomes much easier.
I would do something like this
library(tidyr)
library(dplyr)
df1 <- df
names(df1) <- gsub("year", "Year_t", names(df1))
names(df1) <- gsub("LENGTH", "Length_", names(df1))
df1 %>%
pivot_longer(cols = -c('ID','Nb_data'),
names_to = c("name", "timepoint"),
names_sep = "_",
values_drop_na = TRUE) %>%
pivot_wider(names_from = name, values_from = value)
EDIT: or shorter, using ".value" in the names_to argument (as #onyambu showed in his answer):
df1 %>%
pivot_longer(cols = -c('ID','Nb_data'),
names_to = c(".value", "timepoint"),
names_sep = "_",
values_drop_na = TRUE)
Output:
ID Nb_data timepoint Weight Year Length
<chr> <dbl> <chr> <dbl> <dbl> <dbl>
1 A01 2 t1 70 20 175
2 A01 2 t2 75 28 175
3 A01 2 t3 72 29 176
4 B20 2 t1 44 28 155
5 B20 2 t2 78 32 155
6 B20 2 t3 52 35 154
7 C3 2 t1 98 32 198
8 C3 2 t2 105 35 198
9 C3 2 t3 107 38 198
10 D4 3 t1 65 50 165
11 D4 3 t2 68 60 163
You could directly use pivot_longer though with abit of complex regex as follows
df %>%
pivot_longer(matches("\\d+$"), names_to = c(".value", "grp"),
names_pattern = "(.*?)[_t]{0,2}(\\d+$)",
values_drop_na = TRUE)
# A tibble: 11 × 6
ID Nb_data grp Weight year LENGTH
<chr> <dbl> <chr> <dbl> <dbl> <dbl>
1 A01 2 1 70 20 175
2 A01 2 2 75 28 175
3 A01 2 3 72 29 176
4 B20 2 1 44 28 155
5 B20 2 2 78 32 155
6 B20 2 3 52 35 154
7 C3 2 1 98 32 198
8 C3 2 2 105 35 198
9 C3 2 3 107 38 198
10 D4 3 1 65 50 165
11 D4 3 2 68 60 163
I have a dataframe with columns that have 'x1' and 'x1_fit' with the numbers going up to 5 in some cases.
date <- seq(as.Date('2019-11-04'), by = "days", length.out = 7)
x1 <- c(100,120,111,152,110,112,111)
x1_fit <- c(150,142,146,148,123,120,145)
x2 <- c(110,130,151,152,150,142,161)
x2_fit <- c(170,172,176,178,173,170,175)
df <- data.frame(date,x1,x1_fit,x2,x2_fit)
How can I do x1_fit - x1 and so on. The number of x's will change every time.
You can select those columns with regular expressions (surppose the columns are in appropriate order):
> df[, grep('^x\\d+_fit$', colnames(df))] - df[, grep('^x\\d+$', colnames(df))]
x1_fit x2_fit
1 50 60
2 22 42
3 35 25
4 -4 26
5 13 23
6 8 28
7 34 14
If you want to assign the differences to the original df:
df[, paste0(grep('^x\\d+$', colnames(df), value = TRUE), '_diff')] <-
df[, grep('^x\\d+_fit$', colnames(df))] - df[, grep('^x\\d+$', colnames(df))]
# > df
# date x1 x1_fit x2 x2_fit x1_diff x2_diff
# 1 2019-11-04 100 150 110 170 50 60
# 2 2019-11-05 120 142 130 172 22 42
# 3 2019-11-06 111 146 151 176 35 25
# 4 2019-11-07 152 148 152 178 -4 26
# 5 2019-11-08 110 123 150 173 13 23
# 6 2019-11-09 112 120 142 170 8 28
# 7 2019-11-10 111 145 161 175 34 14
Solution from #mt1022 is straightforward, however since you have tagged this as dplyr, here is one approach following it where we convert the data to long format, subtract the corresponding values and get the data in wide format again.
library(dplyr)
library(tidyr)
df %>%
pivot_longer(cols = -date) %>%
mutate(name = sub('_.*', '', name)) %>%
group_by(date, name) %>%
summarise(diff = diff(value)) %>%
pivot_wider(names_from = name, values_from = diff) %>%
rename_at(-1, ~paste0(., "_diff")) %>%
left_join(df, by = "date")
# date x1_diff x2_diff x1 x1_fit x2 x2_fit
# <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 2019-11-04 50 60 100 150 110 170
#2 2019-11-05 22 42 120 142 130 172
#3 2019-11-06 35 25 111 146 151 176
#4 2019-11-07 -4 26 152 148 152 178
#5 2019-11-08 13 23 110 123 150 173
#6 2019-11-09 8 28 112 120 142 170
#7 2019-11-10 34 14 111 145 161 175
In base R, you could loop over the unique column names and diff on the the fitted column using
> lapply(setNames(nm = unique(gsub("_.*", "", names(df)))), function(nm) {
fit <- paste0(nm, "_fit")
diff <- df[, nm] - df[, fit]
})
# $x1
# [1] -50 -22 -35 4 -13 -8 -34
#
# $x2
# [1] -60 -42 -25 -26 -23 -28 -14
Here, I set the Date column as the row names and removed the column using
df <- data.frame(date,x1,x1_fit,x2,x2_fit)
row.names(df) <- df$date
df$date <- NULL
but you could just loop over the the column names without the Date column.
We can also do with a split in base R
out <- sapply(split.default(df[-1], sub("_.*", "", names(df)[-1])),
function(x) x[,2] - x[1])
df[sub("\\..*", "_diff", names(lst1))] <- out
df
# date x1 x1_fit x2 x2_fit x1_diff x2_diff
#1 2019-11-04 100 150 110 170 50 60
#2 2019-11-05 120 142 130 172 22 42
#3 2019-11-06 111 146 151 176 35 25
#4 2019-11-07 152 148 152 178 -4 26
#5 2019-11-08 110 123 150 173 13 23
#6 2019-11-09 112 120 142 170 8 28
#7 2019-11-10 111 145 161 175 34 14
I'm currently on R trying to create for a DF multiple columns with the sum of previous one. Imagine I got a DF like this:
df=
sep-2016 oct-2016 nov-2016 dec-2016 jan-2017
1 70 153 NA 28 19
2 57 68 73 118 16
3 29 NA 19 32 36
4 177 36 3 54 53
and I want to add at the end the sum of the rows previous of the month that I'm reporting so for October you end up with the sum of sep and oct, and for November you end up with the sum of sep, oct and november and end up with something like this:
df=
sep-2016 oct-2016 nov-2016 dec-2016 jan-2017 status-Oct2016 status-Nov 2016
1 70 153 NA 28 19 223 223
2 57 68 73 118 16 105 198
3 29 NA 19 32 36 29 48
4 177 36 3 54 53 213 93
I want to know a efficient way insted of writing a lots of lines of rowSums() and even if I can get the label on the iteration for each month would be amazing!
Thanks!
We can use lapply to loop through the columns to apply the rowSums.
dat2 <- as.data.frame(lapply(2:ncol(dat), function(i){
rowSums(dat[, 1:i], na.rm = TRUE)
}))
names(dat2) <- paste0("status-", names(dat[, -1]))
dat3 <- cbind(dat, dat2)
dat3
# sep-2016 oct-2016 nov-2016 dec-2016 jan-2017 status-oct-2016 status-nov-2016 status-dec-2016 status-jan-2017
# 1 70 153 NA 28 19 223 223 251 270
# 2 57 68 73 118 16 125 198 316 332
# 3 29 NA 19 32 36 29 48 80 116
# 4 177 36 3 54 53 213 216 270 323
DATA
dat <- read.table(text = " 'sep-2016' 'oct-2016' 'nov-2016' 'dec-2016' 'jan-2017'
1 70 153 NA 28 19
2 57 68 73 118 16
3 29 NA 19 32 36
4 177 36 3 54 53",
header = TRUE, stringsAsFactors = FALSE)
names(dat) <- c("sep-2016", "oct-2016", "nov-2016", "dec-2016", "jan-2017")
Honestly I have no idea why you would want your data in this format, but here is a tidyverse method of accomplishing it. It involves transforming the data to a tidy format before spreading it back out into your wide format. The key thing to note is that in a tidy format, where month is a variable in a single column instead of spread across multiple columns, you can simply use group_by(rowid) and cumsum to calculate all the values you want. The last few lines are constructing the status- column names and spreading the data back out into a wide format.
library(tidyverse)
df <- read_table2(
"sep-2016 oct-2016 nov-2016 dec-2016 jan-2017
70 153 NA 28 19
57 68 73 118 16
29 NA 19 32 36
177 36 3 54 53"
)
df %>%
rowid_to_column() %>%
gather("month", "value", -rowid) %>%
arrange(rowid) %>%
group_by(rowid) %>%
mutate(
value = replace_na(value, 0),
status = cumsum(value)
) %>%
gather("vartype", "number", value, status) %>%
mutate(colname = ifelse(vartype == "value", month, str_c("status-", month))) %>%
select(rowid, number, colname) %>%
spread(colname, number)
#> # A tibble: 4 x 11
#> # Groups: rowid [4]
#> rowid `dec-2016` `jan-2017` `nov-2016` `oct-2016` `sep-2016`
#> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 28.0 19.0 0 153 70.0
#> 2 2 118 16.0 73.0 68.0 57.0
#> 3 3 32.0 36.0 19.0 0 29.0
#> 4 4 54.0 53.0 3.00 36.0 177
#> # ... with 5 more variables: `status-dec-2016` <dbl>,
#> # `status-jan-2017` <dbl>, `status-nov-2016` <dbl>,
#> # `status-oct-2016` <dbl>, `status-sep-2016` <dbl>
Created on 2018-02-16 by the reprex package (v0.2.0).
A clean way to do it is by convert your data in a long format.
library(tibble)
library(tidyr)
library(dplyr)
your_data <- tribble(~"sep_2016", ~"oct_2016", ~"nov_2016", ~"dec_2016", ~"jan_2017",
70, 153, NA, 28, 19,
57, 68, 73, 118, 16,
29, NA, 19, 32, 36,
177, 36, 3, 54, 53)
You can change the format of your data.frame with gather from the tidyr package.
your_data_long <- your_data %>%
rowid_to_column() %>%
gather(key = month_year, value = the_value, -rowid)
head(your_data_long)
#> # A tibble: 6 x 3
#> rowid month_year the_value
#> <int> <chr> <dbl>
#> 1 1 sep_2016 70
#> 2 2 sep_2016 57
#> 3 3 sep_2016 29
#> 4 4 sep_2016 177
#> 5 1 oct_2016 153
#> 6 2 oct_2016 68
Once your data.frame is in a long format. You can compute cumulative sum with cumsumand dplyrfunctions mutate and group_by.
result <- your_data_long %>%
group_by(rowid) %>%
mutate(cumulative_value = cumsum(the_value))
result
#> # A tibble: 20 x 4
#> # Groups: rowid [4]
#> rowid month_year the_value cumulative_value
#> <int> <chr> <dbl> <dbl>
#> 1 1 sep_2016 70 70
#> 2 2 sep_2016 57 57
#> 3 3 sep_2016 29 29
#> 4 4 sep_2016 177 177
#> 5 1 oct_2016 153 223
#> 6 2 oct_2016 68 125
#> 7 3 oct_2016 NA NA
#> 8 4 oct_2016 36 213
#> 9 1 nov_2016 NA NA
#> 10 2 nov_2016 73 198
#> 11 3 nov_2016 19 NA
#> 12 4 nov_2016 3 216
#> 13 1 dec_2016 28 NA
#> 14 2 dec_2016 118 316
#> 15 3 dec_2016 32 NA
#> 16 4 dec_2016 54 270
#> 17 1 jan_2017 19 NA
#> 18 2 jan_2017 16 332
#> 19 3 jan_2017 36 NA
#> 20 4 jan_2017 53 323
If you want to retrieve the starting form, you can do it with spread.
My preferred solution would be:
# library(matrixStats)
DF <- as.matrix(df)
DF[is.na(DF)] <- 0
RES <- matrixStats::rowCumsums(DF)
colnames(RES) <- paste0("status-", colnames(DF))
cbind.data.frame(df, RES)
This is closest to what you are looking for with the rowSums.
One option could be using spread and gather function from tidyverse.
Note: The status column has been added even for the 1st month. And the status columns are not in order but values are correct.
The approach is:
# Data
df <- read.table(text = "sep-2016 oct-2016 nov-2016 dec-2016 jan-2017
70 153 NA 28 19
57 68 73 118 16
29 NA 19 32 36
177 36 3 54 53", header = T, stringsAsFactors = F)
library(tidyverse)
# Just add an row number as sl
df <- df %>% mutate(sl = row_number())
#Calculate the cumulative sum after gathering and arranging by date
mod_df <- df %>%
gather(key, value, -sl) %>%
mutate(key = as.Date(paste("01",key, sep="."), format="%d.%b.%Y")) %>%
arrange(sl, key) %>%
group_by(sl) %>%
mutate(status = cumsum(ifelse(is.na(value),0L,value) )) %>%
select(-value) %>%
mutate(key = paste("status",as.character(key, format="%b.%Y"))) %>%
spread(key, status)
# Finally join cumulative calculated sum columns with original df and then
# remove sl column
inner_join(df, mod_df, by = "sl") %>% select(-sl)
# sep.2016 oct.2016 nov.2016 dec.2016 jan.2017 status Dec.2016 status Jan.2017 status Nov.2016 status Oct.2016 status Sep.2016
#1 70 153 NA 28 19 251 270 223 223 70
#2 57 68 73 118 16 316 332 198 125 57
#3 29 NA 19 32 36 80 116 48 29 29
#4 177 36 3 54 53 270 323 216 213 177
Another base solution where we build a matrix accumulating the row sums :
status <- setNames(
as.data.frame(t(apply(dat,1,function(x) Reduce(sum,'[<-'(x,is.na(x),0),accumulate = TRUE)))),
paste0("status-",names(dat)))
status
# status-sep-2016 status-oct-2016 status-nov-2016 status-dec-2016 status-jan-2017
# 1 70 223 223 251 270
# 2 57 125 198 316 332
# 3 29 29 48 80 116
# 4 177 213 216 270 323
Then bind it to your original data if needed :
cbind(dat,status[-1])
I have a dataset with 15 columns col1 to col15 being numeric. I have 100 rows of data with names attached to each row as a factor. I want to do a summary for each row for all 15 columns.
head(df2phcl[,c(1:16)])
col1 col2 col3 col4 col5 col6 col7 col8 col9 col10 col11 col12 col13 col14 col15 NAME
78 95 101 100 84 93 93 85 81 97 80 94 81 79 87 R04-001
100 61 96 75 98 92 99 99 102 83 84 NA 101 93 96 R04-002
81 84 82 83 77 86 90 92 92 78 86 91 59 80 84 R04-003
91 84 87 95 103 93 92 95 86 92 107 96 94 87 97 R04-004
72 79 66 98 84 75 85 83 75 80 91 65 90 81 73 R04-005
72 75 68 44 79 64 83 71 81 82 85 63 87 94 60 R04-006
My code for this is.
library(dplyr)
####Rachis
SUMCL <- df2phcl %>%
group_by(name) %>%
summarise(CL = mean(c(col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12,col13,col14,col15), na.rm=T),
CLMAX = max(c(col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12,col13,col14,col15), na.rm=T),
CLMIN = min(c(col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12,col13,col14,col15), na.rm=T),
CLSTD = sd(c(col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12,col13,col14,col15), na.rm=T),
OUT = outliers(c(col1,col2,col3,col4,col5,col6,col7,col8,col9,col10,col11,col12,col13,col14,col15), na.rm=T))
head(SUMCL)
tail(SUMCL)
My resulting analysis comes out as...
Error:
Evaluation error: missing value where TRUE/FALSE needed.
I've also tried this...
df2phcl$col1+col2+col3+col4+col5+col6+col7+col8+col9+col10+col11+col12+col13+co114+col15[!df2phcl$col1+col2+col3+col4+col5+col6+col7+col8+col9+col10+col11+col12+col13+col14+col15%in%boxplot.stats(df2phcl$col1+col2+col3+col4+col5+col6+col7+col8+col9+col10+co111+col12+col13+col14+col15)$out]
This returns ....
Error: object 'col2' not found
Not sure what I'm doing wrong this works with mean, max, min, and sd.
> head(SUMCL)
# A tibble: 6 x 11
# Groups: ENTRY, NAME, HEADCODE, RHTGENES, HEAD, PL [6]
ENTRY NAME HEADCODE RHTGENES HEAD PL PH CL CLMAX CLMIN CLSTD
<int> <fctr> <fctr> <fctr> <fctr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 R04-001 CAW Rht1 Club 319 83 88.53333 101 78 7.989875
2 2 R04-002 LBW Wildtype Common 330 102 91.35714 102 61 11.770936
3 3 R04-003 CBW Rht2 Club 230 82 83.00000 92 59 8.220184
4 4 R04-004 LBW Rht1 Common 328 117 93.26667 107 84 6.192930
5 5 R04-005 CBW Rht1 Club 280 97 79.80000 98 65 9.182281
6 6 R04-006 LAW Rht1 Common 310 92 73.86667 94 44 12.749603
I'm just wanting to filter the outliers at 3 sd or more and then use the dplyr to package to do my statistics...
I'm not exactly sure what you're trying to do, so let me know if the code below is on the right track.
The approach below is to convert the data from wide to long format, which makes it much easier to do the summaries for each level of name.
library(tidyverse)
# Fake data
set.seed(2)
dat = as.data.frame(replicate(15, rnorm(100)))
names(dat) = paste0("col", 1:15)
dat$name = paste0(rep(LETTERS[1:10], each=10), rep(letters[1:10], 10))
# Convert data to long format, remove outliers and summarize
dat %>%
gather(column, value, -name) %>% # reshape from wide to long
group_by(name) %>% # summarize by name
mutate(value = replace(value, abs(value - mean(value)) > 2*sd(value), NA)) %>% # set outliers to NA
summarise(mean = mean(value, na.rm=TRUE),
max = max(value, na.rm=TRUE),
sd = sd(value, na.rm=TRUE))
name mean max sd
1 Aa 0.007848188 1.238744 0.8510016
2 Ab -0.208536464 1.980401 1.2764606
3 Ac -0.152986713 1.587845 0.8443106
4 Ad -0.413543054 0.965692 0.7225872
5 Ae -0.112648322 1.178716 0.7269527
6 Af 0.442268890 2.048040 1.0350119
7 Ag 0.390627994 1.978260 0.8716681
8 Ah 0.080505879 2.396349 1.3128403
9 Ai 0.257925059 1.984474 1.0196722
10 Aj 0.137469703 1.470177 0.7192616
# ... with 90 more rows
I managed to get some of the col std. dev. changed; however, I'm not sure how many observations it took out. I was wanting to take out from the top and the bottom of the distribution at an even amount. Like a trimmed mean, it would take out 20% of the obs. from the top and bottom of the distribution. What I was curious about was just leaving the observations from the top and bottom (+-3 SD) of the distribution.
> SUMCL <- df2phcl %>%
+ gather(column, value, -c(ENTRY, NAME, HEADCODE, RHTGENES, HEAD,PL,PH)) %>% # reshape from wide to long
+ group_by(ENTRY, NAME, HEADCODE, RHTGENES, HEAD,PL,PH) %>% # summarize by name
+ mutate(value = replace(value, abs(value - mean(value)) > 2*sd(value), NA)) %>% # set outliers to NA
+ summarise(CL = mean(value, na.rm=TRUE),
+ CLMAX = max(value, na.rm=TRUE),
+ CLMIN = min(value, na.rm=TRUE),
+ N = sum(!is.na(value), na.rm=TRUE),
+ CLSTD= sd(value, na.rm=TRUE),
+ CLSE = (CLSTD / sqrt(N)))
> head(SUMCL)
# A tibble: 6 x 13
# Groups: ENTRY, NAME, HEADCODE, RHTGENES, HEAD, PL [6]
ENTRY NAME HEADCODE RHTGENES HEAD PL PH CL CLMAX CLMIN N CLSTD CLSE
<int> <fctr> <fctr> <fctr> <fctr> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl>
1 1 R04-001 CAW Rht1 Club 319 83 88.53333 101 78 15 7.989875 2.062977
2 2 R04-002 LBW Wildtype Common 330 102 91.35714 102 61 14 11.770936 3.145915
3 3 R04-003 CBW Rht2 Club 230 82 84.71429 92 77 14 5.029583 1.344213
4 4 R04-004 LBW Rht1 Common 328 117 92.28571 103 84 14 5.075258 1.356420
5 5 R04-005 CBW Rht1 Club 280 97 79.80000 98 65 15 9.182281 2.370855
6 6 R04-006 LAW Rht1 Common 310 92 76.00000 94 60 14 10.076629 2.693093