Hello I have a data frame that is 2000x56. I would like to do a simple subtraction of specific columns. For example I would like to subtract column 1 from 3 and column 5 from 7 etc..
Here is a sample of the data set.
df= structure(list(c(48.9518, 47.9639, 47.5751, 46.5795, 46.6301,
45.0705, 43.7893, 43.8325, 46.507, 45.1127, 46.2437, 44.6545,
43.5113, 43.2287, 43.6998, 41.44, 41.44, 41.8239, 43.2681, 42.5079,
40.315), c(51.9657, 50.928, 50.559, 50.477, 51.8529, 47.506,
49.0126, 47.8382, 57.6266, 59.9311, 71.9462, 44.6545, 43.5113,
43.2287, 43.6998, 41.44, 41.44, 41.7783, 43.6673, 42.915, 40.4284
), c(42.0552, 40.141, 40.07, 40.3302, 39.7687, 39.3804, 40.5853,
40.2478, 40.7404, 36.0079, 39.3361, 38.6883, 33.1306, 34.2174,
34.0593, 34.4541, 32.1919, 36.2109, 37.0591, 35.7394, 34.8065
), c(43.5527, 40.6115, 41.1305, 42.6484, 42.1938, 41.2828, 41.8979,
41.9331, 47.0511, 48.0175, 49.5343, 45.5063, 33.1306, 34.2174,
34.0593, 34.4541, 32.0264, 36.1705, 37.2596, 35.5938, 34.3885
), c(56.3464, 53.5964, 55.2791, 54.7751, 53.6983, 48.2984, 46.8343,
50.339, 54.6205, 54.6327, 53.7313, 51.839, 49.9128, 60.1649,
64.1637, 57.4661, 57.4661, 57.9187, 51.9147, 51.5786, 49.357),
c(61.6417, 57.054, 58.8402, 60.6182, 58.3043, 48.7071, 47.5466,
52.9527, 67.9061, 64.3576, 63.6387, 61.2588, 43.1908, 59.254,
63.8611, 57.4661, 57.4661, 58.6671, 54.097, 53.8527, 51.4929
), c(62.3702, 58.9045, 58.1827, 59.4045, 57.7552, 50.4304,
45.2969, 51.3944, 55.3861, 54.3857, 50.634, 49.1729, 51.0196,
56.8711, 59.2268, 56.1792, 56.812, 53.9583, 52.6343, 49.8832,
47.8319)), row.names = c(NA, -21L), class = c("tbl_df", "tbl",
"data.frame"))
head(df)
A tibble: 6 x 7
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 49.0 52.0 42.1 43.6 56.3 61.6 62.4
2 48.0 50.9 40.1 40.6 53.6 57.1 58.9
3 47.6 50.6 40.1 41.1 55.3 58.8 58.2
4 46.6 50.5 40.3 42.6 54.8 60.6 59.4
5 46.6 51.9 39.8 42.2 53.7 58.3 57.8
6 45.1 47.5 39.4 41.3 48.3 48.7 50.4
I start by creating 2 vectors with the column numbers I would like to subtract.
First = seq(1, ncol(df), 4)
Second = seq(3, ncol(df), 4)
print(First)
1, 5
print(Second)
3, 7
Now I create a loop using map2 from purrr. I would like the output to be a dataframe so I use map2_dfr() from purrr
map2_dfr(First, Second, ~df[,.x]-df[,.y])
The result is a tibble with nothing.
I have tried creating a function inside map2_dfr() with no luck.
map2_dfr(First, Second, function(x, y){df[,x]-df[,y]})
My expected output is a data frame where
Column1 = df[,1]-df[,3]
Column2 = df[,5]-df[,7]
Thank you.
The issue is that the dataset doesn't have any column names
colnames(df) <- paste0("col", seq_along(df))
Now, applying the OP's code should work fine
Related
What I'm having trouble with is I'd like the first row of this matrix (mat.a) to be the first row of matrix 1 in my array, and then the second row to be the first row of matrix 2, etc. Then the first row of mat.b to be the second row of the first matrix in my array, second row of mat. b to be the second row in the second matrix of the array, etc. This trend continues for mat.c. The fourth row of my matrix should be the averages of the values in each column. Also, I'm not allowed to use a for loop
mat.a <- matrix(c(scores$A1, scores$A2, scores$avgA), ncol = 3,
byrow = FALSE)
mat.b <- matrix(c(scores$B1, scores$B2, scores$avgB), ncol = 3,
byrow = FALSE)
mat.c <- matrix(c(scores$C1, scores$C2, scores$avgC), ncol = 3,
byrow = FALSE)
scores.array<- array(c(mat.a,mat.b, mat.c), dim = c(3,3,21))
> dim(mat.a)
[1] 21 3
> dim(scores)
[1] 21 10
> dim(mat.b)
[1] 21 3
> dim(mat.c)
[1] 21 3
scores
scores.updated
Here is a natural (I think) approach to this problem:
Use array to construct an array with A, B, and C lying along the third dimension.
Use aperm to transpose the array so that A, B, and C lie along the first dimension.
Use colMeans to compute means over the first dimension ("columnwise").
Use abind to attach the means to the transposed array.
nms <- c("A1", "A2", "avgA", "B1", "B2", "avgB", "C1", "C2", "avgC")
z <- array(unlist(scores[nms]), dim = c(21L, 3L, 3L))
zz <- aperm(zz, 3:1)
zzz <- abind::abind(zz, colMeans(zz, dims = 1L), along = 1L)
zzz[, , 1:2]
, , 1
[,1] [,2] [,3]
[1,] 28.75775 69.28034 49.01905
[2,] 41.37243 27.43836 34.40540
[3,] 10.28646 89.03502 49.66074
[4,] 26.80555 61.91791 44.36173
, , 2
[,1] [,2] [,3]
[1,] 78.83051 64.05068 71.44060
[2,] 36.88455 81.46400 59.17427
[3,] 43.48927 91.44382 67.46655
[4,] 53.06811 78.98617 66.02714
I have used scores as (very helpfully!) defined by #langtang.
Try this:
library(tidyverse)
# add the averages
scores <- scores %>%
rowwise() %>%
mutate(avg1 = mean(c_across(ends_with("1"))),
avg2 = mean(c_across(ends_with("2"))),
avg3 = mean(c_across(starts_with("avg")))) %>%
# relocate the columns
relocate(ini, A1,B1,C1,avg1, A2,B2,C2,avg2, avgA,avgB,avgC, avg3)
# create scores array
scores.array = array(scores %>% pivot_longer(cols = A1:avg3) %>% pull(value), dim=c(4,3,21))
# add dim names
dimnames(scores.array) = list(c("A","B","C","mean"), c("Midterm", "Final", "mean"), scores$ini)
Output (first two):
> scores.array[,,1:2]
, , ZO
Midterm Final mean
A 28.75775 69.28034 49.01905
B 41.37243 27.43836 34.40540
C 10.28646 89.03502 49.66074
mean 26.80555 61.91791 44.36173
, , UE
Midterm Final mean
A 78.83051 64.05068 71.44060
B 36.88455 81.46400 59.17427
C 43.48927 91.44382 67.46655
mean 53.06811 78.98617 66.02714
Input Data (fake data):
set.seed(123)
scores = data.frame(
A1 = runif(21)*100,
A2 = runif(21)*100,
B1 = runif(21)*100,
B2 = runif(21)*100,
C1 = runif(21)*100,
C2 = runif(21)*100
)
scores <- scores %>% rowwise() %>%
mutate(ini = paste0(sample(LETTERS,2), collapse="")) %>%
relocate(ini)
scores$avgA = apply(scores[,c("A1","A2")],1,mean)
scores$avgB = apply(scores[,c("B1","B2")],1,mean)
scores$avgC = apply(scores[,c("C1","C2")],1,mean)
ini A1 A2 B1 B2 C1 C2 avgA avgB avgC
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 ZO 28.8 69.3 41.4 27.4 10.3 89.0 49.0 34.4 49.7
2 UE 78.8 64.1 36.9 81.5 43.5 91.4 71.4 59.2 67.5
3 HS 40.9 99.4 15.2 44.9 98.5 60.9 70.2 30.0 79.7
4 JR 88.3 65.6 13.9 81.0 89.3 41.1 76.9 47.4 65.2
5 JL 94.0 70.9 23.3 81.2 88.6 14.7 82.4 52.3 51.7
6 BJ 4.56 54.4 46.6 79.4 17.5 93.5 29.5 63.0 55.5
7 VL 52.8 59.4 26.6 44.0 13.1 30.1 56.1 35.3 21.6
8 TN 89.2 28.9 85.8 75.4 65.3 6.07 59.1 80.6 35.7
9 QN 55.1 14.7 4.58 62.9 34.4 94.8 34.9 33.8 64.6
10 VC 45.7 96.3 44.2 71.0 65.7 72.1 71.0 57.6 68.9
# ... with 11 more rows
I am using R programming language. Suppose I have the following data ("my_data"):
student first_run second_run third_run fourth_run fifth_run sixth_run seventh_run eight_run ninth_run tenth_run
1 student1 19.70847 21.79771 16.49083 19.51691 13.97987 14.60733 13.89703 15.24651 20.75679 18.44020
2 student2 11.22369 15.36253 16.90215 20.20724 15.90227 15.14539 13.74945 18.30090 19.55124 17.24132
3 student3 15.93649 17.03599 14.20214 13.17548 14.70327 15.49697 13.08945 19.94142 22.41674 17.37958
4 student4 16.18733 15.13197 14.79481 16.75177 14.51287 17.71816 13.45054 14.25553 19.89091 18.88981
5 student5 18.71084 18.85453 17.15864 19.38880 15.68862 18.39169 15.26428 16.04526 18.92532 16.62409
6 student6 19.75246 12.74605 18.52214 17.92626 14.48501 17.20780 13.10512 12.46502 20.68583 15.87711
7 student7 14.75144 23.82376 18.51366 20.77424 14.22155 16.08186 12.95981 12.67820 20.12166 15.66006
8 student8 17.06516 15.63075 13.72026 15.02068 14.21098 15.99414 14.64818 16.15603 21.74607 17.07382
9 student9 20.27611 12.44592 12.26502 15.13456 14.61552 18.72192 15.11129 17.60746 18.83831 17.55257
10 student10 17.70736 16.21620 14.10861 17.20014 16.59376 19.50027 13.05073 15.80002 18.09781 18.34313
I want to add 2 columns to this data:
my_mean : the mean of each row
my_median: the median of each row
I tried the following code in R:
my_data$median = apply(my_data, 1, median, na.rm=T)
my_data$mean = apply(my_data, 1, mean, na.rm=T)
But I don't think this code is correct. For instance, when using this code, the median of the second row of data is returned as "16.90215"
But when I manually take the median of this row:
median(11.22369 , 15.36253 , 16.90215 , 20.20724, 15.90227 , 15.14539 , 13.74945 , 18.30090 , 19.55124 , 17.24132)
I get an answer of
11.22
Can someone please show me what I am doing wrong?
Thanks
The calculation is incorrect i.e. the first argument of median is 'x' which can be a vector. The second argument is na.rm, followed by variadic arguments .... So, when write 11.22369, 15.36253, the 'x' is taken as 11.22369 and that is the value returned. Instead, it should be a vector by concatenation c
median(c(11.22369 , 15.36253 , 16.90215 , 20.20724, 15.90227 , 15.14539 , 13.74945 , 18.30090 , 19.55124 , 17.24132))
[1] 16.40221
Also, based on the OP's data, the first column should be dropped which is character or factor
apply(my_data[-1], 1, median, na.rm=TRUE)
1 2 3 4 5 6 7 8 9 10
17.46551 16.40221 15.71673 15.65965 17.77517 16.54246 15.87096 15.81245 16.34356 16.89695
The second row is used in the manual calculation
library(dplyr)
df %>%
rowwise() %>%
mutate(median = median(c_across(where(is.numeric))),
mean = mean(c_across(where(is.numeric))))
c_across and rowwise were created for this type of situation. Most verbs work column-wise. To change this behavior pipe to rowwise first.
c_across will then combine all values in a row that are numeric (hence where(is.numeric) into a numeric vector and then mean or median can be applied.
Note: You will likely want to pipe the output to ungroup since rowwise creates a rowwise grouped data frame.
Here is an alternative using pmap along with passing all the arguments simultaneously thus using ellipsis i.e. .... The output is needed to be unnested with unnest_wider from tidyr:
library(tidyr)
library(dplyr)
library(purrr)
df %>%
mutate(res = pmap(across(where(is.numeric)),
~ list(median = median(c(...)),
avg = mean(c(...))))) %>%
unnest_wider(res)
output:
student first_run second_run third_run fourth_run fifth_run sixth_run seventh_run eight_run ninth_run tenth_run median avg
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 student1 19.7 21.8 16.5 19.5 14.0 14.6 13.9 15.2 20.8 18.4 17.5 17.4
2 student2 11.2 15.4 16.9 20.2 15.9 15.1 13.7 18.3 19.6 17.2 16.4 16.4
3 student3 15.9 17.0 14.2 13.2 14.7 15.5 13.1 19.9 22.4 17.4 15.7 16.3
4 student4 16.2 15.1 14.8 16.8 14.5 17.7 13.5 14.3 19.9 18.9 15.7 16.2
5 student5 18.7 18.9 17.2 19.4 15.7 18.4 15.3 16.0 18.9 16.6 17.8 17.5
6 student6 19.8 12.7 18.5 17.9 14.5 17.2 13.1 12.5 20.7 15.9 16.5 16.3
7 student7 14.8 23.8 18.5 20.8 14.2 16.1 13.0 12.7 20.1 15.7 15.9 17.0
8 student8 17.1 15.6 13.7 15.0 14.2 16.0 14.6 16.2 21.7 17.1 15.8 16.1
9 student9 20.3 12.4 12.3 15.1 14.6 18.7 15.1 17.6 18.8 17.6 16.3 16.3
10 student10 17.7 16.2 14.1 17.2 16.6 19.5 13.1 15.8 18.1 18.3 16.9 16.7
You could definitely benefit from the speed of matrixStats library.
matrixStats::rowMedians(as.matrix(d[-1]))
# [1] 17.46551 16.40221 15.71673 15.65965 17.77517 16.54246 15.87096 15.81245 16.34356 16.89695
matrixStats::rowMeans2(as.matrix(d[-1]))
# [1] 17.44417 16.35862 16.33775 16.15837 17.50521 16.27728 16.95862 16.12661 16.25687 16.66180
stopifnot(all.equal(matrixStats::rowMedians(as.matrix(d[-1])),
as.numeric(apply(d[-1], 1, median, na.rm=T))))
stopifnot(all.equal(matrixStats::rowMeans2(as.matrix(d[-1])),
as.numeric(apply(d[-1], 1, mean, na.rm=T))))
Data:
d <- structure(list(student = c("student1", "student2", "student3",
"student4", "student5", "student6", "student7", "student8", "student9",
"student10"), first_run = c(19.70847, 11.22369, 15.93649, 16.18733,
18.71084, 19.75246, 14.75144, 17.06516, 20.27611, 17.70736),
second_run = c(21.79771, 15.36253, 17.03599, 15.13197, 18.85453,
12.74605, 23.82376, 15.63075, 12.44592, 16.2162), third_run = c(16.49083,
16.90215, 14.20214, 14.79481, 17.15864, 18.52214, 18.51366,
13.72026, 12.26502, 14.10861), fourth_run = c(19.51691, 20.20724,
13.17548, 16.75177, 19.3888, 17.92626, 20.77424, 15.02068,
15.13456, 17.20014), fifth_run = c(13.97987, 15.90227, 14.70327,
14.51287, 15.68862, 14.48501, 14.22155, 14.21098, 14.61552,
16.59376), sixth_run = c(14.60733, 15.14539, 15.49697, 17.71816,
18.39169, 17.2078, 16.08186, 15.99414, 18.72192, 19.50027
), seventh_run = c(13.89703, 13.74945, 13.08945, 13.45054,
15.26428, 13.10512, 12.95981, 14.64818, 15.11129, 13.05073
), eight_run = c(15.24651, 18.3009, 19.94142, 14.25553, 16.04526,
12.46502, 12.6782, 16.15603, 17.60746, 15.80002), ninth_run = c(20.75679,
19.55124, 22.41674, 19.89091, 18.92532, 20.68583, 20.12166,
21.74607, 18.83831, 18.09781), tenth_run = c(18.4402, 17.24132,
17.37958, 18.88981, 16.62409, 15.87711, 15.66006, 17.07382,
17.55257, 18.34313)), class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9", "10"))
I'm trying to create a function that summarizes several vectors and the prompt is
Write a function data_summary which takes three inputs:\
`dataset`: A data frame\
`vars`: A character vector whose elements are names of columns from dataset which the user wants summaries for\
`group.name`: A length one character vector which gives the name of the column from dataset which contains the factor which will be used as a grouping variable
\`var.names`: A character vector of the same length as vars which gives the names that the user would like used as the entries under “Variable” in the resulting output. This should be set equal to vars by default, so the default behavior is to use the column names from dataset.
The output of the function should be a data frame with the following structure:
Column names of the data frame will be:\
`Variable`\
`Missing`\
The `first` level of the factor group.name\
The `second` level of the factor group.name\
…\
The `kth` level of the factor group.name\
`p-value`
I've set up the code already,
data_summary <- function(dataset,vars,group.name,var.names) {
}
but I'm unsure how to proceed because I do not understand what this is trying to accomplish and what the output should look like. There is an example that shows
#data_summary<-function(dataset, vars,group.name, var.name){}
#example
#data_summary(titanic4, c("survived", "female", "age", "sibsp", "parch", "fare", "cabin"), "pclass")
#data_summary(titanic4, c("survived", "female", "age", "sibsp", "parch", "fare", "cabin"), "pclass", c("Survival rate", "% Female", "Age", "# siblings/spouses aboard", "# children/parents aboard", "Fare ($)", "Cabin"))
But it really did not help me outside of inputting the arguments for the function.
You can use dplyr package for this function. Also I don't know by which functions you want summarise your dataframe, so I use all functions which summary function returns from base package.
My data:
> NewSKUMatrix
# A tibble: 268,918 x 4
LagerID FilialID CSBID Price
<int> <int> <int> <dbl>
1 233 2578 1005 38.3
2 333 2543 NA 61.0
3 334 2543 NA 15.0
4 335 2543 NA 11.0
5 337 2301 NA 71.0
6 338 2031 NA 37.0
7 338 2044 NA 35.0
8 338 2054 NA 36.0
9 338 2060 NA 37.0
10 338 2063 NA 36.0
# ... with 268,908 more rows
Function:
data_summary <- function(data,
variables,
values,
names = NULL) {
if (is.null(x = names)) {
names <- variables
}
data %>%
group_by_at(.vars = variables) %>%
summarise_at(
.vars = values,
.funs = list(
Min. = min,
`1st Qu.` = ~ quantile(x = ., probs = 0.25),
Median = median,
Mean = mean,
`3rd Qu.` = ~ quantile(x = ., probs = 0.75),
Max. = max
)
) %>%
rename_at(.vars = variables,
.funs = ~ names)
}
Output:
data_summary(NewSKUMatrix,
c('LagerID'),
c('Price'),
c('SKU'))
# A tibble: 32,454 x 7
SKU Min. `1st Qu.` Median Mean `3rd Qu.` Max.
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 17 39.0 39.0 39.0 39.0 39.0 39.0
2 18 120. 120. 120. 121. 120. 140.
3 21 289. 289. 289. 289. 289. 289.
4 24 37.0 37.0 37.0 45.2 45.2 70.0
5 25 14.0 14.0 14.0 14.0 14.0 14.0
6 55 30.9 30.9 30.9 30.9 30.9 30.9
7 117 26.9 26.9 26.9 26.9 26.9 26.9
8 118 24.8 24.9 24.9 25.1 25.1 25.7
9 119 24.8 24.8 24.9 25.1 25.3 25.7
10 158 104. 108. 108. 107. 108. 108.
# ... with 32,444 more rows
I'm trying to reshape my data from a long format into a wide format based on multiple groupings, without success. with this data:
id <- 1:20
month <- rep(4:7, 50)
name <- rep(c("sam", "mike", "tim", "jill", "max"), 40)
cost <- sample(1:100, 200, replace=TRUE)
df <- data.frame(id, month, name, cost)
df.mo.mean <- aggregate(df$cost ~ df$name + df$month, FUN="mean")
df.mo.sd <- aggregate(df$cost ~ df$name + df$month, FUN="sd")
df.mo <- data.frame(df.mo.mean, df.mo.sd)
df.mo <- df.mo[,-c(4,5)]
df.mo[3:4] <- round(df.mo[3:4],2)
head(df)
id month name cost
1 1 4 sam 29
2 2 5 mike 93
3 3 6 tim 27
4 4 7 jill 67
5 5 4 max 28
6 6 5 sam 69
I'm trying to get my data to look like something below, and try to generalize it for an unknown number of names (but <15 max)
month name1.cost.mean name1.cost.sd name2.cost.mean name2.cost.sd
1 45 4 40 6
2 ...
I've tried reshape and do.call with rbind without success. The only other way I can think of doing it is with a loop, which means I'm doing something wrong. I dont have any experience with plyr and would prefer to solve this problem with base packages (for learning purposes), but if its not possible any other suggestions would be very helpful
set.seed(1)
library(plyr)
kk<-ddply(df,.(month,name),summarize,mean=mean(cost),sd=sd(cost))
reshape(kk,timevar="name",idvar="month",direction="wide")
month mean.jill sd.jill mean.max sd.max mean.mike sd.mike mean.sam sd.sam mean.tim sd.tim
1 4 55.3 34.62834 63.3 23.35261 57.6 22.91627 63.4 28.89906 43.3 25.42112
6 5 49.3 25.00689 51.1 27.85059 48.4 23.16223 43.0 24.33562 47.6 32.13928
11 6 60.4 23.61826 52.1 29.74503 38.6 34.39703 53.0 23.28567 52.4 20.88700
16 7 50.0 30.76073 62.7 23.98634 51.7 32.10763 52.8 32.27589 49.5 23.00845
> means <- with( df, tapply(cost, list(month, name), FUN=mean) )
> sds <- with( df, tapply(cost, list(month, name), FUN=sd) )
> colnames(means) <- paste0(colnames(means), ".mean")
> colnames(sds) <- paste0(colnames(sds), ".sd")
> comb.df <- as.data.frame( cbind(means, sds) )
> comb.df <- comb.df[order(names(comb.df))]
> comb.df
jill.mean jill.mean.sd max.mean max.mean.sd mike.mean mike.mean.sd
4 62.1 22.29823 39.7 25.53016 39.6 30.11164
5 40.7 30.72838 44.4 29.12502 54.2 23.91095
6 47.3 31.54556 46.9 32.30910 65.3 30.05569
7 55.5 33.16038 45.9 28.13637 59.7 31.79815
sam.mean sam.mean.sd tim.mean tim.mean.sd
4 40.9 23.54877 58.5 21.69613
5 51.5 30.76163 34.2 32.16900
6 69.1 18.26016 55.2 32.99764
7 46.9 29.90150 55.8 27.17352
I'm not sure what you are asking for, but maybe something like this could be useful
> set.seed(1)
> df <- data.frame(id=1:20, month=rep(4:7, 50),
+ name=rep(c("sam", "mike", "tim", "jill", "max"), 40),
+ cost= sample(1:100, 200, replace=TRUE))
>
> DF.mean <- aggregate(cost ~ name + month, FUN=mean, data=df) ## mean
> DF.sd <- aggregate(cost ~ name + month, FUN=sd, data=df) ## sd
>
> x1 <- as.data.frame.matrix(xtabs(cost~month+name, data=DF.mean)) # reshaping mean
> colnames(x1) <- paste0(colnames(x1), ".mean")
> x2 <- as.data.frame.matrix(xtabs(cost~month+name, data=DF.sd)) # reshaping sd
> colnames(x2) <- paste0(colnames(x2), ".sd")
>
> cbind(x1, x2)
jill.mean max.mean mike.mean sam.mean tim.mean jill.sd max.sd mike.sd sam.sd tim.sd
4 55.3 63.3 57.6 63.4 43.3 34.62834 23.35261 22.91627 28.89906 25.42112
5 49.3 51.1 48.4 43.0 47.6 25.00689 27.85059 23.16223 24.33562 32.13928
6 60.4 52.1 38.6 53.0 52.4 23.61826 29.74503 34.39703 23.28567 20.88700
7 50.0 62.7 51.7 52.8 49.5 30.76073 23.98634 32.10763 32.27589 23.00845
Also, note that #Metrics approach can be done using R base functions without any extra packages:
> kk <- aggregate(cost ~ name + month, FUN=function(x) c(mean=mean(x), sd=sd(x)), data=df)
> reshape(kk,timevar="name",idvar="month",direction="wide")
month cost.jill.mean cost.jill.sd cost.max.mean cost.max.sd cost.mike.mean cost.mike.sd cost.sam.mean cost.sam.sd cost.tim.mean cost.tim.sd
1 4 55.30000 34.62834 63.30000 23.35261 57.60000 22.91627 63.40000 28.89906 43.30000 25.42112
6 5 49.30000 25.00689 51.10000 27.85059 48.40000 23.16223 43.00000 24.33562 47.60000 32.13928
11 6 60.40000 23.61826 52.10000 29.74503 38.60000 34.39703 53.00000 23.28567 52.40000 20.88700
16 7 50.00000 30.76073 62.70000 23.98634 51.70000 32.10763 52.80000 32.27589 49.50000 23.00845
You can use two reshape and then merge the results
library(reshape2)
> dcast(df, month ~ name, mean, value.var="cost")
month jill max mike sam tim
1 4 39.5 54.6 45.6 48.4 57.4
2 5 45.1 61.7 45.4 54.5 50.8
3 6 41.9 45.7 56.4 43.1 52.1
4 7 51.6 38.6 43.6 65.1 51.5
> dcast(df, month ~ name, sd, value.var="cost")
month jill max mike sam tim
1 4 29.31154 25.25954 28.96051 31.32695 29.82989
2 5 31.02848 27.96049 34.32589 30.08599 23.95273
3 6 32.09517 32.50316 37.16988 27.03681 30.42094
4 7 19.56300 31.50026 28.65969 36.53750 26.73429
I have a data that looks like this.
Name|ID|p72|p78|p51|p49|c36.1|c32.1|c32.2|c36.2|c37
hsa-let-7a-5p|MIMAT0000062|9.1|38|12.7|185|8|4.53333333333333|17.9|23|63.3
hsa-let-7b-5p|MIMAT0000063|11.3|58.6|27.5|165.6|20.4|8.5|21|30.2|92.6
hsa-let-7c|MIMAT0000064|7.8|40.2|9.6|147.8|11.8|4.53333333333333|15.4|17.7|62.3
hsa-let-7d-5p|MIMAT0000065|4.53333333333333|27.7|13.4|158.1|8.5|4.53333333333333|14.2|13.5|50.5
hsa-let-7e-5p|MIMAT0000066|6.2|4.53333333333333|4.53333333333333|28|4.53333333333333|4.53333333333333|5.6|4.7|12.8
hsa-let-7f-5p|MIMAT0000067|4.53333333333333|4.53333333333333|4.53333333333333|78.2|4.53333333333333|4.53333333333333|6.8|4.53333333333333|8.9
hsa-miR-15a-5p|MIMAT0000068|4.53333333333333|70.3|10.3|147.6|4.53333333333333|4.53333333333333|21.1|30.2|100.8
hsa-miR-16-5p|MIMAT0000069|9.5|562.6|60.5|757|25.1|4.53333333333333|89.4|142.9|613.9
hsa-miR-17-5p|MIMAT0000070|10.5|71.6|27.4|335.1|6.3|10.1|51|51|187.1
hsa-miR-17-3p|MIMAT0000071|4.53333333333333|4.53333333333333|4.53333333333333|17.2|4.53333333333333|4.53333333333333|9.5|4.53333333333333|7.3
hsa-miR-18a-5p|MIMAT0000072|4.53333333333333|14.6|4.53333333333333|53.4|4.53333333333333|4.53333333333333|9.5|25.5|29.7
hsa-miR-19a-3p|MIMAT0000073|4.53333333333333|11.6|4.53333333333333|42.8|4.53333333333333|4.53333333333333|4.53333333333333|5.5|17.9
hsa-miR-19b-3p|MIMAT0000074|8.3|93.3|15.8|248.3|4.53333333333333|6.3|44.7|53.2|135
hsa-miR-20a-5p|MIMAT0000075|4.53333333333333|75.2|23.4|255.7|6.6|4.53333333333333|43.8|38|130.3
hsa-miR-21-5p|MIMAT0000076|6.2|19.7|18|299.5|6.8|4.53333333333333|49.9|68.5|48
hsa-miR-22-3p|MIMAT0000077|40.4|128.4|65.4|547.1|56.5|33.4|104.9|84.1|248.3
hsa-miR-23a-3p|MIMAT0000078|58.3|99.3|58.6|617.9|36.6|21.4|107.1|125.5|120.9
hsa-miR-24-1-5p|MIMAT0000079|4.53333333333333|4.53333333333333|4.53333333333333|9.2|4.53333333333333|4.53333333333333|4.53333333333333|4.9|4.53333333333333
hsa-miR-24-3p|MIMAT0000080|638.2|286.9|379.5|394.4|307.8|240.4|186|234.2|564
What I want to do is to simply pick rows where all the values is greater than 10.
But why this code of mine only report the last one?
The data clearly showed that there are more rows that satisfy this condition.
> dat<-read.delim("http://dpaste.com/1215552/plain/",sep="|",na.strings="",header=TRUE,blank.lines.skip=TRUE,fill=FALSE)
But why this code of mine only report the last one?
> dat[apply(dat[, -1], MARGIN = 1, function(x) all(x > 10)), ]
Name ID p72 p78 p51 p49 c36.1 c32.1 c32.2 c36.2 c37
19 hsa-miR-24-3p MIMAT0000080 638.2 286.9 379.5 394.4 307.8 240.4 186 234.2 564
What is the right way to do it?
Update:
alexwhan solution works. But I wonder how can I generalized his approach
so that it can handle data with missing values (NA)
dat<-read.delim("http://dpaste.com/1215354/plain/",sep="\t",na.strings="",header=FALSE,blank.lines.skip=TRUE,fill=FALSE)
Since you're including your ID column (which is a factor) in the all(), it's getting messed up. Try:
dat[apply(dat[, -c(1,2)], MARGIN = 1, function(x) all(x > 10)), ]
# Name ID p72 p78 p51 p49 c36.1 c32.1 c32.2 c36.2 c37
# 16 hsa-miR-22-3p MIMAT0000077 40.4 128.4 65.4 547.1 56.5 33.4 104.9 84.1 248.3
# 17 hsa-miR-23a-3p MIMAT0000078 58.3 99.3 58.6 617.9 36.6 21.4 107.1 125.5 120.9
# 19 hsa-miR-24-3p MIMAT0000080 638.2 286.9 379.5 394.4 307.8 240.4 186.0 234.2 564.0
EDIT
For the case where you have NA, you can just just use the na.rm argument for all(). Using your new data (from the comment):
dat<-read.delim("http://dpaste.com/1215354/plain/",sep="\t",na.strings="",header=FALSE,blank.lines.skip=TRUE,fill=FALSE)
dat[apply(dat[, -c(1,2)], MARGIN = 1, function(x) all(x > 10, na.rm = T)), ]
# V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11
# 7 hsa-miR-15a-5p MIMAT0000068 NA 70.3 10.3 147.6 NA NA 21.1 30.2 100.8
# 16 hsa-miR-22-3p MIMAT0000077 40.4 128.4 65.4 547.1 56.5 33.4 104.9 84.1 248.3
# 17 hsa-miR-23a-3p MIMAT0000078 58.3 99.3 58.6 617.9 36.6 21.4 107.1 125.5 120.9
# 19 hsa-miR-24-3p MIMAT0000080 638.2 286.9 379.5 394.4 307.8 240.4 186.0 234.2 564.0
# 20 hsa-miR-25-3p MIMAT0000081 19.3 78.6 25.6 84.3 14.9 16.9 19.1 27.2 113.8
# 21 hsa-miR-26a-5p MIMAT0000082 NA 22.8 31.0 561.2 12.4 NA 67.0 55.8 48.9
ANother idea is to transform your data ton long format( or molton format). I think it is even better to avoid missing values problem with:
library(reshape2)
dat.m <- melt(dat,id.vars=c('Name','ID'))
dat.m$value <- as.numeric(dat.m$value)
library(plyr)
res <- ddply(dat.m,.(Name,ID), summarise, keepme = all(value > 10))
res[res$keepme,]
# Name ID keepme
# 16 hsa-miR-22-3p MIMAT0000077 TRUE
# 17 hsa-miR-23a-3p MIMAT0000078 TRUE
# 19 hsa-miR-24-3p MIMAT0000080 TRUE