Using across function in dplyr to a subset of variables - r

I have a data frame like this:
require(dplyr)
x_1=rnorm(10,0,1)
x_2=rnorm(10,0,1)
x_3=rnorm(10,0,1)
y_1=rnorm(10,0,1)
y_2=rnorm(10,0,1)
data=data.frame(cbind(x_1,x_2,x_3,y_1,y_2))
data[1,1]=NA
data[2,1]=NA
data[5,2]=NA
> data
x_1 x_2 x_3 y_1 y_2
1 NA 0.9272000 0.29439845 -1.7856567 1.6579091
2 NA 0.2346621 1.09837343 0.3731092 0.6111779
3 0.7315300 -0.5579094 -0.08524311 -2.8661310 1.1545358
4 -0.9469221 0.6929277 -2.67173898 0.6391045 -0.5114099
5 1.5408777 NA 1.33386146 -0.5581233 -2.5733381
6 -0.2852210 -0.9532492 0.03750860 -1.0129503 0.3929722
7 -1.3821487 -2.1865094 -0.03039062 0.3960388 -1.5332137
8 -0.9447420 0.2669902 0.65167163 0.4310705 -1.5300816
9 -0.9023479 0.2068130 0.10868635 -1.1652238 -0.4892178
10 -0.9739177 -0.8094084 0.64103491 0.6063812 0.7248394
I need to create a new variable which counts the number of non missing values in each row for the variables starting with "x_". To do that I used mutate and across functions from dplyr.
data=data %>% mutate(sum_no_miss=across(.cols = starts_with("x_"),~ sum(is.na(.x))))
I ran the code without getting error. But I am not getting the ourput that I want. I am getting this.
Would it be possible to tell what I'm doing wrong?

We may use rowSums which is vectorized and efficient compared to rowwise with sum
library(dplyr)
data %>%
mutate(sum_no_miss = rowSums(!is.na(across(starts_with("x_")))))
-output
x_1 x_2 x_3 y_1 y_2 sum_no_miss
1 NA 0.9272000 0.29439845 -1.7856567 1.6579091 2
2 NA 0.2346621 1.09837343 0.3731092 0.6111779 2
3 0.7315300 -0.5579094 -0.08524311 -2.8661310 1.1545358 3
4 -0.9469221 0.6929277 -2.67173898 0.6391045 -0.5114099 3
5 1.5408777 NA 1.33386146 -0.5581233 -2.5733381 2
6 -0.2852210 -0.9532492 0.03750860 -1.0129503 0.3929722 3
7 -1.3821487 -2.1865094 -0.03039062 0.3960388 -1.5332137 3
8 -0.9447420 0.2669902 0.65167163 0.4310705 -1.5300816 3
9 -0.9023479 0.2068130 0.10868635 -1.1652238 -0.4892178 3
10 -0.9739177 -0.8094084 0.64103491 0.6063812 0.7248394 3
If we want to use sum, then need rowwise
data %>%
rowwise %>%
mutate(sum_no_miss = sum(!is.na(c_across(starts_with('x_'))))) %>%
ungroup
-output
# A tibble: 10 × 6
x_1 x_2 x_3 y_1 y_2 sum_no_miss
<dbl> <dbl> <dbl> <dbl> <dbl> <int>
1 NA 0.927 0.294 -1.79 1.66 2
2 NA 0.235 1.10 0.373 0.611 2
3 0.732 -0.558 -0.0852 -2.87 1.15 3
4 -0.947 0.693 -2.67 0.639 -0.511 3
5 1.54 NA 1.33 -0.558 -2.57 2
6 -0.285 -0.953 0.0375 -1.01 0.393 3
7 -1.38 -2.19 -0.0304 0.396 -1.53 3
8 -0.945 0.267 0.652 0.431 -1.53 3
9 -0.902 0.207 0.109 -1.17 -0.489 3
10 -0.974 -0.809 0.641 0.606 0.725 3
In the OP's code, the function sum is used within across and across loops over each column, thus the sum will be the sum of non-NA elements in each column instead of across a row

First, you're close.
Edit: OP fixed this I believe, so not a relevant comment. Second, your variables are coded in a misleading way - you should rename your variable sum_is_miss to something like sum_no_miss if you want readers to understand that you're calculating the number of non-missing values.
Third, this is how you could calculate the number of non-missing entries per row in your dataset first using apply then just adding this column to your dataset:
library(tidyverse)
x_1=rnorm(10,0,1)
x_2=rnorm(10,0,1)
x_3=rnorm(10,0,1)
y_1=rnorm(10,0,1)
y_2=rnorm(10,0,1)
data=data.frame(cbind(x_1,x_2,x_3,y_1,y_2))
data[1,1]=NA
data[2,1]=NA
data[5,2]=NA
sum_no_miss_vec <- apply(data %>% dplyr::select(starts_with("x_")), MARGIN = 1, FUN = function(r){
sum(!is.na(r))
})
data2 <- data %>% mutate(sum_no_miss = sum_no_miss_vec) ;
data2
#> x_1 x_2 x_3 y_1 y_2 sum_no_miss
#> 1 NA 0.74234418 1.06515091 -0.313359946 -0.81266805 2
#> 2 NA 2.13222122 0.78392737 2.109171065 0.69459821 2
#> 3 0.9322299 -0.52545325 0.67377319 2.025281430 0.99975832 3
#> 4 0.9634517 0.38985353 1.20940016 -0.007232240 -1.61104902 3
#> 5 0.4454230 NA 0.02420848 -1.743636503 -0.59597234 2
#> 6 -1.7305822 2.07163152 -0.52849895 0.830802138 -1.40573549 3
#> 7 0.2382603 0.20427098 0.22184048 0.806113977 -0.36726054 3
#> 8 -0.3972436 1.61183785 -0.26835072 0.419459671 -0.05723072 3
#> 9 0.3703195 -0.05354607 -1.19558014 -0.852003930 0.64032424 3
#> 10 0.3003434 -0.82513981 0.19782771 0.001526784 0.89393655 3
Created on 2022-04-15 by the reprex package (v2.0.1)

Here is a base R solution:
As akrun already provided the best answer. Here is a different one using base R:
We use apply with grep to apply the function to specific columns:
data$sum_no_miss <- apply(data[, grep("x_", names(data))], 1, function(x) sum(!is.na(x)))
x_1 x_2 x_3 y_1 y_2 sum_no_miss
1 NA -0.5659449 -1.44792814 0.1659370 0.8040186 2
2 NA 2.6873635 -0.70704189 1.2647756 -0.1238085 2
3 -0.3239291 0.6206436 0.04374401 -0.6476829 1.5228775 3
4 0.7245148 1.6632621 -0.39304104 -0.9305281 1.1328385 3
5 -0.5994830 NA 0.06037891 -1.7654617 0.3073035 2
6 -0.1848746 0.3694963 -1.13622715 0.9252195 0.1072250 3
7 -0.1147132 0.4042102 1.56730477 0.3262673 -0.6369951 3
8 -0.8631230 0.2888508 -2.20030009 -0.9873629 0.2561348 3
9 -0.9384460 -0.8739620 -1.59174131 0.7559146 -1.4229472 3
10 -0.9352575 1.3151532 -0.11439843 -0.5451860 0.9334084 3

Here is a possible data.table solution, using Reduce and lapply on the specific columns using grep.
library(data.table)
dt <- as.data.table(data)
dt[, num_obs := Reduce(`+`, lapply(.SD, \(x) !is.na(x))), .SDcols=grep("x_", names(dt))]
Output
x_1 x_2 x_3 y_1 y_2 sum_no_miss
1: NA 1.30228879 -0.586898083 -1.02679871 -0.9280488 2
2: NA -1.00846632 -0.260183508 -0.78828113 -0.8712957 2
3: -0.40475601 0.22961832 0.004414558 -1.04496673 -0.1032775 3
4: 0.09559518 -0.58875533 1.360528488 -0.48798151 -0.6350380 3
5: -0.39312997 NA 0.292025300 1.13544025 -0.2487097 2
6: -1.15802973 1.01589098 0.445829196 -0.02029337 0.9758154 3
7: -0.02524740 -0.17334510 -1.455821490 -0.12165396 -0.4441740 3
8: 0.93627901 -0.92913166 0.407038460 2.04054914 -0.8347571 3
9: 1.20218530 0.54453181 0.513222262 0.05571475 -0.4858128 3
10: 0.84765702 0.07472934 1.367745731 -1.49924113 -1.3170490 3

Related

R: How to simply compare values of columns in 2 data frames

I am comparing two data frames: FU and FO
Here are short samples of what they look like
"Model_ID" "FU_Lin_Period" "FU_Growth_rate"
2 0.72127 0.0093333
3 0.69281 0.015857
4 0.66735 0.021103
5 0.64414 0.024205
6 0.62288 0.026568
7 0.60318 0.027749
8 0.58472 0.028161
9 0.56734 0.028008
10 0.55085 0.027309
11 0.53522 0.026068
12 0.52029 0.024684
13 0.50603 0.022866
14 0.49237 0.020991
15 0.47928 0.018773
"Model_ID" "FO_Lin_Period" "FO_Growth_rate"
7 0.44398 0.008868
8 0.43114 0.01674
9 0.41896 0.023248
10 0.40728 0.028641
11 0.39615 0.032192
12 0.38543 0.03543
13 0.37517 0.03692
14 0.36525 0.038427
15 0.35573 0.038195
As you can tell, they do not have all the same Model_ID
Basically, what I want to do is go through every Model_ID in the two tables, compare whether FU or FO's growth rate is larger for a given model ID, and...
if FU's is larger (or FU exists for the model number and FO does not), place the model number in a vector called selected_FU
if FO's is larger (or FO exists for the model number and FU does not), place the model number in a vector called selected_FO
Is there a way to do this without using loops?
data.table alternative using similar logic to the tidyverse answer.
Replace NAs with -Infinity, do the comparison of the two FU/FO_Growth_rate variables, flag which group had the larger value, and select the Model_ID into the variables requested.
library(data.table)
setDT(FU)
setDT(FO)
out <- merge(FU, FO, by="Model_ID", all=TRUE)[,
"gr_sel" := c("FO","FU")[(nafill(FU_Growth_rate, fill=-Inf) >
nafill(FO_Growth_rate, fill=-Inf)) + 1],
]
selected_FU <- out[gr_sel == "FU", Model_ID]
selected_FO <- out[gr_sel == "FO", Model_ID]
Data used:
FU <- read.table(text="Model_ID FU_Lin_Period FU_Growth_rate\n2 0.72127 0.0093333\n3 0.69281 0.015857\n4 0.66735 0.021103\n5 0.64414 0.024205\n6 0.62288 0.026568\n7 0.60318 0.027749\n8 0.58472 0.028161\n9 0.56734 0.028008\n10 0.55085 0.027309\n11 0.53522 0.026068\n12 0.52029 0.024684\n13 0.50603 0.022866\n14 0.49237 0.020991\n15 0.47928 0.018773", header=TRUE)
FO <- read.table(text="Model_ID FO_Lin_Period FO_Growth_rate\n7 0.44398 0.008868\n8 0.43114 0.01674\n9 0.41896 0.023248\n10 0.40728 0.028641\n11 0.39615 0.032192\n12 0.38543 0.03543\n13 0.37517 0.03692\n14 0.36525 0.038427\n15 0.35573 0.038195", header=TRUE)
With dplyr, tidyr, and reader.
library(dplyr)
library(tidyr)
library(readr)
FU <- read_table2("test.FU.LINA.table")
FO <- read_table2("test.FO.LINA.table")
df_compared <-
full_join(FU, FO, by = "model_id") %>%
replace_na(list(fo_growth_rate = -1, fu_growth_rate = -1)) %>%
mutate(select_fufo = if_else(fu_growth_rate >= fo_growth_rate, true = "fu", false = "fo"))
df_compared
# A tibble: 6,166 x 6
model_id fu_lin_period fu_growth_rate fo_lin_period fo_growth_rate select_fufo
<dbl> <dbl> <dbl> <dbl> <dbl> <chr>
1 2 0.721 0.00933 NA -1 fu
2 3 0.693 0.0159 NA -1 fu
3 4 0.667 0.0211 NA -1 fu
4 5 0.644 0.0242 NA -1 fu
5 6 0.623 0.0266 NA -1 fu
6 7 0.603 0.0277 0.444 0.00887 fu
7 8 0.585 0.0282 0.431 0.0167 fu
8 9 0.567 0.0280 0.419 0.0232 fu
9 10 0.551 0.0273 0.407 0.0286 fo
10 11 0.535 0.0261 0.396 0.0322 fo
# ... with 6,156 more rows
selected_fu <- df_compared %>% filter(select_fufo == "fu") %>% .$model_id
selected_fo <- df_compared %>% filter(select_fufo == "fo") %>% .$model_id

Developing a row extraction rule

I want to develop a rule to extract certain rows from a matrix. I set up the example as follows:
mat1 = data.frame(matrix(nrow=508, ncol =5))
mat1[1:20,1] = rep(1,20)
mat1[1:20,2:5] = rnorm(20*4,0,1)
mat2 = data.frame(matrix(nrow=508, ncol =5))
seq1 <- seq(1,3,1)
mat2[1:27,1] = rep(seq1,9)
mat2[1:27,2:5] = rnorm(27*4,0,1)
mat3 = data.frame(matrix(nrow=508, ncol =5))
mat3[1:32,1] = rep(seq(1,4,1),8)
mat3[1:32,2:5] = rnorm(32*4,0,1)
colnames(mat1) = colnames(mat2) = colnames(mat3) = c("Cohort Number", "Alpha(t-1)", "date1", "date2", "date3")
mat.list <- list(mat1,mat2,mat3)
Example matrix
Cohort Number Alpha(t-1) date1 date2 date3
1 1 -1.76745451 -1.3227308 2.7099501 -0.13797329
2 1 -0.72651808 -0.8714317 1.3200554 0.76964663
3 1 -0.50325892 0.0742336 -0.6460628 0.30148135
4 1 0.79592650 0.1353875 -0.5694022 -0.59019913
5 1 1.94064961 0.2255595 0.3156252 -0.90996475
6 1 0.27134932 0.3966957 -1.9198976 0.23998928
7 1 -1.13272507 -0.8603225 -1.2042036 0.06609958
8 1 -2.12392748 1.0905405 -0.3788234 0.92850110
9 1 0.22038996 0.4500683 -1.4617004 0.58498275
10 1 0.26348734 -0.8340913 1.2631368 -1.48490518
11 1 0.26931077 -0.5230622 -0.6615288 1.45668453
12 1 -2.03067695 -0.6432484 0.4801026 0.01808834
13 1 1.25915656 -0.1116544 -0.3004298 -1.04072722
14 1 -2.27894271 -2.1058424 -0.3351053 -1.04132045
15 1 0.47742052 2.1564274 -0.4733351 -0.53152019
16 1 -1.57680089 -0.1340645 -0.3134633 0.53223567
17 1 0.25245813 -0.8243152 0.5998211 -1.01892301
18 1 0.18391447 -1.3500645 1.6059798 1.43359399
19 1 -0.09602031 1.4921338 -0.6455687 0.66385823
20 1 -0.13613759 2.2474816 0.7311762 -2.46849071
mat2[1:27,]
Cohort Number Alpha(t-1) date1 date2 date3
1 1 -0.76033920 1.317636591 -0.09684526 -0.08796725
2 2 0.05123185 -0.731591674 -0.37247406 0.04470346
3 3 -0.78460201 0.890336570 1.26737475 -0.39062992
4 1 -0.14111920 1.255008475 -0.32799815 -0.77277716
5 2 -0.46044451 1.175157970 0.82187906 0.54326905
6 3 -0.46804365 0.704203273 -2.04539007 -1.74782065
7 1 0.42009824 0.488807461 3.21093186 -0.13745029
8 2 1.27083389 -1.316989452 0.43565921 0.07870330
9 3 -0.16581119 1.872955624 -0.22399155 -0.79334562
10 1 -1.33436656 0.589311311 -1.03871415 -1.06221057
11 2 1.56584985 0.020699064 0.45691456 0.15858065
12 3 1.07756426 -0.045200151 0.05124461 -1.86633279
13 1 -1.01264994 -0.229406681 1.24954420 0.88846407
14 2 -0.09950713 -0.515798138 1.62560454 -0.20191909
15 3 -0.28319479 0.450854419 1.42963386 -1.11964154
16 1 0.51771608 -1.407248379 0.62626313 0.97775246
17 2 -0.43951262 -0.368739441 0.66564013 -0.79980882
18 3 -0.15865277 -0.231475146 0.37582330 0.93685867
19 1 -0.57758129 0.235550070 0.42480442 -0.14379249
20 2 -0.81726414 -1.207593079 -0.30000514 0.68967230
21 3 -0.72926703 -0.458849409 1.51162785 1.40921409
22 1 -0.32220454 0.334996561 1.26073381 -2.03405958
23 2 -0.51450039 -0.305634241 1.51021957 0.39775430
24 3 1.15476297 -1.040126709 -0.36192432 -0.37346894
25 1 -0.88053587 -0.006829769 -0.89855797 -0.39840858
26 2 -0.64435448 0.209561006 -0.13986834 -0.61308957
27 3 1.22492942 0.812693992 -1.32371617 -1.21852365
and
> mat3[1:32,]
Cohort Number Alpha(t-1) date1 date2 date3
1 1 -0.7657871 -0.35390862 -0.23539987 -1.8365309
2 2 -0.6631690 1.36450837 0.78403072 -0.8344993
3 3 -1.0134022 -0.28380021 0.72149463 -0.7890273
4 4 2.6419455 0.26998803 2.03606725 0.8099134
5 1 -0.1383910 0.90845134 1.09273919 0.4651443
6 2 -0.7549340 -0.23185551 2.21119705 -0.1386960
7 3 0.7296121 -1.09145187 -1.18092505 0.1510642
8 4 -0.5583415 0.71988405 0.09454476 -0.8661514
9 1 -0.2420894 -0.03215026 -2.51249946 1.1659027
10 2 -0.6434337 -0.13910557 -1.10373674 1.2377968
11 3 -0.6297123 2.09797419 0.87128407 -0.1351845
12 4 0.6674166 0.48707847 0.36373509 1.0680623
13 1 0.6254708 -0.61311671 0.82542494 1.7320687
14 2 -2.4704173 0.98460064 -1.10416042 2.9627952
15 3 -0.2544887 0.63177246 -0.39138717 1.6942072
16 4 -0.9807623 1.11882794 -0.47669974 1.2383798
17 1 -0.6900549 1.68086482 -0.01405476 -1.3099288
18 2 1.4510505 -0.04752782 1.49735258 0.2963673
19 3 -1.1355194 -1.76263532 -1.49318214 1.3524114
20 4 0.7168833 -0.76833639 0.60752304 -1.0647885
21 1 2.0004745 2.13931057 -1.35036048 -0.7694501
22 2 2.0985591 0.01569677 0.33975952 -1.4979973
23 3 0.1703261 -1.47625208 -1.13228671 0.5686501
24 4 0.2632233 -0.55672667 0.33428217 0.5341078
25 1 -0.2741324 -1.61301237 0.78861248 0.4982554
26 2 -0.8793897 -1.07266362 -0.78158128 0.9127354
27 3 0.3920579 -0.59869834 -0.76775259 1.8137107
28 4 -1.4088488 -0.54954542 0.32421016 0.7284813
29 1 -1.2421837 0.50599077 1.62464999 0.6801672
30 2 -2.8980422 0.42197236 0.45243582 1.4939070
31 3 0.3965108 -1.35877353 1.52230797 -1.6552039
32 4 0.8112229 0.51970084 0.30830797 -2.0563928
What I want to do:
For every matrix in mat.list I want to extract 6 rows of data, according to certain criteria, and place these rows as a data.frame in a list labelled Output1. I want to store all remaining rows as a data.frame in Output2.
The process:
1) Group data by cohort number.
2a. If there is 1 group (Cohort Number can only = 1). Move to column 2 and extract the 6 rows of matrix with the highest value for "Alpha(t-1)". Store these rows as a data.frame in a list named "Output1". Store all remaining rows as a data.frame in a list named "Output2".
2b. If there are 2 groups (Cohort number can = 1 or Cohort Number can =2) move to column 2 and extract the 3 rows with the largest "Alpha(t-1)" corresponding to Cohort Number ==1 and extract the 3 rows with largest"Alpha(t-1)" corresponding to Cohort Number == 2. Place the 6 rows extracted as a data.frame in a list named "Output1". Place all remaining rows as a data.frame in a list named "Output2".
2c. If there are 3 groups ("Cohort Number can = 1, Cohort Number can =2, Cohort Number can =3 ) move to column 2 and extract the 2 rows with the largest "Alpha(t-1)" corresponding to Cohort Number ==1, extract the 2 rows with the largest "Alpha(t-1)" corresponding to Cohort Number =2 and extract the 2 rows with the largest "Alpha(t-1)" corresponding to Cohort Number =3
2d. If there are 4 groups ("Cohort Number can = 1, Cohort Number can =2, Cohort Number can =3, Cohort Number = 4) move to column 2. Extract the 2 rows with the largest "Alpha(t-1)" corresponding to Cohort Number ==1. Extract the 2 row with the largest "Alpha(t-1)" corresponding to Cohort Number ==2. Extract the 1 row with the largest "Alpha(t-1)" corresponding to Cohort Number ==3 and Extract the 1 row with the largest "Alpha(t-1)" corresponding to Cohort Number ==4. Store the 6 key rows as a data.frame in Output1. Store all remaining rows as a data.frame in the list Output2.
Desired Output:
Output1 <- c()
Output2 <- c()
Output1[[1]] = mat1 %>% group_by(`Cohort Number`) %>% top_n(6, `Alpha(t-1)`)
Output1[[2]] = mat2 %>% group_by(`Cohort Number`) %>% top_n(2, `Alpha(t-1)`)
> Output1[[1]]
# A tibble: 6 x 5
# Groups: Cohort Number [1]
`Cohort Number` `Alpha(t-1)` date1 date2 date3
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1 0.796 0.135 -0.569 -0.590
2 1 1.94 0.226 0.316 -0.910
3 1 0.271 0.397 -1.92 0.240
4 1 0.269 -0.523 -0.662 1.46
5 1 1.26 -0.112 -0.300 -1.04
6 1 0.477 2.16 -0.473 -0.532
> Output1[[2]]
# A tibble: 6 x 5
# Groups: Cohort Number [3]
`Cohort Number` `Alpha(t-1)` date1 date2 date3
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1 0.420 0.489 3.21 -0.137
2 2 1.27 -1.32 0.436 0.0787
3 2 1.57 0.0207 0.457 0.159
4 1 0.518 -1.41 0.626 0.978
5 3 1.15 -1.04 -0.362 -0.373
6 3 1.22 0.813 -1.32 -1.22
Overall I need a function to do this because i have over 1000 matrices in my actual application and can't do this manually.
We can count the number of distinct values in Cohort Number and based on that select the value of n in top_n. For distinct values which are more than 3, we create vector of values to select in top_n for each Cohort Number.
library(tidyverse)
output1 <- map(mat.list, function(x) {
dist <- n_distinct(x$`Cohort Number`, na.rm = TRUE)
if(dist <= 3)
x %>%
group_by(`Cohort Number`) %>%
top_n(6/dist, `Alpha(t-1)`)
else
map2_df(list(2, 2, 1, 1),x %>% na.omit %>% group_split(`Cohort Number`),
~.y %>% top_n(.x, `Alpha(t-1)`))
})
and for output2, we use map2 with ant_join
output2 <- map2(mat.list, output1, anti_join)
Confirming the output
map_dbl(output1, nrow)
#[1] 6 6 6
map_dbl(output2, nrow)
#[1] 502 502 502

A compact way to perform multiple pairwise tests (e.g. t-test) with a single variable split in multiple categories in long-format

I am interested in performing multiple tests for a single variable with an associated factor that split the values into multiple groups. It is related to this question and, actually, I would like to get a solution of that kind but it is not exactly the same.
In my case, I have a single variable and multiple groups (eventually many). Expanding on this example:
library(reshape)
# Create a dataset
mu=34
stdv=5
Location=rep(c("Area_A","Area_B","Area_C"),5)
distro=rnorm(length(Location),mu,stdv)
id=seq(1:length(Location))
sample_long=data.frame(id,Location,distro)
sample_long
id Location distro
1 1 Area_A 34.95737
2 2 Area_B 31.30298
3 3 Area_C 35.86569
4 4 Area_A 40.45378
5 5 Area_B 36.12060
6 6 Area_C 28.29649
7 7 Area_A 30.64495
8 8 Area_B 29.70668
9 9 Area_C 33.22874
10 10 Area_A 25.29148
11 11 Area_B 32.35511
12 12 Area_C 34.69159
13 13 Area_A 26.89791
14 14 Area_B 35.30717
15 15 Area_C 40.64628
I would like to perform all-against-all tests among Areas, i.e. test(Area_A,Area_B), test(Area_A,Area_C) and test(Area_B,Area_C) (in a more general case, all the i<j possible tests).
A simple way to go is to transform the data into wide format:
# Reshape to wide format
sample_wide=reshape(sample_long,direction="wide",idvar="id",timevar="Location")
sample_wide
id distro.Area_A distro.Area_B distro.Area_C
1 1 34.95737 NA NA
2 2 NA 31.30298 NA
3 3 NA NA 35.86569
4 4 40.45378 NA NA
5 5 NA 36.12060 NA
6 6 NA NA 28.29649
7 7 30.64495 NA NA
8 8 NA 29.70668 NA
9 9 NA NA 33.22874
10 10 25.29148 NA NA
11 11 NA 32.35511 NA
12 12 NA NA 34.69159
13 13 26.89791 NA NA
14 14 NA 35.30717 NA
15 15 NA NA 40.64628
and then loop across all-against-all columns, for which I've seen several approximations more R-like than the following one in which I'm using for loops:
# Now compute the test
test.out=list()
k=0
for(i in 2:(dim(sample_wide)[2]-1)){ # All against all var groups
for(j in (i+1):dim(sample_wide)[2]){
k=k+1
test.out[[k]]=t.test(sample_wide[,i],
sample_wide[,j]) # store results in a list
}
}
But my question is not about which is the best solution given the wide format, but whether it is possible to find a solution for the problem working from the original long format, in line with the solutions found for the links I provided above that use dplyr, broom, etc.
This is a little trickier and less straightforward than I hoped. You can first figure out the combinations of locations and, to make it a little simpler, save that in a lookup table. I turned that into a long shape with an ID for each pair, which I'll use as a grouping variable on the data.
library(dplyr)
library(tidyr)
library(purrr)
set.seed(111)
# same data creation code
grps <- as.data.frame(t(combn(levels(sample_long$Location), 2))) %>%
mutate(pair = row_number()) %>%
gather(key, value = loc, -pair) %>%
select(-key)
grps
#> pair loc
#> 1 1 Area_A
#> 2 2 Area_A
#> 3 3 Area_B
#> 4 1 Area_B
#> 5 2 Area_C
#> 6 3 Area_C
Joining the lookup to the data frame doubles the rows—that will differ depending on how many levels you're combining. Note also I dropped your ID column since it didn't seem necessary right now. Nest, do the t-test, and tidy the results.
sample_long %>%
select(-id) %>%
inner_join(grps, by = c("Location" = "loc")) %>%
group_by(pair) %>%
nest() %>%
mutate(t_test = map(data, ~t.test(distro ~ Location, data = .)),
tidied = map(t_test, broom::tidy)) %>%
unnest(tidied)
#> # A tibble: 3 x 13
#> pair data t_test estimate estimate1 estimate2 statistic p.value
#> <int> <lis> <list> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 <tib… <htes… -0.921 31.8 32.7 -0.245 0.816
#> 2 2 <tib… <htes… -1.48 31.8 33.3 -0.383 0.716
#> 3 3 <tib… <htes… -0.563 32.7 33.3 -0.305 0.769
#> # … with 5 more variables: parameter <dbl>, conf.low <dbl>,
#> # conf.high <dbl>, method <chr>, alternative <chr>
If you needed to, you could do something to show which locations are in each pair—joining with the lookup table would be one way to do this.
I'm realizing also that you mentioned wanting to use broom functions afterwards, but didn't specify that you need a broom::tidy call. In that case, just drop the last 2 lines.
A little bit of base R will do the trick:
combn(x=unique(sample_long$Location), m=2, simplify=FALSE,
FUN=function(l) {
t.test(distro ~ Location, data=subset(sample_long, Location %in% l))
})
combn will generate all combinations of elements of x taken m at a time (sic). Combined with subset, you will apply your test to subsets of your data.frame.

R is not ordering data correctly - skips E values

I am trying to order data by the column weightFisher. However, it is almost as if R does not process e values as low, because all the e values are skipped when I try to order from smallest to greatest.
Code:
resultTable_bon <- GenTable(GOdata_bon,
weightFisher = resultFisher_bon,
weightKS = resultKS_bon,
topNodes = 15136,
ranksOf = 'weightFisher'
)
head(resultTable_bon)
#create Fisher ordered df
indF <- order(resultTable_bon$weightFisher)
resultTable_bonF <- resultTable_bon[indF, ]
what resultTable_bon looks like:
GO.ID Term Annotated Significant Expected Rank in weightFisher
1 GO:0019373 epoxygenase P450 pathway 19 13 1.12 1
2 GO:0097267 omega-hydroxylase P450 pathway 9 7 0.53 2
3 GO:0042738 exogenous drug catabolic process 10 7 0.59 3
weightFisher weightKS
1 1.9e-12 0.79744
2 7.9e-08 0.96752
3 2.5e-07 0.96336
what "ordered" resultTable_bonF looks like:
GO.ID Term Annotated Significant Expected Rank in weightFisher
17 GO:0014075 response to amine 33 7 1.95 17
18 GO:0034372 very-low-density lipoprotein particle re... 11 5 0.65 18
19 GO:0060710 chorio-allantoic fusion 6 4 0.35 19
weightFisher weightKS
17 0.00014 0.96387
18 0.00016 0.83624
19 0.00016 0.92286
As #bhas says, it appears to be working precisely as you want it to. Maybe it's the use of head() that's confusing you?
To put your mind at ease, try it with something simpler
dtf <- data.frame(a=c(1, 8, 6, 2)^-10, b=c(7, 2, 1, 6))
dtf
# a b
# 1 1.000000e+00 7
# 2 9.313226e-10 2
# 3 1.653817e-08 1
# 4 9.765625e-04 6
dtf[order(dtf$a), ]
# a b
# 2 9.313226e-10 2
# 3 1.653817e-08 1
# 4 9.765625e-04 6
# 1 1.000000e+00 7
Try the following :
resultTable_bon$weightFisher <- as.numeric (resultTable_bon$weightFisher)
Then :
resultTable_bonF <- resultTable_bon[order(resultTable_bonF$weightFisher),]

Selecting top finite number of rows for each unique value of a column in a data fame in R

I have a data frame with 3 columns. a,b,c. There are multiple rows corresponding to each unique value of column a. I want to select top 5 rows corresponding to each unique value of column a. column c is some value and the data frame is already sorted by it in descending order, so that would not be a problem. Can anyone please suggest how can I do this in R.
Stealing #ptocquin's example, here's how you can use base function by. You can flatten the result using do.call (see below).
> by(data = data, INDICES = data$a, FUN = function(x) head(x, 5))
# or by(data = data, INDICES = data$a, FUN = head, 5)
data$a: 1
a b c
21 1 0.1188552 1.6389895
41 1 1.0182033 1.4811359
61 1 -0.8795879 0.7784072
81 1 0.6485745 0.7734652
31 1 1.5102255 0.7107957
------------------------------------------------------------
data$a: 2
a b c
15 2 -1.09704040 1.1710693
85 2 0.42914795 0.8826820
65 2 -1.01480957 0.6736782
45 2 -0.07982711 0.3693384
35 2 -0.67643885 -0.2170767
------------------------------------------------------------
A similar thing could be achieved by splitting your data.frame based on a and then using lapply to step through each element subsetting first n rows.
split.data <- split(data, data$a)
subsetted.data <- lapply(split.data, FUN = function(x) head(x, 5)) # or ..., FUN = head, 5) like above
flatten.data <- do.call("rbind", subsetted.data)
head(flatten.data)
a b c
1.21 1 0.11885516 1.63898947
1.41 1 1.01820329 1.48113594
1.61 1 -0.87958790 0.77840718
1.81 1 0.64857445 0.77346517
1.31 1 1.51022545 0.71079568
2.15 2 -1.09704040 1.17106930
2.85 2 0.42914795 0.88268205
2.65 2 -1.01480957 0.67367823
2.45 2 -0.07982711 0.36933837
2.35 2 -0.67643885 -0.21707668
Here is my try :
library(plyr)
data <- data.frame(a=rep(sample(1:20,10),10),b=rnorm(100),c=rnorm(100))
data <- data[rev(order(data$c)),]
head(data, 15)
a b c
28 6 1.69611039 1.720081
91 11 1.62656460 1.651574
70 9 -1.17808386 1.641954
6 15 1.23420550 1.603140
23 7 0.70854914 1.588352
51 11 -1.41234359 1.540738
19 10 2.83730734 1.522825
49 10 0.39313579 1.370831
80 9 -0.59445323 1.327825
59 10 -0.55538404 1.214901
18 6 0.08445888 1.152266
86 15 0.53027267 1.066034
69 10 -1.89077464 1.037447
62 1 -0.43599566 1.026505
3 7 0.78544009 1.014770
result <- ddply(data, .(a), "head", 5)
head(result, 15)
a b c
1 1 -0.43599566 1.02650544
2 1 -1.55113486 0.36380251
3 1 0.68608364 0.30911430
4 1 -0.85406406 0.05555500
5 1 -1.83894595 -0.11850847
6 5 -1.79715809 0.77760033
7 5 0.82814909 0.22401278
8 5 -1.52726859 0.06745849
9 5 0.51655092 -0.02737905
10 5 -0.44004646 -0.28106808
11 6 1.69611039 1.72008079
12 6 0.08445888 1.15226601
13 6 -1.99465060 0.82214319
14 6 0.43855489 0.76221979
15 6 -2.15251353 0.64417757

Resources