Related
I have a dataframe of gene expression scores (cells x genes). I also have the cluster that each cell belongs to in stored as a column.
I want to calculate the mean expression values per cluster for a group of genes (columns), however, I only want to include values > 0 in these calculations.
My attempt at this is as follows:
test <- gene_scores_df2 %>%
select(all_of(gene_list), Clusters) %>%
group_by(Clusters) %>%
summarize(across(c(1:13), ~mean(. > 0)))
This produces the following tibble:
# A tibble: 16 x 14
Clusters SLC17A7 GAD1 GAD2 SLC32A1 GLI3 TNC PROX1 SCGN LHX6 NXPH1 MEIS2 ZFHX3 C3
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 C1 0.611 0.605 0.817 0.850 0.979 0.590 0.725 0.434 0.275 0.728 0.949 0.886 0.332
2 C10 0.484 0.401 0.434 0.401 0.791 0.387 0.431 0.362 0.204 0.652 0.715 0.580 0.186
3 C11 0.495 0.5 0.538 0.412 0.847 0.437 0.516 0.453 0.187 0.764 0.804 0.640 0.160
4 C12 0.807 0.626 0.559 0.703 0.942 0.448 0.644 0.366 0.403 0.702 0.917 0.859 0.228
5 C13 0.489 0.578 0.709 0.719 0.796 0.409 0.565 0.371 0.367 0.773 0.716 0.776 0.169
6 C14 0.541 0.347 0.330 0.388 0.731 0.281 0.438 0.279 0.198 0.577 0.777 0.633 0.128
7 C15 0.152 0.306 0.337 0.198 0.629 0.304 0.331 0.179 0.132 0.496 0.509 0.405 0.0556
8 C16 0.402 0.422 0.542 0.418 0.813 0.514 0.614 0.287 0.267 0.729 0.574 0.737 0.279
9 C2 0.152 0.480 0.458 0.297 0.883 0.423 0.511 0.195 0.152 0.722 0.692 0.598 0.0632
10 C3 0.585 0.679 0.659 0.711 0.996 0.886 0.801 0.297 0.305 0.789 0.992 0.963 0.346
11 C4 0.567 0.756 0.893 0.940 0.892 0.334 0.797 0.750 0.376 0.686 0.897 0.885 0.240
12 C5 0.220 0.516 0.560 0.625 0.673 0.250 0.466 0.275 0.358 0.590 0.571 0.641 0.112
13 C6 0.558 0.908 0.836 0.973 0.725 0.280 0.830 0.642 0.871 0.927 0.830 0.916 0.202
14 C7 0.380 0.743 0.749 0.772 0.825 0.415 0.480 0.211 0.199 0.614 0.860 0.901 0.135
15 C8 0.616 0.348 0.312 0.334 0.749 0.271 0.451 0.520 0.129 0.542 0.743 0.735 0.147
16 C9 0.406 0.381 0.400 0.265 0.679 0.266 0.465 0.233 0.0820 0.648 0.565 0.557 0.119
However, when I check this against (what I assume is) a similar procedure on a single column I get different mean values.
Here is the code for SLC1747:
gene_scores_df2 %>%
select(SLC17A7, Clusters) %>%
group_by(Clusters) %>%
filter(SLC17A7 > 0) %>%
summarize(SLC17A7 = mean(SLC17A7))
And the result:
# A tibble: 16 x 2
Clusters SLC17A7
<chr> <dbl>
1 C1 0.780
2 C10 1.42
3 C11 1.21
4 C12 1.64
5 C13 1.09
6 C14 1.83
7 C15 1.61
8 C16 0.968
9 C2 1.09
10 C3 0.512
11 C4 0.920
12 C5 1.53
13 C6 0.814
14 C7 1.22
15 C8 2.24
16 C9 1.72
I'm unsure what exactly is wrong with the first attempt above.
Any suggestions would be greatly appreciated.
Code snippet for original df for
# First 20 cols of:
gene_scores_df2 %>%
select(all_of(gene_list), Clusters) %>%
group_by(Clusters)
structure(list(SLC17A7 = c(0.273, 0.722, 0.699, 0.71, 0.333,
0.674, 0.63, 0.481, 0.274, 0.981, 0.586, 0.401, 0.325, 0.583,
0, 0.348, 0.287, 0, 0.295, 0.351), GAD1 = c(0.355, 0.392, 0.455,
0.34, 0.108, 1.169, 0, 0.426, 2.219, 0.099, 1.16, 0.332, 0.404,
0.284, 0, 5.297, 0.518, 0.027, 1.19, 0.346), GAD2 = c(0.12, 0.562,
0.337, 0.49, 0.095, 0.958, 0.09, 1.518, 1.464, 0.175, 0.419,
0.536, 0.501, 1.103, 0.343, 0, 0.247, 0, 0.635, 0.906), SLC32A1 = c(0,
0.97, 0.067, 0.999, 0.224, 1.04, 0, 2.569, 1.544, 0.059, 2.177,
3.227, 3.603, 1.229, 0.102, 2.421, 0.055, 0.826, 2.646, 0.228
), GLI3 = c(1.527, 0.487, 0.341, 3.352, 0.346, 0.694, 1.395,
0.767, 1.334, 1.373, 1.7, 2.216, 0.394, 1.029, 1.235, 0.55, 2.043,
4.469, 2.901, 4.139), TNC = c(0, 0, 0.448, 0.03, 1.377, 0.045,
0, 0.169, 0.123, 0, 0.188, 0.075, 0, 1.074, 0, 1.272, 0.124,
0.505, 0.173, 0.889), PROX1 = c(0, 0.075, 0.167, 0.782, 0.802,
0.561, 0.098, 0.734, 0.448, 1.645, 0.735, 0.795, 0.102, 0.317,
0.124, 0.324, 0.352, 0.236, 0.826, 0.308), SCGN = c(0.696, 0.234,
0, 0.202, 0.059, 0.162, 0, 0.653, 0.383, 0.42, 0.094, 0.779,
0.228, 0.248, 0.171, 0.089, 0.081, 0.026, 0.159, 0), LHX6 = c(0,
0, 0.134, 0.1, 0.829, 1.489, 0, 0.38, 0.526, 0.117, 0, 0.205,
0.299, 2.235, 0, 1.335, 0, 0.115, 0.454, 0.108), NXPH1 = c(0.792,
0.143, 0.175, 0.658, 0, 1.034, 1.798, 0.219, 0.896, 0.249, 1.336,
1.507, 0.26, 0.242, 1.235, 2.16, 0.235, 0.349, 1.297, 2.234),
MEIS2 = c(4.337, 0.559, 0.978, 1.972, 0.964, 0.657, 0.162,
0.827, 0.882, 0.157, 1.494, 1.171, 2.524, 2.458, 0.205, 0.448,
2.027, 4.767, 1.514, 2.077), ZFHX3 = c(1.48, 1.38, 2.323,
1.039, 1.343, 1.354, 0.238, 1.224, 1.676, 0.811, 0.316, 2.012,
2.298, 1.869, 0.201, 0.176, 1.829, 1.081, 0.522, 0.959),
C3 = c(0.52, 0.527, 0, 0.073, 0, 0.15, 0.094, 0.315, 0.174,
0, 0, 0.17, 0.165, 0, 0.237, 0, 0.091, 0.095, 0, 0.081),
Clusters = c("C12", "C5", "C13", "C4", "C12", "C13", "C13",
"C4", "C6", "C8", "C4", "C4", "C4", "C12", "C5", "C6", "C1",
"C3", "C4", "C3")), row.names = c(NA, -20L), groups = structure(list(
Clusters = c("C1", "C12", "C13", "C3", "C4", "C5", "C6",
"C8"), .rows = structure(list(17L, c(1L, 5L, 14L), c(3L,
6L, 7L), c(18L, 20L), c(4L, 8L, 11L, 12L, 13L, 19L), c(2L,
15L), c(9L, 16L), 10L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -8L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
What you want is:
library(tidyverse)
df %>%
group_by(Clusters) %>%
summarize(across(everything(), ~mean(.[. > 0])))
~mean(. > 0) checks if an element is greater 0 or not and thus returns TRUE/FALSE and then gives you the mean of the underlying 0/1's. Instead you want to filter each column which can be achieved with the usual [] approach
I have data with the cumulative households against the cumulative wealth they posses. I've attached an image of a small amount of the data. Using the R diff() function allows me to get what % of households hold what % of wealth which is good.
I aim to find the Gini index of my data which I first need to get in a format where the households are evenly spaced. There are 20000 rows or so meaning I need to standardise the wealth owned to 0.005% at a time or something like that so as to attain a true distribution of wealth with households (1,2, etc) and not the percentage of households.
EDIT:
structure(list(ï..0.002 = c(0.005, 0.007, 0.017, 0.025, 0.027,
0.037, 0.047, 0.057, 0.067, 0.075, 0.081, 0.09, 0.1, 0.107, 0.116,
0.124, 0.13, 0.138, 0.145, 0.151), X.0.002 = c(-0.004, -0.005,
-0.008, -0.01, -0.01, -0.013, -0.015, -0.017, -0.019, -0.02,
-0.021, -0.022, -0.024, -0.025, -0.026, -0.027, -0.027, -0.028,
-0.029, -0.03)), row.names = c(NA, 20L), class = "data.frame")
Data OCR'd with https://ocr.space/ :
Obs wealth households
1 -0.002 0.002
2 -0.004 0.005
3 -0.005 0.007
4 -0.008 0.017
5 -0.01 0.025
6 -0.01 0.027
7 -0.013 0.037
8 -0.015 0.047
9 -0.017 0.057
10 -0.019 0.067
11 -0.02 0.075
12 -0.021 0.081
13 -0.022 0.09
14 -0.024 0.1
I suggest you used an interpolation to get your data into an evenly spaced form using the approx function.
interpolation <- approx(x = df$cum_hh, y = df$cum_wealth, xout = seq(0, 1, by = 0.00005))
interpolation$x ## evenly spaced cumulative households
interpolation$y ## interpolated cumulative wealth
This question already has answers here:
Aggregate / summarize multiple variables per group (e.g. sum, mean)
(10 answers)
Group by multiple columns and sum other multiple columns
(7 answers)
Closed 3 years ago.
I have a large tibble, an example of which is shown below. It has seven predictors (V4 to V10) and nine outcomes (w1, w2, w3, mw, i1, i2, i3, mi, p2).
What I am trying to do is to create confidence intervals for the outcomes in columns 2 (w1) to 10 (p2)
vars w1 w2 w3 mw i1 i2 i3 mi p2
V4 0.084 0.017 0.061 0.054 22.800 4.570 16.700 14.700 0.367
V5 0.032 0.085 0.039 0.052 8.840 23.100 10.700 14.200 0.367
V6 0.026 0.066 0.022 0.038 7.030 18.000 6.070 10.400 0.367
V7 0.097 0.020 0.066 0.061 26.300 5.420 18.100 16.600 0.367
V8 0.048 0.071 0.043 0.054 13.100 19.300 11.800 14.700 0.367
V9 0.018 0.111 0.020 0.050 4.800 30.300 5.440 13.500 0.367
V10 0.053 0.020 0.103 0.058 14.300 5.330 28.000 15.900 0.367
V4 0.084 0.017 0.060 0.054 22.400 4.420 16.200 14.300 0.373
V5 0.032 0.072 0.036 0.047 8.630 19.300 9.760 12.500 0.373
V6 0.030 0.076 0.023 0.043 8.080 20.500 6.070 11.500 0.373
V7 0.080 0.021 0.087 0.063 21.500 5.720 23.300 16.800 0.373
V8 0.053 0.090 0.034 0.059 14.100 24.000 9.110 15.700 0.373
V9 0.016 0.101 0.025 0.048 4.410 27.100 6.790 12.800 0.373
V10 0.060 0.022 0.100 0.061 16.000 5.950 26.800 16.300 0.373
When I group_by variables (vars) in dplyr and run quantiles on three of the outcomes (as a test), it does not give me what I'm looking for. Instead of giving me the confidence intervals for the three outcomes, it just gives me one confidence interval as
seen below:
+ group_by(vars) %>%
+ do(data.frame(t(quantile(c(.$w1, .$w2, .$w3), probs = c(0.025, 0.975)))))
# A tibble: 7 x 3
# Groups: variables [7]
variables X2.5 X97.5
1 V10 0.0202 0.103
2 V4 0.017 0.084
3 V5 0.032 0.0834
4 V6 0.0221 0.0748
5 V7 0.0201 0.0958
6 V8 0.0351 0.0876
7 V9 0.0162 0.110
In short, what I'm looking for is something like the table below, where I get the confidence intervals for each outcome.
w1 w2 w3
vars X2.5 X97.5 vars X2.5 X97.5 vars X2.5 X97.5
V10 0.020 0.103 V10 0.020 0.103 V10 0.020 0.103
V4 0.017 0.084 V4 0.017 0.084 V4 0.017 0.084
V5 0.032 0.083 V5 0.032 0.083 V5 0.032 0.083
V6 0.022 0.075 V6 0.022 0.075 V6 0.022 0.075
V7 0.020 0.096 V7 0.020 0.096 V7 0.020 0.096
V8 0.035 0.088 V8 0.035 0.088 V8 0.035 0.088
V9 0.016 0.110 V9 0.016 0.110 V9 0.016 0.110
Any pointers in the right direction would be greatly appreciated. I've read on StackOverflow, but can't seem to find an answer that addresses what I want to do.
Here are two ways.
Base R.
aggregate(df1[-1], list(df1[[1]]), quantile, probs = c(0.025, 0.975))
With the tidyverse.
library(dplyr)
df1 %>%
group_by(vars) %>%
mutate_at(vars(w1:p2), quantile, probs = c(0.025, 0.975))
Note that in the second way, the output format is different, the first quantile (0.025) is in the first rows and the second (0.975) in the last rows.
Data.
df1 <-
structure(list(vars = structure(c(2L, 3L, 4L,
5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 1L),
.Label = c("V10", "V4", "V5", "V6", "V7", "V8",
"V9"), class = "factor"), w1 = c(0.084, 0.032,
0.026, 0.097, 0.048, 0.018, 0.053, 0.084,
0.032, 0.03, 0.08, 0.053, 0.016, 0.06),
w2 = c(0.017, 0.085, 0.066, 0.02, 0.071, 0.111,
0.02, 0.017, 0.072, 0.076, 0.021, 0.09, 0.101,
0.022), w3 = c(0.061, 0.039, 0.022, 0.066,
0.043, 0.02, 0.103, 0.06, 0.036, 0.023, 0.087,
0.034, 0.025, 0.1), mw = c(0.054, 0.052, 0.038,
0.061, 0.054, 0.05, 0.058, 0.054, 0.047, 0.043,
0.063, 0.059, 0.048, 0.061), i1 = c(22.8, 8.84,
7.03, 26.3, 13.1, 4.8, 14.3, 22.4, 8.63, 8.08,
21.5, 14.1, 4.41, 16), i2 = c(4.57, 23.1, 18, 5.42,
19.3, 30.3, 5.33, 4.42, 19.3, 20.5, 5.72, 24, 27.1,
5.95), i3 = c(16.7, 10.7, 6.07, 18.1, 11.8, 5.44,
28, 16.2, 9.76, 6.07, 23.3, 9.11, 6.79, 26.8),
mi = c(14.7, 14.2, 10.4, 16.6, 14.7, 13.5, 15.9,
14.3, 12.5, 11.5, 16.8, 15.7, 12.8, 16.3),
p2 = c(0.367, 0.367, 0.367, 0.367, 0.367, 0.367,
0.367, 0.373, 0.373, 0.373, 0.373, 0.373, 0.373,
0.373)), class = "data.frame",
row.names = c(NA, -14L))
Another possibility: melt/pivot to long format; compute summaries; then cast/pivot to wide format
library(tidyverse)
df2 <- (df1
%>% pivot_longer(-vars,"outcome","value")
%>% group_by(vars,outcome)
%>% summarise(lwr=quantile(value,0.025),upr=quantile(value,0.975))
)
df2 %>% pivot_wider(names_from=outcome,values_from=c(lwr,upr))
Unfortunately the columns aren't in the order you want; I can't think of a quick fix (you can select() with variables in the order you want ...
I have following list of colors:
ddf = structure(list(r = c(0.374, 0.374, 0.051, 0.096, 0.876, 0.415,
+ 0.596, 0.724, 0.847, 0.588, 0.481, 0.142, 0.819, 0.91, 0.969,
+ 0.887, 0.432, 0.927, 0.381, 0.04), g = c(0.183, 0.905, 0.662,
+ 0.706, 0.461, 0.845, 0.07, 0.101, 0.434, 0.885, 0.366, 0.075,
+ 0.737, 0.722, 0.012, 0.536, 0.967, 0.125, 0.646, 0.898), b = c(0.528,
+ 0.337, 0.028, 0.898, 0.628, 0.286, 0.523, 0.673, 0.937, 0.604,
+ 0.337, 0.276, 0.658, 0.979, 0.451, 0.123, 0.446, 0.332, 0.656,
+ 0.798)), .Names = c("r", "g", "b"), class = "data.frame", row.names = c(NA,
+ -20L))
>
> ddf
r g b
1 0.374 0.183 0.528
2 0.374 0.905 0.337
3 0.051 0.662 0.028
4 0.096 0.706 0.898
5 0.876 0.461 0.628
6 0.415 0.845 0.286
7 0.596 0.070 0.523
8 0.724 0.101 0.673
9 0.847 0.434 0.937
10 0.588 0.885 0.604
11 0.481 0.366 0.337
12 0.142 0.075 0.276
13 0.819 0.737 0.658
14 0.910 0.722 0.979
15 0.969 0.012 0.451
16 0.887 0.536 0.123
17 0.432 0.967 0.446
18 0.927 0.125 0.332
19 0.381 0.646 0.656
20 0.040 0.898 0.798
>
How can I use it in the same manner as following command for rainbow() color palette:
barplot(rep(1,100), yaxt='n',col=rainbow(100))
Following works:
barplot(rep(1,100), yaxt='n',col=with(ddf, rgb(r,g,b)))
However, here color sequence is repeating (5 times) rather than palette getting stretched from one end to another.
In this case, the colorRampPalette() function is your friend. First, let's define
mycols<-with(df, rgb(r,g,b))
Now we start, without repeating
barplot(rep(1,20), yaxt='n',col=mycols)
now, with repeating
barplot(rep(1,100), yaxt='n',col=mycols)
now, with interpolation from colorRampPalette
barplot(rep(1,100), yaxt='n',col=colorRampPalette(mycols)(100))
I have following data.frame with rgb values. Hence each row indicates a color.
> ddf
r g b
1 0.374 0.183 0.528
2 0.374 0.905 0.337
3 0.051 0.662 0.028
4 0.096 0.706 0.898
5 0.876 0.461 0.628
6 0.415 0.845 0.286
7 0.596 0.070 0.523
8 0.724 0.101 0.673
9 0.847 0.434 0.937
10 0.588 0.885 0.604
11 0.481 0.366 0.337
12 0.142 0.075 0.276
13 0.819 0.737 0.658
14 0.910 0.722 0.979
15 0.969 0.012 0.451
16 0.887 0.536 0.123
17 0.432 0.967 0.446
18 0.927 0.125 0.332
19 0.381 0.646 0.656
20 0.040 0.898 0.798
>
> dput(ddf)
structure(list(r = c(0.374, 0.374, 0.051, 0.096, 0.876, 0.415,
0.596, 0.724, 0.847, 0.588, 0.481, 0.142, 0.819, 0.91, 0.969,
0.887, 0.432, 0.927, 0.381, 0.04), g = c(0.183, 0.905, 0.662,
0.706, 0.461, 0.845, 0.07, 0.101, 0.434, 0.885, 0.366, 0.075,
0.737, 0.722, 0.012, 0.536, 0.967, 0.125, 0.646, 0.898), b = c(0.528,
0.337, 0.028, 0.898, 0.628, 0.286, 0.523, 0.673, 0.937, 0.604,
0.337, 0.276, 0.658, 0.979, 0.451, 0.123, 0.446, 0.332, 0.656,
0.798)), .Names = c("r", "g", "b"), class = "data.frame", row.names = c(NA,
-20L))
How can I visualize these colors? This can be either as bars of color or a palette or a pie chart. I tried to use following method but could not fit it in my data:
pie(rep(1,20), col=rainbow(20))
I think the simplest option is scales. This also has the advantage of showing the hex values in the colours.
library(scales)
pal <- rgb(ddf$r, ddf$g, ddf$b)
show_col(pal)
image() will work well here if you convert the colors via rgb()
image(1:nrow(ddf), 1, as.matrix(1:nrow(ddf)),
col=rgb(ddf$r, ddf$g, ddf$b),
xlab="", ylab = "", xaxt = "n", yaxt = "n", bty = "n")
As an alternative to the solution using image you could also use polygon and create a quite similar plot:
plot(NA, xlim=c(0, nrow(ddf)), ylim=c(0,1))
for (i in 1:nrow(ddf)) {
row <- ddf[i,]
color <- rgb(red=row$r, green=row$g, blue=row$b)
polygon(x=c(i-1, i, i, i-1), y=c(0, 0, 1, 1), col = color)
}
You could also use ggplot2:
library(ggplot2)
qplot(x=1:nrow(ddf), y = 1, fill=factor(1:nrow(ddf)), geom="tile") +
scale_fill_manual(values = rgb(ddf$r, ddf$g, ddf$b)) +
theme_void()+
theme(legend.position="none")