I have the following tibble:
df <- structure(list(treatment = c("control", "control", "control",
"control", "control", "control", "treated", "treated", "treated",
"treated", "treated", "treated"), `0610005C13Rik` = c(5L, 2L,
2L, 5L, 1L, 0L, 6L, 1L, 0L, 5L, 1L, 2L), `0610007P14Rik` = c(300L,
249L, 166L, 104L, 248L, 136L, 164L, 121L, 191L, 187L, 289L, 169L
), `0610009B22Rik` = c(251L, 158L, 92L, 82L, 239L, 107L, 147L,
96L, 153L, 200L, 211L, 80L), `0610009L18Rik` = c(42L, 17L, 16L,
17L, 10L, 6L, 18L, 1L, 15L, 8L, 19L, 13L), `0610009O20Rik` = c(187L,
77L, 86L, 37L, 81L, 24L, 83L, 57L, 98L, 83L, 113L, 48L), `0610010B08Rik` = c(16L,
3L, 6L, 3L, 2L, 3L, 3L, 2L, 3L, 2L, 3L, 1L)), .Names = c("treatment",
"0610005C13Rik", "0610007P14Rik", "0610009B22Rik", "0610009L18Rik",
"0610009O20Rik", "0610010B08Rik"), row.names = c(NA, -12L), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), vars = "treatment", drop = TRUE, indices = list(
0:5, 6:11), group_sizes = c(6L, 6L), biggest_group_size = 6L, labels = structure(list(
treatment = c("control", "treated")), row.names = c(NA, -2L
), class = "data.frame", vars = "treatment", drop = TRUE, .Names = "treatment"))
That looks like this:
Source: local data frame [12 x 7]
Groups: treatment [2]
treatment `0610005C13Rik` `0610007P14Rik` `0610009B22Rik` `0610009L18Rik` `0610009O20Rik` `0610010B08Rik`
<chr> <int> <int> <int> <int> <int> <int>
1 control 5 300 251 42 187 16
2 control 2 249 158 17 77 3
3 control 2 166 92 16 86 6
4 control 5 104 82 17 37 3
5 control 1 248 239 10 81 2
6 control 0 136 107 6 24 3
7 treated 6 164 147 18 83 3
8 treated 1 121 96 1 57 2
9 treated 0 191 153 15 98 3
10 treated 5 187 200 8 83 2
11 treated 1 289 211 19 113 3
12 treated 2 169 80 13 48 1
What I want to do is to calculate mean and coefficient variation (cv) based on grouped treatment. The CV is basically mean / sd sd / mean. The final expected result looks like this:
gene_symbol control.mean treated.mean control.cv treated.cv
0610005C13Rik 2.5000 2.500000 0.829457 ...
0610007P14Rik 200.5000 186.833333 ... ...
... etc ...
How can I do that using dplyr?
We can gather and then get the mean/sd
library(tidyverse)
df %>%
gather(gene_symbol, Val, -treatment) %>%
group_by(treatment, gene_symbol) %>%
summarise(Mean = mean(Val), cv = sd(Val)/mean(Val)) %>%
gather(Var1, Val, -treatment,-gene_symbol) %>%
unite(new, treatment, Var1) %>%
spread(new, Val)
# A tibble: 6 × 5
# gene_symbol control_cv control_Mean treated_cv treated_Mean
#* <chr> <dbl> <dbl> <dbl> <dbl>
#1 0610005C13Rik 0.8294577 2.5000 0.9715966 2.500000
#2 0610007P14Rik 0.3809605 200.5000 0.2992429 186.833333
#3 0610009B22Rik 0.4823019 154.8333 0.3582799 147.833333
#4 0610009L18Rik 0.6983225 18.0000 0.5515103 12.333333
#5 0610009O20Rik 0.6996217 82.0000 0.3040676 80.333333
#6 0610010B08Rik 0.9672317 5.5000 0.3499271 2.333333
Or another option is to get the mean, cv with summarise_all, then reshape into 'long' format and reconvert it back to 'wide'
df %>%
group_by(treatment) %>%
summarise_all(funs(mean = mean(.), cv = sd(.)/mean(.))) %>%
gather(Var, Val, -treatment) %>%
separate(Var, into = c('gene_symbol', 'Var2')) %>%
unite(new, treatment, Var2) %>%
spread(new, Val)
# A tibble: 6 × 5
# gene_symbol control_cv control_mean treated_cv treated_mean
#* <chr> <dbl> <dbl> <dbl> <dbl>
#1 0610005C13Rik 0.8294577 2.5000 0.9715966 2.500000
#2 0610007P14Rik 0.3809605 200.5000 0.2992429 186.833333
#3 0610009B22Rik 0.4823019 154.8333 0.3582799 147.833333
#4 0610009L18Rik 0.6983225 18.0000 0.5515103 12.333333
#5 0610009O20Rik 0.6996217 82.0000 0.3040676 80.333333
#6 0610010B08Rik 0.9672317 5.5000 0.3499271 2.333333
Or we can do this with melt/dcast from data.table
library(data.table)
dcast(melt(setDT(df), id.var = "treatment", variable.name = "gene_symbol"
)[, .(mean = mean(value), cv = sd(value)/mean(value)), .(treatment, gene_symbol)
], gene_symbol~treatment, value.var = c('mean', 'cv'))
# gene_symbol mean_control mean_treated cv_control cv_treated
#1: 0610005C13Rik 2.5000 2.500000 0.8294577 0.9715966
#2: 0610007P14Rik 200.5000 186.833333 0.3809605 0.2992429
#3: 0610009B22Rik 154.8333 147.833333 0.4823019 0.3582799
#4: 0610009L18Rik 18.0000 12.333333 0.6983225 0.5515103
#5: 0610009O20Rik 82.0000 80.333333 0.6996217 0.3040676
#6: 0610010B08Rik 5.5000 2.333333 0.9672317 0.3499271
EDIT: To reflect the changes in the OP's formula
Here is an approach using a join
library("tidyverse")
df %>% gather(key = gene_symbol, value = value,-treatment) %>%
group_by(treatment, gene_symbol) %>%
summarise(mean = mean(value), cv = mean / sd(value)) %>%
ungroup() %>%
left_join(
x = filter(., treatment == "control"),
y = filter(., treatment == "treated"),
by = "gene_symbol",
suffix = c(".control", ".treated")
) %>%
select(-starts_with("treatment"))
Related
I have a dataframe df like so
height age dept
69 18 A
44 8 B
72 19 B
58 34 C
I want to one-hot encode only the factor variables (only dept is a factor). How can i do this?
Currently right now I'm selecting everything..
and getting this warning:
Warning message:
The following variables are not factor vectors and will be ignored: height, age
ohe <- df %>%
recipes::recipe(~ .) %>%
recipes::step_dummy(tidyselect::everything()) %>%
recipes::prep() %>%
recipes::bake(df)
Use the where with is.factor instead of everything
library(dplyr)
df %>%
recipes::recipe(~ .) %>%
recipes::step_dummy(tidyselect:::where(is.factor)) %>%
recipes::prep() %>%
recipes::bake(df)
-output
# A tibble: 4 × 4
height age dept_B dept_C
<int> <int> <dbl> <dbl>
1 69 18 0 0
2 44 8 1 0
3 72 19 1 0
4 58 34 0 1
data
df <- structure(list(height = c(69L, 44L, 72L, 58L), age = c(18L, 8L,
19L, 34L), dept = structure(c(1L, 2L, 2L, 3L), .Label = c("A",
"B", "C"), class = "factor")), row.names = c(NA, -4L), class = "data.frame")
I have a dataframe such as
Groups NAMES start end
G1 A 1 50
G1 A 25 45
G1 B 20 51
G1 A 51 49
G2 A 200 400
G2 B 1 1600
G2 A 2000 3000
G2 B 4000 5000
and the idea is within each Groups to look at NAMES where start & end coordinates of A are within coordinates of B
for instance here in the example :
Groups NAMES start end
G1 A 1 50 <- A is outside any B coordinate
G1 A 25 45 <- A is **inside** the B coord `20-51`,then I remove these B row.
G1 B 20 51
G1 A 51 49 <- A is outside any B coordinate
G2 A 200 400 <- A is **inside** the B coordinate 1-1600, then I romove this B row.
G2 B 1 1600
G2 A 2000 3000 <- A is outside any B coordinate
G2 B 4000 5000 <- this one does not have any A inside it, then it will be kept in the output.
Then I should get as output :
Groups NAMES start end
G1 A 1 50
G1 A 25 45
G1 A 51 49
G2 A 200 400
G2 A 2000 3000
G2 B 4000 5000
Does someone have an idea please ?
Here is the dataframe in dput format if it can help you ? :
structure(list(Groups = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L), .Label = c("G1", "G2"), class = "factor"), NAMES = structure(c(1L,
1L, 2L, 1L, 1L, 2L, 1L, 2L), .Label = c("A", "B"), class = "factor"),
start = c(1L, 25L, 20L, 51L, 200L, 1L, 2000L, 4000L), end = c(50L,
45L, 51L, 49L, 400L, 1600L, 3000L, 5000L)), class = "data.frame", row.names = c(NA,
-8L))
Here's a possible approach. We'll split the df by NAMES and join the two parts to each other by Groups to do within-group comparisons. Only B rows can get dropped, so those are the only ones whose row numbers we want to keep track of.
We can then just group by rowid to tag the B rows by whether or not they have any A inside them. Finally, filter to the B to keep and concatenate back to the A rows.
library(tidyverse)
df <- structure(list(Groups = structure(c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), .Label = c("G1", "G2"), class = "factor"), NAMES = structure(c(1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L), .Label = c("A", "B"), class = "factor"), start = c(1L, 25L, 20L, 51L, 200L, 1L, 2000L, 4000L), end = c(50L, 45L, 51L, 49L, 400L, 1600L, 3000L, 5000L)), class = "data.frame", row.names = c(NA, -8L))
A <- filter(df, NAMES == "A")
B <- df %>%
filter(NAMES == "B") %>%
rowid_to_column()
comparison <- inner_join(A, B, by = "Groups") %>%
mutate(A_in_B = start.x >= start.y & end.x <= end.y) %>%
group_by(rowid) %>%
summarise(keep_B = !any(A_in_B))
B %>%
inner_join(comparison, by = "rowid") %>%
filter(keep_B) %>%
select(-rowid, -keep_B) %>%
bind_rows(A) %>%
arrange(Groups, NAMES)
#> Groups NAMES start end
#> 1 G1 A 1 50
#> 2 G1 A 25 45
#> 3 G1 A 51 49
#> 4 G2 A 200 400
#> 5 G2 A 2000 3000
#> 6 G2 B 4000 5000
Created on 2021-07-27 by the reprex package (v1.0.0)
This will also do using purrr::map_dfr
library(tidyverse)
df %>%
group_split(Groups) %>%
map_dfr(~ .x %>% mutate(r = row_number()) %>%
full_join(.x %>%
filter(NAMES == 'B'),
by = 'Groups') %>%
group_by(r) %>%
filter(any(NAMES.x == 'B' | start.x > start.y & end.x < end.y)) %>%
ungroup %>%
select(Groups, ends_with('.x')) %>%
distinct %>%
rename_with(~ gsub('\\.x', '', .), everything())
)
#> # A tibble: 6 x 4
#> Groups NAMES start end
#> <fct> <fct> <int> <int>
#> 1 G1 A 25 45
#> 2 G1 B 20 51
#> 3 G1 A 51 49
#> 4 G2 A 200 400
#> 5 G2 B 1 1600
#> 6 G2 B 4000 5000
Created on 2021-07-27 by the reprex package (v2.0.0)
This question already has answers here:
Complete dataframe with missing combinations of values
(2 answers)
Closed 2 years ago.
I have a data frame/tibble that includes a factor variable of bins. There are missing bins because the original data did not include an observation in those 5-year ranges. Is there a way to easily complete the series without having to deconstruct the interval?
Here's a sample df.
library(tibble)
df <- structure(list(bin = structure(c(1L, 3L, 5L, 6L, 7L, 8L, 9L,
10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L), .Label = c("[1940,1945]",
"(1945,1950]", "(1950,1955]", "(1955,1960]", "(1960,1965]", "(1965,1970]",
"(1970,1975]", "(1975,1980]", "(1980,1985]", "(1985,1990]", "(1990,1995]",
"(1995,2000]", "(2000,2005]", "(2005,2010]", "(2010,2015]", "(2015,2020]",
"(2020,2025]"), class = "factor"), Values = c(2L, 4L, 14L, 11L,
8L, 26L, 30L, 87L, 107L, 290L, 526L, 299L, 166L, 502L, 8L)), row.names = c(NA,
-15L), class = c("tbl_df", "tbl", "data.frame"))
df
# A tibble: 15 x 2
bin Values
<fct> <int>
1 [1940,1945] 2
2 (1950,1955] 4
3 (1960,1965] 14
4 (1965,1970] 11
5 (1970,1975] 8
6 (1975,1980] 26
7 (1980,1985] 30
8 (1985,1990] 87
9 (1990,1995] 107
10 (1995,2000] 290
11 (2000,2005] 526
12 (2005,2010] 299
13 (2010,2015] 166
14 (2015,2020] 502
15 (2020,2025] 8
I would like to add the missing (1945,1950] and (1955,1960] bins.
bins already has the levels that you want. So you can use complete in your df as :
tidyr::complete(df, bin = levels(bin), fill = list(Values = 0))
# A tibble: 17 x 2
# bin Values
# <chr> <dbl>
# 1 (1945,1950] 0
# 2 (1950,1955] 4
# 3 (1955,1960] 0
# 4 (1960,1965] 14
# 5 (1965,1970] 11
# 6 (1970,1975] 8
# 7 (1975,1980] 26
# 8 (1980,1985] 30
# 9 (1985,1990] 87
#10 (1990,1995] 107
#11 (1995,2000] 290
#12 (2000,2005] 526
#13 (2005,2010] 299
#14 (2010,2015] 166
#15 (2015,2020] 502
#16 (2020,2025] 8
#17 [1940,1945] 2
df <- orig_df %>%
mutate(bin = cut_width(Year, width = 5, center = 2.5))
df2 <- df %>%
group_by(bin) %>%
summarize(Values = n()) %>%
ungroup()
tibble(bin = levels(df$bin)) %>%
left_join(df2) %>%
replace_na(list(Values = 0))
I have an example dataset:
Road Start End Cat
1 0 50 a
1 50 60 b
1 60 90 b
1 70 75 a
2 0 20 a
2 20 25 a
2 25 40 b
Trying to output following:
Road Start End Cat
1 0 50 a
1 50 90 b
1 70 75 a
2 0 25 a
2 25 40 b
My code doesn't work:
df %>% group_by(Road, cat)
%>% summarise(
min(Start),
max(End)
)
How can I achieve the results I wanted?
We can use rleid from data.table to get the run-length-id-encoding for grouping and then do the summarise
library(dplyr)
library(data.table)
df %>%
group_by(Road, grp = rleid(Cat)) %>%
summarise(Cat = first(Cat), Start = min(Start), End = max(End)) %>%
select(-grp)
# A tibble: 5 x 4
# Groups: Road [2]
# Road Cat Start End
# <int> <chr> <int> <int>
#1 1 a 0 50
#2 1 b 50 90
#3 1 a 70 75
#4 2 a 0 25
#5 2 b 25 40
Or using data.table methods
library(data.table)
setDT(df)[, .(Start = min(Start), End = max(End)), .(Road, Cat, grp = rleid(Cat))]
data
df <- structure(list(Road = c(1L, 1L, 1L, 1L, 2L, 2L, 2L), Start = c(0L,
50L, 60L, 70L, 0L, 20L, 25L), End = c(50L, 60L, 90L, 75L, 20L,
25L, 40L), Cat = c("a", "b", "b", "a", "a", "a", "b")),
class = "data.frame", row.names = c(NA,
-7L))
When i selected NA value here
data[data=="na"] <- NA
data[!complete.cases(data),]
i must replace it, but depending on type of distribution.
If using Shapiro.test the distribution by variables not normal,
then missing value must be replace by median,
If it's normal, than replace by mean.
But distribution for each gender(1 girl, 2 -man)
data=structure(list(sex = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L), emotion = c(20L,
15L, 49L, NA, 34L, 35L, 54L, 45L), IQ = c(101L, 98L, 105L, NA,
123L, 120L, 115L, NA)), .Names = c("sex", "emotion", "IQ"), class = "data.frame", row.names = c(NA,
-8L))
the desired output
sex emotion IQ
1 20 101
1 15 98
1 49 105
1 28 101
2 34 123
2 35 120
2 54 115
2 45 119
Following code will replace NA values according to the Shapiro Test:
library(dplyr)
data %>%
group_by(sex) %>%
mutate(
emotion = ifelse(!is.na(emotion), emotion,
ifelse(shapiro.test(emotion)$p.value > 0.05,
mean(emotion, na.rm=TRUE), quantile(emotion, na.rm=TRUE, probs=0.5) ) ),
IQ = ifelse(!is.na(IQ), IQ,
ifelse(shapiro.test(IQ)$p.value > 0.05,
mean(IQ, na.rm=TRUE), quantile(IQ, na.rm=TRUE, probs=0.5) )
)
)