Subset column and compute operations for each subset [duplicate] - r

This question already has answers here:
How to sum a variable by group
(18 answers)
Closed 5 years ago.
Here is a minimal example of dataframe to reproduce.
df <- structure(list(Gene = structure(c(147L, 147L, 148L, 148L, 148L,
87L, 87L, 87L, 87L, 87L), .Label = c("genome", "k141_1189_101",
"k141_1189_104", "k141_1189_105", "k141_1189_116", "k141_1189_13",
"k141_1189_14", "k141_1189_146", "k141_1189_150", "k141_1189_18",
"k141_1189_190", "k141_1189_194", "k141_1189_215", "k141_1189_248",
"k141_1189_251", "k141_1189_252", "k141_1189_259", "k141_1189_274",
"k141_1189_283", "k141_1189_308", "k141_1189_314", "k141_1189_322",
"k141_1189_353", "k141_1189_356", "k141_1189_372", "k141_1189_373",
"k141_1189_43", "k141_1189_45", "k141_1189_72", "k141_1597_15",
"k141_1597_18", "k141_1597_23", "k141_1597_41", "k141_1597_55",
"k141_1597_66", "k141_1597_67", "k141_1597_68", "k141_1597_69",
"k141_2409_34", "k141_2409_8", "k141_3390_69", "k141_3390_83",
"k141_3390_84", "k141_3726_25", "k141_3726_31", "k141_3726_49",
"k141_3726_50", "k141_3726_62", "k141_3726_8", "k141_3726_80",
"k141_3790_1", "k141_3993_114", "k141_3993_122", "k141_3993_162",
"k141_3993_172", "k141_3993_183", "k141_3993_186", "k141_3993_188",
"k141_3993_24", "k141_3993_25", "k141_3993_28", "k141_3993_32",
"k141_3993_44", "k141_3993_47", "k141_3993_53", "k141_3993_57",
"k141_3993_68", "k141_4255_80", "k141_4255_81", "k141_4255_87",
"k141_5079_107", "k141_5079_110", "k141_5079_130", "k141_5079_14",
"k141_5079_141", "k141_5079_16", "k141_5079_184", "k141_5079_185",
"k141_5079_202", "k141_5079_24", "k141_5079_39", "k141_5079_63",
"k141_5079_65", "k141_5079_70", "k141_5079_77", "k141_5079_87",
"k141_5079_9", "k141_5313_16", "k141_5313_17", "k141_5313_20",
"k141_5313_23", "k141_5313_39", "k141_5313_5", "k141_5313_51",
"k141_5313_52", "k141_5313_78", "k141_5545_101", "k141_5545_103",
"k141_5545_104", "k141_5545_105", "k141_5545_106", "k141_5545_107",
"k141_5545_108", "k141_5545_109", "k141_5545_110", "k141_5545_111",
"k141_5545_112", "k141_5545_113", "k141_5545_114", "k141_5545_119",
"k141_5545_128", "k141_5545_130", "k141_5545_139", "k141_5545_141",
"k141_5545_145", "k141_5545_16", "k141_5545_169", "k141_5545_17",
"k141_5545_172", "k141_5545_6", "k141_5545_60", "k141_5545_62",
"k141_5545_63", "k141_5545_86", "k141_5545_87", "k141_5545_88",
"k141_5545_89", "k141_5545_91", "k141_5545_92", "k141_5545_93",
"k141_5545_94", "k141_5545_96", "k141_5545_97", "k141_5545_98",
"k141_5545_99", "k141_5734_13", "k141_5734_2", "k141_5734_4",
"k141_5734_5", "k141_5734_6", "k141_6014_124", "k141_6014_2",
"k141_6014_34", "k141_6014_75", "k141_6014_96", "k141_908_14",
"k141_908_2", "k141_908_5", "k141_957_126", "k141_957_135", "k141_957_136",
"k141_957_14", "k141_957_140", "k141_957_141", "k141_957_148",
"k141_957_179", "k141_957_191", "k141_957_35", "k141_957_47",
"k141_957_55", "k141_957_57", "k141_957_59", "k141_957_6", "k141_957_63",
"k141_957_65", "k141_957_68", "k141_957_77", "k141_957_95"), class = "factor"),
depth = c(9L, 10L, 9L, 10L, 11L, 14L, 15L, 16L, 17L, 18L),
bases_covered = c(6L, 3L, 4L, 7L, 4L, 59L, 54L, 70L, 34L,
17L), gene_length = c(1140L, 1140L, 591L, 591L, 591L, 690L,
690L, 690L, 690L, 690L), regioncoverage = c(54L, 30L, 36L,
70L, 44L, 826L, 810L, 1120L, 578L, 306L)), .Names = c("Gene",
"depth", "bases_covered", "gene_length", "regioncoverage"), row.names = c(1L,
2L, 33L, 34L, 35L, 78L, 79L, 80L, 81L, 82L), class = "data.frame")
The dataframe looks like this:
Gene depth bases_covered gene_length regioncoverage
1 k141_908_2 9 6 1140 54
2 k141_908_2 10 3 1140 30
33 k141_908_5 9 4 591 36
34 k141_908_5 10 7 591 70
35 k141_908_5 11 4 591 44
78 k141_5079_9 14 59 690 826
79 k141_5079_9 15 54 690 810
80 k141_5079_9 16 70 690 1120
81 k141_5079_9 17 34 690 578
82 k141_5079_9 18 17 690 306
What i want is that for each Gene (e.g k141_908_2) i want to sum region coverage and divide by unique(gene length). In fact gene length is always the same value for each gene.
For example for Gene K141_908_2 i would do: (54+30)/1140 = 0.07
For example for Gene K141_908_5 i would do: (36+70+44)/591 = 0.25
The final dataframe should report two columns.
Gene Newcoverage
1 k141_908_2 0.07
2 k141_908_5 0.25
3 ......
and so on .
Thanks for your help

This is straightforward with dplyr:
library(dplyr)
df_final <- df %>%
group_by(Gene) %>%
summarize(Newcoverage = sum(regioncoverage) / first(gene_length))
df_final
# # A tibble: 3 × 2
# Gene Newcoverage
# <fctr> <dbl>
# 1 k141_5079_9 5.27536232
# 2 k141_908_2 0.07368421
# 3 k141_908_5 0.25380711

I needed to set the first column to character and others to numeric. But after that you can just split the df by gene and then do the necessary calculations.
df[,2:5] = lapply(df[,2:5], as.numeric)
df$Gene = as.character(df$Gene)
sapply(split(df, df$Gene), function(x) sum(x[,5]/x[1,4]))
#k141_5079_9 k141_908_2 k141_908_5
# 5.27536232 0.07368421 0.25380711

We can use tidyverse
library(tidyverse)
df %>%
group_by(Gene) %>%
summarise(Newcoverage = sum(regioncoverage)/gene_length[1])
# A tibble: 3 × 2
# Gene Newcoverage
# <fctr> <dbl>
#1 k141_5079_9 5.27536232
#2 k141_908_2 0.07368421
#3 k141_908_5 0.25380711
Or a base R option is
by(df[4:5], list(as.character(df[,'Gene'])), FUN= function(x) sum(x[,2])/x[1,1])

quick approach is
require(data.table)
DT <- setDT(df)
#just to output unique rows
DT[, .(New_Coverage = unique(sum(regioncoverage)/gene_length)), by = .(Gene)]
output
Gene New_Coverage
1: k141_908_2 0.07368421
2: k141_908_5 0.25380711
3: k141_5079_9 5.27536232

I use dplyr a lot. So here's one way:
library(dplyr)
df %>%
group_by(Gene) %>%
mutate(Newcoverage=sum(regioncoverage)/unique(gene_length))
If you want only unique values per Gene:
df %>%
group_by(Gene) %>%
transmute(Newcoverage=sum(regioncoverage)/unique(gene_length)) %>%
unique()

Related

Rowwise proportion test and add p value as new column

My data:
c5 =structure(list(comorbid = c("heart", "ihd", "cabg", "angio",
"cerebrovasc", "diabetes", "pvd", "amputation", "liver", "malig",
"smoke", "ulcers"), AVF_Y = c(626L, 355L, 266L, 92L, 320L, 1175L,
199L, 89L, 75L, 450L, 901L, 114L), AVG_Y = c(54L, 14L, 18L, 5L,
21L, 37L, 5L, 7L, 5L, 29L, 33L, 3L), AVF_tot = c(2755L, 1768L,
2770L, 2831L, 2844L, 2877L, 1745L, 2823L, 2831L, 2823L, 2798L,
2829L), AVG_tot = c(161L, 61L, 161L, 165L, 166L, 167L, 61L, 165L,
165L, 165L, 159L, 164L)), row.names = c(NA, -12L), class = "data.frame")
I want to perform a prop.test for each row ( a two-proportions z-test) and add the p value as a new column.
I've tried using the following code, but this gives me 24 1-sample proportions test results instead of 12 2-sample test for equality of proportions.
Map(prop.test, x = c(c5$AVF_Y, c5$AVG_Y), n = c(c5$AVF_tot, c5$AVG_tot))
Use a lambda function and extract. When we concatenate the columns, it returns a vector and its length will be 2 times the number of rows of the data. We would need to concatenate within in the loop to create a vector of length 2 for each x and n from corresponding columns of '_Y', and '_tot'
mapply(function(avf, avg, avf_n, avg_n) prop.test(c(avf, avg), c(avf_n, avg_n))$p.value, c5$AVF_Y, c5$AVG_Y, c5$AVF_tot, c5$AVG_tot)
-output
[1] 2.218376e-03 6.985883e-01 6.026012e-01 1.000000e+00 6.695440e-01 2.425781e-06 5.672322e-01 5.861097e-01 9.627050e-01 6.546286e-01 3.360300e-03 2.276857e-0
Or use do.cal with Map or mapply
do.call(mapply, c(FUN = function(x, y, n1, n2)
prop.test(c(x, y), c(n1, n2))$p.value, unname(c5[-1])))
[1] 2.218376e-03 6.985883e-01 6.026012e-01 1.000000e+00 6.695440e-01 2.425781e-06 5.672322e-01 5.861097e-01 9.627050e-01 6.546286e-01 3.360300e-03 2.276857e-01
Or with apply
apply(c5[-1], 1, function(x) prop.test(x[1:2], x[3:4])$p.value)
[1] 2.218376e-03 6.985883e-01 6.026012e-01 1.000000e+00 6.695440e-01 2.425781e-06 5.672322e-01 5.861097e-01 9.627050e-01 6.546286e-01 3.360300e-03 2.276857e-01
Or use rowwise
library(dplyr)
c5 %>%
rowwise %>%
mutate(pval = prop.test(c(AVF_Y, AVG_Y),
n = c(AVF_tot, AVG_tot))$p.value) %>%
ungroup
-output
# A tibble: 12 × 6
comorbid AVF_Y AVG_Y AVF_tot AVG_tot pval
<chr> <int> <int> <int> <int> <dbl>
1 heart 626 54 2755 161 0.00222
2 ihd 355 14 1768 61 0.699
3 cabg 266 18 2770 161 0.603
4 angio 92 5 2831 165 1.00
5 cerebrovasc 320 21 2844 166 0.670
6 diabetes 1175 37 2877 167 0.00000243
7 pvd 199 5 1745 61 0.567
8 amputation 89 7 2823 165 0.586
9 liver 75 5 2831 165 0.963
10 malig 450 29 2823 165 0.655
11 smoke 901 33 2798 159 0.00336
12 ulcers 114 3 2829 164 0.228

Drop observations if there are inconsistent variables within same ID [duplicate]

This question already has answers here:
Select groups based on number of unique / distinct values
(4 answers)
Closed 7 months ago.
df <- structure(list(id = c(123L, 123L, 123L, 45L, 45L, 9L, 103L, 103L,
22L, 22L, 22L), age = c(69L, 23L, 70L, 29L, 29L, 37L, 25L, 54L,
40L, 40L, 41L)), class = "data.frame", row.names = c(NA, -11L
))
id age
1 123 69
2 123 23
3 123 70
4 45 29
5 45 29
6 9 37
7 103 25
8 103 54
9 22 40
10 22 40
11 22 41
I would like to drop all observations for an id if it is associated with different values for age. How can I do that?
I would be left with:
id age
45 29
45 29
9 37
A dplyr approach:
library(dplyr)
dat |>
group_by(id) |>
filter(n_distinct(age)==1)
Without external packages, you could use ave():
df |>
subset(ave(age, id, FUN = \(x) length(unique(x))) == 1)
# id age
# 4 45 29
# 5 45 29
# 6 9 37

Assign value in vector based on presence in another vector in R?

I have tried to look for a similar question and I´m sure other people encountered this problem but I still couldn´t find something that helped me. I have a dataset1 with 37.000 observations like this:
id hours
130 12
165 56
250 13
11 15
17 42
and another dataset2 with 38. 000 observations like this:
id hours
130 6
165 23
250 9
11 14
17 11
I want to do the following: if an id of dataset1 is in dataset2, the hours of dataset1 should override the hours of dataset2. For the id´s who are in dataset1 but not in dataset2, the value for dataset2$hours should be NA.
I tried the %in% operator, ifelse(), a loop, and some base R commands but I can´t figure it out. I always get the error that the vectors don´have the same length.
Thanks for any help!
You can replace hours with NAs for id that don't match between df1 and df2. Since both your data sets had the same values for ids, I added one row in df1 with id = 123 and hours = 12.
df1$hours <- replace(df1$hours, is.na(match(df1$id,df2$id)), NA)
df1
id hours
1 130 12
2 165 56
3 250 13
4 11 15
5 17 42
6 123 NA
data
df1 <- structure(list(id = c(130L, 165L, 250L, 11L, 17L, 123L), hours = c(12L,
56L, 13L, 15L, 42L, NA)), row.names = c(NA, -6L), class = "data.frame")
id hours
1 130 12
2 165 56
3 250 13
4 11 15
5 17 42
6 123 12
df2 <- structure(list(id = c(130L, 165L, 250L, 11L, 17L), hours = c(6L,
23L, 9L, 14L, 11L)), class = "data.frame", row.names = c(NA,
-5L))
First match ID's of replacement data with ID's of original data while using na.omit() for the case when replacement ID's are not contained in original data. Replace with replacement data whose ID's are in original ID's.
I expanded both data sets to fabricate cases with no matches.
dat1
# id hours
# 1 130 12
# 2 165 56
# 3 250 13
# 4 11 15
# 5 17 42
# 6 12 232
# 7 35 456
dat2
# id hours
# 1 11 14
# 2 17 11
# 3 165 23
# 4 999 99
# 5 130 6
# 6 250 9
Replacement
dat1[na.omit(match(dat2$id, dat1$id)), ]$hours <-
dat2[dat2$id %in% dat1$id, ]$hours
dat1
# id hours
# 1 130 6
# 2 165 23
# 3 250 9
# 4 11 14
# 5 17 11
# 6 12 232
# 7 35 456
Data:
dat1 <- structure(list(id = c(130L, 165L, 250L, 11L, 17L, 12L, 35L),
hours = c(12L, 56L, 13L, 15L, 42L, 232L, 456L)), class = "data.frame", row.names = c(NA,
-7L))
dat2 <- structure(list(id = c(11L, 17L, 165L, 999L, 130L, 250L), hours = c(14L,
11L, 23L, 99L, 6L, 9L)), class = "data.frame", row.names = c(NA,
-6L))

Is there a way to complete or expand an interval factor variable [duplicate]

This question already has answers here:
Complete dataframe with missing combinations of values
(2 answers)
Closed 2 years ago.
I have a data frame/tibble that includes a factor variable of bins. There are missing bins because the original data did not include an observation in those 5-year ranges. Is there a way to easily complete the series without having to deconstruct the interval?
Here's a sample df.
library(tibble)
df <- structure(list(bin = structure(c(1L, 3L, 5L, 6L, 7L, 8L, 9L,
10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L), .Label = c("[1940,1945]",
"(1945,1950]", "(1950,1955]", "(1955,1960]", "(1960,1965]", "(1965,1970]",
"(1970,1975]", "(1975,1980]", "(1980,1985]", "(1985,1990]", "(1990,1995]",
"(1995,2000]", "(2000,2005]", "(2005,2010]", "(2010,2015]", "(2015,2020]",
"(2020,2025]"), class = "factor"), Values = c(2L, 4L, 14L, 11L,
8L, 26L, 30L, 87L, 107L, 290L, 526L, 299L, 166L, 502L, 8L)), row.names = c(NA,
-15L), class = c("tbl_df", "tbl", "data.frame"))
df
# A tibble: 15 x 2
bin Values
<fct> <int>
1 [1940,1945] 2
2 (1950,1955] 4
3 (1960,1965] 14
4 (1965,1970] 11
5 (1970,1975] 8
6 (1975,1980] 26
7 (1980,1985] 30
8 (1985,1990] 87
9 (1990,1995] 107
10 (1995,2000] 290
11 (2000,2005] 526
12 (2005,2010] 299
13 (2010,2015] 166
14 (2015,2020] 502
15 (2020,2025] 8
I would like to add the missing (1945,1950] and (1955,1960] bins.
bins already has the levels that you want. So you can use complete in your df as :
tidyr::complete(df, bin = levels(bin), fill = list(Values = 0))
# A tibble: 17 x 2
# bin Values
# <chr> <dbl>
# 1 (1945,1950] 0
# 2 (1950,1955] 4
# 3 (1955,1960] 0
# 4 (1960,1965] 14
# 5 (1965,1970] 11
# 6 (1970,1975] 8
# 7 (1975,1980] 26
# 8 (1980,1985] 30
# 9 (1985,1990] 87
#10 (1990,1995] 107
#11 (1995,2000] 290
#12 (2000,2005] 526
#13 (2005,2010] 299
#14 (2010,2015] 166
#15 (2015,2020] 502
#16 (2020,2025] 8
#17 [1940,1945] 2
df <- orig_df %>%
mutate(bin = cut_width(Year, width = 5, center = 2.5))
df2 <- df %>%
group_by(bin) %>%
summarize(Values = n()) %>%
ungroup()
tibble(bin = levels(df$bin)) %>%
left_join(df2) %>%
replace_na(list(Values = 0))

Aggregating monthly data to quarterly (averages)

I have data, which has multiple monthly variables. I would like to aggregate these variables to quarterly level. My initial data is:
Time A B C D . . . . . K
Jan-2004 42 57 53 28
Feb-2004 40 78 56 28
Mar-2004 68 77 53 20
Apr-2004 97 96 80 16
May-2004 84 93 76 17
Jun-2004 57 100 100 21
Jul-2004 62 100 79 22
.
.
.
.
N
So the goal is calculate quarters as monthly averages (sum(jan+feb+mar)/3)). In other words, the goal is to end up data like this:
Time A B C D . . . . . K
2004Q1 50,0 70,7 54,0 25,3
2004Q2 79,3 96,3 85,3 18,0
2004Q3
.
.
.
N
Could anyone help me with this problem?
Thank you very much.
An option would be to convert the 'Time' to yearqtr class with as.yearqtr from zoo and do a summarise_all
library(zoo)
library(dplyr)
df1 %>%
group_by(Time = format(as.yearqtr(Time, "%b-%Y"), "%YQ%q")) %>%
summarise_all(mean)
# A tibble: 3 x 5
# Time A B C D
# <chr> <dbl> <dbl> <dbl> <dbl>
#1 2004Q1 50 70.7 54 25.3
#2 2004Q2 79.3 96.3 85.3 18
#3 2004Q3 62 100 79 22
data
df1 <- structure(list(Time = c("Jan-2004", "Feb-2004", "Mar-2004", "Apr-2004",
"May-2004", "Jun-2004", "Jul-2004"), A = c(42L, 40L, 68L, 97L,
84L, 57L, 62L), B = c(57L, 78L, 77L, 96L, 93L, 100L, 100L), C = c(53L,
56L, 53L, 80L, 76L, 100L, 79L), D = c(28L, 28L, 20L, 16L, 17L,
21L, 22L)), class = "data.frame", row.names = c(NA, -7L))
data.table has quarter function, you can do:
library(data.table)
setDT(my_data)
my_data[ , lapply(.SD, mean), by = .(year = year(Time), quarter = quarter(Time))]
This is the gist of it. Getting it to work exactly would require a reproducible example.

Resources