Comparing columns from different rows in R - r

In R using dplyr I'm struggling to accumulate two columns through a sequence.
What I'd like to do:
Within each Outlet I'm trying to calculate the cumulative DFLSEcr (cumulative DFLSEcr = cumu_DFLSEcr) and count (cumu_count) for each row based on a sequence of ZHYD and NextDown. Each row has a value for NextDown which corresponds to the row which comes before it signified by a matching ZHYD. This makes a sequence in which DFLSEcr and count accumulate. Where Exutoire == 0 then cumu_DFLSEcr == 0 and cumu_count == 0. If DFLSEcr == 1 or NA then don't include it in the sum. I've used lag() but I don't think this is correct...
Input:
input <- structure(list(ZHYD = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L,
10L, 8L, 9L, 11L), .Label = c("B020006183", "B020006184", "B020006185",
"B020006190", "B020006199", "B020006212", "B020006228", "B020006278",
"B020006285", "B020006290", "B020006325"), class = "factor"),
Outlet = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 3L), .Label = c("BSO0001727", "BSO0001746", "BSO0001756"
), class = "factor"), NextDown = structure(c(1L, 1L, 2L,
2L, 3L, 3L, NA, NA, 4L, 4L, 5L), .Label = c("B020006190",
"B020006199", "B020006228", "B020006290", "B020006335"), class = "factor"),
count = c(15L, 55L, 42L, 19L, 32L, 6L, 19L, 49L, 4L, 82L,
5L), DFLSEcr = c(0.07, 0.02, 0.02, 0.05, 0.03, 0.17, 0.05,
0.02, 0.25, 0.01, NA), Exutoire = c(0L, 0L, 0L, 0L, 0L, 0L,
1L, 1L, 0L, 0L, 0L)), class = "data.frame", row.names = c(NA,
-11L), .Names = c("ZHYD", "Outlet", "NextDown", "count", "DFLSEcr",
"Exutoire"))
Method tried so far:
input %>%
group_by(Outlet) %>%
select(ZHYD, NextDown, ZHYD, Outlet, Exutoire, count, DFLSEcr) %>%
mutate(cleanDFLSEcr = replace(DFLSEcr, DFLSEcr == 1, 0),
cleanDFLSEcr = replace(DFLSEcr, is.na(DFLSEcr), 0),
cumu_DFLSEcr = if_else(Exutoire == 1, 0, cumsum(cleanDFLSEcr)) %>% lag(1, default = 0),
cumu_count = if_else(Exutoire == 1, 0, as.numeric(cumsum(count))) %>% lag(1, default = 0)) %>%
select(-cleanDFLSEcr)
Desired output:
ZHYD Outlet NextDown count DFLSEcr Exutoire cumu_count cumu_DFLSEcr
<fct> <fct> <fct> <int> <dbl> <int> <int> <dbl>
1 B020006183 BSO0001727 B020006190 15 0.0700 0 70 0.130
2 B020006184 BSO0001727 B020006190 55 0.0200 0 70 0.130
3 B020006185 BSO0001727 B020006199 42 0.0200 0 51 0.0800
4 B020006190 BSO0001727 B020006199 19 0.0500 0 51 0.0800
5 B020006199 BSO0001727 B020006228 32 0.0300 0 19 0.0500
6 B020006212 BSO0001727 B020006228 6 0.170 0 19 0.0500
7 B020006228 BSO0001727 <NA> 19 0.0500 1 0 0.
8 B020006290 BSO0001746 <NA> 49 0.0200 1 0 0.
9 B020006278 BSO0001746 B020006290 4 0.250 0 1 0.0200
10 B020006285 BSO0001746 B020006290 82 0.0100 0 1 0.0200
11 B020006325 BSO0001756 B020006335 5 NA 0 1 0.0200
cumu_count and cumu_DFLSEcr outputs are the same in some cases because they share the same NextDown.
The final row in the sequence shouldn't be included in cumu_count. So for the first row cumu_count == 19 +32 + 19 = 70
Edit
Turns out I needed igraph as this is a routing problem.

Ok your problem is not simple. You have nested data which is a bit complicated to manipulate.
I give you an answer. There are for sure better ways to do it. But it can give you some ideas.
library(dplyr)
library(tidyr)
father_son_table <- select(input, actual = ZHYD, father = NextDown)
sequences <- rename(input, actual = ZHYD, father = NextDown) %>%
left_join(father_son_table, by = c("father" = "actual"), suffix= c(".1", ".2")) %>%
left_join(father_son_table, by = c("father.2" = "actual"), suffix = c(".1", ".3")) %>%
tibble::rowid_to_column(var = "sequence_number")
table_order <- sequences %>%
select(-count, -DFLSEcr, -Exutoire, -Outlet) %>%
gather(key = height, value = node, -sequence_number) %>%
mutate(order = case_when( height == "actual" ~ 0,
height =="father.1" ~ 1,
height == "father.2" ~ 2,
height == "father.3" ~ 3 )) %>%
na.omit() %>%
select(sequence_number, node, order)
result <- left_join(table_order, input, by = c("node" = "ZHYD")) %>%
arrange(sequence_number, order) %>%
group_by(sequence_number, Outlet) %>%
mutate(cumu_count = sum(count) - count,
cumu_DFLSE_cr = sum(DFLSEcr)- DFLSEcr) %>%
filter(order == 0)

Related

How can create my own factor column in a dataframe?

I have dataframe and task:"Define your own criterion of income level, and split data according to levels of this criterion"
dput(head(creditcard))
structure(list(card = structure(c(2L, 2L, 2L, 2L, 2L, 2L), levels = c("no",
"yes"), class = "factor"), reports = c(0L, 0L, 0L, 0L, 0L, 0L
), age = c(37.66667, 33.25, 33.66667, 30.5, 32.16667, 23.25),
income = c(4.52, 2.42, 4.5, 2.54, 9.7867, 2.5), share = c(0.03326991,
0.005216942, 0.004155556, 0.06521378, 0.06705059, 0.0444384
), expenditure = c(124.9833, 9.854167, 15, 137.8692, 546.5033,
91.99667), owner = structure(c(2L, 1L, 2L, 1L, 2L, 1L), levels = c("no",
"yes"), class = "factor"), selfemp = structure(c(1L, 1L,
1L, 1L, 1L, 1L), levels = c("no", "yes"), class = "factor"),
dependents = c(3L, 3L, 4L, 0L, 2L, 0L), days = c(54L, 34L,
58L, 25L, 64L, 54L), majorcards = c(1L, 1L, 1L, 1L, 1L, 1L
), active = c(12L, 13L, 5L, 7L, 5L, 1L), income_fam = c(1.13,
0.605, 0.9, 2.54, 3.26223333333333, 2.5)), row.names = c("1",
"2", "3", "4", "5", "6"), class = "data.frame")
I defined this criterion in this way
inc_l<-c("low","average","above average","high")
grad_fact<-function(x){
ifelse(x>=10, 'high',
ifelse(x>6 && x<10, 'above average',
ifelse(x>=3 && x<=6,'average',
ifelse(x<3, 'low'))))
}
And added a column like this
creditcard<-transform(creditcard, incom_levev=factor(sapply(creditcard$income, grad_fact), inc_l, ordered = TRUE))
But I need not to use saaply for this and I tried to do it in this way
creditcard<-transform(creditcard, incom_level=factor(grad_fact(creditcard$income),inc_l, ordered = TRUE))
But in this case, all the elements of the column take the value "average" and I don't understand why, please help me figure out the problem
We may need to change the && to & as && will return a single TRUE/FALSE. According to ?"&&"
& and && indicate logical AND and | and || indicate logical OR. The shorter forms performs elementwise comparisons in much the same way as arithmetic operators. The longer forms evaluates left to right, proceeding only until the result is determined. The longer form is appropriate for programming control-flow and typically preferred in if clauses.
In addition, the last ifelse didn't had a no case
grad_fact<-function(x){
ifelse(x>=10, 'high',
ifelse(x>6 & x<10, 'above average',
ifelse(x>=3 & x<=6,'average',
ifelse(x<3, 'low', NA_character_))))
}
and then use
creditcard <- transform(creditcard, incom_level=
factor(grad_fact(income),inc_l, ordered = TRUE))
-output
creditcard
card reports age income share expenditure owner selfemp dependents days majorcards active income_fam incom_level
1 yes 0 37.66667 4.5200 0.033269910 124.983300 yes no 3 54 1 12 1.130000 average
2 yes 0 33.25000 2.4200 0.005216942 9.854167 no no 3 34 1 13 0.605000 low
3 yes 0 33.66667 4.5000 0.004155556 15.000000 yes no 4 58 1 5 0.900000 average
4 yes 0 30.50000 2.5400 0.065213780 137.869200 no no 0 25 1 7 2.540000 low
5 yes 0 32.16667 9.7867 0.067050590 546.503300 yes no 2 64 1 5 3.262233 above average
6 yes 0 23.25000 2.5000 0.044438400 91.996670 no no 0 54 1 1 2.500000 low

Find the "top N" in a group and find the average of the "top N" in R

Rank Laps Average Time
1 1 1 30
2 2 1 34
3 3 1 35
4 1 2 32
5 2 2 33
6 3 2 56
7 4 1 43
8 5 1 23
9 6 1 31
10 4 2 23
11 5 2 88
12 6 2 54
I would like to know how I can group ranks 1-3 and ranks 4-6 and get an average of the "average time" for each lap. Also, I would like this to extend if I have groups 7-9, 10-13, etc.
One option is to use cut to put the different ranks into groups, and add Laps as a grouping variable. Then, you can summarize the data to get the mean.
library(tidyverse)
df %>%
group_by(gr = cut(Rank, breaks = seq(0, 6, by = 3)), Laps) %>%
summarize(avg = mean(Average_Time))
Output
gr Laps avg
<fct> <int> <dbl>
1 (0,3] 1 33
2 (0,3] 2 40.3
3 (3,6] 1 32.3
4 (3,6] 2 55
Or another option if you want the range of ranks displayed for the group:
df %>%
group_by(gr = cut(Rank, breaks = seq(0, 6, by = 3))) %>%
mutate(Rank_gr = paste0(min(Rank), "-", max(Rank))) %>%
group_by(Rank_gr, Laps) %>%
summarize(avg = mean(Average_Time))
Output
Rank_gr Laps avg
<chr> <int> <dbl>
1 1-3 1 33
2 1-3 2 40.3
3 4-6 1 32.3
4 4-6 2 55
Since you will have uneven groups, then you might want to use case_when to make the groups:
df %>%
group_by(gr=case_when(Rank %in% 1:3 ~ "1-3",
Rank %in% 4:6 ~ "4-6",
Rank %in% 7:9 ~ "7-9",
Rank %in% 10:13 ~ "10-13"),
Laps) %>%
summarize(avg = mean(Average_Time))
Data
df <- structure(list(Rank = c(1L, 2L, 3L, 1L, 2L, 3L, 4L, 5L, 6L, 4L,
5L, 6L), Laps = c(1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L,
2L), Average_Time = c(30L, 34L, 35L, 32L, 33L, 56L, 43L, 23L,
31L, 23L, 88L, 54L)), class = "data.frame", row.names = c(NA,
-12L))

R: Using Qvalue, how do I find the number of SNPS that have FDR adjusted p-values of p<.05?

I want to find the number of SNPS that have FDR adjusted p-values of p<.05. However, my for loop and if statement did not effectively find the # of SNPs with p<.05.
My dataset has a P column which indicates p-value and 1422 observations.
> dput(dat[1:5,])
structure(list(CHR = c(6L, 6L, 6L, 6L, 6L), SNP = c("rs9257319",
"rs2269553", "rs2894066", "rs3763338", "rs1233508"), BP = c(28959616L,
28984488L, 29001906L, 29002290L, 29005612L), A1 = c(2L, 1L, 1L,
1L, 2L), A2 = c(1L, 2L, 2L, 2L, 1L), T = c(6L, 9L, 13L, 4L, 8L
), U = c(7L, 9L, 9L, 3L, 13L), OR = c(0.8571, 1, 1.444, 1.333,
0.6154), L95 = c(0.2881, 0.397, 0.6174, 0.2984, 0.2551), U95 = c(2.55,
2.519, 3.379, 5.957, 1.485), CHISQ = c(0.07692, 0, 0.7273, 0.1429,
1.19), P = c(0.7815, 1, 0.3938, 0.7055, 0.2752)), row.names = c(NA,
5L), class = "data.frame")
I calculated the q-values using the qvalue library.
library(qvalue)
library(dplyr)
fdr <- qvalue(dat$P, fdr.level=0.05)
Finally, I want to find the number of SNPs with FDR adjusted p-values of p<.05.
# SNPs that have FDR adjusted p-values of p<.05
for(i in fdr$qvalues){
if(i>0.05){
fdr[!fdr$qvalues %in% i]
}
}
And found that there is one q-value > 0.05 and removed it. However, as shown below, the length of fdr$qvalues remain the same, meaning that I did not remove the q-value > 0.05 element.
length(fdr$qvalues)
[1] 1422
library(tidyverse)
# slightly modified p values to see the result
data <- structure(list(CHR = c(6L, 6L, 6L, 6L, 6L), SNP = c(
"rs9257319",
"rs2269553", "rs2894066", "rs3763338", "rs1233508"
), BP = c(
28959616L,
28984488L, 29001906L, 29002290L, 29005612L
), A1 = c(
2L, 1L, 1L,
1L, 2L
), A2 = c(1L, 2L, 2L, 2L, 1L), T = c(6L, 9L, 13L, 4L, 8L), U = c(7L, 9L, 9L, 3L, 13L), OR = c(
0.8571, 1, 1.444, 1.333,
0.6154
), L95 = c(0.2881, 0.397, 0.6174, 0.2984, 0.2551), U95 = c(
2.55,
2.519, 3.379, 5.957, 1.485
), CHISQ = c(
0.07692, 0, 0.7273, 0.1429,
1.19
), P = c(0.001, 1, 0.3, 0.01, 0.5)), row.names = c(
NA,
5L
), class = "data.frame")
data
#> CHR SNP BP A1 A2 T U OR L95 U95 CHISQ P
#> 1 6 rs9257319 28959616 2 1 6 7 0.8571 0.2881 2.550 0.07692 0.001
#> 2 6 rs2269553 28984488 1 2 9 9 1.0000 0.3970 2.519 0.00000 1.000
#> 3 6 rs2894066 29001906 1 2 13 9 1.4440 0.6174 3.379 0.72730 0.300
#> 4 6 rs3763338 29002290 1 2 4 3 1.3330 0.2984 5.957 0.14290 0.010
#> 5 6 rs1233508 29005612 2 1 8 13 0.6154 0.2551 1.485 1.19000 0.500
data %>%
mutate(q = p.adjust(P, method = "fdr")) %>%
filter(q < 0.05)
#> CHR SNP BP A1 A2 T U OR L95 U95 CHISQ P q
#> 1 6 rs9257319 28959616 2 1 6 7 0.8571 0.2881 2.550 0.07692 0.001 0.005
#> 2 6 rs3763338 29002290 1 2 4 3 1.3330 0.2984 5.957 0.14290 0.010 0.025
data %>%
mutate(q = p.adjust(P, method = "fdr")) %>%
filter(q < 0.05) %>%
count()
#> n
#> 1 2
Created on 2022-02-10 by the reprex package (v2.0.0)

Subsetting a data frame according to recursive rows and creating a column for ordering

Consider the sample data
df <-
structure(
list(
id = c(1L, 1L, 1L, 1L, 2L, 2L, 3L),
A = c(20L, 12L, 13L, 8L, 11L, 21L, 17L),
B = c(1L, 1L, 0L, 0L, 1L, 0L, 0L)
),
.Names = c("id", "A", "B"),
class = "data.frame",
row.names = c(NA,-7L)
)
Each id (stored in column 1) has varying number of entries for column A and B. In the example data, there are four observations with id = 1. I am looking for a way to subset this data in R so that there will be at most 3 entries for for each id and finally create another column (labelled as C) which consists of the order of each id. The expected output would look like:
df <-
structure(
list(
id = c(1L, 1L, 1L, 2L, 2L, 3L),
A = c(20L, 12L, 13L, 11L, 21L, 17L),
B = c(1L, 1L, 0L, 1L, 0L, 0L),
C = c(1L, 2L, 3L, 1L, 2L, 1L)
),
.Names = c("id", "A", "B","C"),
class = "data.frame",
row.names = c(NA,-6L)
)
Your help is much appreciated.
Like this?
library(data.table)
dt <- as.data.table(df)
dt[, C := seq(.N), by = id]
dt <- dt[C <= 3,]
dt
# id A B C
# 1: 1 20 1 1
# 2: 1 12 1 2
# 3: 1 13 0 3
# 4: 2 11 1 1
# 5: 2 21 0 2
# 6: 3 17 0 1
Here is one option with dplyr and considering the top 3 values based on A (based of the comments of #Ronak Shah).
library(dplyr)
df %>%
group_by(id) %>%
top_n(n = 3, wt = A) %>% # top 3 values based on A
mutate(C = rank(id, ties.method = "first")) # C consists of the order of each id
# A tibble: 6 x 4
# Groups: id [3]
id A B C
<int> <int> <int> <int>
1 1 20 1 1
2 1 12 1 2
3 1 13 0 3
4 2 11 1 1
5 2 21 0 2
6 3 17 0 1

Subsetting a dataframe based on summation of rows of a given column

I am dealing with data with three variables (i.e. id, time, gender). It looks like
df <-
structure(
list(
id = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L),
time = c(21L, 3L, 4L, 9L, 5L, 9L, 10L, 6L, 27L, 3L, 4L, 10L),
gender = c(1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L)
),
.Names = c("id", "time", "gender"),
class = "data.frame",
row.names = c(NA,-12L)
)
That is, each id has four observations for time and gender. I want to subset this data in R based on the sums of the rows of variable time which first gives a value which is greater than or equal to 25 for each id. Notice that for id 2 all observations will be included and for id 3 only the first observation is involved. The expected results would look like:
df <-
structure(
list(
id = c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L ),
time = c(21L, 3L, 4L, 5L, 9L, 10L, 6L, 27L ),
gender = c(1L, 1L, 1L, 0L, 0L, 0L, 0L, 1L)
),
.Names = c("id", "time", "gender"),
class = "data.frame",
row.names = c(NA,-8L)
)
Any help on this is highly appreciated.
One option is using lag of cumsum as:
library(dplyr)
df %>% group_by(id,gender) %>%
filter(lag(cumsum(time), default = 0) < 25 )
# # A tibble: 8 x 3
# # Groups: id, gender [3]
# id time gender
# <int> <int> <int>
# 1 1 21 1
# 2 1 3 1
# 3 1 4 1
# 4 2 5 0
# 5 2 9 0
# 6 2 10 0
# 7 2 6 0
# 8 3 27 1
Using data.table: (Updated based on feedback from #Renu)
library(data.table)
setDT(df)
df[,.SD[shift(cumsum(time), fill = 0) < 25], by=.(id,gender)]
Another option would be to create a logical vector for each 'id', cumsum(time) >= 25, that is TRUE when the cumsum of 'time' is equal to or greater than 25.
Then you can filter for rows where the cumsum of this vector is less or equal then 1, i.e. filter for entries until the first TRUE for each 'id'.
df %>%
group_by(id) %>%
filter(cumsum( cumsum(time) >= 25 ) <= 1)
# A tibble: 8 x 3
# Groups: id [3]
# id time gender
# <int> <int> <int>
# 1 1 21 1
# 2 1 3 1
# 3 1 4 1
# 4 2 5 0
# 5 2 9 0
# 6 2 10 0
# 7 2 6 0
# 8 3 27 1
Can try dplyr construction:
dt <- groupby(df, id) %>%
#sum time within groups
mutate(sum_time = cumsum(time))%>%
#'select' rows, which fulfill the condition
filter(sum_time < 25) %>%
#exclude sum_time column from the result
select (-sum_time)

Resources