Create new column from conditions on multiple columns in R - r

What I'm trying to write would be written with the apply function in Python:
def categorise(row):
if row['colC'] > 0 and row['colC'] <= 99:
return 'A'
elif row['colC'] > 100 and row['colC'] <= 199:
return 'B'
elif row['colC'] > 200 and row['colC'] <= 299:
return 'C'
return 'D'
df['colF'] = df.apply(lambda row: categorise(row), axis=1)
This is the R code I have at the moment
myf <- function(x) {
count <- 0
if(x[,"BMICat"]==4){
count = count +1}
if(x[,"SleepTimeCat"]==1 | x[,"SleepTimeCat"]==4){
count= count+1}
if(x[,"MentalHealthCat"]==3){
count= count+1}
if(x[,"Smoking"]==TRUE){
count= count+1}
if(x[,"PhysicalActivity"]==FALSE){
count= count+1}
return(count)
}
dfAugment %>%
mutate(BadHabits= myf(.))
I often get stuck on trying to apply this pattern in R, is my approach not common in R?

If I understand your question correctly, a possible solution is creating dummy variables and then adding them together.
library(dplyr)
dfAugment <- data.frame(BMICat = c(1, 2, 4, 4),
SleepTimeCat = c(1, 2, 3, 4))
dfAugment |>
mutate(risk_sum = if_else(BMICat == 4, 1, 0) +
if_else(SleepTimeCat == 1 | SleepTimeCat == 4, 1, 0))
Output
#> BMICat SleepTimeCat risk_sum
#> 1 1 1 1
#> 2 2 2 0
#> 3 4 3 1
#> 4 4 4 2
Created on 2022-06-22 by the reprex package (v2.0.1)

Related

Prune list of pairs in R dataframe by removing redundancies based on genomic distance and p-value

I have an R dataframe containing pairs of SNPs (with chromosome number and base pair location in separate columns) and p-values for each pair. The data looks like this:
chr_snp1 loc_snp1 chr_snp2 loc_snp2 pval
1 278474050 2 57386477 7.43e-08
1 275620856 2 57386662 1.08e-07
1 144075771 3 109909704 1.02e-06
1 144075771 3 111344453 2.06e-06
2 103701229 7 56369738 3.83e-08
2 102990566 7 56407691 1.07e-07
I want to remove redundancies in pairs. However, redundancy being defined as a similar pair that is close to another determined by the values in the loc_snp1 and loc_snp2 columns as well as chr_snp1 and chr_snp2. The numeric limit I want to impose for values in loc_snp1 and loc_snp2 is 10,000,000 for pairs on the same combination of chromosomes. This data is sorted by chromosome (chr_snp1 and chr_snp2), then base pair location (loc_snp1 and loc_snp2), then by p-value (pval).
Basically I want the script to check if the value in loc_snp1 is within 10,000,000 of the value in loc_snp1 in the row above and if they both have the same value in chr_snp1. Then, also check if the value in loc_snp2 is within 10,000,000 of the value in loc_snp2 in the row above and if they both have the same value in chr_snp2. If all four conditions are true, only keep the row with the lowest pval and discard the other.
In order words, only keep the best pair and remove all the others that are close (have the same chromosome combination and within 10,000,000 base pairs of other pairs on their respective chromosomes).
This would result in the dataframe looking like:
chr_snp1 loc_snp1 chr_snp2 loc_snp2 pval
1 278474050 2 57386477 7.43e-08
1 144075771 3 109909704 1.02e-06
2 103701229 7 56369738 3.83e-08
Of course, the script doesn't have to actually check the row above. I assume there are more elegant ways to approach the problem.
Here is option. If you already know chr_snp1 and chr_snp2 need to be identical, you can start by grouping them and then calculating the the differences. Admittedly, this might fail if you have a duplicate chr_snp1 somewhere else in the dataset.
library(tidyverse)
df |>
group_by(chr_snp1, chr_snp2) |>
mutate(grp = (abs(loc_snp1 - lag(loc_snp1, default = first(loc_snp1))) < 10e6) &
(abs(loc_snp2 - lag(loc_snp2, default = first(loc_snp2))) < 10e6)) |>
group_by(grp, .add=TRUE) |>
filter(pval == min(pval)) |>
ungroup()|>
select(-grp)
#> # A tibble: 3 x 5
#> chr_snp1 loc_snp1 chr_snp2 loc_snp2 pval
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 278474050 2 57386477 0.0000000743
#> 2 1 144075771 3 109909704 0.00000102
#> 3 2 103701229 7 56369738 0.0000000383
df <- textConnection('
chr_snp1, loc_snp1, chr_snp2, loc_snp2, pval
1, 278474050, 2, 57386477, 7.43e-08
1, 275620856, 2, 57386662, 1.08e-07
1, 194406449, 2, 182907442, 2.22e-06
1, 144075771, 3, 109909704, 1.02e-06
1, 144075771, 3, 111344453, 1.02e-06
2, 103701229, 7, 56369738, 1.02e-06
2, 102990566, 7, 56407691, 1.02e-06
') |> read.csv(header = TRUE)
# filter based on first 4 criteria
threshold <- 1e7
idx <- (
((diff(df$loc_snp1) |> abs()) <= threshold) &
(diff(df$chr_snp1) == 0) &
((diff(df$loc_snp2) |> abs()) <= threshold) &
(diff(df$chr_snp2) == 0)
) |> which()
df <- df[ idx, ]
# now retain only the pair with lowest p-value
df <- split(df, paste0(df$chr_snp1, df$chr_snp2)) |>
lapply(function(x) {
idx <- which.min(x$pval)
x[ idx, ]
}) |>
data.table::rbindlist() |>
as.data.frame()
# now retain only the pair with lowest p-value (mirror redundancies)
df <- split(df, paste0(df$chr_snp2, df$chr_snp1)) |>
lapply(function(x) {
idx <- which.min(x$pval)
x[ idx, ]
}) |>
data.table::rbindlist() |>
as.data.frame()
print(df)
chr_snp1 loc_snp1 chr_snp2 loc_snp2 pval
1 1 278474050 2 57386477 7.43e-08
3 1 144075771 3 109909704 1.02e-06
5 2 103701229 7 56369738 1.02e-06
A friend was able to code how to handle mirror duplicates after AndS's initial pruning step.
Cart_Inter_Pruned$skip = FALSE
df = NULL
threshold = 10e7
for(i in 1:nrow(Cart_Inter_Pruned)){
if(Cart_Inter_Pruned[i,]$skip) next
ordered_key1 = sort(array(unlist(Cart_Inter_Pruned[i,c(1,3)])))
for(j in (i+1):nrow(Cart_Inter_Pruned)){
if(Cart_Inter_Pruned[j,]$skip) next
ordered_key2 = sort(array(unlist(Cart_Inter_Pruned[j,c(1,3)])))
if(paste(ordered_key1,collapse=",") == paste(ordered_key2,collapse = ",")){
if_switch = ordered_key1[1] == ordered_key2[2]
print(paste(i,j,ordered_key1))
if(if_switch){
if(abs(Cart_Inter_Pruned[i,2]- Cart_Inter_Pruned[j,4]) <= threshold && abs(Cart_Inter_Pruned[i,4] - Cart_Inter_Pruned[j,2]) <= threshold){
Cart_Inter_Pruned[j,]$skip = TRUE
Cart_Inter_Pruned[i,5] = min(as.numeric(Cart_Inter_Pruned[i,5] ,as.numeric(Cart_Inter_Pruned[j,5])))
}
}else{
if(abs(Cart_Inter_Pruned[i,2]- Cart_Inter_Pruned[j,2]) <= threshold && abs(Cart_Inter_Pruned[i,4] - Cart_Inter_Pruned[j,4]) <= threshold){
Cart_Inter_Pruned[j,]$skip = TRUE
Cart_Inter_Pruned[i,5] = min(as.numeric(Cart_Inter_Pruned[i,5] ,as.numeric(Cart_Inter_Pruned[j,5])))
}
}
}
}
}

Calculation between vectors in nested for loops

I am struggling with an issue concerned with nested for loops and calculation with conditions.
Let's say I have a data frame like this:
df = data.frame("a" = c(2, 3, 3, 4),
"b" = c(4, 4, 4, 4),
"c" = c(5, 5, 4, 4),
"d" = c(3, 4, 4, 2))
With this df, I want to compare each element between vectors with a condition: if the absolute difference between two elements is lower than 2 (so difference of 0 and 1), I'd like to accord 1 in a newly created vector while the absolute difference between two elements is >= 2, I'd like to append 0.
For example, for a calculation between the vector "a" and the other vectors "b", "c", "d", I want this result: 0 0 1. The first 0 is accorded based on the difference of 2 between a1 and b1; the second 0 is based on the difference of 3 between a1 and c1; the 1 is based on the difference of a1 and d1. So I tried to make a nested for loop to applicate the same itinerary to the elements in the following rows as well.
So my first trial was like this:
list_all = list(df$a, df$b, df$c, df$d)
v0<-c()
for (i in list_all)
for (j in list_all)
if (i != j) {
if(abs(i-j)<2) {
v0<-c(v0, 1)
} else {
v0<-append(v0, 0)
}} else {
next}
The result is like this :
v0
[1] 0 0 1 0 1 1 0 1 0 1 1 0
But it seems that the calculation has been made only among the first elements but not among the following elements.
So my second trial was like this:
list = list(df$b, df$c, df$d)
v1<-c()
for (i in df$a){
for (j in list){
if(abs(i-j)<2) {
v1<-append(v1, 1)
} else {
v1<-append(v1, 0)
}
}
}
v1
v1
[1] 0 0 1 1 0 1 1 0 1 1 1 1
It seems like the calculations were made between all elements of df$a and ONLY the first elements of the others. So this is not what I needed, either.
When I put df$b instead of list in the nested for loop, the result is even more messy.
v2<-c()
for (i in df$a){
for (j in df$b){
if(abs(i-j)<2) {
v2<-append(v2, 1)
} else {
v2<-append(v2, 0)
}
}
}
v2
[1] 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1
It seems like the calculation has not been made between the corresponding elements (in the same rows), but between all vectors regardless of the place.
Could anyone tell me how to fix the problem? I don't understand why the nested for loop works only for the first elements.
Thank you in advance.
I'm not sure if I understood it all correctly, but how about this?
df = data.frame("a" = c(2, 3, 3, 4),
"b" = c(4, 4, 4, 4),
"c" = c(5, 5, 4, 4),
"d" = c(3, 4, 4, 2))
as.vector(apply(df, 1, \(x) ifelse(abs(x[1] - x[2:4]) < 2, 1, 0)))
#> [1] 0 0 1 1 0 1 1 1 1 1 1 0
I think you're making life unnecessarily complicated for yourself. If I understand you correctly, you can do what you want without nesting loops at all.
The key thing to remember is that R is vectorised by default. That means that R will modify all rows of a vector at the same time. There's no need to loop. So, for example, if a is a vector with values 1 and 2 and I write a + 1, the result will be a vector with values 2 and 3.
Applying this logic to your case, you can write:
df$diffB <- ifelse(abs(df$a-df$b) < 2, 1, 0)
df$diffC <- ifelse(abs(df$a-df$c) < 2, 1, 0)
df$diffD <- ifelse(abs(df$a-df$d) < 2, 1, 0)
df
Giving
a b c d diffB diffC diffD
1 2 4 5 3 0 0 1
2 3 4 5 4 1 0 1
3 3 4 4 4 1 1 1
4 4 4 4 2 1 1 0
You can write a loop to loop over columns if you wish, and Aron has given you one option to do this in his answer.
Personally, I find the using tidyverse results in code that's easier to understand than code written in base R. This is because I can read tidyverse code from left to right, whereas base R code (often) needs to be read from inside out. Tidyverse's syntax is more consistent than base R's as well.
Here's how I would solve your problem using the tidyverse:
library(tidyverse)
df %>%
mutate(
diffB=ifelse(abs(a-b) < 2, 1, 0),
diffC=ifelse(abs(a-c) < 2, 1, 0),
diffD=ifelse(abs(a-d) < 2, 1, 0)
)
And the "loop over columns" becomes
df %>%
mutate(
across(
c(b, c, d),
~ifelse(abs(a-.x) < 2, 1, 0),
.names="diff{.col}"
)
)

Creating a Basic R Dice Rolling Function to Sum Dice Values

I'm trying to write a function that combines up to 4 (fair 6 sided) dice rolls to create a specific value (named 'target.mountain') as many times as possible given the numbers shown on the dice.
Then return these values along with any that aren't used in said combination. If the other numbers that aren't used to form the 'target.mountain' can sum to be within the range (5-10) then do so.
So as an example say I roll 4,3,2,5 and my target.mountain value is 9
I would do
4 + 5 -> 9 and as 2 + 3 = 5 my function would return 9, 5
Another example could be
Roll = (2,3,6,4) --> (6 + 3), (4 + 2) --> 9, 6
Once these values have been found then list so it appears like
[1] 9, 5 (example 1)
[1] 9, 6 (example 2)
How do I go about doing this?
If you have ever played the board game 'Mountain Goats' then that may shed some light on how I need the dice to work as I just cannot figure it out!
Let's make the problem a bit harder, say 5 dice.
library(tidyverse)
rolls <- sample(1:6,replace = TRUE, size = 5)
target.mountain <- 7
#Make all possible combinations of the dice:
map_dfr(seq_along(rolls),~ combn(seq_along(rolls),.x,simplify = FALSE) %>%
map(~tibble(dice = list(.), sum = sum(rolls[.]), rolls = list(rolls[.]),length = length(.)))) %>%
#filter to only those combinations which equal the target
filter(sum == target.mountain) %>%
#Now make all possible combinations of the sets that equal the target
{map2(.x = list(.), .y = nrow(.) %>% map(.x = seq(.), .f = combn,x=.,simplify = FALSE) %>% unlist(recursive = FALSE),
~.x[unlist(.y),])} %>%
#Subset to non-overlapping sets
subset(map_lgl(.,~length(reduce(.x$dice,union))==length(unlist(.x$dice)))) -> part1
map(part1, as.data.frame)
#[[1]]
# dice sum rolls length
#1 1, 3 7 3, 4 2
#
#[[2]]
# dice sum rolls length
#1 4, 5 7 6, 1 2
#
#[[3]]
# dice sum rolls length
#1 2, 3, 5 7 2, 4, 1 3
#
#[[4]]
# dice sum rolls length
#1 1, 3 7 3, 4 2
#2 4, 5 7 6, 1 2
From here you can apply whatever rules you want:
part1 %>%
#subset to the largest number of sets
subset(map_dbl(.,nrow) == max(map_dbl(.,nrow))) %>%
#subset to the fewest number of total dice
subset(map_dbl(.,~sum(.x$length)) == min(map_dbl(.,~sum(.x$length)))) %>%
#if there are still ties, pick the first
`[[`(1) -> part2
as.data.frame(part2)
# dice sum rolls length
#1 1, 3 7 3, 4 2
#2 4, 5 7 6, 1 2
possible solution to the problem
target.mountain = 9
dice <- c(4,3,2,5)
library(tidyverse)
fn <- function(target.mountain, dice){
fltr <- map(seq_along(dice), ~combn(dice, .x, sum) == target.mountain)
out <- map(seq_along(dice), ~combn(dice, .x))
sum_target <- map2(out, fltr, ~.x[, .y]) %>%
purrr::discard(.x = ., function(x) length(x) == 0) %>%
keep(.x = ., .p = function(x) length(x) == min(lengths(.))) %>%
flatten_dbl()
no_sum_target <- dice[!(dice %in% sum_target)]
result <- toString(c(sum(sum_target), no_sum_target))
return(result)
}
fn(target.mountain = target.mountain, dice = dice)
#> [1] "9, 3, 2"
Created on 2021-03-29 by the reprex package (v1.0.0)

Count values in a table (in R)

I have a table. I would like to count how many of the values start with 11_ and at the same time equal to 1.
11_AAACCCAAGAGCTGCA 11_AAACCCACAAAGACGC 11_AAACCCAGTCACTTAG 11_AAACGAACAAAGGCTG
6 3 1 1
11_AAACGAATCCACACAA 13_AAACGCTCACATGAAA 13_AAACGCTCAGCGGTCT 11_AAACGCTCATGGAAGC
7 1 3 1
Do you have a named vector?
You can combine the two conditions to filter name and value.
x <- c('11_AAACCCAAGAGCTGCA' = 6,
'11_AAACCCACAAAGACGC' = 3,
'11_AAACCCAGTCACTTAG' = 1,
'11_AAACGAACAAAGGCT' = 1,
'11_AAACGAATCCACACAA' = 7,
'13_AAACGCTCACATGAAA' = 1,
'13_AAACGCTCAGCGGTCT' = 3,
'11_AAACGCTCATGGAAGC' = 1)
x[startsWith(names(x), '11_') & x == 1]
#11_AAACCCAGTCACTTAG 11_AAACGAACAAAGGCT 11_AAACGCTCATGGAAGC
# 1 1 1
#To count
sum(startsWith(names(x), '11_') & x == 1)
#[1] 3
We can use grepl
sum(grepl("^11_", names(x)) & x == 1)

Recoding a nominal variable with multiple categories into a dummy variable

I am trying to condense a party ID seven point scale variable (pid_x) from the ANES 2012 data to a dummy variable (democrat = 1 and republican = 0). This entails removing all missing values and excluding independents (4). I can remove NAs, but how would I filter out independents and properly mutate the new variable? Yes, I am very new to R. Much appreciated!
The code below provides the following error:
"Error: Problem with mutate() input party_id_recode.
x Can't recycle ..1 (size 2054) to match ..2 (size 3).
i Input party_id_recode is `ifelse(pid_x == 1:3, 1, ifelse(pid_x == 5:7, 0))"
library(tidyverse)
anesnew <- anes %>%
na.omit(anes$pid_x) %>%
mutate(party_id_recode = ifelse(pid_x == 1:3, 1,
ifelse(pid_x == 5:7, 0)))
Reproducible data and expected output would be very useful, but it looks like your ifelse() statement hasn't been constructed properly, and could be simplified:
anesnew <- anes %>%
filter(!is.na(pid_x), pid_x != 4) %>%
mutate(party_id_recode = case_when(pid_x < 4 ~ 1,
pid_x > 4 ~ 0))
With the following sample data:
anes <- tibble(pid_x = c(1, 2, 3, 4, 5, 6, 7, NA))
The results are:
# A tibble: 6 x 2
pid_x party_id_recode
<dbl> <dbl>
1 1 1
2 2 1
3 3 1
4 5 0
5 6 0
6 7 0

Resources