Remove rows based in a custom category order - r

# The order of my `class` variable is:
complete.cases <- c("Class_0_1","Class_1_3","Class_3_9", "Class_9_25","Class_25_50")
# In my ds:
nest <- c(2,2,2,2,2,3,3,3,3)
TA <- c(2.3,5.9,8.7,11.8,14.9,11.9,8.8,5.7,2.4)
class <- c("Class_0_1","Class_1_3","Class_3_9","Class_3_9","Class_9_25","Class_1_3","Class_1_3","Class_1_3","Class_3_9")
my.ds <- data.frame(nest,TA,class)
my.ds
nest TA class
1 2 2.3 Class_0_1
2 2 5.9 Class_1_3
3 2 8.7 Class_3_9
4 2 11.8 Class_3_9
5 2 14.9 Class_9_25
6 3 11.9 Class_1_3
7 3 8.8 Class_1_3
8 3 5.7 Class_1_3
9 3 2.4 Class_3_9
# About ds information:
nest <- c(2,3)
class_max <- c("Class_3_9","Class_1_3")
info.ds <- data.frame(nest,class_max)
Now I'd like to remove the rows based in the rule, my.ds$class up than info.ds$class_max by nest remove it, if not is OK.
My final dataframe: must to be:
# new.ds
# nest TA class
# 1 2 2.3 Class_0_1
# 2 2 5.9 Class_1_3
# 3 2 8.7 Class_3_9
# 4 2 11.8 Class_3_9
# 5 3 11.9 Class_1_3
# 6 3 8.8 Class_1_3
# 7 3 5.7 Class_1_3
Please, any help with it?

You could convert class to an ordered factor object and define a custom order for it with ordered(x, levels) or factor(x, levels, ordered = TRUE).
library(dplyr)
my.ds %>%
mutate(class = ordered(class, levels = complete.cases)) %>%
left_join(info.ds, by = 'nest') %>%
filter(class <= class_max) %>%
select(-class_max)
# nest TA class
# 1 2 2.3 Class_0_1
# 2 2 5.9 Class_1_3
# 3 2 8.7 Class_3_9
# 4 2 11.8 Class_3_9
# 5 3 11.9 Class_1_3
# 6 3 8.8 Class_1_3
# 7 3 5.7 Class_1_3

Here is one option - join the two datasets and then do a group by slice
library(dplyr)
my.ds %>%
arrange(nest, factor(class, levels = complete.cases)) %>%
left_join(info.ds, by = 'nest') %>%
group_by(nest) %>%
slice(seq_len(last(which(class == class_max)))) %>%
ungroup %>%
select(-class_max)
-output
# A tibble: 7 × 3
nest TA class
<dbl> <dbl> <chr>
1 2 2.3 Class_0_1
2 2 5.9 Class_1_3
3 2 8.7 Class_3_9
4 2 11.8 Class_3_9
5 3 11.9 Class_1_3
6 3 8.8 Class_1_3
7 3 5.7 Class_1_3

Related

If Group Contains Value Logic R

I'm trying to get the age given certain criteria. For example, if RNA is not null, then I would like to have the AvatarKey be associated with the minimum age associated with a present RNA entry. If, however, it is NA, I would like to take the minimum Age where DNA is not null. If both are null, remove.
Input:
ID DNA RNA Age
2 NA SL43 22.2
2 SL333 NA 55.7
2 SL333 SL43 43.7
6 SL333 NA 10.3
6 SL333 NA 65.6
6 NA NA 35.5
5 NA SL43 78.0
5 NA SL43 23.3
5 NA SL43 35.8
7 SL333 SL43 13.5
7 SL333 SL43 98.1
1 NA NA 55.6
Desired Output
ID DNA RNA Age
2 NA SL43 22.2
2 SL333 SL43 43.7
6 SL333 NA 10.3
5 NA SL43 23.3
7 SL333 SL43 13.5
Different order than your output, but does this work?
library(dplyr)
my_data %>%
filter(!is.na(DNA) | !is.na(RNA)) %>%
group_by(ID, DNA) %>%
arrange(DNA, -Age) %>%
slice(n())
ID DNA RNA Age
<int> <chr> <chr> <dbl>
1 2 SL333 SL43 43.7
2 2 NA SL43 22.2
3 5 NA SL43 23.3
4 6 SL333 NA 10.3
5 7 SL333 SL43 13.5
You can try :
library(dplyr)
df %>%
group_by(ID, DNA, RNA) %>%
summarise(Age = min(Age)) %>%
ungroup() %>%
filter(!(is.na(DNA) & is.na(RNA)))

How to find mean value using multiple columns of a R data.frame?

I am trying to find mean of A and B for each row and save it as separate column but seems like the code only average the first row and fill the rest of the rows with that value. Any suggestion how to fix this?
library(tidyverse)
library(lubridate)
set.seed(123)
DF <- data.frame(Date = seq(as.Date("2001-01-01"), to = as.Date("2003-12-31"), by = "day"),
A = runif(1095, 1,60),
Z = runif(1095, 5,100)) %>%
mutate(MeanofAandZ= mean(A:Z))
Are you looking for this:
DF %>% rowwise() %>% mutate(MeanofAandZ = mean(c_across(A:Z)))
# A tibble: 1,095 x 4
# Rowwise:
Date A Z MeanofAandZ
<date> <dbl> <dbl> <dbl>
1 2001-01-01 26.5 7.68 17.1
2 2001-01-02 54.9 33.1 44.0
3 2001-01-03 37.1 82.0 59.5
4 2001-01-04 6.91 18.0 12.4
5 2001-01-05 53.0 8.76 30.9
6 2001-01-06 26.1 7.63 16.9
7 2001-01-07 59.3 30.8 45.0
8 2001-01-08 39.9 14.6 27.3
9 2001-01-09 59.2 93.6 76.4
10 2001-01-10 30.7 89.1 59.9
you can do it with Base R: rowMeans
Full Base R:
DF$MeanofAandZ <- rowMeans(DF[c("A", "Z")])
head(DF)
#> Date A Z MeanofAandZ
#> 1 2001-01-01 17.967074 76.92436 47.44572
#> 2 2001-01-02 47.510003 99.28325 73.39663
#> 3 2001-01-03 25.129638 64.33253 44.73109
#> 4 2001-01-04 53.098027 32.42556 42.76179
#> 5 2001-01-05 56.487570 23.99162 40.23959
#> 6 2001-01-06 3.687833 81.08720 42.38751
or inside a mutate:
library(dplyr)
DF <- DF %>% mutate(MeanofAandZ = rowMeans(cbind(A,Z)))
head(DF)
#> Date A Z MeanofAandZ
#> 1 2001-01-01 17.967074 76.92436 47.44572
#> 2 2001-01-02 47.510003 99.28325 73.39663
#> 3 2001-01-03 25.129638 64.33253 44.73109
#> 4 2001-01-04 53.098027 32.42556 42.76179
#> 5 2001-01-05 56.487570 23.99162 40.23959
#> 6 2001-01-06 3.687833 81.08720 42.38751
We can also do
DF$MeanofAandZ <- Reduce(`+`, DF[c("A", "Z")])/2
Or using apply
DF$MeanofAandZ <- apply(DF[c("A", "Z")], 1, mean)

Sort a dataframe according to characters in R [duplicate]

This question already has answers here:
R Sort strings according to substring
(2 answers)
Closed 2 years ago.
I got the dataframe (code) and I I want to sort it according to combName in a numerical order.
> code
# A tibble: 1,108 x 2
combName sumLength
<chr> <dbl>
1 20-1 8.05
2 20-10 14.7
3 20-100 21.2
4 20-101 17.6
5 20-102 25.4
6 20-103 46.3
7 20-104 68.7
8 20-105 24.3
9 20-106 46.3
10 20-107 14.0
# ... with 1,098 more rows
Afterwards the left column should look like:
> code
# A tibble: 1,108 x 2
combName sumLength
<chr> <dbl>
1 20-1 8.05
2 20-2 ...
3 20-3 ...
4 20-4 ...
5 20-5 ...
...
10 20-10 14.7
# ... with 1,098 more rows
It do not know what I can do to reach this format.
Does this work:
library(dplyr)
library(tidyr)
df
# A tibble: 10 x 2
combName sumLength
<chr> <dbl>
1 20-102 25.4
2 20-100 21.2
3 20-101 17.6
4 20-105 24.3
5 20-10 14.7
6 20-103 46.3
7 20-104 68.7
8 20-1 8.05
9 20-106 46.3
10 20-107 14
df %>% separate(combName, into = c('1','2'), sep = '-', remove = F) %>%
type.convert(as.is = T) %>% arrange(`1`,`2`) %>% select(-c(`1`,`2`))
# A tibble: 10 x 2
combName sumLength
<chr> <dbl>
1 20-1 8.05
2 20-10 14.7
3 20-100 21.2
4 20-101 17.6
5 20-102 25.4
6 20-103 46.3
7 20-104 68.7
8 20-105 24.3
9 20-106 46.3
10 20-107 14

filter by observation that cumulate X% of values

I would like to filter by observations (after sorting in decreasing way in every group) that cumulate X % of values, in my case less than or equal to 80 percent of total of the values. And that in every group.
So from this dataframe below:
Group<-c("A","A","A","A","A","B","B","B","B","C","C","C","C","C","C")
value<-c(c(2,3,6,3,1,1,3,3,5,4,3,5,3,4,2))
data1<-data.frame(Group,value)
data1<-data1%>%arrange(Group,desc(value))%>%
group_by(Group)%>%mutate(pct=round (100*value/sum(value),1))%>%
mutate(cumPct=cumsum(pct))
I would like to have the below filtered dataframe according to conditions I decribed above:
Group value pct cumPct
1 A 6 40.0 40.0
2 A 3 20.0 60.0
3 A 3 20.0 80.0
4 B 5 41.7 41.7
5 B 3 25.0 66.7
6 C 5 23.8 23.8
7 C 4 19.0 42.8
8 C 4 19.0 61.8
9 C 3 14.3 76.1
You can arrange the data in descending order of value, for each Group calculate pct and cum_pct and select rows where cum_pct is less than equal to 80.
library(dplyr)
data1 %>%
arrange(Group, desc(value)) %>%
group_by(Group) %>%
mutate(pct = value/sum(value) * 100,
cum_pct = cumsum(pct)) %>%
filter(cum_pct <= 80)
# Group value pct cum_pct
# <chr> <dbl> <dbl> <dbl>
#1 A 6 40 40
#2 A 3 20 60
#3 A 3 20 80
#4 B 5 41.7 41.7
#5 B 3 25 66.7
#6 C 5 23.8 23.8
#7 C 4 19.0 42.9
#8 C 4 19.0 61.9
#9 C 3 14.3 76.2

Calculate row mean with condition in dplyr

My dataset looks like this:
> head(tempExp)
points.id wc2.0_30s_tavg_01 wc2.0_30s_tavg_02
1 AmsterdamGreenhouses_Calamagrostis eigejos-AM_Nhigh 3.1 3.2
2 AmsterdamGreenhouses_Molinia caerulea-AM_Nhigh 3.1 3.2
3 Bangor_Alnus-ECM/AM_Nlow 3.8 3.6
4 Bangor_Betula_pendula-ECM_Nlow 3.8 3.6
5 Bangor_Fagus-ECM_Nlow 3.8 3.6
6 BioCON_nolegumes_mixed-AM_Nlow -11.8 -7.9
wc2.0_30s_tavg_03 wc2.0_30s_tavg_04 wc2.0_30s_tavg_05 wc2.0_30s_tavg_06 wc2.0_30s_tavg_07
1 5.9 8.3 12.6 15.1 17.1
2 5.9 8.3 12.6 15.1 17.1
3 5.4 7.3 10.3 12.7 14.7
4 5.4 7.3 10.3 12.7 14.7
5 5.4 7.3 10.3 12.7 14.7
6 -1.2 7.2 14.5 19.3 21.8
For each row (id) I need to calculate the mean across the entire row, but only including those columns with value > 5.
require(dplyr)
# simulate a similar data set
set.seed(1984)
x <- rep('',100)
for (i in 1:100)
{x[i] <- paste(sample(c(LETTERS, 0:9), 5, replace = T), collapse = '')}
df <- data.frame(ID = x, v1 = 3*rnorm(100),
v2 = 5+3*rnorm(100),
v3 = sample(1:20, 100, replace = T),
v4 = rpois(100,6),
v5 = rep(15,100))
head(df)
# ID v1 v2 v3 v4 v5
#1 XPNL0 7.839162 -1.341105 12 5 15
#2 5BQ3H -1.241025 7.651719 1 5 15
#3 5AZZH 2.185374 2.186604 6 4 15
#4 AKX7H 3.148868 2.513623 13 5 15
#5 VAW42 2.757498 3.888333 16 5 15
#6 F4UST -1.894727 4.587320 2 2 15
df %>%
mutate(avg =apply(df[,-1], 1,
function(x) mean(x[x >5]))) -> df
head(df)
# ID v1 v2 v3 v4 v5 avg
#1 XPNL0 7.839162 -1.341105 12 5 15 11.61305
#2 5BQ3H -1.241025 7.651719 1 5 15 11.32586
#3 5AZZH 2.185374 2.186604 6 4 15 10.50000
#4 AKX7H 3.148868 2.513623 13 5 15 14.00000
#5 VAW42 2.757498 3.888333 16 5 15 15.50000
#6 F4UST -1.894727 4.587320 2 2 15 15.00000

Resources