My dataset looks like this:
> head(tempExp)
points.id wc2.0_30s_tavg_01 wc2.0_30s_tavg_02
1 AmsterdamGreenhouses_Calamagrostis eigejos-AM_Nhigh 3.1 3.2
2 AmsterdamGreenhouses_Molinia caerulea-AM_Nhigh 3.1 3.2
3 Bangor_Alnus-ECM/AM_Nlow 3.8 3.6
4 Bangor_Betula_pendula-ECM_Nlow 3.8 3.6
5 Bangor_Fagus-ECM_Nlow 3.8 3.6
6 BioCON_nolegumes_mixed-AM_Nlow -11.8 -7.9
wc2.0_30s_tavg_03 wc2.0_30s_tavg_04 wc2.0_30s_tavg_05 wc2.0_30s_tavg_06 wc2.0_30s_tavg_07
1 5.9 8.3 12.6 15.1 17.1
2 5.9 8.3 12.6 15.1 17.1
3 5.4 7.3 10.3 12.7 14.7
4 5.4 7.3 10.3 12.7 14.7
5 5.4 7.3 10.3 12.7 14.7
6 -1.2 7.2 14.5 19.3 21.8
For each row (id) I need to calculate the mean across the entire row, but only including those columns with value > 5.
require(dplyr)
# simulate a similar data set
set.seed(1984)
x <- rep('',100)
for (i in 1:100)
{x[i] <- paste(sample(c(LETTERS, 0:9), 5, replace = T), collapse = '')}
df <- data.frame(ID = x, v1 = 3*rnorm(100),
v2 = 5+3*rnorm(100),
v3 = sample(1:20, 100, replace = T),
v4 = rpois(100,6),
v5 = rep(15,100))
head(df)
# ID v1 v2 v3 v4 v5
#1 XPNL0 7.839162 -1.341105 12 5 15
#2 5BQ3H -1.241025 7.651719 1 5 15
#3 5AZZH 2.185374 2.186604 6 4 15
#4 AKX7H 3.148868 2.513623 13 5 15
#5 VAW42 2.757498 3.888333 16 5 15
#6 F4UST -1.894727 4.587320 2 2 15
df %>%
mutate(avg =apply(df[,-1], 1,
function(x) mean(x[x >5]))) -> df
head(df)
# ID v1 v2 v3 v4 v5 avg
#1 XPNL0 7.839162 -1.341105 12 5 15 11.61305
#2 5BQ3H -1.241025 7.651719 1 5 15 11.32586
#3 5AZZH 2.185374 2.186604 6 4 15 10.50000
#4 AKX7H 3.148868 2.513623 13 5 15 14.00000
#5 VAW42 2.757498 3.888333 16 5 15 15.50000
#6 F4UST -1.894727 4.587320 2 2 15 15.00000
Related
# The order of my `class` variable is:
complete.cases <- c("Class_0_1","Class_1_3","Class_3_9", "Class_9_25","Class_25_50")
# In my ds:
nest <- c(2,2,2,2,2,3,3,3,3)
TA <- c(2.3,5.9,8.7,11.8,14.9,11.9,8.8,5.7,2.4)
class <- c("Class_0_1","Class_1_3","Class_3_9","Class_3_9","Class_9_25","Class_1_3","Class_1_3","Class_1_3","Class_3_9")
my.ds <- data.frame(nest,TA,class)
my.ds
nest TA class
1 2 2.3 Class_0_1
2 2 5.9 Class_1_3
3 2 8.7 Class_3_9
4 2 11.8 Class_3_9
5 2 14.9 Class_9_25
6 3 11.9 Class_1_3
7 3 8.8 Class_1_3
8 3 5.7 Class_1_3
9 3 2.4 Class_3_9
# About ds information:
nest <- c(2,3)
class_max <- c("Class_3_9","Class_1_3")
info.ds <- data.frame(nest,class_max)
Now I'd like to remove the rows based in the rule, my.ds$class up than info.ds$class_max by nest remove it, if not is OK.
My final dataframe: must to be:
# new.ds
# nest TA class
# 1 2 2.3 Class_0_1
# 2 2 5.9 Class_1_3
# 3 2 8.7 Class_3_9
# 4 2 11.8 Class_3_9
# 5 3 11.9 Class_1_3
# 6 3 8.8 Class_1_3
# 7 3 5.7 Class_1_3
Please, any help with it?
You could convert class to an ordered factor object and define a custom order for it with ordered(x, levels) or factor(x, levels, ordered = TRUE).
library(dplyr)
my.ds %>%
mutate(class = ordered(class, levels = complete.cases)) %>%
left_join(info.ds, by = 'nest') %>%
filter(class <= class_max) %>%
select(-class_max)
# nest TA class
# 1 2 2.3 Class_0_1
# 2 2 5.9 Class_1_3
# 3 2 8.7 Class_3_9
# 4 2 11.8 Class_3_9
# 5 3 11.9 Class_1_3
# 6 3 8.8 Class_1_3
# 7 3 5.7 Class_1_3
Here is one option - join the two datasets and then do a group by slice
library(dplyr)
my.ds %>%
arrange(nest, factor(class, levels = complete.cases)) %>%
left_join(info.ds, by = 'nest') %>%
group_by(nest) %>%
slice(seq_len(last(which(class == class_max)))) %>%
ungroup %>%
select(-class_max)
-output
# A tibble: 7 × 3
nest TA class
<dbl> <dbl> <chr>
1 2 2.3 Class_0_1
2 2 5.9 Class_1_3
3 2 8.7 Class_3_9
4 2 11.8 Class_3_9
5 3 11.9 Class_1_3
6 3 8.8 Class_1_3
7 3 5.7 Class_1_3
Convert to long by generating sub-variables within a variable "Variable" colulum I group 1:2 = WW, 34 = MM, and 158:190=EE.
df <- data.frame(A=c("A", "B", "C"), `1`=c("1.9", "6.8", "4.7"), `2`=c("1.9", "6.8", "4.7"), `34`=c("3.9", "0.3", "2.7"), `158`=c("2.9", "3", "45"),`190`=c("22.1", "7.4", "56"), check.names=FALSE)
from my df:
1 2 34 158 190
A 1.9 1.9 3.9 2.9 22.1
B 6.8 6.3 0.3 3 7.4
C 4.7 4.7 2.7 45 56
Desired output
Letter Number Variable Value
A 1 WW 1.9
A 2 WW 1.9
A 34 MM 3.9
A 158 EE 2.9
A 190 EE 22.1
B 1 WW 6.8
B 2 WW 6.8
B 34 MM 0.3
B 158 EE 3
B 190 EE 7.4
...
I tried this but I need to add the new cathegory including MM, MM and EE.
library(tidyr)
data_long <- gather(df, Letter, value, 1:90, factor_key=TRUE)
Does this work:
library(dplyr)
library(tidyr)
df %>% pivot_longer(cols = !A, names_to = 'Number', values_to = 'Value') %>% type.convert(as.is = T) %>%
mutate(Variable = case_when(Number %in% c(1,2) ~ 'WW', Number == 34 ~ 'MM', TRUE ~ 'EE')) %>%
select('Letter' = A, Number, Variable, Value)
Output:
# A tibble: 15 x 4
Letter Number Variable Value
<chr> <int> <chr> <dbl>
1 A 1 WW 1.9
2 A 2 WW 1.9
3 A 34 MM 3.9
4 A 158 EE 2.9
5 A 190 EE 22.1
6 B 1 WW 6.8
7 B 2 WW 6.8
8 B 34 MM 0.3
9 B 158 EE 3
10 B 190 EE 7.4
11 C 1 WW 4.7
12 C 2 WW 4.7
13 C 34 MM 2.7
14 C 158 EE 45
15 C 190 EE 56
>
I would like to filter by observations (after sorting in decreasing way in every group) that cumulate X % of values, in my case less than or equal to 80 percent of total of the values. And that in every group.
So from this dataframe below:
Group<-c("A","A","A","A","A","B","B","B","B","C","C","C","C","C","C")
value<-c(c(2,3,6,3,1,1,3,3,5,4,3,5,3,4,2))
data1<-data.frame(Group,value)
data1<-data1%>%arrange(Group,desc(value))%>%
group_by(Group)%>%mutate(pct=round (100*value/sum(value),1))%>%
mutate(cumPct=cumsum(pct))
I would like to have the below filtered dataframe according to conditions I decribed above:
Group value pct cumPct
1 A 6 40.0 40.0
2 A 3 20.0 60.0
3 A 3 20.0 80.0
4 B 5 41.7 41.7
5 B 3 25.0 66.7
6 C 5 23.8 23.8
7 C 4 19.0 42.8
8 C 4 19.0 61.8
9 C 3 14.3 76.1
You can arrange the data in descending order of value, for each Group calculate pct and cum_pct and select rows where cum_pct is less than equal to 80.
library(dplyr)
data1 %>%
arrange(Group, desc(value)) %>%
group_by(Group) %>%
mutate(pct = value/sum(value) * 100,
cum_pct = cumsum(pct)) %>%
filter(cum_pct <= 80)
# Group value pct cum_pct
# <chr> <dbl> <dbl> <dbl>
#1 A 6 40 40
#2 A 3 20 60
#3 A 3 20 80
#4 B 5 41.7 41.7
#5 B 3 25 66.7
#6 C 5 23.8 23.8
#7 C 4 19.0 42.9
#8 C 4 19.0 61.9
#9 C 3 14.3 76.2
I want to conditionally create a new var = old var. My data looks like this:
id id2
1.1 1 1
1.2 2 2
1.3 3 3
1.4 4 4
1.5 NA 5
5.5 5 6
5.6 6 7
5.7 7 8
5.8 8 9
5.51 NA 10
9.9 9 11
9.10 10 12
9.11 11 13
9.4 NA 14
12.12 12 15
12.2 NA 16
13.13 13 17
13.14 14 18
13.15 15 19
13.16 16 20
How can I create a new var = id2 when id is missing? If id is not missing, id3 is missing.
id id2 id3
1.1 1 1
1.2 2 2
1.3 3 3
1.4 4 4
1.5 NA 5 5
5.5 5 6
5.6 6 7
5.7 7 8
5.8 8 9
5.51 NA 10 10
9.9 9 11
9.10 10 12
9.11 11 13
9.4 NA 14 14
12.12 12 15
12.2 NA 16 16
13.13 13 17
13.14 14 18
13.15 15 19
13.16 16 20
Thanks!!
Assuming that dat is your data frame, you can do the following based on ifelse in base R.
dat$id3 <- with(dat, ifelse(is.na(id), id2, NA))
Or
dat2 <- transform(dat, id3 = ifelse(is.na(id), id2, NA))
DATA
dat <- read.table(text = " id id2
1.1 1 1
1.2 2 2
1.3 3 3
1.4 4 4
1.5 NA 5
5.5 5 6
5.6 6 7
5.7 7 8
5.8 8 9
5.51 NA 10
9.9 9 11
9.10 10 12
9.11 11 13
9.4 NA 14
12.12 12 15
12.2 NA 16
13.13 13 17
13.14 14 18
13.15 15 19
13.16 16 20",
header = TRUE)
I want to create another data frame (df) that lists only events. For example, there should be 4 events in df(XX,YY). The column XX should be sum of event value greater than zero separated by zero rows. The column YY should be Max minus Min of event value greater than zero separated by zero rows.
XX YY
1 3.0 23.6
2 0.0 23.2
3 0.0 23.7
4 0.0 25.2
5 1.3 24.5
6 4.8 24.2
7 0.2 23.1
8 0.0 23.3
9 0.0 23.9
10 0.0 24.3
11 1.8 24.6
12 3.2 23.7
13 0.0 23.2
14 0.0 23.6
15 0.0 24.1
16 0.2 24.5
17 4.8 24.1
18 3.7 22.1
19 0.0 23.4
20 0.0 23.8
From my table, I would like to get the results as following.
Event 1. XX[1] = sum(row1,row2) ; YY[1] = [Max(row1,row2)- Min(row1,row2)]
XX[1]=3, YY[1]=0.4
Event 2. XX[2] = sum(row4,row5,row6,row7,row8) ; YY[2] = [Max(row4,row5,row6,row7,row8)- Min(row4,row5,row6,row7,row8)]
XX[2]=6.3, YY[2]=2.1
Event 3. XX[3] = sum(row10,row11,row12,row13) ; YY[3] = [Max(row10,row11,row12,row13)- Min(row10,row11,row12,row13)]
XX[3]=5, YY[3]=1.4
Event 4. XX[4] = sum(row15,row16,row17,row18,row19) ; YY[4] = [Max(row15,row16,row17,row18,row19)- Min(row15,row16,row17,row18,row19)]
XX[4]=5, YY[4]=2.4
XX YY
1 3 0.4
2 6.3 2.1
3 5 1.4
4 8.7 2.4
Method 1 in base R
Split the original data.frame into a list.
lst <- split(df, c(rep(1, 2), 2, rep(3, 5), 4, rep(5, 4), 6, rep(7, 5), 8));
lst <- lst[sapply(lst, function(x) nrow(x) > 1)];
names(lst) <- NULL;
Note that this is exactly the same as your original data, with the only difference that relevant rows are grouped into separate data.frames, and irrelevant rows (row3, row9, row14, row20) have been removed.
Next define a custom function
# Define a custom function that returns
# the sum(column XX) and max(column YY)-min(column YY)
calc_summary_stats <- function(df) {
c(sum(df$XX), max(df$YY) - min(df$YY));
}
Apply the function to your list elements using sapply to get your expected outcome.
# Apply the function to the list of dataframes
m <- t(sapply(lst, calc_summary_stats))
colnames(m) <- c("XX", "YY");
# XX YY
#[1,] 3.0 0.4
#[2,] 6.3 2.1
#[3,] 5.0 1.4
#[4,] 8.7 2.4
Method 2 using tidyverse
Using dplyr, we can first add an idx column by which we group the data; then filter the groups with >1 row, calculate the two summary statistics for every group, and output the ungrouped data with the idx column removed.
library(tidyverse);
df %>%
mutate(idx = c(rep(1, 2), 2, rep(3, 5), 4, rep(5, 4), 6, rep(7, 5), 8)) %>%
group_by(idx) %>%
filter(n() > 1) %>%
summarise(XX = sum(XX), YY = max(YY) - min(YY)) %>%
ungroup() %>%
select(-idx);
## A tibble: 4 x 2
# XX YY
# <dbl> <dbl>
#1 3.00 0.400
#2 6.30 2.10
#3 5.00 1.40
#4 8.70 2.40
Sample data
df <- read.table(text =
"XX YY
1 3.0 23.6
2 0.0 23.2
3 0.0 23.7
4 0.0 25.2
5 1.3 24.5
6 4.8 24.2
7 0.2 23.1
8 0.0 23.3
9 0.0 23.9
10 0.0 24.3
11 1.8 24.6
12 3.2 23.7
13 0.0 23.2
14 0.0 23.6
15 0.0 24.1
16 0.2 24.5
17 4.8 24.1
18 3.7 22.1
19 0.0 23.4
20 0.0 23.8", header = T)