I'm trying to summarize this data set as an example and I'm trying to use multiple functions n() & mean(). How can I combine both in the same workflow?
Here is a toy dataset that mirrors my larger data:
library(tidyverse)
df <- structure(list(group_var = c(70, 72, 73, 70, 70, 71, 70, 71,
71, 70), var1_scr = c(50.5, 25.75, 50.5, 50.5, 50.5, 50.5, 75.25,
75.25, 50.5, 75.25), var2_scr = c(50.5, 50.5, NA, 75.25, 50.5,
50.5, 75.25, 75.25, 100, 75.25), var3_scr = c(NA, NA, 75.25,
NA, NA, NA, NA, NA, NA, NA)), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame"))
df
#> # A tibble: 10 x 4
#> group_var var1_scr var2_scr var3_scr
#> <dbl> <dbl> <dbl> <dbl>
#> 1 70 50.5 50.5 NA
#> 2 72 25.8 50.5 NA
#> 3 73 50.5 NA 75.2
#> 4 70 50.5 75.2 NA
#> 5 70 50.5 50.5 NA
#> 6 71 50.5 50.5 NA
#> 7 70 75.2 75.2 NA
#> 8 71 75.2 75.2 NA
#> 9 71 50.5 100 NA
#> 10 70 75.2 75.2 NA
# summarize the scores
df %>% group_by(group_var) %>%
summarise_at(vars(ends_with("_scr")), funs(mean(., na.rm = TRUE)))
#> # A tibble: 4 x 4
#> group_var var1_scr var2_scr var3_scr
#> <dbl> <dbl> <dbl> <dbl>
#> 1 70 60.4 65.4 NaN
#> 2 71 58.8 75.2 NaN
#> 3 72 25.8 50.5 NaN
#> 4 73 50.5 NaN 75.2
# count all the oberservations
df %>% group_by(group_var) %>%
summarise(obs = n())
#> # A tibble: 4 x 2
#> group_var obs
#> <dbl> <int>
#> 1 70 5
#> 2 71 3
#> 3 72 1
#> 4 73 1
# my goal is to produce this dataset but using the mutate_at function
df %>% group_by(group_var) %>%
summarise(var1_scr = mean(var1_scr),
var2_scr = mean(var2_scr),
var3_scr = mean(var3_scr),
obs = n())
#> # A tibble: 4 x 5
#> group_var var1_scr var2_scr var3_scr obs
#> <dbl> <dbl> <dbl> <dbl> <int>
#> 1 70 60.4 65.4 NA 5
#> 2 71 58.8 75.2 NA 3
#> 3 72 25.8 50.5 NA 1
#> 4 73 50.5 NA 75.2 1
Created on 2019-08-15 by the reprex package (v0.3.0)
An option is to add the 'n' also in the grouping variable after grouping by 'group_var' and then do the summarise_at
library(dplyr)
df %>%
group_by(group_var) %>%
group_by(obs = n(), add = TRUE) %>%
summarise_at(vars(ends_with("_scr")), list(~mean(., na.rm = TRUE)))
# A tibble: 4 x 5
# Groups: group_var [4]
# group_var obs var1_scr var2_scr var3_scr
# <dbl> <int> <dbl> <dbl> <dbl>
#1 70 5 60.4 65.4 NaN
#2 71 3 58.8 75.2 NaN
#3 72 1 25.8 50.5 NaN
#4 73 1 50.5 NaN 75.2
Another option is to create the frequency column with mutate, and get the mean by including that also in the summarise_at (e.g. mean(rep(3, 5)) -> 3)
df %>%
group_by(group_var) %>%
mutate(obs = n()) %>%
summarise_at(vars(ends_with("_scr"), obs), list(~mean(., na.rm = TRUE)))
# A tibble: 4 x 5
# group_var var1_scr var2_scr var3_scr obs
# <dbl> <dbl> <dbl> <dbl> <dbl>
#1 70 60.4 65.4 NaN 5
#2 71 58.8 75.2 NaN 3
#3 72 25.8 50.5 NaN 1
#4 73 50.5 NaN 75.2 1
NOTE: Both of these provide one column for the 'obs'
Here, the OP's expected output is a summarised output for which summarise/summarise_at/summarise_all/summarise_if are efficient. However, if we need to use mutate_at (only for demonstration)
df %>%
group_by(group_var) %>%
mutate(obs = n()) %>%
mutate_at(vars(ends_with("_scr"), obs), list(~mean(., na.rm = TRUE))) %>%
distinct_at(vars(group_var, ends_with("_scr"), obs))
# A tibble: 4 x 5
# Groups: group_var [4]
# group_var var1_scr var2_scr var3_scr obs
# <dbl> <dbl> <dbl> <dbl> <dbl>
#1 70 60.4 65.4 NaN 5
#2 72 25.8 50.5 NaN 1
#3 73 50.5 NaN 75.2 1
#4 71 58.8 75.2 NaN 3
If you need the two function in the same call, we can do
library(dplyr)
df %>% group_by(group_var) %>%
summarise_at(vars(ends_with("_scr")), list(m=~mean(., na.rm = TRUE), n=~n()))
# A tibble: 4 x 7
group_var var1_scr_m var2_scr_m var3_scr_m var1_scr_n var2_scr_n var3_scr_n
<dbl> <dbl> <dbl> <dbl> <int> <int> <int>
1 70 60.4 65.4 NaN 5 5 5
2 71 58.8 75.2 NaN 3 3 3
3 72 25.8 50.5 NaN 1 1 1
4 73 50.5 NaN 75.2 1 1 1
Consider the OP note: my goal is to produce this dataset but using the mutate_at function
df %>% group_by(group_var) %>%
mutate_at(vars(ends_with("_scr")), list(m=~mean(., na.rm = TRUE), n=~n())) %>%
slice(1)
Related
I need to divide columns despesatotal and despesamonetaria by the row named Total:
Lets suppose your data set is df.
# 1) Delete the last row
df <- df[-nrow(df),]
# 2) Build the desired data.frame [combining the CNAE names and the proportion columns
new.df <- cbind(grup_CNAE = df$grup_CNAE,
100*prop.table(df[,-1],margin = 2))
Finally, rename your columns. Be careful with the matrix or data.frame formats, because sometimes mathematical operations may suppose a problem. If you you use dput function in order to give us a reproducible example, the answer would be more accurate.
Here is a way to get it done. This is not the best way, but I think it is very readable.
Suppose this is your data frame:
mydf = structure(list(grup_CNAE = c("A", "B", "C", "D", "E", "Total"
), despesatotal = c(71, 93, 81, 27, 39, 311), despesamonetaria = c(7,
72, 36, 22, 73, 210)), row.names = c(NA, -6L), class = "data.frame")
mydf
# grup_CNAE despesatotal despesamonetaria
#1 A 71 7
#2 B 93 72
#3 C 81 36
#4 D 27 22
#5 E 39 73
#6 Total 311 210
To divide despesatotal values with its total value, you need to use the total value (311 in this example) as the denominator. Note that the total value is located in the last row. You can identify its position by indexing the despesatotal column and use nrow() as the index value.
mydf |> mutate(percentage1 = despesatotal/despesatotal[nrow(mydf)],
percentage2 = despesamonetaria /despesamonetaria[nrow(mydf)])
# grup_CNAE despesatotal despesamonetaria percentage1 percentage2
#1 A 71 7 0.22829582 0.03333333
#2 B 93 72 0.29903537 0.34285714
#3 C 81 36 0.26045016 0.17142857
#4 D 27 22 0.08681672 0.10476190
#5 E 39 73 0.12540193 0.34761905
#6 Total 311 210 1.00000000 1.00000000
library(tidyverse)
Sample data
# A tibble: 11 x 3
group despesatotal despesamonetaria
<chr> <int> <int>
1 1 198 586
2 2 186 525
3 3 202 563
4 4 300 562
5 5 126 545
6 6 215 529
7 7 183 524
8 8 163 597
9 9 213 592
10 10 175 530
11 Total 1961 5553
df %>%
mutate(percentage_total = despesatotal / last(despesatotal),
percentage_monetaria = despesamonetaria/ last(despesamonetaria)) %>%
slice(-nrow(.))
# A tibble: 10 x 5
group despesatotal despesamonetaria percentage_total percentage_monetaria
<chr> <int> <int> <dbl> <dbl>
1 1 198 586 0.101 0.106
2 2 186 525 0.0948 0.0945
3 3 202 563 0.103 0.101
4 4 300 562 0.153 0.101
5 5 126 545 0.0643 0.0981
6 6 215 529 0.110 0.0953
7 7 183 524 0.0933 0.0944
8 8 163 597 0.0831 0.108
9 9 213 592 0.109 0.107
10 10 175 530 0.0892 0.0954
This is a good place to use dplyr::mutate(across()) to divide all relevant columns by the Total row. Note this is not sensitive to the order of the rows and will apply the manipulation to all numeric columns. You can supply any tidyselect semantics to across() instead if needed in your case.
library(tidyverse)
# make sample data
d <- tibble(grup_CNAE = paste0("Group", 1:12),
despesatotal = sample(1e6:5e7, 12),
despesamonetaria = sample(1e6:5e7, 12)) %>%
add_row(grup_CNAE = "Total", summarize(., across(where(is.numeric), sum)))
# divide numeric columns by value in "Total" row
d %>%
mutate(across(where(is.numeric), ~./.[grup_CNAE == "Total"]))
#> # A tibble: 13 × 3
#> grup_CNAE despesatotal despesamonetaria
#> <chr> <dbl> <dbl>
#> 1 Group1 0.117 0.0204
#> 2 Group2 0.170 0.103
#> 3 Group3 0.0451 0.0837
#> 4 Group4 0.0823 0.114
#> 5 Group5 0.0170 0.0838
#> 6 Group6 0.0174 0.0612
#> 7 Group7 0.163 0.155
#> 8 Group8 0.0352 0.0816
#> 9 Group9 0.0874 0.135
#> 10 Group10 0.113 0.0877
#> 11 Group11 0.0499 0.0495
#> 12 Group12 0.104 0.0251
#> 13 Total 1 1
Created on 2022-11-08 with reprex v2.0.2
I have a data frame in R (xlsx file imported) that the first column contain dates with character type:
> dat
# A tibble: 4,372 x 3
date `1` `2`
<chr> <dbl> <dbl>
1 40544 35.5 35.5
2 40545 35.6 35.8
3 40546 37.2 36.4
4 40547 36.7 35.4
5 40548 36.6 35.3
I want to convert the character type into date type in R.
I tried the below code but NA's occur:
> dat%>%
+ mutate(date2 = as.Date(date, format= "%m-%d-%Y"))
# A tibble: 4,372 x 4
date `1` `2` date2
<chr> <dbl> <dbl> <date>
1 40544 35.5 35.5 NA
2 40545 35.6 35.8 NA
3 40546 37.2 36.4 NA
4 40547 36.7 35.4 NA
5 40548 36.6 35.3 NA
How can i fix this ?
Any help ?
Is this what you're looking for?
dat %>%
mutate(date2 = as.numeric(date),
date2 = as.Date(date2, origin = "1900-01-01"))
date date2
1 40544 2011-01-03
2 40545 2011-01-04
3 40546 2011-01-05
4 40547 2011-01-06
I have a data frame with five columns:
year<- c(2000,2000,2000,2001,2001,2001,2002,2002,2002)
k<- c(12.5,11.5,10.5,-8.5,-9.5,-10.5,13.9,14.9,15.9)
pop<- c(143,147,154,445,429,430,178,181,211)
pop_obs<- c(150,150,150,440,440,440,185,185,185)
df<- data_frame(year,k,pop,pop_obs)
df<-
year k pop pop_obs
<dbl> <dbl> <dbl> <dbl>
1 2000 12.5 143 150
2 2000 11.5 147 150
3 2000 10.5 154 150
4 2001 -8.5 445 440
5 2001 -9.5 429 440
6 2001 -10.5 430 440
7 2002 13.9 178 185
8 2002 14.9 181 185
9 2002 15.9 211 185
what I want is, based on each year and each k which value of pop has minimum difference of pop_obs. finally, I want to keep result as a data frame based on each year and each k.
my expected output would be like this:
year k
<dbl> <dbl>
1 2000 11.5
2 2001 -8.5
3 2003 14.9
You could try with dplyr
df<- data.frame(year,k,pop,pop_obs)
library(dplyr)
df %>%
mutate(diff = abs(pop_obs - pop)) %>%
group_by(year) %>%
filter(diff == min(diff)) %>%
select(year, k)
#> # A tibble: 3 x 2
#> # Groups: year [3]
#> year k
#> <dbl> <dbl>
#> 1 2000 11.5
#> 2 2001 -8.5
#> 3 2002 14.9
Created on 2021-12-11 by the reprex package (v2.0.1)
Try tidyverse way
library(tidyverse)
data_you_want = df %>%
group_by(year, k)%>%
mutate(dif=pop-pop_obs)%>%
ungroup() %>%
arrange(desc(dif)) %>%
select(year, k)
Using base R
subset(df, as.logical(ave(abs(pop_obs - pop), year,
FUN = function(x) x == min(x))), select = c('year', 'k'))
# A tibble: 3 × 2
year k
<dbl> <dbl>
1 2000 11.5
2 2001 -8.5
3 2002 14.9
I am trying to find mean of A and B for each row and save it as separate column but seems like the code only average the first row and fill the rest of the rows with that value. Any suggestion how to fix this?
library(tidyverse)
library(lubridate)
set.seed(123)
DF <- data.frame(Date = seq(as.Date("2001-01-01"), to = as.Date("2003-12-31"), by = "day"),
A = runif(1095, 1,60),
Z = runif(1095, 5,100)) %>%
mutate(MeanofAandZ= mean(A:Z))
Are you looking for this:
DF %>% rowwise() %>% mutate(MeanofAandZ = mean(c_across(A:Z)))
# A tibble: 1,095 x 4
# Rowwise:
Date A Z MeanofAandZ
<date> <dbl> <dbl> <dbl>
1 2001-01-01 26.5 7.68 17.1
2 2001-01-02 54.9 33.1 44.0
3 2001-01-03 37.1 82.0 59.5
4 2001-01-04 6.91 18.0 12.4
5 2001-01-05 53.0 8.76 30.9
6 2001-01-06 26.1 7.63 16.9
7 2001-01-07 59.3 30.8 45.0
8 2001-01-08 39.9 14.6 27.3
9 2001-01-09 59.2 93.6 76.4
10 2001-01-10 30.7 89.1 59.9
you can do it with Base R: rowMeans
Full Base R:
DF$MeanofAandZ <- rowMeans(DF[c("A", "Z")])
head(DF)
#> Date A Z MeanofAandZ
#> 1 2001-01-01 17.967074 76.92436 47.44572
#> 2 2001-01-02 47.510003 99.28325 73.39663
#> 3 2001-01-03 25.129638 64.33253 44.73109
#> 4 2001-01-04 53.098027 32.42556 42.76179
#> 5 2001-01-05 56.487570 23.99162 40.23959
#> 6 2001-01-06 3.687833 81.08720 42.38751
or inside a mutate:
library(dplyr)
DF <- DF %>% mutate(MeanofAandZ = rowMeans(cbind(A,Z)))
head(DF)
#> Date A Z MeanofAandZ
#> 1 2001-01-01 17.967074 76.92436 47.44572
#> 2 2001-01-02 47.510003 99.28325 73.39663
#> 3 2001-01-03 25.129638 64.33253 44.73109
#> 4 2001-01-04 53.098027 32.42556 42.76179
#> 5 2001-01-05 56.487570 23.99162 40.23959
#> 6 2001-01-06 3.687833 81.08720 42.38751
We can also do
DF$MeanofAandZ <- Reduce(`+`, DF[c("A", "Z")])/2
Or using apply
DF$MeanofAandZ <- apply(DF[c("A", "Z")], 1, mean)
I am trying to group_by a variable and then do operations per row per group. I got lost when using ifelse vs case_when. There is something basic I am failing to understand between the usage of two. I was assuming both would give me same output but that is not the case here. Using ifelse didn't give the expected output but case_when did. And I am trying to understand why ifelse didn't give me the expected output.
Here is the example df
structure(list(Pos = c(73L, 146L, 146L, 150L, 150L, 151L, 151L,
152L, 182L, 182L), Percentage = c(81.2, 13.5, 86.4, 66.1, 33.9,
48.1, 51.9, 86.1, 48, 52)), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame")) -> foo
I am grouping by Pos and I want to round Percentage if their sum is 100. The following is using ifelse:
library(tidyverse)
foo %>%
group_by(Pos) %>%
mutate(sumn = n()) %>%
mutate(Val = ifelse(sumn == 1,100,
ifelse(sum(Percentage) == 100, unlist(map(Percentage,round)), 0)
# case_when(sum(Percentage) == 100 ~ unlist(map(Percentage,round)),
# TRUE ~ 0
# )
))
the output is
# A tibble: 10 x 4
# Groups: Pos [6]
Pos Percentage sumn Val
<int> <dbl> <int> <dbl>
1 73 81.2 1 100
2 146 13.5 2 0
3 146 86.4 2 0
4 150 66.1 2 66
5 150 33.9 2 66
6 151 48.1 2 48
7 151 51.9 2 48
8 152 86.1 1 100
9 182 48 2 48
10 182 52 2 48
I don't want this, rather I want the following which I get using case_when
foo %>%
group_by(Pos) %>%
mutate(sumn = n()) %>%
mutate(Val = ifelse(sumn == 1,100,
#ifelse(sum(Percentage) == 100, unlist(map(Percentage,round)), 0)
case_when(sum(Percentage) == 100 ~ unlist(map(Percentage,round)),
TRUE ~ 0
)
))
# A tibble: 10 x 4
# Groups: Pos [6]
Pos Percentage sumn Val
<int> <dbl> <int> <dbl>
1 73 81.2 1 100
2 146 13.5 2 0
3 146 86.4 2 0
4 150 66.1 2 66
5 150 33.9 2 34
6 151 48.1 2 48
7 151 51.9 2 52
8 152 86.1 1 100
9 182 48 2 48
10 182 52 2 52
What is ifelse doing different?
According to ?ifelse
A vector of the same length and attributes (including dimensions and "class") as test and data values from the values of yes or no.
If we replicate to make the lengths same, then it should work
foo %>%
group_by(Pos) %>%
mutate(sumn = n()) %>%
mutate(Val = ifelse(sumn == 1,100,
ifelse(rep(sum(Percentage) == 100,
n()), unlist(map(Percentage,round)), 0)
))
# A tibble: 10 x 4
# Groups: Pos [6]
Pos Percentage sumn Val
<int> <dbl> <int> <dbl>
1 73 81.2 1 100
2 146 13.5 2 0
3 146 86.4 2 0
4 150 66.1 2 66
5 150 33.9 2 34
6 151 48.1 2 48
7 151 51.9 2 52
8 152 86.1 1 100
9 182 48 2 48
10 182 52 2 52