My data take this shape:
set.seed(666)
grouping <- rep(c("A", "B"), 3)
theMonth <- c("2022_01", "2022_01", "2022_02", "2022_02", "2022_03", "2022_03")
revenue <- sample(100:1000, 6)
df <- tibble(grouping, theMonth, revenue)
I'm being asked to spread these data by month...
step1 <- spread(df, theMonth, revenue)
step1
# A tibble: 2 × 4
grouping `2022_01` `2022_02` `2022_03`
<chr> <int> <int> <int>
1 A 673 707 639
2 B 737 222 753
...but also, within the same table, I'm being asked for the cumulative progress of B (and only B) toward a target, say in this case 10000. So the desired output is something like:
grouping `2022_01` `2022_02` `2022_03`
<chr> <int> <int> <int>
1 A 673 707 639
2 B 737 222 753
3 CumSumB 737 959 1712
4 Progress 7.37% 9.59% 17.12%
What's the best way to attack this? Should I do it before I spread, probably using mutate? Or is there a clean way to do it after the spread?
(Answer does not have to use dplyr, but that is my preferred package for this sort of work.)
We may filter the data first, get the cumulative sum column, bind the data with the original data and then create the row for 'Progress' with add_row
library(dplyr)
library(tidyr)
library(tibble)
df %>%
filter(grouping == 'B') %>%
mutate(grouping = 'CumSumB', revenue = cumsum(revenue)) %>%
bind_rows(df, .) %>%
pivot_wider(names_from = theMonth, values_from = revenue) %>%
add_row(., tibble(grouping = "Progress", .[3, -1]/10000 * 100))
-output
# A tibble: 4 × 4
grouping `2022_01` `2022_02` `2022_03`
<chr> <dbl> <dbl> <dbl>
1 A 673 707 639
2 B 737 222 753
3 CumSumB 737 959 1712
4 Progress 7.37 9.59 17.1
Adding the % would make the whole column character. If needed, it can be done
library(stringr)
df %>%
filter(grouping == 'B') %>%
mutate(grouping = 'CumSumB', revenue = cumsum(revenue)) %>%
bind_rows(df, .) %>%
pivot_wider(names_from = theMonth, values_from = revenue) %>%
add_row(., tibble(grouping = "Progress", .[3, -1]/10000 * 100)) %>%
mutate(across(-grouping, ~ replace(.x, n(), str_c(.x[n()], "%"))))
# A tibble: 4 × 4
grouping `2022_01` `2022_02` `2022_03`
<chr> <chr> <chr> <chr>
1 A 673 707 639
2 B 737 222 753
3 CumSumB 737 959 1712
4 Progress 7.37% 9.59% 17.12%
Here is an alternative approach:
library(dplyr)
library(tidyr)
df %>%
mutate(revenueA = lag(revenue, default = revenue[1])) %>%
filter(row_number() %% 2 == 0) %>%
mutate(CumSum = cumsum(revenue),
Progres = paste0(CumSum/100, "%")) %>%
pivot_longer(-c(grouping, theMonth),
names_to = "key",
values_to = "val",
values_transform = list(val = as.character)) %>%
pivot_wider(names_from = theMonth, values_from = val) %>%
mutate(grouping = case_when(key == "revenue" ~"B",
key == "revenueA" ~ "A",
TRUE ~ key)) %>%
arrange(grouping) %>%
select(-key)
grouping `2022_01` `2022_02` `2022_03`
<chr> <chr> <chr> <chr>
1 A 673 707 639
2 B 737 222 753
3 CumSum 737 959 1712
4 Progres 7.37% 9.59% 17.12%
Here is another option:
library(dplyr)
library(tidyr)
df %>%
pivot_wider(names_from = grouping, values_from = revenue) %>%
mutate(
CumSumB = cumsum(B),
Progress = (CumSumB / 10000) * 100
) %>%
pivot_longer(-theMonth, names_to = "grouping") %>%
pivot_wider(names_from = theMonth, values_from = value)
Returns:
grouping `2022_01` `2022_02` `2022_03`
<chr> <dbl> <dbl> <dbl>
1 A 673 707 639
2 B 737 222 753
3 CumSumB 737 959 1712
4 Progress 7.37 9.59 17.1
Related
The dataset is nycflights13.R. I'm unable to get the required answer:
ATL 895
BOS 422
When I run my code I can get the top for ATL but not for BOS, I'm having trouble narrowing down the resulting df to just the two answers.
flights %>%
filter(dest == 'ATL' | dest == 'BOS') %>%
group_by(dest) %>%
select(dest, max_arr_delay = arr_delay) %>%
arrange(desc(arr_delay) %>%
as.data.frame()
I am missing the step that would limit this dataframe. I get both the max answers for ATL and BOS but BOS is buried by the arrange function. How do I limit this so I get both the max values for just ATL and BOS.
Using dplyr::slice_max you could do:
library(nycflights13)
library(dplyr)
flights %>%
filter(dest %in% c("ATL", "BOS")) %>%
select(dest, max_arr_delay = arr_delay) %>%
group_by(dest) %>%
slice_max(max_arr_delay, n = 1)
#> # A tibble: 2 × 2
#> # Groups: dest [2]
#> dest max_arr_delay
#> <chr> <dbl>
#> 1 ATL 895
#> 2 BOS 422
Or using summarise:
library(nycflights13)
library(dplyr)
flights %>%
filter(dest %in% c("ATL", "BOS")) %>%
group_by(dest) %>%
summarise(max_arr_delay = max(arr_delay, na.rm = TRUE))
#> # A tibble: 2 × 2
#> dest max_arr_delay
#> <chr> <dbl>
#> 1 ATL 895
#> 2 BOS 422
I have a data set I modified a lot, to the point where the code doesn't look very clean and tidy, and I need some help in order to put everything in a clean dplyr style, this is my code:
ddd_dataset <- read_excel("data/ddd_dataset.xlsx")
new_data = ddd_dataset[ddd_dataset$`Indicator name`=="Population covered by at least a 4G mobile network (%)",]
new_data = new_data[order(new_data$Country),]
new_data = spread(new_data[-c(1553, 1554), c(1,5,6)], Year, value = Value)
# Data imputation
new_data = new_data %>% pivot_longer(-Country, names_to = "year") %>%
mutate(value = value %>% as.numeric()) %>%
group_by(Country) %>%
fill(value, .direction = "updown") %>%
pivot_wider(names_from = year, values_from = value)
# Change column
itu_emi_countries <- read_csv("data/itu-emi-countries.csv")
itu_emi_countries <- itu_emi_countries %>% rename(Country = `ITU Name`)
new_data = left_join(new_data, itu_emi_countries, by.x = "Country", by.y = "Country")
new_data$Country = new_data$`EMI Name`
new_data = new_data[,1:10]
# Turn data into long format
new_long =
new_data %>%
pivot_longer(-Country, names_to = "year", values_to = "x") %>%
mutate(across(year, as.numeric))
Does anyone know how I can rewrite these functions into a single function that has the style of a dplyr function (using %>%)?
Literal, with inference and caveats:
library(dplyr)
library(tidyr) # pivot_*, complete, fill
# library(readr)
# library(readxl)
ddd_dataset <- readxl::read_excel("ddd_dataset.xlsx")
itu_emi_countries <- readr::read_csv("itu-emi-countries.csv") %>%
rename(Country = `ITU Name`)
new_data <- ddd_dataset %>%
filter(`Indicator name` == "Population covered by at least a 4G mobile network (%)") %>%
mutate(Value = suppressWarnings(as.numeric(Value))) %>%
pivot_wider(Country, names_from = Year, values_from = Value) %>%
# we cannot impute before here, since some countries do not have all years, but now they will
pivot_longer(-Country, names_to = "Year", values_to = "Value") %>%
arrange(Country, Year) %>%
group_by(Country) %>%
fill(Value, .direction = "updown") %>%
pivot_wider(Country, names_from = Year, values_from = Value)
new_long <- left_join(new_data, itu_emi_countries, by = "Country") %>%
# inferring that you want to keep names for countries in new_data not present in itu
mutate(Country = coalesce(`EMI Name`, Country)) %>%
# inferring you want all but `EMI Name`, not just hard-coding 1:10
select(-`EMI Name`) %>%
pivot_longer(-Country, names_to = "year", values_to = "x") %>%
mutate(year = as.integer(year))
new_data
# # A tibble: 196 x 10
# Country `2012` `2013` `2014` `2015` `2016` `2017` `2018` `2019` `2020`
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 Afghanistan 0 0 0 0 0 4 7 22 26
# 2 Albania 0 0 0 35 80.2 85.3 85.5 95 98.4
# 3 Algeria 0 0 0 0 3.62 30.5 52.8 53.6 76.2
# 4 Andorra 50 50 50 50 50 85 85 85 85
# 5 Angola 7 7 7 7 8 8 8 18 30
# 6 Antigua and Barbuda 65 78.6 80 98 99 99 99 99 99
# 7 Argentina 0 0 0 65 85 85 90.8 91.2 97.7
# 8 Armenia 17.5 44 46 46.5 52.5 90.0 99.1 99.3 100
# 9 Australia 52.2 85 95 94 98 99 99.2 99.4 99.5
# 10 Austria 31.6 58.4 85 98 98 98 98 98 98
# # ... with 186 more rows
new_long
# # A tibble: 1,764 x 3
# Country year x
# <chr> <int> <dbl>
# 1 Afghanistan 2012 0
# 2 Afghanistan 2013 0
# 3 Afghanistan 2014 0
# 4 Afghanistan 2015 0
# 5 Afghanistan 2016 0
# 6 Afghanistan 2017 4
# 7 Afghanistan 2018 7
# 8 Afghanistan 2019 22
# 9 Afghanistan 2020 26
# 10 Albania 2012 0
# # ... with 1,754 more rows
But it seems unnecessary and inefficient to pivot back and forth when you ultimately want it in long format in the end. One-step:
new_long2 <- ddd_dataset %>%
filter(`Indicator name` == "Population covered by at least a 4G mobile network (%)") %>%
left_join(itu_emi_countries, by = "Country") %>%
mutate(
Country = coalesce(`EMI Name`, Country), # some `EMI Name` are missing
Value = suppressWarnings(as.numeric(Value)) # "NULL" -> NA
) %>%
complete(Country, Year) %>%
arrange(Year) %>%
group_by(Country) %>%
fill(Value, .direction = "updown") %>%
ungroup() %>%
select(Country, year = Year, x = Value)
(The only difference in the data, other than order, is that Year is a numeric in this last block and is integer above. This can easily be remedied, over to you.)
I am having some issues ranslating a dataframe into wide format using pivot_wider. My dataframe looks like this:
Data <- read.table(header = T, text = "
ID A B C D
1 6.01764 0.00409222 0.000500143 101.816
1 6.01769 0.00431931 0.000565946 101.334
1 6.01774 0.00454617 0.00063163 101.923
2 6.01779 0.00477308 0.000697374 101.914
2 6.01784 0.00500005 0.000763118 101.905
2 6.0179 0.00522703 0.000828803 101.926
3 6.01795 0.005454 0.000894606 101.889
3 6.018 0.00568086 0.000960231 101.895
3 6.01805 0.00590783 0.00102603 101.87
")
I would like to create unique column names by combining The "ID" with the Column name so that it looks like this:
Datalong <- read.table(header = T, text = "
1A 1B 1C 1D 2A 2B 2C 2D 3A 3B 3C 3D
6.01764 0.00409222 0.000500143 101.816 6.01779 0.00477308 0.000697374 101.914 6.01795 0.005454 0.000894606 101.889
6.01769 0.00431931 0.000565946 101.334 6.01784 0.00500005 0.000763118 101.905 6.018 0.00568086 0.000960231 101.895
6.01774 0.00454617 0.00063163 101.923 6.0179 0.00522703 0.000828803 101.926 6.01805 0.00590783 0.00102603 101.87
")
I am thinking I might need to add a new column that counts each instance of the ID column (as it is time series data)
I have tried:
DataNew <- Data %>% pivot_wider(names_from = ID, values_from = c(ID, colnames(Data)))
And
Data %>% group_by(ID) %>% mutate(time = row_number()) %>% pivot_wider(names_from = time, values_from = c(ID, colnames(Data)))
but to no avail. Any support would be greatly appreciated!
How about this:
DataNew <- Data %>%
pivot_longer(-ID, names_to="var", values_to="vals") %>%
group_by(ID, var) %>%
mutate(obs =1:n(),
vnames = paste0(ID, var)) %>%
ungroup %>%
select(-c(ID, var)) %>%
pivot_wider(names_from = vnames, values_from = vals) %>%
select(-obs)
DataNew
# # A tibble: 3 x 12
# `1A` `1B` `1C` `1D` `2A` `2B` `2C` `2D` `3A` `3B`
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 6.02 0.00409 5.00e-4 102. 6.02 0.00477 6.97e-4 102. 6.02 0.00545
# 2 6.02 0.00432 5.66e-4 101. 6.02 0.00500 7.63e-4 102. 6.02 0.00568
# 3 6.02 0.00455 6.32e-4 102. 6.02 0.00523 8.29e-4 102. 6.02 0.00591
# # … with 2 more variables: `3C` <dbl>, `3D` <dbl>
We can use dcast from data.table
library(data.table)
dcast(setDT(Data), rowid(ID) ~ ID, value.var = c('A', 'B', 'C', 'D'))
I my code below, I was wondering why the result of n = n() is not shown in the final output?
library(tidyverse)
hsb <- read.csv('https://raw.githubusercontent.com/rnorouzian/e/master/hsb.csv')
hsb %>% dplyr::select(math, sector) %>% group_by(sector) %>%
summarise(across(.fns = list(mean=mean, sd=sd), n = n()))
The issue seems to be with the closing bracket of across. We want the n to be a single column instead of repeating for each case, so for that, we can close the across and use n = n() separately i.e outside the across
library(dplyr)
hsb %>%
dplyr::select(math, sector) %>%
group_by(sector) %>%
summarise(across(.fns = list(mean=mean, sd=sd)), n = n(), .groups = 'drop')
# A tibble: 2 x 4
# sector math_mean math_sd n
# <int> <dbl> <dbl> <int>
#1 0 11.4 7.08 3642
#2 1 14.2 6.36 3543
Just to show that if we need multiple 'n' columns (not really needed). Here, we select only two columns and one of them is the grouping column, so it would return only a single 'n'
hsb %>%
dplyr::select(math, sector) %>%
group_by(sector) %>%
summarise(across(.fns = list(mean = mean, sd = sd,
n = ~ n())), .groups = 'drop')=
# A tibble: 2 x 4
# sector math_mean math_sd math_n
# <int> <dbl> <dbl> <int>
#1 0 11.4 7.08 3642
#2 1 14.2 6.36 3543
What should i write in summarise for showing de percentaje of Amount of Accidents. Thanks
dfc %>%
group_by(Urban_or_Rural_Area) %>%
summarise(
Accidents = mean(Number_of_Casualties),
`Amount of Accidents` = n()
)
There is likely a dupe somewhere, but ...
library(dplyr)
mtcars %>%
group_by(cyl) %>%
summarize(Amt = n()) %>%
ungroup() %>%
mutate(Pct = 100 * Amt / sum(Amt))
# # A tibble: 3 x 3
# cyl Amt Pct
# <dbl> <int> <dbl>
# 1 4 11 34.4
# 2 6 7 21.9
# 3 8 14 43.8