I have a below dataframe with numbers in two of the columns and I should replace that with string using my other reference dataset.
Dataset 1:
lhs rhs
32,39,6 65
39,6,65 32
14,16,26 15
16,20,4 26
16,26,33 4
53 31
Dataset 2:
id name
4 yougurt
6 coffee
14 cream chese
15 meat spreads
16 butter
20 whole milk
26 condensed milk
31 curd
32 flour
39 rolls
53 sugar
65 soda
Expected output:
lhs rhs
flour, rolls, coffee soda
rolls, coffee, soda flour
cream chease, butter, condensed milk meat spreads
A solution using dplyr and tidyr. dat is the final output. The key is to use separate_rows to expand the lhs and then conduct left_join twice.
library(dplyr)
library(tidyr)
dat <- dat1 %>%
separate_rows(lhs, convert = TRUE) %>%
left_join(dat2, by = c("lhs" = "id")) %>%
left_join(dat2, by = c("rhs" = "id")) %>%
drop_na(name.x) %>%
group_by(name.y) %>%
summarise(lhs = paste0(name.x, collapse = ", ")) %>%
ungroup() %>%
select(lhs, rhs = name.y)
dat
# # A tibble: 6 x 2
# lhs rhs
# <chr> <chr>
# 1 butter, whole milk, yougurt condensed milk
# 2 sugar curd
# 3 rolls, coffee, soda flour
# 4 cream chese, butter, condensed milk meat spreads
# 5 flour, rolls, coffee soda
# 6 butter, condensed milk yougurt
DATA
dat1 <- read.table(text = "lhs rhs
'32,39,6' 65
'39,6,65' 32
'14,16,26' 15
'16,20,4' 26
'16,26,33' 4
53 31 ",
stringsAsFactors = FALSE, header = TRUE)
dat2 <- read.table(text = "id name
4 yougurt
6 coffee
14 'cream chese'
15 'meat spreads'
16 butter
20 'whole milk'
26 'condensed milk'
31 curd
32 flour
39 rolls
53 sugar
65 soda",
header = TRUE, stringsAsFactors = FALSE)
Another option. Here d1 is your first data frame and d2 your second.
library(tidyverse)
d1 %>% separate(lhs, sep = ',', into = c('v1', 'v2', 'v3')) %>%
mutate_all(as.numeric) %>%
left_join(d2, by = c('v1'='id')) %>%
left_join(d2, by = c('v2'='id')) %>%
left_join(d2, by = c('v3'='id')) %>%
left_join(d2, by = c('rhs'='id')) %>%
unite(lhs, name.x, name.y, name.x.x, sep = ',') %>%
mutate(lhs = str_replace_all(lhs, ',NA', '')) %>%
select(lhs, rhs = name.y.y)
OR, as pointed out by #Moody_Mudskipper in the comments
d1 %>% separate(lhs, sep = ',', into = c('v1', 'v2', 'v3')) %>%
mutate_all(as.numeric) %>%
lmap(~setNames(left_join(setNames(.x, "id"), d2)[2], names(.x))) %>%
unite(lhs, v1, v2, v3, sep = ', ') %>%
mutate(lhs = str_replace_all(lhs, ',NA', '')) %>%
select(lhs, rhs = name.y.y)
lhs rhs
1 flour, rolls, coffee soda
2 rolls, coffee, soda flour
3 cream chese, butter, condensed milk meat spreads
4 butter, whole milk, yougurt condensed milk
5 butter, condensed milk yougurt
6 sugar curd
This is almost the same as www, but appears to be a little faster. Apparently using strsplit and unnest is faster than separate_rows
require(tidyverse)
df1 %>%
mutate(lhs = sapply(lhs, strsplit, ',')) %>%
unnest %>%
mutate_at(c('lhs', 'rhs'), as.numeric) %>%
left_join(df2, by = c('lhs'= 'id')) %>%
left_join(df2, by = c('rhs'= 'id')) %>%
group_by(name.y) %>%
summarize(name.x = paste(name.x, collapse = ', ')) %>%
rename(rhs = name.y, lhs = name.x)
Then there's the data.table solution, which is much faster.
require(data.table)
setDT(df1)
df1[, .(lhs = unlist(strsplit(lhs, ','))), rhs] %>%
.[, lapply(.SD, as.numeric)] %>%
merge(df2, by.x = 'lhs', by.y = 'id') %>%
merge(df2, by.x = 'rhs', by.y = 'id') %>%
.[, .(lhs = paste0(name.x, collapse = ',')), by = .(rhs = name.y)]
Benchmark
# Results
# Unit: relative
# expr min lq mean median uq max neval
# useDT() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 300
# UseUnnest() 5.570704 5.632532 5.274552 5.374714 5.042518 9.254190 300
# UseSeparateRows() 8.640615 8.356889 7.661669 7.939593 7.401666 7.896038 300
# Method
require(tidyverse)
require(data.table)
df1 <- fread("
lhs rhs
32,39,6 65
39,6,65 32
14,16,26 15
16,20,4 26
16,26,33 4
53 31
")
df2 <- fread("
id name
4 yougurt
6 coffee
14 cream_chese
15 meat_spreads
16 butter
20 whole_milk
26 condensed_milk
31 curd
32 flour
39 rolls
53 sugar
65 soda
")
useDT <- function(x){
df1[, lapply(sapply(lhs, strsplit, ','), unlist), rhs] %>%
setNames(c('rhs', 'lhs')) %>%
.[, `:=`(lhs = as.numeric(lhs),
rhs = as.numeric(rhs))] %>%
.[df2, on = c('lhs'= 'id')] %>%
.[df2, on = c('rhs'= 'id')] %>%
.[, .(lhs = paste0(name, collapse = ',')), by = i.name] %>%
.[lhs != 'NA', .(lhs, rhs = i.name)]
}
UseUnnest <- function(x){
df1 %>%
mutate(lhs = sapply(lhs, strsplit, ',')) %>%
unnest %>%
mutate_at(c('lhs', 'rhs'), as.numeric) %>%
left_join(df2, by = c('lhs'= 'id')) %>%
left_join(df2, by = c('rhs'= 'id')) %>%
group_by(name.y) %>%
summarize(name.x = paste(name.x, collapse = ', ')) %>%
rename(rhs = name.y, lhs = name.x)
}
UseSeparateRows <- function(x){
df1 %>%
separate_rows(lhs, convert = TRUE) %>%
left_join(df2, by = c("lhs" = "id")) %>%
left_join(df2, by = c("rhs" = "id")) %>%
drop_na(name.x) %>%
group_by(name.y) %>%
summarise(lhs = paste0(name.x, collapse = ", ")) %>%
ungroup() %>%
select(lhs, rhs = name.y)
}
microbenchmark(useDT(), UseUnnest(), UseSeparateRows(), times = 300, unit = 'relative')
Here is an option using just base R and mapping the numeric values to factor labels.
Split the string, map the labels to the values and then collapse the labels back into a string.
df<-structure(list(id = c(4L, 6L, 14L, 15L, 16L, 20L, 26L, 31L, 32L,
39L, 53L, 65L), name = c("yougurt", "coffee", "cream cheese",
"meat spreads", "butter", "whole milk", "condensed milk", "curd",
"flour", "rolls", "sugar", "soda")), .Names = c("id", "name"),
class = "data.frame", row.names = c(NA, -12L))
input<-structure(list(lhs = c("32,39,6", "39,6,65", "14,16,26", "16,20,4",
"16,26,33", "53"), rhs = c(65L, 32L, 15L, 26L, 4L, 31L)),
.Names = c("lhs", "rhs"), class = "data.frame", row.names = c(NA, -6L))
#new left hand side
newlhs<-sapply(as.character(input$lhs), function(x){
strs<-unlist(strsplit(x, ","))
f<-factor(strs, levels=df$id, labels=df$name)
paste(f, collapse = ", ")
})
#new right hand side
newrhs<-sapply(as.character(input$rhs), function(x){
strs<-unlist(strsplit(x, ","))
f<-factor(strs, levels=df$id, labels=df$name)
paste(f, collapse = ", ")
})
answer<-data.frame(newlhs, newrhs)
row.names(answer)<-NULL #remove rownames
Not so idiomatic but I win the code golf :) :
as.data.frame(lapply(dat1, function(x){
for (i in seq(nrow(dat2))) x <- gsub(paste0("(^|,)",dat2$id[i],"(,|$)"),
paste0("\\1",dat2$name[i],"\\2"),x)
x}))
# lhs rhs
# 1 flour,rolls,coffee soda
# 2 rolls,coffee,soda flour
# 3 cream chese,butter,condensed milk meat spreads
# 4 butter,whole milk,yougurt condensed milk
# 5 butter,condensed milk,33 yougurt
# 6 sugar curd
May fail if you have numbers in 2nd dataset.
Related
I have a dataframe that looks like this
Fruit
2021
2022
Apples
12
29
Bananas
11
31
Apples
44
55
Oranges
30
73
Oranges
19
82
Bananas
24
78
The Fruit names are not ordered so I can't group them by taking n at a time, they're listed randomly. I need to get the mean of fruits sold in 2021 & 2022 as well as mean sold for apples, oranges & bananas for each year separately.
My code is
2021 <- c(mean(df$2021), sd(df$2021))
2022 <- c(mean(df$2022), sd(df$2022))
measure <- c('mean','standard deviation')
df1 <- data.table(measure,TE,TW,NC,SC,NWC)
and output looks like this:
Measure
2021
2022
mean
23.3
58
standard deviation
12.4
23.3
But I'm not sure where to start with grouping the rows by name. I need to get something that looks like this
Measure
2021
Apples
Bananas
Oranges
2022
Apples
Bananas
Oranges
mean
23.3
58
standard deviation
12.4
23.3
(with the appropriate numbers in the blank spaces)
I suggest this might be better (in the long run) in a long format, which this summarizing can get started. This is just 'mean', not hard to repeat for sd and combine with this:
fruits <- c(NA, "Apples", "Oranges", "Bananas")
lapply(quux[,-1], function(yr) stack(sapply(fruits, function(z) mean(yr[is.na(z) | quux$Fruit %in% z])))) |>
dplyr::bind_rows(.id = "year")
# year values ind
# 1 2021 23.33333 <NA>
# 2 2021 28.00000 Apples
# 3 2021 24.50000 Oranges
# 4 2021 17.50000 Bananas
# 5 2022 58.00000 <NA>
# 6 2022 42.00000 Apples
# 7 2022 77.50000 Oranges
# 8 2022 54.50000 Bananas
where NA in ind indicates all fruits, otherwise the individual fruit labeled.
If you put your data in long form, you could use the aggregate function:
a <- aggregate(value ~ year + fruit, data=df, FUN=function(x) c(sd(x),mean(x))
Where value is a column you could create to put the values which are now under 2021 and 2022. Then create a new column called year which has 2021 or 2022 accordingly. Long form is the way to go in R almost always.
We may use
library(dplyr)
library(tidyr)
library(data.table)
library(stringr)
df1 %>%
pivot_longer(cols = where(is.numeric), names_to = 'year') %>%
as.data.table %>%
cube( .(Mean = mean(value), SD = sd(value)),
by = c("Fruit", "year")) %>%
filter(!if_all(Fruit:year, is.na)) %>%
unite(Fruit, Fruit, year, sep = "_", na.rm = TRUE) %>%
filter(str_detect(Fruit, "_|\\d+")) %>%
data.table::transpose(make.names = "Fruit", keep.names = "Measure")
-output
Measure Apples_2021 Apples_2022 Bananas_2021 Bananas_2022 Oranges_2021 Oranges_2022 2021 2022
1: Mean 28.00000 42.00000 17.500000 54.50000 24.500000 77.500000 23.33333 58.00000
2: SD 22.62742 18.38478 9.192388 33.23402 7.778175 6.363961 12.42041 23.57965
Or if we want the duplicate column names
df1 %>%
pivot_longer(cols = where(is.numeric), names_to = 'year') %>%
as.data.table %>%
cube( .(Mean = mean(value), SD = sd(value)), by = c("Fruit", "year")) %>%
mutate(Fruit = coalesce(Fruit, year)) %>%
drop_na(year) %>%
arrange(year, str_detect(Fruit, '\\d{4}', negate = TRUE)) %>%
select(-year) %>%
data.table::transpose(make.names = "Fruit", keep.names = "Measure")
-output
Measure 2021 Apples Bananas Oranges 2022 Apples Bananas Oranges
1: Mean 23.33333 28.00000 17.500000 24.500000 58.00000 42.00000 54.50000 77.500000
2: SD 12.42041 22.62742 9.192388 7.778175 23.57965 18.38478 33.23402 6.363961
data
df1 <- structure(list(Fruit = c("Apples", "Bananas", "Apples", "Oranges",
"Oranges", "Bananas"), `2021` = c(12L, 11L, 44L, 30L, 19L, 24L
), `2022` = c(29L, 31L, 55L, 73L, 82L, 78L)),
class = "data.frame", row.names = c(NA,
-6L))
I'm trying to produce a table with summary totals and means across the whole dataset, and then by sub-category (f_grp), and show this by site.
I can use the group_by function to group by, which works well for reporting total_count and Mean_per_litre, but I would then like the same values for each category, as shown in f_grp.
|Site
|total_count
|Mean_per_litre
|1 |66 |3.33333333
|2 |77 |4.27777778
|3 |65 |3.38541667
|4 |154 |8.85057471
etc
I've tried group_by for both site and f_grp but this isn't quite right
|site
|f_grp
|total_count
|mean_per_litre
|1 |1c |3 |1.666667
|1 |1d |15 |4.166667
|1 |2a |1 |1.666667
|1 |2b |47 |11.190476
This isn't quite right as its not easy to read and I've now lost the original total columns I had in the first table (sorry about the tables, cant get them to work here).
dat$site=as.factor(dat$site)
dat$count=as.numeric(dat$count)
dat$f_grp=as.factor(dat$f_grp)
# totals across all f_grp
tabl1 <- dat %>%
group_by(site) %>%
summarise (total_count = sum(count), Mean_per_litre = mean(count_l_site))
tabl1
# totals FG 1b
tabl2 <- dat %>%
group_by(site) %>%
filter(f_grp== '1b') %>%
summarise ('1b_total_count' = sum(count))
tabl2
### BUT - this doesnt give a correct mean, as it only shows the mean of '1b' when only '1b' is present. I need a mean over the entire dataset at that site.
# table showing totals across whole dataset
tabl7 <- dat %>%
summarise (total_count = sum(count, na.rm = TRUE), Total_mean_per_litre = mean(count_l_site, na.rm = TRUE))
tabl7
# table with means for each site by fg
table6 <- dat %>%
group_by(site, f_grp) %>%
summarise (total_count = sum(count), mean_per_litre = mean(count_l_site, na.rm = TRUE))
table6
Ideally I need a way to extract the f-grp categories, put them as column headings, and then summarise means by site for those categories. But filtering the data and then joining multiple tables, gives incorrect means (as not mean of whole dataset, but a subset of that category, ie: when f_grp value is present only).
Many thanks to all who have read this far :)
> dput(head(dat))
structure(list(X = 1:6, site = structure(c(1L, 10L, 11L, 12L,
13L, 14L), levels = c("1", "2", "3", "4", "5", "6", "7", "8",
"9", "10", "11", "12", "13", "14", "15", "16", "17", "18"), class = "factor"),
count = c(0, 0, 0, 0, 0, 0), f_grp = structure(c(NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_
), levels = c("1b", "1c", "1d", "2a", "2b"), class = "factor"),
count_l_site = c(0, 0, 0, 0, 0, 0)), row.names = c(NA, 6L
), class = "data.frame")
Updated:
Following advice here from Jon, and using the mtcars data (which worked as expected) I've tried the same method using my own data.
I can almost produce what's needed, but the totals are coming through as a row when they are needed as a column.
tabl1 <- dat %>%
group_by(site) %>%
summarise (total_count = sum(count), Mean_per_litre = mean(count_l_site)) %>%
mutate(fg = "total")
tabl1
tabl2_fg <- dat %>%
group_by(site, f_grp = as.character(f_grp)) %>%
summarize(total_count = sum(count), Mean_per_litre = mean(count_l_site))
tabl2_fg
tabl4 <-
bind_rows(tabl1, tabl2_fg) %>%
arrange(site, f_grp) %>%
tidyr::pivot_wider(names_from = f_grp, values_from = c(Mean_per_litre, total_count), names_vary = "slowest")
tabl4
Output as follows
Next steps:
move the circled outputs and put them at the beginning of the table
remove every other line
result - left with a simple table rows = sites; columns: total count; total mean; then columns for each fg count & mean: eg 1c count; 1c mean; 1d count; 1d mean.
Something like this?
library(dplyr)
avg_gear <- mtcars %>%
group_by(gear) %>%
summarize(avg_mpg = mean(mpg), n = n()) %>%
mutate(cyl = "total")
avg_gear_cyl <- mtcars %>%
group_by(gear,cyl = as.character(cyl)) %>%
summarize(avg_mpg = mean(mpg), n = n())
bind_rows(avg_gear, avg_gear_cyl) %>%
arrange(gear, cyl)
# A tibble: 11 × 4
gear avg_mpg n cyl
<dbl> <dbl> <int> <chr>
1 3 21.5 1 4
2 3 19.8 2 6
3 3 15.0 12 8
4 3 16.1 15 total
5 4 26.9 8 4
6 4 19.8 4 6
7 4 24.5 12 total
8 5 28.2 2 4
9 5 19.7 1 6
10 5 15.4 2 8
11 5 21.4 5 total
Or if you want categories as columns:
bind_rows(avg_gear, avg_gear_cyl) %>%
arrange(gear, desc(cyl)) %>%
tidyr::pivot_wider(names_from = cyl, values_from = c(avg_mpg, n), names_vary = "slowest")
# A tibble: 3 × 9
gear avg_mpg_total n_total avg_mpg_8 n_8 avg_mpg_6 n_6 avg_mpg_4 n_4
<dbl> <dbl> <int> <dbl> <int> <dbl> <int> <dbl> <int>
1 3 16.1 15 15.0 12 19.8 2 21.5 1
2 4 24.5 12 NA NA 19.8 4 26.9 8
3 5 21.4 5 15.4 2 19.7 1 28.2 2
I have a list of countries with lists inside each one of them.
Just to give you an example of a list object for one country with lists for two countries (df_DOTS):
df_DOTS <- list(BR = structure(list(`#FREQ` = "M", `#REF_AREA` = "AU", `#INDICATOR` = "TXG_FOB_USD",
`#COUNTERPART_AREA` = "BR", `#UNIT_MULT` = "6", `#TIME_FORMAT` = "P1M",
Obs = list(structure(list(`#TIME_PERIOD` = c("2019-07", "2019-08",
"2019-09"), `#OBS_VALUE` = c("55.687747", "36.076581", "57.764474"
)), class = "data.frame", row.names = c(NA, 3L)))), row.names = 2L, class = "data.frame"),
US = structure(list(`#FREQ` = "M", `#REF_AREA` = "AU", `#INDICATOR` = "TXG_FOB_USD",
`#COUNTERPART_AREA` = "US", `#UNIT_MULT` = "6", `#TIME_FORMAT` = "P1M",
Obs = list(structure(list(`#TIME_PERIOD` = c("2019-07",
"2019-08", "2019-09"), `#OBS_VALUE` = c("876.025841",
"872.02118", "787.272851")), class = "data.frame", row.names = c(NA,
3L)))), row.names = 1L, class = "data.frame"))
I can reach the matrix (matrix_DOTS) I am looking for using these lines of code:
library(dplyr)
library(rlist)
library(magrittr)
BR <- df_DOTS[["BR"]][["Obs"]] %>%
list.select(.$`#OBS_VALUE`) %>%
unlist() %>%
sapply(function(x) as.numeric(as.character(x))) %>%
mean()
US <- df_DOTS[["US"]][["Obs"]] %>%
list.select(.$`#OBS_VALUE`) %>%
unlist() %>%
sapply(function(x) as.numeric(as.character(x))) %>%
mean()
matrix_DOTS <- matrix(c(BR, US), nrow = 1, dimnames = list(c("AU"), c("BR", "US")))
Since I have a list of several countries with lists of other several countries inside them, I am looking for a more practical way of achieving matrix_DOTS. Any help is highly appreciated!
PS: This is the dput for the final matrix in this example:
matrix_DOTS <- structure(c(49.842934, 845.106624), .Dim = 1:2, .Dimnames = list(
"AU", c("BR", "US")))
EDIT
This is the procedure to obtain df_DOTS:
library(IMFData)
databaseID <- "DOT"
startdate = "2019-07-01"
enddate = "2019-09-01"
checkquery = FALSE
queryfilter <- list(CL_FREQ = "M", CL_AREA_DOT = "AU",
CL_INDICATOR_DOT = "TXG_FOB_USD",
CL_COUNTERPART_AREA_DOT = c("BR", "US"))
df_DOTS <- CompactDataMethod(databaseID, queryfilter, startdate, enddate, checkquery) %>%
split(.$`#COUNTERPART_AREA`)
Just add tidy = TRUE to the CompactDataMethod call:
library(IMFData)
databaseID <- "DOT"
startdate = "2019-07-01"
enddate = "2019-09-01"
checkquery = FALSE
queryfilter <- list(CL_FREQ = "M", CL_AREA_DOT = "AU",
CL_INDICATOR_DOT = "TXG_FOB_USD",
CL_COUNTERPART_AREA_DOT = c("BR", "US"))
df_DOTS <- CompactDataMethod(databaseID,
queryfilter,
startdate,
enddate,
checkquery,
tidy = TRUE)
df_DOTS
#TIME_PERIOD #OBS_VALUE #FREQ #REF_AREA #INDICATOR #COUNTERPART_AREA #UNIT_MULT #TIME_FORMAT
1 2019-07 876.025841 M AU TXG_FOB_USD US 6 P1M
2 2019-08 872.02118 M AU TXG_FOB_USD US 6 P1M
3 2019-09 787.272851 M AU TXG_FOB_USD US 6 P1M
4 2019-07 55.687747 M AU TXG_FOB_USD BR 6 P1M
5 2019-08 36.076581 M AU TXG_FOB_USD BR 6 P1M
6 2019-09 57.764474 M AU TXG_FOB_USD BR 6 P1M
you just need one group_by(#COUNTERPART_AREA) %>% summarise(mean = mean(#OBS_VALUE)):
library(tidyverse)
df_DOTS %>%
group_by(`#COUNTERPART_AREA`, `#REF_AREA`) %>%
summarise(mean = mean(as.numeric(`#OBS_VALUE`))) %>%
spread( `#COUNTERPART_AREA`, mean)
#output
`#REF_AREA` BR US
<chr> <dbl> <dbl>
1 AU 49.8 845.
Or if you insist on a matrix
df_DOTS %>%
group_by(`#COUNTERPART_AREA`, `#REF_AREA`) %>%
summarise(mean = mean(as.numeric(`#OBS_VALUE`))) %>%
spread( `#COUNTERPART_AREA`, mean) %>%
column_to_rownames("#REF_AREA") %>%
as.matrix
#output
BR US
AU 49.84293 845.1066
From the input data, we could loop over with map, pluck the elements that is needed, convert to numeric, get the mean, and convert to a two column tibble with enframe
library(purrr)
library(tidyr)
map(df_DOTS, ~ .x %>%
pluck("Obs", 1, "#OBS_VALUE") %>%
as.numeric %>%
mean) %>%
enframe %>%
unnest(c(value))
# A tibble: 2 x 2
# name value
# <chr> <dbl>
#1 BR 49.8
#2 US 845.
Another option would be like this:
tmp <- df_DOTS %>%
as_tibble() %>%
summarise(across(everything(), ~mean(as.numeric(.x$Obs[[1]]$`#OBS_VALUE`))))
tmp
# # A tibble: 1 x 2
# BR US
# <dbl> <dbl>
# 1 49.8 845.
Let's say I have a dataframe containing the sales for some quarters, while the values for the following quarters are missing. I would like to replace the NAs by a simple formula (with mutate/dplyr like below). The issue is that I don't want to use mutate so many times. How could I do that for all NAs at the same time? Is there a way?
structure(list(Period = c("1999Q1", "1999Q2", "1999Q3", "1999Q4",
"2000Q1", "2000Q2", "2000Q3", "2000Q4", "2001Q1", "2001Q2", "2001Q3",
"2001Q4", "2002Q1", "2002Q2", "2002Q3", "2002Q4", "2003Q1", "2003Q2",
"2003Q3", "2003Q4"), Sales= c(353.2925571, 425.9299841, 357.5204626,
363.80247, 302.8081066, 394.328576, 435.15573, 387.99768, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), class = "data.frame", row.names = c(NA,
-20L))
test %>%
mutate(Sales = ifelse(is.na(Sales), 1.05*lag(Sales, 4), Sales)) %>%
mutate(Sales = ifelse(is.na(Sales), 1.05*lag(Sales, 4), Sales)) %>%
mutate(Sales = ifelse(is.na(Sales), 1.05*lag(Sales, 4), Sales))
One dplyr and tidyr possibility could be:
df %>%
group_by(quarter = substr(Period, 5, 6)) %>%
mutate(Sales_temp = replace_na(Sales, last(na.omit(Sales)))) %>%
group_by(quarter, na = is.na(Sales)) %>%
mutate(constant = 1.05,
Sales_temp = Sales_temp * cumprod(constant),
Sales = coalesce(Sales, Sales_temp)) %>%
ungroup() %>%
select(1:2)
Period Sales
<chr> <dbl>
1 1999Q1 353.
2 1999Q2 426.
3 1999Q3 358.
4 1999Q4 364.
5 2000Q1 303.
6 2000Q2 394.
7 2000Q3 435.
8 2000Q4 388.
9 2001Q1 318.
10 2001Q2 414.
11 2001Q3 457.
12 2001Q4 407.
13 2002Q1 334.
14 2002Q2 435.
15 2002Q3 480.
16 2002Q4 428.
17 2003Q1 351.
18 2003Q2 456.
19 2003Q3 504.
20 2003Q4 449.
Or with just dplyr:
df %>%
group_by(quarter = substr(Period, 5, 6)) %>%
mutate(Sales_temp = if_else(is.na(Sales), last(na.omit(Sales)), Sales)) %>%
group_by(quarter, na = is.na(Sales)) %>%
mutate(constant = 1.05,
Sales_temp = Sales_temp * cumprod(constant),
Sales = coalesce(Sales, Sales_temp)) %>%
ungroup() %>%
select(1:2)
x <- test$Sales
# find that last non-NA data
last.valid <- tail(which(!is.na(x)),1)
# store the "base"
base <- ceiling(last.valid/4)*4 + (-3:0)
base <- base + ifelse(base > last.valid, -4, 0)
base <- x[base]
# calculate the "exponents"
expos <- ceiling( ( seq(length(x)) - last.valid ) / 4 )
test$Sales <- ifelse(is.na(x), bases * 1.05 ^ expos, x)
tail(test)
# Period Sales
# 15 2002Q3 479.7592
# 16 2002Q4 427.7674
# 17 2003Q1 350.5382
# 18 2003Q2 456.4846
# 19 2003Q3 503.7472
# 20 2003Q4 449.1558
Here's another base solution:
non_nas <- na.omit(test$Sales)
nas <- length(attr(non_nas, 'na.action'))
test$Sales <- c(non_nas, #keep non_nas
tail(non_nas, 4) * 1.05 ^(rep(1:floor(nas / 4), each = 4, length.out = nas)))
test
I have a large data set like this :
ID Number
153 31
28
31
30
104 31
30
254 31
266 31
and I want to compute sum by ID include the NA. I mean get this :
ID Number
153 120
104 61
254 31
266 31
I tried aggregate but I dont get the expected result. Some help would be appreciated
One option is to convert the blanks to NA, then fill replace the NA elements with non-NA adjacent elements above in 'ID', grouped by 'ID', get the sum of 'Number'
library(tidyverse)
df1 %>%
mutate(ID = na_if(ID, "")) %>%
fill(ID) %>%
group_by(ID) %>%
summarise(Number = sum(Number))
# A tibble: 4 x 2
# ID Number
# <chr> <int>
#1 104 61
#2 153 120
#3 254 31
#4 266 31
Or without using fill, create a grouping variable with a logical expression and cumsum, and then do the sum
df1 %>%
group_by(grp = cumsum(ID != "")) %>%
summarise(ID = first(ID), Number = sum(Number)) %>%
select(-grp)
data
df1 <- structure(list(ID = c("153", "", "", "", "104", "", "254", "266"
), Number = c(31L, 28L, 31L, 30L, 31L, 30L, 31L, 31L)), row.names = c(NA,
-8L), class = "data.frame")
Or do it straightforwardly :) by
cbind(df1[df1$ID != "", "ID", drop = FALSE],
Number = rev(diff(c(0, rev((rev(cumsum(rev(df1$Number)))[df1$ID != ""]))))))