Add new columns with custom function using mutate

Add new columns with custom function using mutate - r

I want to do a simple and add a new column using dplyr mutate for that. Basically I have a DF with lots of columns and I want to select some of them, just the ones containing hist_avg, tgt_ and monthyl_X_ly. This should be simple and adding a new column starting with "fct_" + metric shouldn't be an issue. However, as you may see below, it adds the column but with a weird name (fct_visits$hist_avg_visits and fct_revenue$hist_avg_revenue_lcy).
Also, not sure but I tried to do it using mutate + across since it would save me lots of lines of code and couldn't figure out on how to do that.
library(tidyverse)
(example <- tibble(brand = c("Brand A", "Brand A", "Brand A", "Brand A", "Brand A"),
country = c("Country A", "Country A", "Country A", "Country A", "Country A"),
date = c("2020-08-01", "2020-08-02", "2020-08-03", "2020-08-04", "2020-08-05"),
visits = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_),
visits_ly = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_),
tgt_visits = c(2491306, 2491306, 2491306, 2491306, 2491306),
hist_avg_visits = c(177185, 175758, 225311, 210871, 197405),
monthly_visits_ly = c(3765612, 3765612, 3765612, 3765612, 3765612),
revenue_lcy = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_),
revenue_ly = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_),
tgt_revenue_lcy = c(48872737, 48872737, 48872737, 48872737, 48872737),
hist_avg_revenue_lcy = c(231101, 222236, 276497, 259775, 251167),
monthly_revenue_lcy_ly = c(17838660, 17838660, 17838660, 17838660, 17838660))) %>%
print(width = Inf)
#> # A tibble: 5 x 13
#> brand country date visits visits_ly tgt_visits hist_avg_visits
#> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 Brand A Country A 2020-08-01 NA NA 2491306 177185
#> 2 Brand A Country A 2020-08-02 NA NA 2491306 175758
#> 3 Brand A Country A 2020-08-03 NA NA 2491306 225311
#> 4 Brand A Country A 2020-08-04 NA NA 2491306 210871
#> 5 Brand A Country A 2020-08-05 NA NA 2491306 197405
#> monthly_visits_ly revenue_lcy revenue_ly tgt_revenue_lcy hist_avg_revenue_lcy
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 3765612 NA NA 48872737 231101
#> 2 3765612 NA NA 48872737 222236
#> 3 3765612 NA NA 48872737 276497
#> 4 3765612 NA NA 48872737 259775
#> 5 3765612 NA NA 48872737 251167
#> monthly_revenue_lcy_ly
#> <dbl>
#> 1 17838660
#> 2 17838660
#> 3 17838660
#> 4 17838660
#> 5 17838660
first_forecast <- function(dataset, metric) {
avg_metric <- select(dataset, paste0("hist_avg_", metric))
tgt_metric <- select(dataset, paste0("tgt_", metric))
monthly_metric <- select(dataset, paste0("monthly_", metric, "_ly"))
output <- avg_metric * (tgt_metric / monthly_metric)
return(output)
}
example %>%
mutate(fct_visits = first_forecast(., "visits"),
fct_revenue = first_forecast(., "revenue_lcy")) %>%
print(width = Inf)
#> # A tibble: 5 x 15
#> brand country date visits visits_ly tgt_visits hist_avg_visits
#> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 Brand A Country A 2020-08-01 NA NA 2491306 177185
#> 2 Brand A Country A 2020-08-02 NA NA 2491306 175758
#> 3 Brand A Country A 2020-08-03 NA NA 2491306 225311
#> 4 Brand A Country A 2020-08-04 NA NA 2491306 210871
#> 5 Brand A Country A 2020-08-05 NA NA 2491306 197405
#> monthly_visits_ly revenue_lcy revenue_ly tgt_revenue_lcy hist_avg_revenue_lcy
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 3765612 NA NA 48872737 231101
#> 2 3765612 NA NA 48872737 222236
#> 3 3765612 NA NA 48872737 276497
#> 4 3765612 NA NA 48872737 259775
#> 5 3765612 NA NA 48872737 251167
#> monthly_revenue_lcy_ly fct_visits$hist_avg_visits
#> <dbl> <dbl>
#> 1 17838660 117225.
#> 2 17838660 116280.
#> 3 17838660 149064.
#> 4 17838660 139511.
#> 5 17838660 130602.
#> fct_revenue$hist_avg_revenue_lcy
#> <dbl>
#> 1 633149.
#> 2 608862.
#> 3 757521.
#> 4 711708.
#> 5 688124.
Created on 2020-07-28 by the reprex package (v0.3.0)

Pointing to the great sugestion of #Onyambu the final part of your code should be this:
example %>%
cbind(fct_visits = first_forecast(., "visits"),
fct_revenue = first_forecast(., "revenue_lcy")) %>%
print(width = Inf)
brand country date visits visits_ly tgt_visits hist_avg_visits monthly_visits_ly revenue_lcy
1 Brand A Country A 2020-08-01 NA NA 2491306 177185 3765612 NA
2 Brand A Country A 2020-08-02 NA NA 2491306 175758 3765612 NA
3 Brand A Country A 2020-08-03 NA NA 2491306 225311 3765612 NA
4 Brand A Country A 2020-08-04 NA NA 2491306 210871 3765612 NA
5 Brand A Country A 2020-08-05 NA NA 2491306 197405 3765612 NA
revenue_ly tgt_revenue_lcy hist_avg_revenue_lcy monthly_revenue_lcy_ly hist_avg_visits hist_avg_revenue_lcy
1 NA 48872737 231101 17838660 117224.5 633149.5
2 NA 48872737 222236 17838660 116280.4 608862.0
3 NA 48872737 276497 17838660 149064.4 757521.3
4 NA 48872737 259775 17838660 139511.0 711707.9
5 NA 48872737 251167 17838660 130601.9 688124.5

Related

Rolling weighted sum across table with NA in R

I am trying to get rolling weighted sums across a table, and have a method involving matrix multiplication, but it breaks when some of the data is missing.
So if I use
library(tidyverse)
mydata <- tibble(Country = c("Australia", "Canada"),
"1980" = c(1000, 2000),
"1981" = c(1100, 2100),
"1982" = c(1300, 2300),
"1983" = c(1200, 2400),
"1984" = c(1400, 2200),
"1985" = c(1500, 2500))
weights <- c(3, 4, 6)
n0 <- ncol(mydata) - length(weights)
matweights <- matrix(rep(c(rep(0, n0), weights), n0)[-(1:n0)], ncol=n0)
tibble(cbind(mydata[, 1], as.matrix(mydata[, -1]) %*% matweights))
I get what I want with
# A tibble: 2 x 5
Country `1` `2` `3` `4`
<chr> <dbl> <dbl> <dbl> <dbl>
1 Australia 15200 15700 17100 18200
2 Canada 28200 29900 29700 31000
where for example in the top right 18200 is 3*1200 + 4*1400 + 6*1500
But if for example one of the values is missing, say mydata[2, 3] <- NA then I would get
# A tibble: 2 x 5
Country `1` `2` `3` `4`
<chr> <dbl> <dbl> <dbl> <dbl>
1 Australia 15200 15700 17100 18200
2 Canada NA NA NA NA
when I want
# A tibble: 2 x 5
Country `1` `2` `3` `4`
<chr> <dbl> <dbl> <dbl> <dbl>
1 Australia 15200 15700 17100 18200
2 Canada NA NA 29700 31000
The problem with my matrix approach is 0 * NA giving NA when I want it to be 0. I know there are solutions using some kind of apply approach but I suspect that may be slower with a large table.

I really quite like slider for sliding functions—it's very flexible, and has a purrr-like syntax. Here, slide_index_dbl() will let us slide a function and use another variable as an index by which to decide what observations are within the window.
First, reshape to long form and group, then it's a single call within mutate(). .before here specifies how many years back to include; .complete specifies to ignore partial windows.
library(tidyverse)
out1 <- mydata %>%
gather(year, value, -Country, convert = TRUE) %>%
group_by(Country) %>%
mutate(
value_3y = slider::slide_index_dbl(
value, .i = year,
.f = ~sum(.x * weights),
.before = 2, .complete = TRUE
)
)
out1
#> # A tibble: 12 x 4
#> # Groups: Country [2]
#> Country year value value_3y
#> <chr> <int> <dbl> <dbl>
#> 1 Australia 1980 1000 NA
#> 2 Canada 1980 2000 NA
#> 3 Australia 1981 1100 NA
#> 4 Canada 1981 2100 NA
#> 5 Australia 1982 1300 15200
#> 6 Canada 1982 2300 28200
#> 7 Australia 1983 1200 15700
#> 8 Canada 1983 2400 29900
#> 9 Australia 1984 1400 17100
#> 10 Canada 1984 2200 29700
#> 11 Australia 1985 1500 18200
#> 12 Canada 1985 2500 31000
To reshape to wide form:
out1 %>%
select(-value) %>%
drop_na() %>% # omit to keep partial/empty years
spread(year, value_3y)
#> # A tibble: 2 x 5
#> # Groups: Country [2]
#> Country `1982` `1983` `1984` `1985`
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 Australia 15200 15700 17100 18200
#> 2 Canada 28200 29900 29700 31000
If the data contains NAs, the code works exactly the same:
mydata[2, 3] <- NA
out2 <- mydata %>%
gather(year, value, -Country, convert = TRUE) %>%
group_by(Country) %>%
mutate(
value_3y = slider::slide_index_dbl(
value, .i = year,
.f = ~sum(.x * weights),
.before = 2, .complete = TRUE
)
)
out2
#> # A tibble: 12 x 4
#> # Groups: Country [2]
#> Country year value value_3y
#> <chr> <int> <dbl> <dbl>
#> 1 Australia 1980 1000 NA
#> 2 Canada 1980 2000 NA
#> 3 Australia 1981 1100 NA
#> 4 Canada 1981 NA NA
#> 5 Australia 1982 1300 15200
#> 6 Canada 1982 2300 NA
#> 7 Australia 1983 1200 15700
#> 8 Canada 1983 2400 NA
#> 9 Australia 1984 1400 17100
#> 10 Canada 1984 2200 29700
#> 11 Australia 1985 1500 18200
#> 12 Canada 1985 2500 31000
out2 %>%
select(-value) %>%
drop_na() %>%
spread(year, value_3y)
#> # A tibble: 2 x 5
#> # Groups: Country [2]
#> Country `1982` `1983` `1984` `1985`
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 Australia 15200 15700 17100 18200
#> 2 Canada NA NA 29700 31000

Using rollapply we have the following matrix:
library(zoo)
t(rollapply(t(mydata[, -1]), 3, function(x) sum(x * weights)))
## [,1] [,2] [,3] [,4]
## [1,] 15200 15700 17100 18200
## [2,] NA NA 29700 31000

Linear filtering option:
t(apply(mydata[-1], 1, stats::filter, filter=rev(weights), sides=1))
# [,1] [,2] [,3] [,4] [,5] [,6]
#[1,] NA NA 15200 15700 17100 18200
#[2,] NA NA NA NA 29700 31000

Check for multiple NA columns and return another column in R

I have a dataframe that has multiple columns named as "avg_metric", "wkday_avg_metric", "event_avg_metric" and "monthly_avg_metric", in which "metric" consists of multiple metrics with these calculations (orders, revenue, etc). I have to check for multiple columns if their rows have NAs and replace them with a row from another column. For that, I created a function that does the same verification for the column "metric" I specify. The thing is that I'm getting the same value for the entire new column that I'm creating, which should not be the case.
I added below an example_fixed on what should be the outcome.
Is there an easier way of doing that? Or am I lacking some logic in the function?
Tks.
Edit: I got the errors on my function, but I'm sure there's a better solution to mine. I tried your solutions, but couldn't apply them for my dataframe. I updated the reprex so you can help me better.
library(tidyverse)
(example <- tibble(country = c("A", "B", "C", "D"),
brand = c("A", "A", "B", "B"),
event = c(1:4),
month = c(1:4),
weekday = c(1:4),
avg_visits = c(5028, NA, NA, NA),
avg_revenue = c(12345, NA, NA, NA),
wkday_avg_visits = c(1234, 4355, NA, NA),
wkday_avg_revenue = c(12345, 54321, NA, NA),
event_avg_visits = c(51271, 59212, 98773, NA),
event_avg_revenue = c(98764, 56435, 35634, NA),
monthly_avg_visits = c(5028, 5263, 6950, 8902),
monthly_avg_revenue = c(63457, 34536, 34574, 23426))) %>%
print(width = Inf)
#> # A tibble: 4 x 13
#> country brand event month weekday avg_visits avg_revenue wkday_avg_visits
#> <chr> <chr> <int> <int> <int> <dbl> <dbl> <dbl>
#> 1 A A 1 1 1 5028 12345 1234
#> 2 B A 2 2 2 NA NA 4355
#> 3 C B 3 3 3 NA NA NA
#> 4 D B 4 4 4 NA NA NA
#> wkday_avg_revenue event_avg_visits event_avg_revenue monthly_avg_visits
#> <dbl> <dbl> <dbl> <dbl>
#> 1 12345 51271 98764 5028
#> 2 54321 59212 56435 5263
#> 3 NA 98773 35634 6950
#> 4 NA NA NA 8902
#> monthly_avg_revenue
#> <dbl>
#> 1 63457
#> 2 34536
#> 3 34574
#> 4 23426
subs_metric <- function(data, metric) {
avg <- paste0("avg_", metric)
wkday_avg <- paste0("wkday_avg_", metric)
event_avg <- paste0("event_avg_", metric)
monthly_avg <- paste0("monthly_avg_", metric)
for (i in nrow(data)) {
value <- if (is.na(data[[avg]][i]) & is.na(data[[wkday_avg]][i]) & is.na(data[[event_avg]][i])) {
data[[monthly_avg]][i]
} else if (is.na(data[[avg]][i]) & is.na(data[[wkday_avg]][i])) {
data[[event_avg]][i]
} else if (is.na(data[[avg]][i])) {
data[[wkday_avg]][i]
} else {
data[[avg]][i]
}
return(value)
}
}
example %>%
mutate(avg_visits_new = subs_metric(., "visits"),
avg_revenue_new = subs_metric(., "revenue")) %>%
print(width = Inf)
#> # A tibble: 4 x 15
#> country brand event month weekday avg_visits avg_revenue wkday_avg_visits
#> <chr> <chr> <int> <int> <int> <dbl> <dbl> <dbl>
#> 1 A A 1 1 1 5028 12345 1234
#> 2 B A 2 2 2 NA NA 4355
#> 3 C B 3 3 3 NA NA NA
#> 4 D B 4 4 4 NA NA NA
#> wkday_avg_revenue event_avg_visits event_avg_revenue monthly_avg_visits
#> <dbl> <dbl> <dbl> <dbl>
#> 1 12345 51271 98764 5028
#> 2 54321 59212 56435 5263
#> 3 NA 98773 35634 6950
#> 4 NA NA NA 8902
#> monthly_avg_revenue avg_visits_new avg_revenue_new
#> <dbl> <dbl> <dbl>
#> 1 63457 8902 23426
#> 2 34536 8902 23426
#> 3 34574 8902 23426
#> 4 23426 8902 23426
(example_fixed <- tibble(country = c("A", "B", "C", "D"),
brand = c("A", "A", "B", "B"),
event = c(1:4),
month = c(1:4),
weekday = c(1:4),
avg_visits = c(5028, NA, NA, NA),
avg_revenue = c(12345, NA, NA, NA),
wkday_avg_visits = c(1234, 4355, NA, NA),
wkday_avg_revenue = c(12345, 54321, NA, NA),
event_avg_visits = c(51271, 59212, 98773, NA),
event_avg_revenue = c(98764, 56435, 35634, NA),
monthly_avg_visits = c(5028, 5263, 6950, 8902),
monthly_avg_revenue = c(63457, 34536, 34574, 23426),
avg_visits_new = c(5028, 4355, 98773, 8902),
avg_revenue_new = c(12345, 54321, 35634, 23426))) %>%
print(width = Inf)
#> # A tibble: 4 x 15
#> country brand event month weekday avg_visits avg_revenue wkday_avg_visits
#> <chr> <chr> <int> <int> <int> <dbl> <dbl> <dbl>
#> 1 A A 1 1 1 5028 12345 1234
#> 2 B A 2 2 2 NA NA 4355
#> 3 C B 3 3 3 NA NA NA
#> 4 D B 4 4 4 NA NA NA
#> wkday_avg_revenue event_avg_visits event_avg_revenue monthly_avg_visits
#> <dbl> <dbl> <dbl> <dbl>
#> 1 12345 51271 98764 5028
#> 2 54321 59212 56435 5263
#> 3 NA 98773 35634 6950
#> 4 NA NA NA 8902
#> monthly_avg_revenue avg_visits_new avg_revenue_new
#> <dbl> <dbl> <dbl>
#> 1 63457 5028 12345
#> 2 34536 4355 54321
#> 3 34574 98773 35634
#> 4 23426 8902 23426
Created on 2020-07-07 by the reprex package (v0.3.0)

We could use the following
example$avg_visits_new <- apply(example,1,function(x) x[!is.na(x)][1])
# A tibble: 4 x 5
avg_visits wkday_avg_visits event_avg_visits monthly_avg_visits avg_visits_new
<dbl> <dbl> <dbl> <dbl> <dbl>
1 5028 1234 51271 5028 5028
2 NA 4355 59212 5263 4355
3 NA NA 98773 6950 98773
4 NA NA NA 8902 8902
This just goes row-by-row and uses the first non-NA value it finds
Edit:
here is a loop that will add recycle the above code on all the metrics.
metric <- unique(sub(".*_(.*)","\\1",colnames(example)[-(1:5)]))
for(i in metric){
example <- cbind(example, print(apply(example[,grepl(i,colnames(example))],1,function(x) x[!is.na(x)][1])))
}
colnames(example)[(ncol(example)-length(metric)+1):ncol(example)] <- paste0("avg_",metric,"_new")
> example
country brand event month weekday avg_visits avg_revenue wkday_avg_visits wkday_avg_revenue event_avg_visits event_avg_revenue monthly_avg_visits monthly_avg_revenue avg_visits_new avg_revenue_new
1 A A 1 1 1 5028 12345 1234 12345 51271 98764 5028 63457 5028 12345
2 B A 2 2 2 NA NA 4355 54321 59212 56435 5263 34536 4355 54321
3 C B 3 3 3 NA NA NA NA 98773 35634 6950 34574 98773 35634
4 D B 4 4 4 NA NA NA NA NA NA 8902 23426 8902 23426

There are better ways of doing this, for example you can replace the whole function with:
subs_metric <- function(data, metric)
{
data.table::fcoalesce(data[grep(metric, names(data)), ])
}
Which gives the correct result:
example %>%
mutate(avg_visits_new = subs_metric(., "visits"))
#> # A tibble: 4 x 5
#> avg_visits wkday_avg_visits event_avg_visits monthly_avg_visits avg_visits_new
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 5028 1234 51271 5028 5028
#> 2 NA 4355 59212 5263 4355
#> 3 NA NA 98773 6950 98773
#> 4 NA NA NA 8902 8902
However, I'm sure you would like to know where the flaws in your code were that stopped the loop working as expected.
Firstly, your loop starts with for (i in nrow(data)). Since there are 4 rows in your data frame, this means for (i in 4). That means the loop only runs once with i set to 4. I think you meant for (i in 1:nrow(data))
Secondly, you are returning value inside the loop. That means that any time the loop runs, it will only run once and the function will return value. I think this was just a misplaced curly bracket.
Thirdly, you are overwriting value in each iteration of the loop, where you want value to be the vector that will form your new column, so you need to declare value in advance and write to value[i] for each iteration of the loop.
Incorporating these changes, we have:
subs_metric <- function(data, metric) {
avg <- paste0("avg_", metric)
wkday_avg <- paste0("wkday_avg_", metric)
event_avg <- paste0("event_avg_", metric)
monthly_avg <- paste0("monthly_avg_", metric)
value <- numeric(nrow(data))
for (i in 1:nrow(data)) {
value[i] <- if (is.na(data[[avg]][i]) &
is.na(data[[wkday_avg]][i]) &
is.na(data[[event_avg]][i])) {
data[[monthly_avg]][i]
} else if (is.na(data[[avg]][i]) &
is.na(data[[wkday_avg]][i])) {
data[[event_avg]][i]
} else if (is.na(data[[avg]][i])) {
data[[wkday_avg]][i]
} else {
data[[avg]][i]
}
}
return(value)
}
Which now gives the correct result:
example %>%
mutate(avg_visits_new = subs_metric(., "visits"))
#> # A tibble: 4 x 5
#> avg_visits wkday_avg_visits event_avg_visits monthly_avg_visits avg_visits_new
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 5028 1234 51271 5028 5028
#> 2 NA 4355 59212 5263 4355
#> 3 NA NA 98773 6950 98773
#> 4 NA NA NA 8902 8902
However, I'd probably stick to one of the other solutions offered, since they are considerably shorter and more efficient than a row-wise loop.

Filtering on date using conditional in R

Trying to simply filter on year, or date. See below where data2 is a tibble and Contract Date is a type date field.
recent <- data2 %>%
filter(data2,`Contract Date` >("01-01-2016"))
Error in FUN(left, right) :
operations are possible only for numeric, logical or complex types
Update:
See head of data2 below:
A tibble: 6 x 52
CONTACTID Status `Contract Date` `Country of Ori~ `CES Submitted` `Embassy Date` `Hire Date` Agent `ATT Received` `CES Issued Dat~ `CES Ready Revi~ `CES RFR 8 week~ `I-140 Sent` `I-140 Recd.` `I-140 Result D~ `I-140 Status` `CES Status` `Choice of Agen~ Client `Contact Owner`
<chr> <chr> <date> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <lgl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 zcrm_257~ Finis~ NA Philippines NA NA 6/22/2015 NA NA NA NA FALSE NA NA NA NA NA NA Oakbe~ Tyler Richards
2 zcrm_257~ Finis~ 2015-12-15 England NA 12/5/2016 2/20/2017 Adev~ NA NA NA FALSE 4/14/2016 6/29/2011 4/25/2016 Approved NA 5/25/2016 Oakbe~ Eddie Money
3 zcrm_257~ Finis~ 2015-11-09 Philippines NA NA 11/17/2015 Ulti~ NA NA NA FALSE NA NA NA NA NA NA Oakbe~ Eddie Money
4 zcrm_257~ Finis~ 2016-03-03 Philippines NA NA 3/21/2016 NA NA NA NA FALSE NA NA NA NA NA NA Oakbe~ Eddie Money
5 zcrm_257~ Finis~ 2006-08-15 Philippines NA 3/21/2016 6/27/2016 IQ NA NA NA FALSE 3/1/2007 3/2/2007 3/8/2007 Approved NA 7/10/2007 Oakbe~ alyssa Coleman
6 zcrm_257~ Relea~ 2016-07-20 Philippines NA NA 9/12/2016 Ulti~ NA NA NA FALSE NA NA NA NA NA NA Oakbe~ Eddie Money
# ... with 32 more variables: `DS260 Completed` <chr>, Elite <lgl>, `Embassy Status` <chr>, `Fee Bill Received` <chr>, `Fee Bill Returned` <chr>, `Hospital Country` <chr>, `Hospital Size` <dbl>, `IELTS Test Date` <chr>, `Initial Ready for Review` <chr>, `Instruction Pkt.

The error occurs because you're trying to compare if character 1 is greater than character 2. You must first coerce Contract Date and the text to filter out to date.
For example:
recent <- data2 %>%
mutate(`Contract Date` = lubridate::ymd(`Contract Date`)) %>%
filter(`Contract Date` > as.Date("2016-01-01"))

gather function in R to match patterns in character strings

I want to gather reshape wide table to long table. The columns i want to gather have a pattern. For now i only managed to gather them by their position. How can i change this to gather them by the patterns in column names? please only use the gather function.
I have included an example dataset, however in the real dataset there are many more columns. Therefore I would like to gather all columns that:
start with an f or m
are followed by one OR two numbers
dput(head(test1, 1))
structure(list(startdate = "2019-11-06", id = "POL55", m0_9 = NA_real_,
m10_19 = NA_real_, m20_29 = NA_real_, m30_39 = NA_real_,
m40_49 = 32, m50_59 = NA_real_, m60_69 = NA_real_, m70 = NA_real_,
f0_9 = 32, f10_19 = NA_real_, f20_29 = NA_real_, f30_39 = NA_real_,
f40_49 = NA_real_, f50_59 = NA_real_, f60_69 = NA_real_,
f70 = NA_real_), row.names = c(NA, -1L), class = c("tbl_df",
"tbl", "data.frame"))
df_age2 <- test1 %>%
gather(age_cat, count, m0_9:f70 )
df_age2
expected output (there will be many more columns that are not gathered). The count should of course count...
startdate id age_cat count
<chr> <chr> <chr> <dbl>
1 2019-11-06 POL55 m0_9 NA
2 2019-11-06 POL56 m0_9 NA
3 2019-11-06 POL57 m0_9 NA
4 2019-11-06 POL58 m0_9 NA
5 2019-11-06 POL59 m0_9 NA
6 2019-11-06 POL60 m0_9 NA
7 2019-11-06 POL61 m0_9 NA
8 2019-11-06 POL62 m0_9 NA
9 2019-11-06 POL63 m0_9 NA
10 2019-11-06 POL64 m0_9 NA

Use starts_with:
test1 %>%
gather(age_bucket, count, c(starts_with("m"), starts_with("f")))

We can use pivot_longer from tidyr
library(dplyr)
library(tidyr)
test1 %>%
pivot_longer(cols = -c(startdate, id), names_to = c('.value', 'grp'), names_sep="_")
Or it could be
test1 %>%
pivot_longer(cols = -c(startdate, id),
names_to = c( '.value', 'grp'), names_pattern = "^([a-z])(.*)")
# A tibble: 8 x 5
# startdate id grp m f
# <chr> <chr> <chr> <dbl> <dbl>
#1 2019-11-06 POL55 0_9 NA 32
#2 2019-11-06 POL55 10_19 NA NA
#3 2019-11-06 POL55 20_29 NA NA
#4 2019-11-06 POL55 30_39 NA NA
#5 2019-11-06 POL55 40_49 32 NA
#6 2019-11-06 POL55 50_59 NA NA
#7 2019-11-06 POL55 60_69 NA NA
#8 2019-11-06 POL55 70 NA NA
Or may be
test1 %>%
pivot_longer(cols = -c(startdate, id),
names_to = c( 'grp', '.value'), names_pattern = "^([a-z])(.*)")
# A tibble: 2 x 11
# startdate id grp `0_9` `10_19` `20_29` `30_39` `40_49` `50_59` `60_69` `70`
# <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 2019-11-06 POL55 m NA NA NA NA 32 NA NA NA
#2 2019-11-06 POL55 f 32 NA NA NA NA NA NA NA
Or it can be
test1 %>%
pivot_longer(cols = matches("^(f|m)\\d+_?\\d*$"), names_to = 'age_bucket',
values_to = 'count')
# A tibble: 16 x 4
# startdate id age_bucket count
# <chr> <chr> <chr> <dbl>
# 1 2019-11-06 POL55 m0_9 NA
# 2 2019-11-06 POL55 m10_19 NA
# 3 2019-11-06 POL55 m20_29 NA
# 4 2019-11-06 POL55 m30_39 NA
# 5 2019-11-06 POL55 m40_49 32
# 6 2019-11-06 POL55 m50_59 NA
# 7 2019-11-06 POL55 m60_69 NA
# 8 2019-11-06 POL55 m70 NA
# 9 2019-11-06 POL55 f0_9 32
#10 2019-11-06 POL55 f10_19 NA
#11 2019-11-06 POL55 f20_29 NA
#12 2019-11-06 POL55 f30_39 NA
#13 2019-11-06 POL55 f40_49 NA
#14 2019-11-06 POL55 f50_59 NA
#15 2019-11-06 POL55 f60_69 NA
#16 2019-11-06 POL55 f70 NA

Spread and Gather table return duplicated rows with NA values

I have a table with categories and sub categories encoded in this format of columns name:
Date| Admissions__0 |Attendance__0 |Tri_1__0|Tri_2__0|...
Tri_1__1|Tri_2__1|...|
and I would like to change it to this format of columns using spread and gather function of tidyverse:
Date| Country code| Admissions| Attendance| Tri_1|Tri_2|...
I tried a solution posted but the outcome actually return multiple rows with NA rather than a single row.
My code used:
temp <- data %>% gather(key="columns",value ="dt",-Date)
temp <- temp %>% mutate(category = gsub(".*__","",columns)) %>% mutate(columns = gsub("__\\d","",columns))
temp %>% mutate(row = row_number()) %>% spread(key="columns",value="dt")
And my results is:
Date country_code row admissions attendance Tri_1 Tri_2 Tri_3 Tri_4 Tri_5
<chr> <chr> <int> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 01-APR-2014 0 275 NA 209 NA NA NA NA NA
2 01-APR-2014 0 640 84 NA NA NA NA NA NA
3 01-APR-2014 0 1005 NA NA 5 NA NA NA NA
4 01-APR-2014 0 1370 NA NA NA 33 NA NA NA
5 01-APR-2014 0 1735 NA NA NA NA 62 NA NA
6 01-APR-2014 0 2100 NA NA NA NA NA 80 NA
7 01-APR-2014 0 2465 NA NA NA NA NA NA 29
8 01-APR-2014 1 2830 NA 138 NA NA NA NA NA
9 01-APR-2014 1 3195 66 NA NA NA NA NA NA
10 01-APR-2014 1 3560 NA NA N/A NA NA NA NA
My expected results:
Date country_code row admissions attendance Tri_1 Tri_2 Tri_3 Tri_4 Tri_5
<chr> <chr> <int> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 01-APR-2014 0 275 84 209 5 33 62 80 29
8 01-APR-2014 1 2830 66 138 66 ... ... ... ...

We can do a summarise_at coalesce to remove the NA elements after the spread
library(tidyverse)
data %>%
gather(key = "columns", val = "dt", -Date, na.rm = TRUE) %>%
mutate(category = gsub(".*__","",columns)) %>%
mutate(columns = gsub("__\\d","",columns)) %>%
group_by(Date, dt, columns, category) %>%
mutate(rn = row_number()) %>%
spread(columns, dt) %>%
select(-V1) %>%
summarise_at(vars(Admissions:Tri_5),list(~ coalesce(!!! .))) # %>%
# filter if needed
#filter_at(vars(Admissions:Tri_5), all_vars(!is.na(.)))

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Add new columns with custom function using mutate - r

Related

Rolling weighted sum across table with NA in R

Check for multiple NA columns and return another column in R

Filtering on date using conditional in R

gather function in R to match patterns in character strings

Spread and Gather table return duplicated rows with NA values

Categories

Resources