Deduplicate and lengthen dataframe in R - r

After merging three datasets I've got a mess. There is a unique id field and then there can be one or more samples associated with each id. So far I've got
samples <- structure(list(id = c(1029459, 1029459, 1029459, 1029459, 1030272,
1030272, 1030272, 1032157, 1032157, 1032178, 1032178, 1032219,
1032219, 1032229, 1032229, 1032494, 1032494, 1032780, 1032780
), sample1 = c(853401, 853401, 853401, 853401, 852769, 852769,
852769, 850161, 850161, 852711, 852711, 852597, 852597, 850363,
850363, 850717, 850717, 848763, 848763), sample2 = c(853401,
853693, 853667, 853667, 852769, 853597, 853597, NA, NA, 852711,
853419, 852597, 852597, 850363, 852741, 850717, 851811, 848763,
848763), sample3 = c(NA, NA, NA, NA, NA, NA, NA, 853621, 852621,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -19L))
print(samples)
#> # A tibble: 19 × 4
#> id sample1 sample2 sample3
#> <dbl> <dbl> <dbl> <dbl>
#> 1 1029459 853401 853401 NA
#> 2 1029459 853401 853693 NA
#> 3 1029459 853401 853667 NA
#> 4 1029459 853401 853667 NA
#> 5 1030272 852769 852769 NA
#> 6 1030272 852769 853597 NA
#> 7 1030272 852769 853597 NA
#> 8 1032157 850161 NA 853621
#> 9 1032157 850161 NA 852621
#> 10 1032178 852711 852711 NA
#> 11 1032178 852711 853419 NA
#> 12 1032219 852597 852597 NA
#> 13 1032219 852597 852597 NA
#> 14 1032229 850363 850363 NA
#> 15 1032229 850363 852741 NA
#> 16 1032494 850717 850717 NA
#> 17 1032494 850717 851811 NA
#> 18 1032780 848763 848763 NA
#> 19 1032780 848763 848763 NA
I'd like to get it so that all unique samples per id are combined into one sample column with a long dataframe. eg
id sample
1029459 853401
1029459 853693
1030272 852769
1030272 853597
1032157 850161
1032157 853621
Any ideas?

Is this what you are looking for? For example:
id 1029459 has the unique samples 853401 , 853693 , 853667.
samples %>%
pivot_longer(
c(sample1, sample2, sample3)
,names_to = "sample") %>%
count(id, sample, value) %>%
drop_na() %>%
distinct(id, value) %>%
rename(sample = value)
# A tibble: 16 × 2
id sample
<dbl> <dbl>
1 1029459 853401
2 1029459 853667
3 1029459 853693
4 1030272 852769
5 1030272 853597
6 1032157 850161
7 1032157 852621
8 1032157 853621
9 1032178 852711
10 1032178 853419
11 1032219 852597
12 1032229 850363
13 1032229 852741
14 1032494 850717
15 1032494 851811
16 1032780 848763

You can do it this way:
library(data.table)
unique(melt(setDT(samples), "id",value.name = "sample")[!is.na(sample),c(1,3)])
Output:
id sample
1: 1029459 853401
2: 1030272 852769
3: 1032157 850161
4: 1032178 852711
5: 1032219 852597
6: 1032229 850363
7: 1032494 850717
8: 1032780 848763
9: 1029459 853693
10: 1029459 853667
11: 1030272 853597
12: 1032178 853419
13: 1032229 852741
14: 1032494 851811
15: 1032157 853621
16: 1032157 852621

Related

data.table roll "nearest" left join for single best match (rest to NA)

I have two data.tables with different number of rows. I would like to left join by matching on a single column so that first dt dt1 keeps all rows. Only best nearest values from second dt2 should be joined.
Minimal data:
library(data.table)
set.seed(42)
timestamp <- sort(rnorm(10, mean = 1, sd = 1))
dt1 <- data.table(
id = letters[1:10],
timestamp = timestamp,
timestamp1 = timestamp,
other1 = 1:10,
other2 = 11:20
)
dt2 <- data.table(
timestamp = timestamp[c(3, 5, 8)] + 0.1,
timestamp2 = timestamp[c(3, 5, 8)] + 0.1,
other3 = c("x", "y", "z"),
other4 = c(333, 444, 555)
)
What I tried:
dt2[dt1, roll = "nearest", on = "timestamp"]
#> timestamp timestamp2 other3 other4 id timestamp1 other1 other2
#> 1: 0.4353018 1.005341 x 333 a 0.4353018 1 11
#> 2: 0.8938755 1.005341 x 333 b 0.8938755 2 12
#> 3: 0.9053410 1.005341 x 333 c 0.9053410 3 13
#> 4: 0.9372859 1.005341 x 333 d 0.9372859 4 14
#> 5: 1.3631284 1.463128 y 444 e 1.3631284 5 15
#> 6: 1.4042683 1.463128 y 444 f 1.4042683 6 16
#> 7: 1.6328626 1.463128 y 444 g 1.6328626 7 17
#> 8: 2.3709584 2.470958 z 555 h 2.3709584 8 18
#> 9: 2.5115220 2.470958 z 555 i 2.5115220 9 19
#> 10: 3.0184237 2.470958 z 555 j 3.0184237 10 20
I am failing to understand how roll="nearest" works. I see that it indeed matches the nearest values, but it does it with all of them. I would like to merge only those 3 rows from dt2 that have the absolute nearest values.
Using joins based on tolerance (max_dist) would also give more than three matches, but in this case I found the value of max_dist that gives the best nearest for this tiny example data.
Desired output:
library(fuzzyjoin)
fuzzyjoin::difference_left_join(as.data.frame(dt1), as.data.frame(dt2), by = "timestamp", max_dist = 0.09)
#> id timestamp.x timestamp1 other1 other2 timestamp.y timestamp2 other3 other4
#> 1 a 0.4353018 0.4353018 1 11 NA NA <NA> NA
#> 2 b 0.8938755 0.8938755 2 12 NA NA <NA> NA
#> 3 c 0.9053410 0.9053410 3 13 NA NA <NA> NA
#> 4 d 0.9372859 0.9372859 4 14 1.005341 1.005341 x 333
#> 5 e 1.3631284 1.3631284 5 15 NA NA <NA> NA
#> 6 f 1.4042683 1.4042683 6 16 1.463128 1.463128 y 444
#> 7 g 1.6328626 1.6328626 7 17 NA NA <NA> NA
#> 8 h 2.3709584 2.3709584 8 18 NA NA <NA> NA
#> 9 i 2.5115220 2.5115220 9 19 2.470958 2.470958 z 555
#> 10 j 3.0184237 3.0184237 10 20 NA NA <NA> NA
Created on 2022-08-25 with reprex v2.0.2
You can try a proper left update join and assign the desired variables from dt2 explicitely
library(data.table)
set.seed(42)
timestamp <- sort(rnorm(10, mean = 1, sd = 1))
dt1 <- data.table(
id = letters[1:10],
timestamp = timestamp,
timestamp1 = timestamp,
other1 = 1:10,
other2 = 11:20
)
dt2 <- data.table(
timestamp = timestamp[c(3, 5, 8)] + 0.1,
timestamp2 = timestamp[c(3, 5, 8)] + 0.1,
other3 = c("x", "y", "z"),
other4 = c(333, 444, 555)
)
# left join: leading table on the left
dt1[dt2,
roll = "nearest",
on = "timestamp",
# assign desired values explicitely
`:=`(other3 = i.other3,
other4 = i.other4)]
dt1[]
#> id timestamp timestamp1 other1 other2 other3 other4
#> 1: a 0.4353018 0.4353018 1 11 <NA> NA
#> 2: b 0.8938755 0.8938755 2 12 <NA> NA
#> 3: c 0.9053410 0.9053410 3 13 <NA> NA
#> 4: d 0.9372859 0.9372859 4 14 x 333
#> 5: e 1.3631284 1.3631284 5 15 <NA> NA
#> 6: f 1.4042683 1.4042683 6 16 y 444
#> 7: g 1.6328626 1.6328626 7 17 <NA> NA
#> 8: h 2.3709584 2.3709584 8 18 <NA> NA
#> 9: i 2.5115220 2.5115220 9 19 z 555
#> 10: j 3.0184237 3.0184237 10 20 <NA> NA

Show me a better way! How to unnest a heavily nested list in R

I will start off by stating that I have working code, but it is embarrassingly inefficient and clumsy. I was hoping that someone in the community might be able to show me a better way to unnest this heavily nested list.
As a background, it is transaction data on nfts that is heavily nested. I am just trying to get a data frame out, ultimately down to the daily level. I have managed to get the code working for the totalPriceUSD field, but as I mentioned, it is clumsy.
library(dplyr)
library(tidyr)
library(rlist)
library(jsonlite)
mydata <- fromJSON("https://api2.cryptoslam.io/api/nft-indexes/NFTGlobal")
#attempt at nested extraction
mydata <- rlist::list.flatten(mydata) %>% dplyr::bind_rows()
mydata <- select(mydata1, contains("totalPriceUSD"))
mydata <- select(mydata1, contains("daily"))
#change row name
rownames(mydata) <- "totalPriceUSD"
names(mydata) <- substring(names(mydata),24,33)
#change col names
names(mydata) <- format(as.Date(names(mydata), format = "%Y-%m-%d"))
mydata1 <- mydata %>%
gather(date, totalPriceUSD)
mydata <- as.data.frame(mydata)
mydata$date <- as.Date(mydata$date, format = "%Y-%m-%d")
As I said, it works, but it ain't pretty. Any suggestions on improving this?
Many thanks
library(dplyr)
mydata <- jsonlite::fromJSON("https://api2.cryptoslam.io/api/nft-indexes/NFTGlobal")
monthly <- bind_rows(lapply(mydata, `[[`, "monthlySummary"), .id = "monthly_id")
daily <- bind_rows(lapply(mydata, function(z) bind_rows(z[["dailySummaries"]], .id = "daily_id")), .id = "monthly_id")
monthly
# # A tibble: 60 x 6
# monthly_id totalTransactions uniqueBuyers uniqueSellers totalPriceUSD isRollingHoursData
# <chr> <int> <int> <int> <dbl> <lgl>
# 1 2017-06 193 33 32 11570. FALSE
# 2 2017-07 613 61 57 89111. FALSE
# 3 2017-08 113 36 31 15133. FALSE
# 4 2017-09 63 22 19 5154. FALSE
# 5 2017-10 52 17 11 3041. FALSE
# 6 2017-11 7259 1077 508 72760. FALSE
# 7 2017-12 265412 53406 23137 18804813. FALSE
# 8 2018-01 30693 7682 4582 1360558. FALSE
# 9 2018-02 34177 4142 4364 2931369. FALSE
# 10 2018-03 29051 3752 2784 987256. FALSE
# # ... with 50 more rows
daily
# # A tibble: 1,750 x 7
# monthly_id daily_id totalTransactions uniqueBuyers uniqueSellers totalPriceUSD isRollingHoursData
# <chr> <chr> <int> <int> <int> <dbl> <lgl>
# 1 2017-06 2017-06-23T00:00:00 27 9 6 1456. FALSE
# 2 2017-06 2017-06-24T00:00:00 15 7 8 846. FALSE
# 3 2017-06 2017-06-25T00:00:00 15 7 5 594. FALSE
# 4 2017-06 2017-06-26T00:00:00 23 10 12 1076. FALSE
# 5 2017-06 2017-06-27T00:00:00 35 8 15 2091. FALSE
# 6 2017-06 2017-06-28T00:00:00 15 6 5 1431. FALSE
# 7 2017-06 2017-06-29T00:00:00 41 13 11 2302. FALSE
# 8 2017-06 2017-06-30T00:00:00 22 11 7 1775. FALSE
# 9 2017-07 2017-07-01T00:00:00 12 7 10 3727. FALSE
# 10 2017-07 2017-07-02T00:00:00 34 13 12 3117. FALSE
# # ... with 1,740 more rows
An alternative to #r2evans answer using rrapply() + unnest_wider(). This should generalize to arbitrary levels of nesting as well.
library(tidyr)
library(jsonlite)
library(rrapply)
mydata <- fromJSON("https://api2.cryptoslam.io/api/nft-indexes/NFTGlobal")
monthly <- rrapply(mydata, classes = "list", condition = \(x, .xname) .xname == "monthlySummary", how = "melt") |>
unnest_wider(value)
daily <- rrapply(mydata, classes = "list", condition = \(x, .xparents) "dailySummaries" %in% head(.xparents, -1), how = "melt") |>
unnest_wider(value)
monthly
#> # A tibble: 60 × 9
#> L1 L2 totalTransactio… uniqueBuyers uniqueSellers totalPriceUSD
#> <chr> <chr> <int> <int> <int> <dbl>
#> 1 2017-06 monthlySum… 193 33 32 11570.
#> 2 2017-07 monthlySum… 613 61 57 89111.
#> 3 2017-08 monthlySum… 113 36 31 15133.
#> 4 2017-09 monthlySum… 63 22 19 5154.
#> 5 2017-10 monthlySum… 52 17 11 3041.
#> 6 2017-11 monthlySum… 7259 1077 508 72760.
#> 7 2017-12 monthlySum… 265412 53406 23137 18804813.
#> 8 2018-01 monthlySum… 30693 7682 4582 1360558.
#> 9 2018-02 monthlySum… 34177 4142 4364 2931369.
#> 10 2018-03 monthlySum… 29051 3752 2784 987256.
#> # … with 50 more rows, and 3 more variables: isRollingHoursData <lgl>,
#> # productNames <lgl>, productNamesWithoutAnySale <lgl>
daily
#> # A tibble: 1,750 × 10
#> L1 L2 L3 totalTransactio… uniqueBuyers uniqueSellers totalPriceUSD
#> <chr> <chr> <chr> <int> <int> <int> <dbl>
#> 1 2017-06 dail… 2017… 27 9 6 1456.
#> 2 2017-06 dail… 2017… 15 7 8 846.
#> 3 2017-06 dail… 2017… 15 7 5 594.
#> 4 2017-06 dail… 2017… 23 10 12 1076.
#> 5 2017-06 dail… 2017… 35 8 15 2091.
#> 6 2017-06 dail… 2017… 15 6 5 1431.
#> 7 2017-06 dail… 2017… 41 13 11 2302.
#> 8 2017-06 dail… 2017… 22 11 7 1775.
#> 9 2017-07 dail… 2017… 12 7 10 3727.
#> 10 2017-07 dail… 2017… 34 13 12 3117.
#> # … with 1,740 more rows, and 3 more variables: isRollingHoursData <lgl>,
#> # productNames <lgl>, productNamesWithoutAnySale <lgl>

Using lag function to find the last value for a specific individual

I'm trying to create a column in my spreadsheet that takes the last recorded value (IC) for a specific individual (by the Datetime column) and populates it into a column (LIC) for the current event.
A sub-sample of my data looks like this (actual dataset has 4949 rows and 37 individuals):
> head(ACdatas.scale)
Date Datetime ID.2 IC LIC
1 2019-05-25 2019-05-25 11:57 139 High NA
2 2019-06-09 2019-06-09 19:42 139 Low NA
3 2019-07-05 2019-07-05 20:12 139 Medium NA
4 2019-07-27 2019-07-27 17:27 152 Low NA
5 2019-08-04 2019-08-04 9:13 152 Medium NA
6 2019-08-04 2019-08-04 16:18 139 Medium NA
I would like to be able to populate the last value from the IC column into the current LIC column for the current event (see below)
> head(ACdatas.scale)
Date Datetime ID.2 IC LIC
1 2019-05-25 2019-05-25 11:57 139 High NA
2 2019-06-09 2019-06-09 19:42 139 Low High
3 2019-07-05 2019-07-05 20:12 139 Medium Low
4 2019-07-27 2019-07-27 17:27 152 Low NA
5 2019-08-04 2019-08-04 9:13 152 Medium Low
6 2019-08-04 2019-08-04 16:18 139 Medium Medium
I've tried the following code:
ACdatas.scale <- ACdatas.scale %>%
arrange(ID.2, Datetime) %>%
group_by(ID.2) %>%
mutate(LIC= lag(IC))
This worked some of the time, but when I checked back through the data, it seemed to have a problem when the date switched, so it could accurately populate the field within the same day, but not when the previous event was on the previous day. Just to make it super confusing, it only had issues with some of the day switches, and not all! Help please!!
Sample data,
dat <- data.frame(id=c(rep("A",5),rep("B",5)), IC=c(1:5,11:15))
dplyr
library(dplyr)
dat %>%
group_by(id) %>%
mutate(LIC = lag(IC)) %>%
ungroup()
# # A tibble: 10 x 3
# id IC LIC
# <chr> <int> <int>
# 1 A 1 NA
# 2 A 2 1
# 3 A 3 2
# 4 A 4 3
# 5 A 5 4
# 6 B 11 NA
# 7 B 12 11
# 8 B 13 12
# 9 B 14 13
# 10 B 15 14
data.table
library(data.table)
as.data.table(dat)[, LIC := shift(IC, type = "lag"), by = .(id)][]
# id IC LIC
# <char> <int> <int>
# 1: A 1 NA
# 2: A 2 1
# 3: A 3 2
# 4: A 4 3
# 5: A 5 4
# 6: B 11 NA
# 7: B 12 11
# 8: B 13 12
# 9: B 14 13
# 10: B 15 14
base R
dat$LIC <- ave(dat$IC, dat$id, FUN = function(z) c(NA, z[-length(z)]))
dat
# id IC LIC
# 1 A 1 NA
# 2 A 2 1
# 3 A 3 2
# 4 A 4 3
# 5 A 5 4
# 6 B 11 NA
# 7 B 12 11
# 8 B 13 12
# 9 B 14 13
# 10 B 15 14
By using your data:
mydat <- structure(list(Date = structure(c(18041, 18056, 18082,
18104, 18112, 18112),
class = "Date"),
Datetime = structure(c(1558760220,1560084120,
1562332320, 1564223220,
1564884780, 1564910280),
class = c("POSIXct","POSIXt"),
tzone = ""),
ID.2 = c(139, 139, 139, 152, 152, 139),
IC = c("High", "Low", "Medium", "Low", "Medium", "Medium"),
LIC = c(NA, NA, NA, NA, NA, NA)), row.names = c(NA, -6L),
class = "data.frame")
mydat %>% arrange(Datetime) %>% group_by(ID.2) %>% mutate(LIC = lag(IC))
# A tibble: 6 x 5
# Groups: ID.2 [2]
Date Datetime ID.2 IC LIC
<date> <dttm> <dbl> <chr> <chr>
1 2019-05-25 2019-05-25 11:57:00 139 High NA
2 2019-06-09 2019-06-09 19:42:00 139 Low High
3 2019-07-05 2019-07-05 20:12:00 139 Medium Low
4 2019-07-27 2019-07-27 17:27:00 152 Low NA
5 2019-08-04 2019-08-04 09:13:00 152 Medium Low
6 2019-08-04 2019-08-04 16:18:00 139 Medium Medium

Check for multiple NA columns and return another column in R

I have a dataframe that has multiple columns named as "avg_metric", "wkday_avg_metric", "event_avg_metric" and "monthly_avg_metric", in which "metric" consists of multiple metrics with these calculations (orders, revenue, etc). I have to check for multiple columns if their rows have NAs and replace them with a row from another column. For that, I created a function that does the same verification for the column "metric" I specify. The thing is that I'm getting the same value for the entire new column that I'm creating, which should not be the case.
I added below an example_fixed on what should be the outcome.
Is there an easier way of doing that? Or am I lacking some logic in the function?
Tks.
Edit: I got the errors on my function, but I'm sure there's a better solution to mine. I tried your solutions, but couldn't apply them for my dataframe. I updated the reprex so you can help me better.
library(tidyverse)
(example <- tibble(country = c("A", "B", "C", "D"),
brand = c("A", "A", "B", "B"),
event = c(1:4),
month = c(1:4),
weekday = c(1:4),
avg_visits = c(5028, NA, NA, NA),
avg_revenue = c(12345, NA, NA, NA),
wkday_avg_visits = c(1234, 4355, NA, NA),
wkday_avg_revenue = c(12345, 54321, NA, NA),
event_avg_visits = c(51271, 59212, 98773, NA),
event_avg_revenue = c(98764, 56435, 35634, NA),
monthly_avg_visits = c(5028, 5263, 6950, 8902),
monthly_avg_revenue = c(63457, 34536, 34574, 23426))) %>%
print(width = Inf)
#> # A tibble: 4 x 13
#> country brand event month weekday avg_visits avg_revenue wkday_avg_visits
#> <chr> <chr> <int> <int> <int> <dbl> <dbl> <dbl>
#> 1 A A 1 1 1 5028 12345 1234
#> 2 B A 2 2 2 NA NA 4355
#> 3 C B 3 3 3 NA NA NA
#> 4 D B 4 4 4 NA NA NA
#> wkday_avg_revenue event_avg_visits event_avg_revenue monthly_avg_visits
#> <dbl> <dbl> <dbl> <dbl>
#> 1 12345 51271 98764 5028
#> 2 54321 59212 56435 5263
#> 3 NA 98773 35634 6950
#> 4 NA NA NA 8902
#> monthly_avg_revenue
#> <dbl>
#> 1 63457
#> 2 34536
#> 3 34574
#> 4 23426
subs_metric <- function(data, metric) {
avg <- paste0("avg_", metric)
wkday_avg <- paste0("wkday_avg_", metric)
event_avg <- paste0("event_avg_", metric)
monthly_avg <- paste0("monthly_avg_", metric)
for (i in nrow(data)) {
value <- if (is.na(data[[avg]][i]) & is.na(data[[wkday_avg]][i]) & is.na(data[[event_avg]][i])) {
data[[monthly_avg]][i]
} else if (is.na(data[[avg]][i]) & is.na(data[[wkday_avg]][i])) {
data[[event_avg]][i]
} else if (is.na(data[[avg]][i])) {
data[[wkday_avg]][i]
} else {
data[[avg]][i]
}
return(value)
}
}
example %>%
mutate(avg_visits_new = subs_metric(., "visits"),
avg_revenue_new = subs_metric(., "revenue")) %>%
print(width = Inf)
#> # A tibble: 4 x 15
#> country brand event month weekday avg_visits avg_revenue wkday_avg_visits
#> <chr> <chr> <int> <int> <int> <dbl> <dbl> <dbl>
#> 1 A A 1 1 1 5028 12345 1234
#> 2 B A 2 2 2 NA NA 4355
#> 3 C B 3 3 3 NA NA NA
#> 4 D B 4 4 4 NA NA NA
#> wkday_avg_revenue event_avg_visits event_avg_revenue monthly_avg_visits
#> <dbl> <dbl> <dbl> <dbl>
#> 1 12345 51271 98764 5028
#> 2 54321 59212 56435 5263
#> 3 NA 98773 35634 6950
#> 4 NA NA NA 8902
#> monthly_avg_revenue avg_visits_new avg_revenue_new
#> <dbl> <dbl> <dbl>
#> 1 63457 8902 23426
#> 2 34536 8902 23426
#> 3 34574 8902 23426
#> 4 23426 8902 23426
(example_fixed <- tibble(country = c("A", "B", "C", "D"),
brand = c("A", "A", "B", "B"),
event = c(1:4),
month = c(1:4),
weekday = c(1:4),
avg_visits = c(5028, NA, NA, NA),
avg_revenue = c(12345, NA, NA, NA),
wkday_avg_visits = c(1234, 4355, NA, NA),
wkday_avg_revenue = c(12345, 54321, NA, NA),
event_avg_visits = c(51271, 59212, 98773, NA),
event_avg_revenue = c(98764, 56435, 35634, NA),
monthly_avg_visits = c(5028, 5263, 6950, 8902),
monthly_avg_revenue = c(63457, 34536, 34574, 23426),
avg_visits_new = c(5028, 4355, 98773, 8902),
avg_revenue_new = c(12345, 54321, 35634, 23426))) %>%
print(width = Inf)
#> # A tibble: 4 x 15
#> country brand event month weekday avg_visits avg_revenue wkday_avg_visits
#> <chr> <chr> <int> <int> <int> <dbl> <dbl> <dbl>
#> 1 A A 1 1 1 5028 12345 1234
#> 2 B A 2 2 2 NA NA 4355
#> 3 C B 3 3 3 NA NA NA
#> 4 D B 4 4 4 NA NA NA
#> wkday_avg_revenue event_avg_visits event_avg_revenue monthly_avg_visits
#> <dbl> <dbl> <dbl> <dbl>
#> 1 12345 51271 98764 5028
#> 2 54321 59212 56435 5263
#> 3 NA 98773 35634 6950
#> 4 NA NA NA 8902
#> monthly_avg_revenue avg_visits_new avg_revenue_new
#> <dbl> <dbl> <dbl>
#> 1 63457 5028 12345
#> 2 34536 4355 54321
#> 3 34574 98773 35634
#> 4 23426 8902 23426
Created on 2020-07-07 by the reprex package (v0.3.0)
We could use the following
example$avg_visits_new <- apply(example,1,function(x) x[!is.na(x)][1])
# A tibble: 4 x 5
avg_visits wkday_avg_visits event_avg_visits monthly_avg_visits avg_visits_new
<dbl> <dbl> <dbl> <dbl> <dbl>
1 5028 1234 51271 5028 5028
2 NA 4355 59212 5263 4355
3 NA NA 98773 6950 98773
4 NA NA NA 8902 8902
This just goes row-by-row and uses the first non-NA value it finds
Edit:
here is a loop that will add recycle the above code on all the metrics.
metric <- unique(sub(".*_(.*)","\\1",colnames(example)[-(1:5)]))
for(i in metric){
example <- cbind(example, print(apply(example[,grepl(i,colnames(example))],1,function(x) x[!is.na(x)][1])))
}
colnames(example)[(ncol(example)-length(metric)+1):ncol(example)] <- paste0("avg_",metric,"_new")
> example
country brand event month weekday avg_visits avg_revenue wkday_avg_visits wkday_avg_revenue event_avg_visits event_avg_revenue monthly_avg_visits monthly_avg_revenue avg_visits_new avg_revenue_new
1 A A 1 1 1 5028 12345 1234 12345 51271 98764 5028 63457 5028 12345
2 B A 2 2 2 NA NA 4355 54321 59212 56435 5263 34536 4355 54321
3 C B 3 3 3 NA NA NA NA 98773 35634 6950 34574 98773 35634
4 D B 4 4 4 NA NA NA NA NA NA 8902 23426 8902 23426
There are better ways of doing this, for example you can replace the whole function with:
subs_metric <- function(data, metric)
{
data.table::fcoalesce(data[grep(metric, names(data)), ])
}
Which gives the correct result:
example %>%
mutate(avg_visits_new = subs_metric(., "visits"))
#> # A tibble: 4 x 5
#> avg_visits wkday_avg_visits event_avg_visits monthly_avg_visits avg_visits_new
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 5028 1234 51271 5028 5028
#> 2 NA 4355 59212 5263 4355
#> 3 NA NA 98773 6950 98773
#> 4 NA NA NA 8902 8902
However, I'm sure you would like to know where the flaws in your code were that stopped the loop working as expected.
Firstly, your loop starts with for (i in nrow(data)). Since there are 4 rows in your data frame, this means for (i in 4). That means the loop only runs once with i set to 4. I think you meant for (i in 1:nrow(data))
Secondly, you are returning value inside the loop. That means that any time the loop runs, it will only run once and the function will return value. I think this was just a misplaced curly bracket.
Thirdly, you are overwriting value in each iteration of the loop, where you want value to be the vector that will form your new column, so you need to declare value in advance and write to value[i] for each iteration of the loop.
Incorporating these changes, we have:
subs_metric <- function(data, metric) {
avg <- paste0("avg_", metric)
wkday_avg <- paste0("wkday_avg_", metric)
event_avg <- paste0("event_avg_", metric)
monthly_avg <- paste0("monthly_avg_", metric)
value <- numeric(nrow(data))
for (i in 1:nrow(data)) {
value[i] <- if (is.na(data[[avg]][i]) &
is.na(data[[wkday_avg]][i]) &
is.na(data[[event_avg]][i])) {
data[[monthly_avg]][i]
} else if (is.na(data[[avg]][i]) &
is.na(data[[wkday_avg]][i])) {
data[[event_avg]][i]
} else if (is.na(data[[avg]][i])) {
data[[wkday_avg]][i]
} else {
data[[avg]][i]
}
}
return(value)
}
Which now gives the correct result:
example %>%
mutate(avg_visits_new = subs_metric(., "visits"))
#> # A tibble: 4 x 5
#> avg_visits wkday_avg_visits event_avg_visits monthly_avg_visits avg_visits_new
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 5028 1234 51271 5028 5028
#> 2 NA 4355 59212 5263 4355
#> 3 NA NA 98773 6950 98773
#> 4 NA NA NA 8902 8902
However, I'd probably stick to one of the other solutions offered, since they are considerably shorter and more efficient than a row-wise loop.

Add new columns with custom function using mutate

I want to do a simple and add a new column using dplyr mutate for that. Basically I have a DF with lots of columns and I want to select some of them, just the ones containing hist_avg, tgt_ and monthyl_X_ly. This should be simple and adding a new column starting with "fct_" + metric shouldn't be an issue. However, as you may see below, it adds the column but with a weird name (fct_visits$hist_avg_visits and fct_revenue$hist_avg_revenue_lcy).
Also, not sure but I tried to do it using mutate + across since it would save me lots of lines of code and couldn't figure out on how to do that.
library(tidyverse)
(example <- tibble(brand = c("Brand A", "Brand A", "Brand A", "Brand A", "Brand A"),
country = c("Country A", "Country A", "Country A", "Country A", "Country A"),
date = c("2020-08-01", "2020-08-02", "2020-08-03", "2020-08-04", "2020-08-05"),
visits = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_),
visits_ly = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_),
tgt_visits = c(2491306, 2491306, 2491306, 2491306, 2491306),
hist_avg_visits = c(177185, 175758, 225311, 210871, 197405),
monthly_visits_ly = c(3765612, 3765612, 3765612, 3765612, 3765612),
revenue_lcy = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_),
revenue_ly = c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_),
tgt_revenue_lcy = c(48872737, 48872737, 48872737, 48872737, 48872737),
hist_avg_revenue_lcy = c(231101, 222236, 276497, 259775, 251167),
monthly_revenue_lcy_ly = c(17838660, 17838660, 17838660, 17838660, 17838660))) %>%
print(width = Inf)
#> # A tibble: 5 x 13
#> brand country date visits visits_ly tgt_visits hist_avg_visits
#> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 Brand A Country A 2020-08-01 NA NA 2491306 177185
#> 2 Brand A Country A 2020-08-02 NA NA 2491306 175758
#> 3 Brand A Country A 2020-08-03 NA NA 2491306 225311
#> 4 Brand A Country A 2020-08-04 NA NA 2491306 210871
#> 5 Brand A Country A 2020-08-05 NA NA 2491306 197405
#> monthly_visits_ly revenue_lcy revenue_ly tgt_revenue_lcy hist_avg_revenue_lcy
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 3765612 NA NA 48872737 231101
#> 2 3765612 NA NA 48872737 222236
#> 3 3765612 NA NA 48872737 276497
#> 4 3765612 NA NA 48872737 259775
#> 5 3765612 NA NA 48872737 251167
#> monthly_revenue_lcy_ly
#> <dbl>
#> 1 17838660
#> 2 17838660
#> 3 17838660
#> 4 17838660
#> 5 17838660
first_forecast <- function(dataset, metric) {
avg_metric <- select(dataset, paste0("hist_avg_", metric))
tgt_metric <- select(dataset, paste0("tgt_", metric))
monthly_metric <- select(dataset, paste0("monthly_", metric, "_ly"))
output <- avg_metric * (tgt_metric / monthly_metric)
return(output)
}
example %>%
mutate(fct_visits = first_forecast(., "visits"),
fct_revenue = first_forecast(., "revenue_lcy")) %>%
print(width = Inf)
#> # A tibble: 5 x 15
#> brand country date visits visits_ly tgt_visits hist_avg_visits
#> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 Brand A Country A 2020-08-01 NA NA 2491306 177185
#> 2 Brand A Country A 2020-08-02 NA NA 2491306 175758
#> 3 Brand A Country A 2020-08-03 NA NA 2491306 225311
#> 4 Brand A Country A 2020-08-04 NA NA 2491306 210871
#> 5 Brand A Country A 2020-08-05 NA NA 2491306 197405
#> monthly_visits_ly revenue_lcy revenue_ly tgt_revenue_lcy hist_avg_revenue_lcy
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 3765612 NA NA 48872737 231101
#> 2 3765612 NA NA 48872737 222236
#> 3 3765612 NA NA 48872737 276497
#> 4 3765612 NA NA 48872737 259775
#> 5 3765612 NA NA 48872737 251167
#> monthly_revenue_lcy_ly fct_visits$hist_avg_visits
#> <dbl> <dbl>
#> 1 17838660 117225.
#> 2 17838660 116280.
#> 3 17838660 149064.
#> 4 17838660 139511.
#> 5 17838660 130602.
#> fct_revenue$hist_avg_revenue_lcy
#> <dbl>
#> 1 633149.
#> 2 608862.
#> 3 757521.
#> 4 711708.
#> 5 688124.
Created on 2020-07-28 by the reprex package (v0.3.0)
Pointing to the great sugestion of #Onyambu the final part of your code should be this:
example %>%
cbind(fct_visits = first_forecast(., "visits"),
fct_revenue = first_forecast(., "revenue_lcy")) %>%
print(width = Inf)
brand country date visits visits_ly tgt_visits hist_avg_visits monthly_visits_ly revenue_lcy
1 Brand A Country A 2020-08-01 NA NA 2491306 177185 3765612 NA
2 Brand A Country A 2020-08-02 NA NA 2491306 175758 3765612 NA
3 Brand A Country A 2020-08-03 NA NA 2491306 225311 3765612 NA
4 Brand A Country A 2020-08-04 NA NA 2491306 210871 3765612 NA
5 Brand A Country A 2020-08-05 NA NA 2491306 197405 3765612 NA
revenue_ly tgt_revenue_lcy hist_avg_revenue_lcy monthly_revenue_lcy_ly hist_avg_visits hist_avg_revenue_lcy
1 NA 48872737 231101 17838660 117224.5 633149.5
2 NA 48872737 222236 17838660 116280.4 608862.0
3 NA 48872737 276497 17838660 149064.4 757521.3
4 NA 48872737 259775 17838660 139511.0 711707.9
5 NA 48872737 251167 17838660 130601.9 688124.5

Resources