I am using src_postgres to connect and dplyr::tbl function to fetch data from redshift database. I have applied some filters and top function to it using the dplyr itself. Now my data looks as below:
riid day hour
<dbl> <chr> <chr>
1 5542. "THURSDAY " 12
2 5862. "FRIDAY " 15
3 5982. "TUESDAY " 15
4 6022. WEDNESDAY 16
My final output should be as below:
riid MON TUES WED THUR FRI SAT SUN
5542 12
5862 15
5988 15
6022 16
I have tried spread. It throws the below error because of the class type:
Error in UseMethod("spread_") : no applicable method for 'spread_'
applied to an object of class "c('tbl_dbi', 'tbl_sql', 'tbl_lazy',
'tbl')"
Since this is a really big table, I do not want to use dataframe as it takes a longer time.
I was able to use as below:
df_mon <- df2 %>% filter(day == 'MONDAY') %>% mutate(MONDAY = hour) %>% select(riid,MONDAY)
df_tue <- df2 %>% filter(day == 'TUESDAY') %>% mutate(TUESDAY = hour) %>% select(riid,TUESDAY)
df_wed <- df2 %>% filter(day == 'WEDNESDAY') %>% mutate(WEDNESDAY = hour) %>% select(riid,WEDNESDAY)
df_thu <- df2 %>% filter(day == 'THURSDAY') %>% mutate(THURSDAY = hour) %>% select(riid,THURSDAY)
df_fri <- df2 %>% filter(day == 'FRIDAY') %>% mutate(FRIDAY = hour) %>% select(riid,FRIDAY)
Is it possible to write all above in one statement?
Any help to transpose this in a faster manner is really appreciated.
EDIT
Adding the dput of the tbl object:
structure(list(src = structure(list(con = <S4 object of class structure("PostgreSQLConnection", package = "RPostgreSQL")>,
disco = <environment>), .Names = c("con", "disco"), class = c("src_dbi",
"src_sql", "src")), ops = structure(list(name = "select", x = structure(list(
name = "filter", x = structure(list(name = "filter", x = structure(list(
name = "group_by", x = structure(list(x = structure("SELECT riid,day,hour,sum(weightage) AS score FROM\n (SELECT riid,day,hour,\n POWER(2,(cast(datediff (seconds,convert_timezone('UTC','PKT',SYSDATE),TO_DATE(TO_CHAR(event_captured_dt,'mm/dd/yyyy hh24:mi:ss'),'mm/dd/yyyy hh24:mi:ss')) as decimal) / cast(7862400 as decimal))) AS weightage\n FROM (\n SELECT riid,convert_timezone('GMT','PKT',event_captured_dt) AS EVENT_CAPTURED_DT,\n TO_CHAR(convert_timezone('GMT','PKT',event_captured_dt),'DAY') AS day,\n TO_CHAR(convert_timezone('GMT','PKT',event_captured_dt),'HH24') AS hour\n FROM Zameen_STO_DATA WHERE EVENT_CAPTURED_DT >= TO_DATE((sysdate -30),'yyyy-mm-dd') and LIST_ID = 4282\n )) group by riid,day,hour", class = c("sql",
"character")), vars = c("riid", "day", "hour", "score"
)), .Names = c("x", "vars"), class = c("op_base_remote",
"op_base", "op")), dots = structure(list(riid = riid,
day = day), .Names = c("riid", "day")), args = structure(list(
add = FALSE), .Names = "add")), .Names = c("name",
"x", "dots", "args"), class = c("op_group_by", "op_single",
"op")), dots = structure(list(~min_rank(desc(~score)) <=
1), .Names = ""), args = list()), .Names = c("name",
"x", "dots", "args"), class = c("op_filter", "op_single",
"op")), dots = structure(list(~row_number() == 1), .Names = ""),
args = list()), .Names = c("name", "x", "dots", "args"), class = c("op_filter",
"op_single", "op")), dots = structure(list(~riid, ~day, ~hour), class = "quosures", .Names = c("",
"", "")), args = list()), .Names = c("name", "x", "dots", "args"
), class = c("op_select", "op_single", "op"))), .Names = c("src",
"ops"), class = c("tbl_dbi", "tbl_sql", "tbl_lazy", "tbl"))
I think what you're looking for is the ability to run the tidyr::spread() function against a remote source, or database. I have a PR for dbplyr that attempts to implement that here: https://github.com/tidyverse/dbplyr/pull/72, you can try it out by using: devtools::install_github("tidyverse/dbplyr", ref = devtools::github_pull(72)).
Use dcast from reshape2 package
> data
# A tibble: 4 x 3
riid day hour
<dbl> <chr> <dbl>
1 1.00 TH 12.0
2 2.00 FR 15.0
3 3.00 TU 15.0
4 4.00 WE 16.0
> dcast(data, riid~day, value.var = "hour")
riid FR TH TU WE
1 1 NA 12 NA NA
2 2 15 NA NA NA
3 3 NA NA 15 NA
4 4 NA NA NA 16
Further if you want to remove NA, then
> z <- dcast(data, riid~day, value.var = "hour")
> z[is.na(z)] <- ""
> z
riid FR TH TU WE
1 1 12
2 2 15
3 3 15
4 4 16
I tried to combine your multiple line attempts into one. Can you try this and let us know the outcome?
library(dplyr)
df %>%
rowwise() %>%
mutate(Mon = ifelse(day=='MONDAY', hour[day=='MONDAY'], NA),
Tue = ifelse(day=='TUESDAY', hour[day=='TUESDAY'], NA),
Wed = ifelse(day=='WEDNESDAY', hour[day=='WEDNESDAY'], NA),
Thu = ifelse(day=='THURSDAY', hour[day=='THURSDAY'], NA),
Fri = ifelse(day=='FRIDAY', hour[day=='FRIDAY'], NA),
Sat = ifelse(day=='SATURDAY', hour[day=='SATURDAY'], NA),
Sun = ifelse(day=='SUNDAY', hour[day=='SUNDAY'], NA)) %>%
select(-day, -hour)
Output is:
riid Mon Tue Wed Thu Fri Sat Sun
1 5542 NA NA NA 12 NA NA NA
2 5862 NA NA NA NA 15 NA NA
3 5982 NA 15 NA NA NA NA NA
4 6022 NA NA 16 NA NA NA NA
Sample data:
# A tibble: 4 x 3
riid day hour
* <dbl> <chr> <int>
1 5542 THURSDAY 12
2 5862 FRIDAY 15
3 5982 TUESDAY 15
4 6022 WEDNESDAY 16
Update:
Can you try below approach using data.table?
library(data.table)
dt <- setDT(df)[, c("Mon","Tue","Wed","Thu","Fri","Sat","Sun") :=
list(ifelse(day=='MONDAY', hour[day=='MONDAY'], NA),
ifelse(day=='TUESDAY', hour[day=='TUESDAY'], NA),
ifelse(day=='WEDNESDAY', hour[day=='WEDNESDAY'], NA),
ifelse(day=='THURSDAY', hour[day=='THURSDAY'], NA),
ifelse(day=='FRIDAY', hour[day=='FRIDAY'], NA),
ifelse(day=='SATURDAY', hour[day=='SATURDAY'], NA),
ifelse(day=='SUNDAY', hour[day=='SUNDAY'], NA))][, !c("day","hour"), with=F]
Related
I have two incomplete dataframes (df_a, df_b): Columns are missing or NA values. "by" is the merge index and df_a has "priority" over df_b.
df_a = structure(list(Datum = structure(c(1635163200, 1635166800, 1635170400, 1635174000), class = c("POSIXct", "POSIXt")), Vorhersage = c(10.297922, 10.155121, 10.044135, 9.699513), Export = c("10.912", "10.47", NA, NA), color = c("rgb(0,128,0)", "rgb(0,128,0)", NA, NA), Status = c("ok", "ok", NA, NA), Plausibilität = c("4", "4", NA, NA), min = c(7.93000000000001, 9.4, 8.7, 8.3), max = c(12.31715325, 12.42822725, 12.51326325, 12.28620625)), row.names = c(NA, -4L), class = "data.frame")
df_b = structure(list(Datum = structure(c(1632510000, 1632513600, 1632517200, 1632520800), class = c("POSIXct", "POSIXt")), Vorhersage = c(14.821988, 14.832919, 14.706179, 14.573527), Referenz = c(16.6, 16.2, 15.9, 16), DWD_Name = c("Elpersbüttel", "Elpersbüttel", "Elpersbüttel", "Elpersbüttel"), Export = c(17.198, 16.713, 16.378, 16.358), color = c("rgb(0,128,0)", "rgb(0,128,0)", "rgb(0,128,0)", "rgb(0,128,0)"), Status = c("ok", "ok", "ok", "ok"), Plausibilität = c(4, 4, 4, 4), min = c(13.05, 12.808, 11.631891, 12.312), max = c(17, 17, 16.9, 16.7)), row.names = c(NA, -4L), class = "data.frame")
desired output is:
Datum Vorhersage Export color Status Plausibilität min max Referenz
1 2021-10-25 14:00:00 10.3 10.912 rgb(0,128,0) ok 4 7.9 12 NA
2 2021-10-25 15:00:00 10.2 10.47 rgb(0,128,0) ok 4 9.4 12 NA
3 2021-10-25 16:00:00 10.0 <NA> <NA> <NA> <NA> 8.7 13 NA
4 2021-10-25 17:00:00 9.7 <NA> <NA> <NA> <NA> 8.3 12 NA
5 2021-09-24 21:00:00 14.8 17.198 rgb(0,128,0) ok 4 13.1 17 17
6 2021-09-24 22:00:00 14.8 16.713 rgb(0,128,0) ok 4 12.8 17 16
7 2021-09-24 23:00:00 14.7 16.378 rgb(0,128,0) ok 4 11.6 17 16
8 2021-09-25 00:00:00 14.6 16.358 rgb(0,128,0) ok 4 12.3 17 16
DWD_Name
1 <NA>
2 <NA>
3 <NA>
4 <NA>
5 Elpersbüttel
6 Elpersbüttel
7 Elpersbüttel
8 Elpersbüttel
# for rebuild:
structure(list(Datum = structure(c(1635163200, 1635166800, 1635170400,
1635174000, 1632510000, 1632513600, 1632517200, 1632520800), class = c("POSIXct",
"POSIXt")), Vorhersage = c(10.297922, 10.155121, 10.044135, 9.699513,
14.821988, 14.832919, 14.706179, 14.573527), Export = c("10.912",
"10.47", NA, NA, "17.198", "16.713", "16.378", "16.358"), color = c("rgb(0,128,0)",
"rgb(0,128,0)", NA, NA, "rgb(0,128,0)", "rgb(0,128,0)", "rgb(0,128,0)",
"rgb(0,128,0)"), Status = c("ok", "ok", NA, NA, "ok", "ok", "ok",
"ok"), Plausibilität = c("4", "4", NA, NA, "4", "4", "4", "4"
), min = c(7.93000000000001, 9.4, 8.7, 8.3, 13.05, 12.808, 11.631891,
12.312), max = c(12.31715325, 12.42822725, 12.51326325, 12.28620625,
17, 17, 16.9, 16.7), Referenz = c(NA, NA, NA, NA, 16.6, 16.2,
15.9, 16), DWD_Name = c(NA, NA, NA, NA, "Elpersbüttel", "Elpersbüttel",
"Elpersbüttel", "Elpersbüttel")), row.names = c(NA, -8L), class = "data.frame")
Thanks to the help of #r2evans I tried the following:
by = "Datum"
library(data.table)
colnms <- setdiff(intersect(names(df_a), names(df_b)), by)
setDT(df_a)
setDT(df_b)
merge(df_a, df_b, by = by, all = TRUE
)[, (colnms) := lapply(colnms, function(nm) fcoalesce(.SD[[paste0(nm, ".x")]], .SD[[paste0(nm, ".y")]]))
][, c(outer(colnms, c(".x", ".y"), paste0)) := NULL ][]
but I get the following error:
Error in fcoalesce(.SD[[paste0(nm, ".x")]], .SD[[paste0(nm, ".y")]]) :
Item 2 is type double but the first item is type character. Please coerce
Most of the other answers are good, but many either over-complicate the result (in my opinion) or they perform a left or right join, not the full join as expected in the OP.
Here's a quick solution that uses dynamic column names.
library(data.table)
colnms <- setdiff(intersect(names(df_a), names(df_b)), "by")
colnms
# [1] "a"
setDT(df_a)
setDT(df_b)
merge(df_a, df_b, by = "by", all = TRUE
)[, (colnms) := lapply(colnms, function(nm) fcoalesce(.SD[[paste0(nm, ".x")]], .SD[[paste0(nm, ".y")]]))
][, c(outer(colnms, c(".x", ".y"), paste0)) := NULL ][]
# by b c a
# <num> <num> <num> <num>
# 1: 1 1 NA 1
# 2: 2 NA 2 2
# 3: 3 3 3 3
# 4: 4 NA 4 4
Notes:
the normal data.table::[ merge is a left-join only, so we need to use data.table::merge in order to be able to get a full-join with all=TRUE;
because it's using merge, the repeated columns get the .x and .y suffixes, something we can easily capitalize on;
the canonical and most-performant way when using (colnms) := ... is to also include .SDcols=colnms, but that won't work as well here since we need the suffixed columns, not the colnms columns themselves; this is a slight performance penalty but certainly not an anti-pattern (I believe) given what we need to do; and since we could have more than one duplicate column, we have to be careful to do it with each pair at a time, not all of them at once;
the last [-block (using outer) is for removing the duplicate columns; without it, the output would have column names c("by", "a.x", "b", "a.y", "c", "a"). It uses outer because that's a straight-forward way to get 1-or-more colnms and combine .x and .y to each of them; it then uses data.table's := NULL shortcut for removing one-or-more columns.
This isn't the most elegant, but you can make a function that applies your rule to coalesce the values if they occur in both data frames.
# find the unique column names (not called "by")
cols <- union(names(df_a),names(df_b))
cols <- cols[!(cols == "by")]
# merge the data sets
df_merge <- merge(df_a, df_b, by = "by", all = TRUE)
# function to check for the base column names that now have a '.x' and
# a '.y' version. for the columns, fill in the NAs from '.x' with the
# value from '.y'
col_val <- function(col_base, df) {
x <- names(df)
if (all(paste0(col_base, c(".x", ".y")) %in% x)) {
na.x <- is.na(df[[paste0(col_base, ".x")]])
df[[paste0(col_base, ".x")]][na.x] <- df[[paste0(col_base, ".y")]][na.x]
df[[paste0(col_base, ".x")]]
} else {
df[[col_base]]
}
}
# apply this function to every column
cbind(df_merge["by"], sapply(cols, col_val, df = df_merge))
This will give the following result.
by a b c
1 1 1 1 NA
2 2 2 NA 2
3 3 3 3 3
4 4 4 NA 4
I know you specified base, by the natural_join() function is worth mentioning.
library(rqdatatable)
natural_join(df_a, df_b, by = "by", jointype = "FULL")
This gives exactly what you want.
by a b c
1 1 1 1 NA
2 2 2 NA 2
3 3 3 3 3
4 4 4 NA 4
Not the answer with R base. But one possible solution with the package data.table
library(data.table)
setDT(df_a)
setDT(df_b)
df_a <- rbind(df_a, list(4, NA, NA))
df_b <- rbind(list(1, NA, NA), df_b)
df_a[df_b, `:=` (a = fifelse(is.na(a), i.a, a), c = c), on = .(by)][]
#> by a b c
#> 1: 1 1 1 NA
#> 2: 2 2 NA 2
#> 3: 3 3 3 3
#> 4: 4 4 NA 4
Edit with the help of #r2evans, A much more elegant and efficient solution:
df_a[df_b, `:=` (a = fcoalesce(a, i.a), c = c), on = .(by)][]
#> by a b c
#> 1: 1 1 1 NA
#> 2: 2 2 NA 2
#> 3: 3 3 3 3
#> 4: 4 4 NA 4
Created on 2021-10-19 by the reprex package (v2.0.1)
here a dynamic solution.. not bad, but maybe someone knows how to speed it up.
get_complete_df<-function(df_a,df_b, by = "by"){
df_a = unique(df_a)
df_b = unique(df_b)
nam_a = names(df_a)[!(names(df_a) == by)]
nam_b = names(df_b)[!(names(df_b) == by)]
nums_a = unlist(lapply(df_a, is.numeric))
nums_b = unlist(lapply(df_b, is.numeric))
nums = unique(names(df_a)[nums_a],names(df_b)[nums_b])
## try to supplement NAs
x = df_b[[by]][df_b[[by]] %in% df_a[[by]]]
y = nam_b[nam_b %in% nam_a]
vna = is.na(df_a[df_a[,1] %in% x,y])
df_a[df_a[,1] %in% x ,y][vna] = df_b[df_b[,1] %in% x,y][vna]
## get complete df
all_names = c(nam_a,nam_b )
all_names = c(by, unique(all_names))
all_by = na.omit(unique(c(df_a[[by]],df_b[[by]]) ))
## build
df_o = as.data.frame(matrix(nrow = length(all_by),ncol = length(all_names)))
names(df_o) = all_names
df_o[[by]] = all_by
## fill in content
df_o[df_o[,1] %in% df_b[,1],names(df_b)] = df_b
df_o[df_o[,1] %in% df_a[,1],names(df_a)] = df_a ## df_a has priority!
# fix numeric:
# why did some(!) num fields changed to chr ?
df_o[,nums] = as.data.frame(apply(df_o[,nums], 2, as.numeric))
df_o
}
I have a list of accounts (300k plus rows), going back six years, with a user number, open and close dates, and other information, such as location. We offer a variety of accounts, and a user can have one or several, in any combination, and both in succession as well as overlapping.
I've been asked to find out how many users we have in any given month. They'd like it split by location, as well as total.
so I have a table like this:
User Open Close Area
1 A 2018-02-13 2018-07-31 West
2 B 2018-02-26 2018-06-04 North
3 B 2018-02-27 2018-03-15 North
4 C 2018-02-27 2018-05-26 South
5 C 2018-03-15 2018-06-03 South
6 D 2018-03-20 2018-07-02 East
7 E 2018-04-01 2018-06-19 West
8 E 2018-04-14 2018-05-04 West
9 F 2018-03-20 2018-04-19 North
10 G 2018-04-26 2018-07-04 South
11 H 2017-29-12 2018-03-21 East
12 I 2016-11-29 2020-04-10 West
13 J 2018-01-31 2018-12-20 West
14 K 2017-10-31 2018-10-30 North
15 K 2018-10-31 2019-10-30 North
And I want to get to one that looks something like this:
Month Total North East South West
1 Feb 18 3 1 0 1 1
2 Mar 18 5 2 1 1 1
3 Apr 18 7 2 1 2 2
4 May 18 6 1 1 2 2
5 Jun 18 6 1 1 2 2
6 Jul 18 3 0 1 1 1
I can filter the data to get to what I need for individual months using
df%>%
filter(Open <= as.Date("2018-04-30") & Close >= as.Date("2018-04-01")) %>%
distinct(PERSON_ID, .keep_all = TRUE) %>%
count(Area)
But what I can't figure out is how to repeat that for every month in the data set automatically. Is there any where of getting r to repeat the above for every month in my data set, and then pass the results into a second table?
Any and all help gratefully received, and many thanks for your time.
Edit: added examples to the source data where Matin Gal's solution returned NA for years
This is a general solution working for dates spanning over more than one year.
library(dplyr)
library(tidyr)
library(lubridate)
data %>%
group_by(rn = row_number()) %>%
mutate(seq = list(seq(month(Open), month(Close) + 12 * (year(Close) - year(Open))))) %>%
unnest(seq) %>%
mutate(
seq_2 = (seq - 1) %% 12 + 1,
month = month(seq_2, label = TRUE),
year = year(Open + months(seq - first(seq)))
) %>%
ungroup() %>%
distinct(User, month, year, Area) %>%
count(month, year, Area) %>%
pivot_wider(
names_from = "Area",
values_from = "n",
values_fill = 0
) %>%
mutate(Total = rowSums(across(c(North, South, West, East))))
returns
month year North South West East Total
<ord> <dbl> <int> <int> <int> <int> <dbl>
1 Feb 2018 1 1 1 0 3
2 Mar 2018 2 1 1 1 5
3 Apr 2018 2 2 2 1 7
4 May 2018 1 2 2 1 6
5 Jun 2018 1 2 2 1 6
6 Jul 2018 0 1 1 1 3
Data
df <- structure(list(id = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), User = c("A",
"B", "B", "C", "C", "D", "E", "E", "F", "G"), Open = structure(c(17575,
17588, 17589, 17589, 17605, 17610, 17622, 17635, 17610, 17647
), class = "Date"), Close = structure(c(17743, 17686, 17605,
17677, 17685, 17714, 17701, 17655, 17640, 17716), class = "Date"),
Area = c("West", "North", "North", "South", "South", "East",
"West", "West", "North", "South")), problems = structure(list(
row = 10L, col = "Area", expected = "", actual = "embedded null",
file = "literal data"), row.names = c(NA, -1L), class = c("tbl_df",
"tbl", "data.frame")), class = c("spec_tbl_df", "tbl_df", "tbl",
"data.frame"), row.names = c(NA, -10L), spec = structure(list(
cols = list(id = structure(list(), class = c("collector_double",
"collector")), User = structure(list(), class = c("collector_character",
"collector")), Open = structure(list(format = ""), class = c("collector_date",
"collector")), Close = structure(list(format = ""), class = c("collector_date",
"collector")), Area = structure(list(), class = c("collector_character",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1L), class = "col_spec"))
Here's how I'd do it:
library(tidyverse)
set.seed(14159)
## generating some data that looks roughly
## like your data
data <- tibble(
user = sample(LETTERS[1:5], size = 20, replace = TRUE),
open = sample(seq(as.Date('1999/01/01'), as.Date('2000/01/01'), by="day"), 20),
close = sample(seq(as.Date('1999/01/01'), as.Date('2000/01/01'), by="day"), 20),
area = sample(c("N", "E", "S", "W"), 20, replace = T)
) %>%
filter(
close > open
)
data
#> # A tibble: 9 × 4
#> user open close area
#> <chr> <date> <date> <chr>
#> 1 A 1999-04-03 1999-07-28 N
#> 2 B 1999-01-27 1999-05-12 W
#> 3 B 1999-06-05 1999-12-29 W
#> 4 C 1999-09-26 1999-12-30 W
#> 5 C 1999-04-21 1999-12-04 E
#> 6 C 1999-08-11 1999-12-12 N
#> 7 A 1999-02-13 1999-09-16 W
#> 8 E 1999-02-17 1999-05-21 E
#> 9 B 1999-07-26 1999-08-16 S
## figuring out what months are in between open and close
get_months_in_range <- function(open, close) {
seq.Date(
open,
close,
by = "month"
) %>%
list()
}
data %>%
rowwise() %>%
mutate(
Month = get_months_in_range(open, close)
) %>%
ungroup() %>%
unnest_longer(
col = Month
) %>%
count(Month, area) %>%
pivot_wider(
names_from = area,
values_from = n,
values_fill = 0
) %>%
rowwise() %>%
mutate(
Total = sum(
c_across(
-Month
)
)
) %>%
ungroup()
#> # A tibble: 45 × 6
#> Month W E N S Total
#> <date> <int> <int> <int> <int> <int>
#> 1 1999-01-27 1 0 0 0 1
#> 2 1999-02-13 1 0 0 0 1
#> 3 1999-02-17 0 1 0 0 1
#> 4 1999-02-27 1 0 0 0 1
#> 5 1999-03-13 1 0 0 0 1
#> 6 1999-03-17 0 1 0 0 1
#> 7 1999-03-27 1 0 0 0 1
#> 8 1999-04-03 0 0 1 0 1
#> 9 1999-04-13 1 0 0 0 1
#> 10 1999-04-17 0 1 0 0 1
#> # … with 35 more rows
Created on 2021-08-18 by the reprex package (v2.0.1)
It's not the world's sexiest solution, but I think it'll get you where you're trying to go. Basically, I just make a helper function that gives me all the dates between open and close and then you can group by those to figure out how many users you have in any given month. Let me know if you want more explanation about what the long chain of dplyr stuff is doing.
welcome to SO. I can't test this code as you haven't provided a snippet of your data in the right format (see below for a suggestion on this point), but I think the basic idea of what you want to do is extract a month-year value from Open and then use group_by. For example:
library(lubridate)
library(dplyr)
df %>% mutate(
Date = dmy(Open),
Month_Yr = format_ISO8601(Date, precision = "ym")) %>%
group_by(Month_Yr) %>%
distinct(PERSON.ID, .keep_all = TRUE) %>%
count(Area)
Generally when sharing data on SO it's best to use a dput. See ?dput for info on how to use it if you're unsure.
My data concerns a company and includes Total Sales and the amount of sales in three counties CA , TX and WI.
Data :
> dput(head(WalData))
structure(list(CA = c(11047, 9925, 11322, 12251, 16610, 14696
), TX = c(7381, 5912, 9006, 6226, 9440, 9376), WI = c(6984, 3309,
8883, 9533, 11882, 8664), Total = c(25412, 19146, 29211, 28010,
37932, 32736), date = structure(c(1296518400, 1296604800, 1296691200,
1296777600, 1296864000, 1296950400), tzone = "UTC", class = c("POSIXct",
"POSIXt")), event_type = c("NA", "NA", "NA", "NA", "NA", "Sporting"
), snap_CA = c(1, 1, 1, 1, 1, 1), snap_TX = c(1, 0, 1, 0, 1,
1), snap_WI = c(0, 1, 1, 0, 1, 1)), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
With the following code i am trying to calculate the average sales share of the three states on the company's total sales.
In addition, i need the same average percentages for each year, month of the year and day of the week.
install.packages("dplyr")
install.packages("lubridate")
library(dplyr)
library(lubridate)
df1 <- df %>%
dplyr::mutate(YEAR = lubridate::year(date),
MONTH = lubridate::month(date),
WEEKDAY = lubridate::wday(date),
P_CA = CA / Total,
P_TX = TX / Total,
P_WI = WI / Total)
# Average per Year
df1 %>%
dplyr::group_by(YEAR) %>%
dplyr::summarise(AV_CA = mean(P_CA, na.rm = TRUE),
AV_TX = mean(P_TX, na.rm = TRUE),
AV_WI = mean(P_WI, na.rm = TRUE))
# Average per Month
df1 %>%
dplyr::group_by(MONTH) %>%
dplyr::summarise(AV_CA = mean(P_CA, na.rm = TRUE),
AV_TX = mean(P_TX, na.rm = TRUE),
AV_WI = mean(P_WI, na.rm = TRUE))
# Average per Weekday
df1 %>%
dplyr::group_by(WEEKDAY) %>%
dplyr::summarise(AV_CA = mean(P_CA, na.rm = TRUE),
AV_TX = mean(P_TX, na.rm = TRUE),
AV_WI = mean(P_WI, na.rm = TRUE))
Output :
> df1 <- df %>%
+ dplyr::mutate(YEAR = lubridate::year(date),
+ MONTH = lubridate::month(date),
+ WEEKDAY = lubridate::wday(date),
+ P_CA = CA / Total,
+ P_TX = TX / Total,
+ P_WI = WI / Total)
Error in UseMethod("mutate_") :
no applicable method for 'mutate_' applied to an object of class "function"
> # Average per Year
> df1 %>%
+ dplyr::group_by(YEAR) %>%
+ dplyr::summarise(AV_CA = mean(P_CA, na.rm = TRUE),
+ AV_TX = mean(P_TX, na.rm = TRUE),
+ AV_WI = mean(P_WI, na.rm = TRUE))
Error in eval(lhs, parent, parent) : object 'df1' not found
It comes with an error : Error in UseMethod("mutate_") :
no applicable method for 'mutate_' applied to an object of class "function"
I cant figure out whats wrong , i double checked the code and the correctness of the data .
Please give a solution .
The issue would be that df is not created as an object in the global env and there is a function with name df if we do ?df
df(x, df1, df2, ncp, log = FALSE)
Basically, the error is based on applying mutate on a function df rather than an object
Checking on a fresh R session with no objects created
df %>%
dplyr::mutate(YEAR = lubridate::year(date),
MONTH = lubridate::month(date),
WEEKDAY = lubridate::wday(date),
P_CA = CA / Total,
P_TX = TX / Total,
P_WI = WI / Total)
Error in UseMethod("mutate_") :
no applicable method for 'mutate_' applied to an object of class "function"
Now, we define 'df' as
df <- WalData
df %>%
dplyr::mutate(YEAR = lubridate::year(date),
MONTH = lubridate::month(date),
WEEKDAY = lubridate::wday(date),
P_CA = CA / Total,
P_TX = TX / Total,
P_WI = WI / Total)
# A tibble: 6 x 15
# CA TX WI Total date event_type snap_CA snap_TX snap_WI YEAR MONTH WEEKDAY P_CA P_TX P_WI
# <dbl> <dbl> <dbl> <dbl> <dttm> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 11047 7381 6984 25412 2011-02-01 00:00:00 NA 1 1 0 2011 2 3 0.435 0.290 0.275
#2 9925 5912 3309 19146 2011-02-02 00:00:00 NA 1 0 1 2011 2 4 0.518 0.309 0.173
#3 11322 9006 8883 29211 2011-02-03 00:00:00 NA 1 1 1 2011 2 5 0.388 0.308 0.304
#4 12251 6226 9533 28010 2011-02-04 00:00:00 NA 1 0 0 2011 2 6 0.437 0.222 0.340
#5 16610 9440 11882 37932 2011-02-05 00:00:00 NA 1 1 1 2011 2 7 0.438 0.249 0.313
#6 14696 9376 8664 32736 2011-02-06 00:00:00 Sporting 1 1 1 2011 2 1 0.449 0.286 0.265
I have a data frame in R which looks like below
Model Month Demand Inventory
A Jan 10 20
B Feb 30 40
A Feb 40 60
I want the data frame to look
Jan Feb
A_Demand 10 40
A_Inventory 20 60
A_coverage
B_Demand 30
B_Inventory 40
B_coverage
A_coverage and B_Coverage will be calculated in excel using a formula. But the problem I need help with is to pivot the data frame from wide to long format (original format).
I tried to implement the solution from the linked duplicate but I am still having difficulty:
HD_dcast <- reshape(data,idvar = c("Model","Inventory","Demand"),
timevar = "Month", direction = "wide")
Here is a dput of my data:
data <- structure(list(Model = c("A", "B", "A"), Month = c("Jan", "Feb",
"Feb"), Demand = c(10L, 30L, 40L), Inventory = c(20L, 40L, 60L
)), class = "data.frame", row.names = c(NA, -3L))
Thanks
Here's an approach with dplyr and tidyr, two popular R packages for data manipulation:
library(dplyr)
library(tidyr)
data %>%
mutate(coverage = NA_real_) %>%
pivot_longer(-c(Model,Month), names_to = "Variable") %>%
pivot_wider(id_cols = c(Model, Variable), names_from = Month ) %>%
unite(Variable, c(Model,Variable), sep = "_")
## A tibble: 6 x 3
# Variable Jan Feb
# <chr> <dbl> <dbl>
#1 A_Demand 10 40
#2 A_Inventory 20 60
#3 A_coverage NA NA
#4 B_Demand NA 30
#5 B_Inventory NA 40
#6 B_coverage NA NA
I have a reasonably complicated multi-level list:
my_list <- list(list(id = 36L, name = "Marathonbet", odds = list(data = list(
list(label = "1", value = "1.25", dp3 = "1.250", american = "-400",
winning = TRUE, handicap = NULL, total = NULL, bookmaker_event_id = "6938899",
last_update = list(date = "2018-08-12 13:12:23.000000",
timezone_type = 3L, timezone = "UTC")), list(label = "2",
value = "13.75", dp3 = "13.750", american = "1275", winning = FALSE,
handicap = NULL, total = NULL, bookmaker_event_id = "6938899",
last_update = list(date = "2018-08-12 13:12:23.000000",
timezone_type = 3L, timezone = "UTC")), list(label = "X",
value = "7.00", dp3 = "7.000", american = "600", winning = FALSE,
handicap = NULL, total = NULL, bookmaker_event_id = "6938899",
last_update = list(date = "2018-08-12 13:12:23.000000",
timezone_type = 3L, timezone = "UTC"))))), list(id = 7L,
name = "888Sport", odds = list(data = list(list(label = "1",
value = "1.23", dp3 = "1.230", american = "-435", winning = TRUE,
handicap = NULL, total = NULL, bookmaker_event_id = "1004746417",
last_update = list(date = "2018-08-12 13:12:23.000000",
timezone_type = 3L, timezone = "UTC")), list(label = "2",
value = "12.50", dp3 = "12.500", american = "1150", winning = FALSE,
handicap = NULL, total = NULL, bookmaker_event_id = "1004746417",
last_update = list(date = "2018-08-12 13:12:23.000000",
timezone_type = 3L, timezone = "UTC")), list(label = "X",
value = "6.50", dp3 = "6.500", american = "550", winning = FALSE,
handicap = NULL, total = NULL, bookmaker_event_id = "1004746417",
last_update = list(date = "2018-08-12 13:12:23.000000",
timezone_type = 3L, timezone = "UTC"))))), list(id = 9L,
name = "BetFred", odds = list(data = list(list(label = "1",
value = "1.30", dp3 = NULL, american = NULL, winning = TRUE,
handicap = NULL, total = NULL, bookmaker_event_id = "1085457020",
last_update = list(date = "2018-07-26 08:30:19.000000",
timezone_type = 3L, timezone = "UTC")), list(label = "2",
value = "9.00", dp3 = NULL, american = NULL, winning = FALSE,
handicap = NULL, total = NULL, bookmaker_event_id = "1085457020",
last_update = list(date = "2018-07-26 08:30:19.000000",
timezone_type = 3L, timezone = "UTC")), list(label = "X",
value = "5.50", dp3 = NULL, american = NULL, winning = FALSE,
handicap = NULL, total = NULL, bookmaker_event_id = "1085457020",
last_update = list(date = "2018-07-26 08:30:19.000000",
timezone_type = 3L, timezone = "UTC"))))))
I can use a combination of map and map_depth to eliminate levels of nesting, but I'm struggling then to bind those levels into a data frame and preserve all the data. For example - at level my_list[[1]][["odds"]][["data"]] there are three sub lists. When converting that level to a df I only end up with one row of data when there should be 3.
What I would like to do is convert this entire list to a data frame, where the common elements across sublists such as:
my_list[[1]][["odds"]][["data"]][[1]][["bookmaker_event_id"]] &
my_list[[2]][["odds"]][["data"]][[1]][["bookmaker_event_id"]]
appear in the same column in the resulting df.
It seems like a easy thing to achieve, but I either end up with missing rows of data or Error: Argument 1 must have names. The resulting data frame from this test list should have 9 rows and around 13 columns.
I'd like to use the map family of functions and avoid any loops please.
If you can live with a lapply solution, because I'm not very familiar with map:
DF <- bind_rows(lapply(my_list,function(ll){ #lapply over the list and bind result to tibble
id <- ll[['id']] #Extract id
name <- ll[['name']] #Extract name
#clean up date and unlist sublists
ll <- lapply(ll[['odds']][['data']],function(il)
{
il$last_update <- unlist(il$last_update)
return(unlist(il))
})
df <- as_tibble(do.call(rbind,ll)) #bind the sublists and generate tibble
df$id <- rep(id,nrow(df)) #add id
df$name <- rep(name,nrow(df)) #add name
return(df) #return df
}))
DF
A tibble: 9 x 11
label value dp3 american winning bookmaker_event~ last_update.date last_update.tim~ last_update.tim~ id name
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <int> <chr>
1 1 1.25 1.250 -400 TRUE 6938899 2018-08-12 13:1~ 3 UTC 36 Mara~
2 2 13.75 13.750 1275 FALSE 6938899 2018-08-12 13:1~ 3 UTC 36 Mara~
3 X 7.00 7.000 600 FALSE 6938899 2018-08-12 13:1~ 3 UTC 36 Mara~
4 1 1.23 1.230 -435 TRUE 1004746417 2018-08-12 13:1~ 3 UTC 7 888S~
5 2 12.50 12.500 1150 FALSE 1004746417 2018-08-12 13:1~ 3 UTC 7 888S~
6 X 6.50 6.500 550 FALSE 1004746417 2018-08-12 13:1~ 3 UTC 7 888S~
7 1 1.30 NA NA TRUE 1085457020 2018-07-26 08:3~ 3 UTC 9 BetF~
8 2 9.00 NA NA FALSE 1085457020 2018-07-26 08:3~ 3 UTC 9 BetF~
9 X 5.50 NA NA FALSE 1085457020 2018-07-26 08:3~ 3 UTC 9 BetF~
Using #shayaa's function here to replace Null with NA since unlist and flatten ignored NULL
replace_null <- function(x) {
lapply(x, function(x) {
if (is.list(x)){
replace_null(x)
} else{
if(is.null(x)) NA else(x)
}
})
}
Then use tibble and purrr::flatten
library(dplyr)
library(purrr)
my_list %>% {
tibble(
id=map_dbl(.,'id'),
name=map_chr(.,'name'),
odds=map(.,'odds') %>% map(. ,'data') %>% map(.,.%>% map(replace_null) %>% map_df(flatten))
#odds=map(.,~.x[['odds']][['data']] %>% map(replace_null) %>% map_df(flatten))
)} %>%
unnest(odds)
# A tibble: 9 x 13
id name label value dp3 american winning handicap total bookmaker_event_~ date timezone_type timezone
<dbl> <chr> <chr> <chr> <chr> <chr> <lgl> <lgl> <lgl> <chr> <chr> <int> <chr>
1 36 Marathonbet 1 1.25 1.250 -400 TRUE NA NA 6938899 2018-08-12 13:12:23.00~ 3 UTC
2 36 Marathonbet 2 13.75 13.750 1275 FALSE NA NA 6938899 2018-08-12 13:12:23.00~ 3 UTC
3 36 Marathonbet X 7.00 7.000 600 FALSE NA NA 6938899 2018-08-12 13:12:23.00~ 3 UTC
4 7 888Sport 1 1.23 1.230 -435 TRUE NA NA 1004746417 2018-08-12 13:12:23.00~ 3 UTC
5 7 888Sport 2 12.50 12.500 1150 FALSE NA NA 1004746417 2018-08-12 13:12:23.00~ 3 UTC
6 7 888Sport X 6.50 6.500 550 FALSE NA NA 1004746417 2018-08-12 13:12:23.00~ 3 UTC
7 9 BetFred 1 1.30 NA NA TRUE NA NA 1085457020 2018-07-26 08:30:19.00~ 3 UTC
8 9 BetFred 2 9.00 NA NA FALSE NA NA 1085457020 2018-07-26 08:30:19.00~ 3 UTC
9 9 BetFred X 5.50 NA NA FALSE NA NA 1085457020 2018-07-26 08:30:19.00~ 3 UTC
For more info see this purrr tutorial.