applying a function across columns by extracting similar column names - r

My data looks like:
[[1]]
date germany france germany_mean france_mean germany_sd france_sd
1 2016-01-01 17 25 21.29429 48.57103 30.03026 47.05169
What I am trying to do is to compute the following calculation over all the lists using map.
germany_calc = (germany - germany_mean) / germany_sd
france_calc = (france - france_mean) / france_sd
However the number of columns can change - here there are two categories/countries but in another list there could be 1 or 3 or N. The countries always follow the same structure. That is,
"country1", "country2", ... , "countryN", "country1_mean", "country2_mean", ... , "countryN_mean", "country1_sd", "country2_sd", ... , "countryN_sd".
Expected Output (for the first list):
Germany: -0.1429988 = (17 - 21.29429) / 30.03026
France: -0.5009603 = (25 - 48.57103) / 47.05169
EDIT: Apologies - expected output:
-0.1429988
-0.5009603
Function:
Scale_Me <- function(x){
(x - mean(x, na.rm = TRUE)) / sd(x, na.rm = TRUE)
}
Data:
my_list <- list(structure(list(date = structure(16801, class = "Date"),
germany = 17, france = 25, germany_mean = 21.2942922374429,
france_mean = 48.5710301846855, germany_sd = 30.030258443028,
france_sd = 47.0516928425878), class = "data.frame", row.names = c(NA,
-1L)), structure(list(date = structure(16802, class = "Date"),
germany = 9, france = 29, germany_mean = 21.2993150684932,
france_mean = 48.5605316914534, germany_sd = 30.0286190461173,
france_sd = 47.0543871206842), class = "data.frame", row.names = c(NA,
-1L)), structure(list(date = structure(16803, class = "Date"),
germany = 8, france = 18, germany_mean = 21.2947488584475,
france_mean = 48.551889593794, germany_sd = 30.0297291333284,
france_sd = 47.0562416513092), class = "data.frame", row.names = c(NA,
-1L)), structure(list(date = structure(16804, class = "Date"),
germany = 3, france = 11, germany_mean = 21.2778538812785,
france_mean = 48.5382545766386, germany_sd = 30.0267943793948,
france_sd = 47.0607680244109), class = "data.frame", row.names = c(NA,
-1L)), structure(list(date = structure(16805, class = "Date"),
germany = 4, france = 13, germany_mean = 21.2614155251142,
france_mean = 48.5214531240057, germany_sd = 30.0269420596686,
france_sd = 47.0676011750263), class = "data.frame", row.names = c(NA,
-1L)), structure(list(date = structure(16806, class = "Date"),
germany = 4, france = 9, germany_mean = 21.253196347032,
france_mean = 48.5055948249362, germany_sd = 30.0292032528186,
france_sd = 47.0737183354519), class = "data.frame", row.names = c(NA,
-1L)))

Why not just rbind the thing?
with(do.call(rbind, my_list),
cbind(germany=(germany - germany_mean) / germany_sd,
france=(france - france_mean) / france_sd))
# germany france
# [1,] -0.1429988 -0.5009603
# [2,] -0.4095864 -0.4157005
# [3,] -0.4427196 -0.6492633
# [4,] -0.6087181 -0.7976550
# [5,] -0.5748642 -0.7546901
# [6,] -0.5745473 -0.8392283

The question is unclear on the exact form of output so we assume that what is wanted is a data frame with a column for date and a column for each country in which the country value is normalized. In this case it means we want 3 columns in the output.
1) pivot_longer/_wider Bind the my_list list components together creating a data frame with a row from each component. Then for each bare country name among the columns append _root to it so that every column name except date is of the form country_suffix. Then convert to long form, perform the normalization and convert back to wide form:
library(dplyr)
library(tidyr)
library(purrr)
my_list %>%
bind_rows %>%
set_names(names(.)[1], sub("^([^_]*)$", "\\1_root", names(.)[-1])) %>%
pivot_longer(-date, names_to = c("country", ".value"), names_sep = "_") %>%
mutate(root = (root - mean) / sd) %>%
pivot_wider(id_cols = "date", names_from = "country", values_from = "root")
giving:
# A tibble: 6 x 3
date germany france
<date> <dbl> <dbl>
1 2016-01-01 -0.143 -0.501
2 2016-01-02 -0.410 -0.416
3 2016-01-03 -0.443 -0.649
4 2016-01-04 -0.609 -0.798
5 2016-01-05 -0.575 -0.755
6 2016-01-06 -0.575 -0.839
2) Base R
After rbinding the list components together giving d we pick out the country names, nms, as those names not containing an underscore except for the first such (which is date). Then perform the normalization and cbind the date column to that.
d <- do.call("rbind", my_list)
nms <- grep("_", names(d), invert = TRUE, value = TRUE)[-1]
cbind(d[1], (d[nms] - d[paste0(nms, "_mean")]) / d[paste0(nms, "_sd")])
giving:
date germany france
1 2016-01-01 -0.1429988 -0.5009603
2 2016-01-02 -0.4095864 -0.4157005
3 2016-01-03 -0.4427196 -0.6492633
4 2016-01-04 -0.6087181 -0.7976550
5 2016-01-05 -0.5748642 -0.7546901
6 2016-01-06 -0.5745473 -0.8392283

Do you have to use map ?
Here I get your desired output using two for loops instead of using map
Result_list = vector("list",length(my_list))
for(i in 1:length(my_list))
{
df = my_list[[i]]
# identifier number of countries
countries = colnames(df)[grep('mean',colnames(df))]
countries = gsub("_mean","",countries)
df_result = NULL
for(j in 1:length(countries))
{
country = countries[j]
value_country = df[1,match(country,colnames(df))]
mean_country = df[1,match(paste0(country,"_mean"),colnames(df))]
sd_country = df[1,match(paste0(country,"_sd"),colnames(df))]
result_country = (value_country - mean_country) / sd_country
Sentence = paste0(country,": ",round(result_country,5)," = (",value_country," - ",round(mean_country,5),") / ",round(sd_country,5))
df_result = c(df_result,Sentence)
}
Result_list[[i]] = df_result
}
And the output Result_list looks like:
> Result_list
[[1]]
[1] "germany: -0.143 = (17 - 21.29429) / 30.03026"
[2] "france: -0.50096 = (25 - 48.57103) / 47.05169"
[[2]]
[1] "germany: -0.40959 = (9 - 21.29932) / 30.02862"
[2] "france: -0.4157 = (29 - 48.56053) / 47.05439"
[[3]]
[1] "germany: -0.44272 = (8 - 21.29475) / 30.02973"
[2] "france: -0.64926 = (18 - 48.55189) / 47.05624"
[[4]]
[1] "germany: -0.60872 = (3 - 21.27785) / 30.02679"
[2] "france: -0.79765 = (11 - 48.53825) / 47.06077"
[[5]]
[1] "germany: -0.57486 = (4 - 21.26142) / 30.02694"
[2] "france: -0.75469 = (13 - 48.52145) / 47.0676"
[[6]]
[1] "germany: -0.57455 = (4 - 21.2532) / 30.0292"
[2] "france: -0.83923 = (9 - 48.50559) / 47.07372"
Is it what you are looking for ?
EDIT: Extracting only results
For extracting only result values, you can do the following:
Df_result_value = NULL
for(i in 1:length(my_list))
{
df = my_list[[i]]
# identifier number of countries
countries = colnames(df)[grep('mean',colnames(df))]
countries = gsub("_mean","",countries)
for(j in 1:length(countries))
{
country = countries[j]
value_country = df[1,match(country,colnames(df))]
mean_country = df[1,match(paste0(country,"_mean"),colnames(df))]
sd_country = df[1,match(paste0(country,"_sd"),colnames(df))]
result_country = (value_country - mean_country) / sd_country
Df_result_value = rbind(Df_result_value,c(country,result_country))
}
}
Df_result_value = data.frame(Df_result_value)
colnames(Df_result_value) = c("Country","Result")
And get this output:
> Df_result_value
Country Result
1 germany -0.142998843835787
2 france -0.500960300483614
3 germany -0.409586436512588
4 france -0.415700488060442
5 germany -0.442719572974515
6 france -0.649263275639099
7 germany -0.608718121899195
8 france -0.797654950237258
9 germany -0.574864249939699
10 france -0.754690110335453
11 germany -0.574547256608035
12 france -0.839228262008441

We can use transform as well in base R
transform(do.call(rbind, my_list),
germany = (germany - germany_mean)/germany_sd,
france = (france - france_mean)/france_sd)[c('date', 'germany', 'france')]
# date germany france
#1 2016-01-01 -0.1429988 -0.5009603
#2 2016-01-02 -0.4095864 -0.4157005
#3 2016-01-03 -0.4427196 -0.6492633
#4 2016-01-04 -0.6087181 -0.7976550
#5 2016-01-05 -0.5748642 -0.7546901
#6 2016-01-06 -0.5745473 -0.8392283
Or in dplyr, without any reshaping, this can be done
library(dplyr)
bind_rows(my_list) %>%
transmute(date,
germany = (germany - germany_mean)/germany_sd,
france = (france - france_mean)/france_sd)

Related

Perform a series of mutations to columns in dataframe

I am trying to replace some text in my dataframe (a few rows given below)
> dput(Henry.longer[1:4,])
structure(list(N_l = c(4, 4, 4, 4), UG = c("100", "100", "100",
"100"), S = c(12, 12, 12, 12), Sample = c(NA, NA, NA, NA), EQ = c("Henry",
"Henry", "Henry", "Henry"), DF = c(0.798545454545455, 0.798545454545455,
0.798545454545455, 0.798545454545455), meow = c("Henry.Exterior.single",
"Multi", "Henry.Exterior.multi", "Henry.Interior.single"), Girder = c("Henry.Exterior.single",
"Henry.Interior.multi", "Henry.Exterior.multi", "Interior")), row.names = c(NA,
-4L), groups = structure(list(UG = "100", S = 12, .rows = list(
1:4)), row.names = c(NA, -1L), class = c("tbl_df", "tbl",
"data.frame"), .drop = FALSE), class = c("grouped_df", "tbl_df",
"tbl", "data.frame"))
I try to mutate the dataframe as:
Henry.longer <- Henry.longer %>%
mutate(Loading = str_replace(meow, "Henry.Exterior.single", "Single")) %>%
mutate(Loading = str_replace(meow, "Henry.Exterior.multi", "Multi")) %>%
mutate(Loading = str_replace(meow, "Henry.Interior.single", "Single")) %>%
mutate(Loading = str_replace(meow, "Henry.Interior.multi", "Multi")) %>%
mutate(Girder = str_replace(meow, "Henry.Exterior.multi", "Exterior")) %>%
mutate(Girder = str_replace(meow, "Henry.Exterior.single", "Exterior")) %>%
mutate(Girder = str_replace(meow, "Henry.Interior.multi", "Interior")) %>%
mutate(Girder = str_replace(meow, "Henry.Interior.single", "Interior")) %>%
select(-meow)
But for some reason the results does not get applied to all the rows and only:
N_l UG S Sample EQ DF Loading Girder
1 4 100 12 NA Henry 0.799 Henry.Exterior.single Henry.Exterior.single
2 4 100 12 NA Henry 0.799 Multi Henry.Interior.multi
3 4 100 12 NA Henry 0.799 Henry.Exterior.multi Henry.Exterior.multi
4 4 100 12 NA Henry 0.799 Henry.Interior.single Interior
I think we can use lookup vectors for this, if it's easy or safer to use static string lookups:
tr_vec <- c(Henry.Exterior.single = "Single", Henry.Exterior.multi = "Multi", Henry.Interior.single = "Single", Henry.Interior.multi = "Multi")
tr_vec2 <- c(Henry.Exterior.multi = "Exterior", Henry.Exterior.single = "Exterior", Henry.Interior.multi = "Interior", Henry.Interior.single = "Interior")
Henry.longer %>%
mutate(
Loading = coalesce(tr_vec[Loading], Loading),
Girder = coalesce(tr_vec2[Girder], Girder)
)
# # A tibble: 4 x 8
# # Groups: UG, S [1]
# N_l UG S Sample EQ DF Loading Girder
# <dbl> <chr> <dbl> <lgl> <chr> <dbl> <chr> <chr>
# 1 4 100 12 NA Henry 0.799 Single Exterior
# 2 4 100 12 NA Henry 0.799 Multi Interior
# 3 4 100 12 NA Henry 0.799 Multi Exterior
# 4 4 100 12 NA Henry 0.799 Single Interior
The advantage of RonakShah's regex solution is that it can very easily handle many of the types of substrings you appear to need. Regexes do carry a little risk, though, in that they may (unlikely in that answer, but) miss match.
Instead of using str_replace I guess it would be easier to extract what you want using regex.
library(dplyr)
Henry.longer %>%
mutate(Loading = sub('.*\\.', '', meow),
Girder = sub('.*\\.(\\w+)\\..*', '\\1', meow))
where
Loading - removes everything until last dot
Girder - extracts a word between two dots.
Oh boy, looks like you've got some answers here already but here's a super-simple one that uses stringr::str_extract:
Henry.longer <- Henry.longer %>%
mutate(Loading = str_extract(meow, "single|multi")) %>%
mutate(Girder = str_extract(meow, "Interior|Exterior"))
It's worth noting that the demo data has a weird entry for meow in one column, so it didn't run perfectly on my machine:

Calculation of "average sales share " with dplyr::mutate

My data concerns a company and includes Total Sales and the amount of sales in three counties CA , TX and WI.
Data :
> dput(head(WalData))
structure(list(CA = c(11047, 9925, 11322, 12251, 16610, 14696
), TX = c(7381, 5912, 9006, 6226, 9440, 9376), WI = c(6984, 3309,
8883, 9533, 11882, 8664), Total = c(25412, 19146, 29211, 28010,
37932, 32736), date = structure(c(1296518400, 1296604800, 1296691200,
1296777600, 1296864000, 1296950400), tzone = "UTC", class = c("POSIXct",
"POSIXt")), event_type = c("NA", "NA", "NA", "NA", "NA", "Sporting"
), snap_CA = c(1, 1, 1, 1, 1, 1), snap_TX = c(1, 0, 1, 0, 1,
1), snap_WI = c(0, 1, 1, 0, 1, 1)), row.names = c(NA, -6L), class = c("tbl_df",
"tbl", "data.frame"))
With the following code i am trying to calculate the average sales share of the three states on the company's total sales.
In addition, i need the same average percentages for each year, month of the year and day of the week.
install.packages("dplyr")
install.packages("lubridate")
library(dplyr)
library(lubridate)
df1 <- df %>%
dplyr::mutate(YEAR = lubridate::year(date),
MONTH = lubridate::month(date),
WEEKDAY = lubridate::wday(date),
P_CA = CA / Total,
P_TX = TX / Total,
P_WI = WI / Total)
# Average per Year
df1 %>%
dplyr::group_by(YEAR) %>%
dplyr::summarise(AV_CA = mean(P_CA, na.rm = TRUE),
AV_TX = mean(P_TX, na.rm = TRUE),
AV_WI = mean(P_WI, na.rm = TRUE))
# Average per Month
df1 %>%
dplyr::group_by(MONTH) %>%
dplyr::summarise(AV_CA = mean(P_CA, na.rm = TRUE),
AV_TX = mean(P_TX, na.rm = TRUE),
AV_WI = mean(P_WI, na.rm = TRUE))
# Average per Weekday
df1 %>%
dplyr::group_by(WEEKDAY) %>%
dplyr::summarise(AV_CA = mean(P_CA, na.rm = TRUE),
AV_TX = mean(P_TX, na.rm = TRUE),
AV_WI = mean(P_WI, na.rm = TRUE))
Output :
> df1 <- df %>%
+ dplyr::mutate(YEAR = lubridate::year(date),
+ MONTH = lubridate::month(date),
+ WEEKDAY = lubridate::wday(date),
+ P_CA = CA / Total,
+ P_TX = TX / Total,
+ P_WI = WI / Total)
Error in UseMethod("mutate_") :
no applicable method for 'mutate_' applied to an object of class "function"
> # Average per Year
> df1 %>%
+ dplyr::group_by(YEAR) %>%
+ dplyr::summarise(AV_CA = mean(P_CA, na.rm = TRUE),
+ AV_TX = mean(P_TX, na.rm = TRUE),
+ AV_WI = mean(P_WI, na.rm = TRUE))
Error in eval(lhs, parent, parent) : object 'df1' not found
It comes with an error : Error in UseMethod("mutate_") :
no applicable method for 'mutate_' applied to an object of class "function"
I cant figure out whats wrong , i double checked the code and the correctness of the data .
Please give a solution .
The issue would be that df is not created as an object in the global env and there is a function with name df if we do ?df
df(x, df1, df2, ncp, log = FALSE)
Basically, the error is based on applying mutate on a function df rather than an object
Checking on a fresh R session with no objects created
df %>%
dplyr::mutate(YEAR = lubridate::year(date),
MONTH = lubridate::month(date),
WEEKDAY = lubridate::wday(date),
P_CA = CA / Total,
P_TX = TX / Total,
P_WI = WI / Total)
Error in UseMethod("mutate_") :
no applicable method for 'mutate_' applied to an object of class "function"
Now, we define 'df' as
df <- WalData
df %>%
dplyr::mutate(YEAR = lubridate::year(date),
MONTH = lubridate::month(date),
WEEKDAY = lubridate::wday(date),
P_CA = CA / Total,
P_TX = TX / Total,
P_WI = WI / Total)
# A tibble: 6 x 15
# CA TX WI Total date event_type snap_CA snap_TX snap_WI YEAR MONTH WEEKDAY P_CA P_TX P_WI
# <dbl> <dbl> <dbl> <dbl> <dttm> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 11047 7381 6984 25412 2011-02-01 00:00:00 NA 1 1 0 2011 2 3 0.435 0.290 0.275
#2 9925 5912 3309 19146 2011-02-02 00:00:00 NA 1 0 1 2011 2 4 0.518 0.309 0.173
#3 11322 9006 8883 29211 2011-02-03 00:00:00 NA 1 1 1 2011 2 5 0.388 0.308 0.304
#4 12251 6226 9533 28010 2011-02-04 00:00:00 NA 1 0 0 2011 2 6 0.437 0.222 0.340
#5 16610 9440 11882 37932 2011-02-05 00:00:00 NA 1 1 1 2011 2 7 0.438 0.249 0.313
#6 14696 9376 8664 32736 2011-02-06 00:00:00 Sporting 1 1 1 2011 2 1 0.449 0.286 0.265

Merge two data frames based on multiple columns in R

I have two data frames looking like that
data frame 1:
P.X value
OOPA 5
POKA 4
JKIO 3
KOPP 1
data frame 2:
P.X.1 P.X.2 P.X.3 P.X.4 mass
JKIO UIX HOP 56
CX OOPA 44
EDD POKA 13
KOPP FOSI 11
and I want to merge the two data files based on the df1 P.X and df2 P.X.1,P.X.2,P.X.3,P.X.4. So if it the JKIO in P.X.2. appears in the P.X one then merge them in a new data frame in the same row JKIO, 3, 56 as below:
data frame new:
P.X value mass
OOPA 5 44
POKA 4 13
JKIO 3 56
KOPP 1 11
Do you know how can I do it maybe with
merge(df1,df2 by(P.X == P.X.1 | P.X.2 | P.X.3 | P.X.4)
?
The following is one way to achieve your goal. You want to convert df2 to a long-format data and get rows that have more than 1 character. Once you have this data, you merge df1 with the updated df2.
library(dplyr)
library(tidyr)
left_join(df1,
pivot_longer(df2, cols = P.X.1:P.X.4, names_to = "foo",
values_to = "P.X") %>% filter(nchar(P.X) > 0),
by = "P.X") %>%
select(-foo)
P.X value mass
1 OOPA 5 44
2 POKA 4 13
3 JKIO 3 56
4 KOPP 1 11
DATA
df1 <- structure(list(P.X = c("OOPA", "POKA", "JKIO", "KOPP"), value = c(5L,
4L, 3L, 1L)), class = "data.frame", row.names = c(NA, -4L))
df2 <- structure(list(P.X.1 = c("", "", "EDD", "KOPP"), P.X.2 = c("JKIO",
"", "", "FOSI"), P.X.3 = c("UIX", "CX", "POKA", ""), P.X.4 = c("HOP",
"OOPA", "", ""), mass = c(56, 44, 13, 11)), row.names = c(NA,
-4L), class = c("tbl_df", "tbl", "data.frame"))
You could also just do:
df_new <- cbind(df1, df2[,5])

How to append 2 data sets one below the other having slightly different column names?

Data set1:
ID Name Territory Sales
1 Richard NY 59
8 Sam California 44
Data set2:
Terr ID Name Comments
LA 5 Rick yes
MH 11 Oly no
I want final data set to have columns of 1st data set only and identify Territory is same as Terr and does not bring forward Comments column.
Final data should look like:
ID Name Territory Sales
1 Richard NY 59
8 Sam California 44
5 Rick LA NA
11 Oly MH NA
Thanks in advance
A possible solution:
# create a named vector with names from 'set2'
# with the positions of the matching columns in 'set1'
nms2 <- sort(unlist(sapply(names(set2), agrep, x = names(set1))))
# only keep the columns in 'set2' for which a match is found
# and give them the same names as in 'set1'
set2 <- setNames(set2[names(nms2)], names(set1[nms2]))
# bind the two dataset together
# option 1:
library(dplyr)
bind_rows(set1, set2)
# option 2:
library(data.table)
rbindlist(list(set1, set2), fill = TRUE)
which gives (dplyr-output shown):
ID Name Territory Sales
1 1 Richard NY 59
2 8 Sam California 44
3 5 Rick LA NA
4 11 Oly MH NA
Used data:
set1 <- structure(list(ID = c(1L, 8L),
Name = c("Richard", "Sam"),
Territory = c("NY", "California"),
Sales = c(59L, 44L)),
.Names = c("ID", "Name", "Territory", "Sales"), class = "data.frame", row.names = c(NA, -2L))
set2 <- structure(list(Terr = c("LA", "MH"),
ID = c(5L, 11L),
Name = c("Rick", "Oly"),
Comments = c("yes", "no")),
.Names = c("Terr", "ID", "Name", "Comments"), class = "data.frame", row.names = c(NA, -2L))

Transpose dplyr::tbl object

I am using src_postgres to connect and dplyr::tbl function to fetch data from redshift database. I have applied some filters and top function to it using the dplyr itself. Now my data looks as below:
riid day hour
<dbl> <chr> <chr>
1 5542. "THURSDAY " 12
2 5862. "FRIDAY " 15
3 5982. "TUESDAY " 15
4 6022. WEDNESDAY 16
My final output should be as below:
riid MON TUES WED THUR FRI SAT SUN
5542 12
5862 15
5988 15
6022 16
I have tried spread. It throws the below error because of the class type:
Error in UseMethod("spread_") : no applicable method for 'spread_'
applied to an object of class "c('tbl_dbi', 'tbl_sql', 'tbl_lazy',
'tbl')"
Since this is a really big table, I do not want to use dataframe as it takes a longer time.
I was able to use as below:
df_mon <- df2 %>% filter(day == 'MONDAY') %>% mutate(MONDAY = hour) %>% select(riid,MONDAY)
df_tue <- df2 %>% filter(day == 'TUESDAY') %>% mutate(TUESDAY = hour) %>% select(riid,TUESDAY)
df_wed <- df2 %>% filter(day == 'WEDNESDAY') %>% mutate(WEDNESDAY = hour) %>% select(riid,WEDNESDAY)
df_thu <- df2 %>% filter(day == 'THURSDAY') %>% mutate(THURSDAY = hour) %>% select(riid,THURSDAY)
df_fri <- df2 %>% filter(day == 'FRIDAY') %>% mutate(FRIDAY = hour) %>% select(riid,FRIDAY)
Is it possible to write all above in one statement?
Any help to transpose this in a faster manner is really appreciated.
EDIT
Adding the dput of the tbl object:
structure(list(src = structure(list(con = <S4 object of class structure("PostgreSQLConnection", package = "RPostgreSQL")>,
disco = <environment>), .Names = c("con", "disco"), class = c("src_dbi",
"src_sql", "src")), ops = structure(list(name = "select", x = structure(list(
name = "filter", x = structure(list(name = "filter", x = structure(list(
name = "group_by", x = structure(list(x = structure("SELECT riid,day,hour,sum(weightage) AS score FROM\n (SELECT riid,day,hour,\n POWER(2,(cast(datediff (seconds,convert_timezone('UTC','PKT',SYSDATE),TO_DATE(TO_CHAR(event_captured_dt,'mm/dd/yyyy hh24:mi:ss'),'mm/dd/yyyy hh24:mi:ss')) as decimal) / cast(7862400 as decimal))) AS weightage\n FROM (\n SELECT riid,convert_timezone('GMT','PKT',event_captured_dt) AS EVENT_CAPTURED_DT,\n TO_CHAR(convert_timezone('GMT','PKT',event_captured_dt),'DAY') AS day,\n TO_CHAR(convert_timezone('GMT','PKT',event_captured_dt),'HH24') AS hour\n FROM Zameen_STO_DATA WHERE EVENT_CAPTURED_DT >= TO_DATE((sysdate -30),'yyyy-mm-dd') and LIST_ID = 4282\n )) group by riid,day,hour", class = c("sql",
"character")), vars = c("riid", "day", "hour", "score"
)), .Names = c("x", "vars"), class = c("op_base_remote",
"op_base", "op")), dots = structure(list(riid = riid,
day = day), .Names = c("riid", "day")), args = structure(list(
add = FALSE), .Names = "add")), .Names = c("name",
"x", "dots", "args"), class = c("op_group_by", "op_single",
"op")), dots = structure(list(~min_rank(desc(~score)) <=
1), .Names = ""), args = list()), .Names = c("name",
"x", "dots", "args"), class = c("op_filter", "op_single",
"op")), dots = structure(list(~row_number() == 1), .Names = ""),
args = list()), .Names = c("name", "x", "dots", "args"), class = c("op_filter",
"op_single", "op")), dots = structure(list(~riid, ~day, ~hour), class = "quosures", .Names = c("",
"", "")), args = list()), .Names = c("name", "x", "dots", "args"
), class = c("op_select", "op_single", "op"))), .Names = c("src",
"ops"), class = c("tbl_dbi", "tbl_sql", "tbl_lazy", "tbl"))
I think what you're looking for is the ability to run the tidyr::spread() function against a remote source, or database. I have a PR for dbplyr that attempts to implement that here: https://github.com/tidyverse/dbplyr/pull/72, you can try it out by using: devtools::install_github("tidyverse/dbplyr", ref = devtools::github_pull(72)).
Use dcast from reshape2 package
> data
# A tibble: 4 x 3
riid day hour
<dbl> <chr> <dbl>
1 1.00 TH 12.0
2 2.00 FR 15.0
3 3.00 TU 15.0
4 4.00 WE 16.0
> dcast(data, riid~day, value.var = "hour")
riid FR TH TU WE
1 1 NA 12 NA NA
2 2 15 NA NA NA
3 3 NA NA 15 NA
4 4 NA NA NA 16
Further if you want to remove NA, then
> z <- dcast(data, riid~day, value.var = "hour")
> z[is.na(z)] <- ""
> z
riid FR TH TU WE
1 1 12
2 2 15
3 3 15
4 4 16
I tried to combine your multiple line attempts into one. Can you try this and let us know the outcome?
library(dplyr)
df %>%
rowwise() %>%
mutate(Mon = ifelse(day=='MONDAY', hour[day=='MONDAY'], NA),
Tue = ifelse(day=='TUESDAY', hour[day=='TUESDAY'], NA),
Wed = ifelse(day=='WEDNESDAY', hour[day=='WEDNESDAY'], NA),
Thu = ifelse(day=='THURSDAY', hour[day=='THURSDAY'], NA),
Fri = ifelse(day=='FRIDAY', hour[day=='FRIDAY'], NA),
Sat = ifelse(day=='SATURDAY', hour[day=='SATURDAY'], NA),
Sun = ifelse(day=='SUNDAY', hour[day=='SUNDAY'], NA)) %>%
select(-day, -hour)
Output is:
riid Mon Tue Wed Thu Fri Sat Sun
1 5542 NA NA NA 12 NA NA NA
2 5862 NA NA NA NA 15 NA NA
3 5982 NA 15 NA NA NA NA NA
4 6022 NA NA 16 NA NA NA NA
Sample data:
# A tibble: 4 x 3
riid day hour
* <dbl> <chr> <int>
1 5542 THURSDAY 12
2 5862 FRIDAY 15
3 5982 TUESDAY 15
4 6022 WEDNESDAY 16
Update:
Can you try below approach using data.table?
library(data.table)
dt <- setDT(df)[, c("Mon","Tue","Wed","Thu","Fri","Sat","Sun") :=
list(ifelse(day=='MONDAY', hour[day=='MONDAY'], NA),
ifelse(day=='TUESDAY', hour[day=='TUESDAY'], NA),
ifelse(day=='WEDNESDAY', hour[day=='WEDNESDAY'], NA),
ifelse(day=='THURSDAY', hour[day=='THURSDAY'], NA),
ifelse(day=='FRIDAY', hour[day=='FRIDAY'], NA),
ifelse(day=='SATURDAY', hour[day=='SATURDAY'], NA),
ifelse(day=='SUNDAY', hour[day=='SUNDAY'], NA))][, !c("day","hour"), with=F]

Resources