I have this dataset
Book2 <- structure(list(meanX3 = c(21.66666667, 21.66666667, 11, 25, 240.3333333
), meanX1 = c(23, 34.5, 10, 25, 233.5), meanX2 = c(24.5, 26.5,
20, 25, 246.5), to_select = structure(c(3L, 1L, 2L, 1L, 1L), .Label = c("meanX1",
"meanX2", "meanX3"), class = "factor"), selected = c(NA, NA,
NA, NA, NA)), .Names = c("meanX3", "meanX1", "meanX2", "to_select",
"selected"), class = "data.frame", row.names = c(NA, -5L))
I want to get the coresponding row value for the column name on variable to_select .
I have tried
Book2 %>% dplyr::mutate(selected=.[paste0(to_select)])
But it returns all the column values. How can I go about to get a data set like
structure(list(meanX3 = c(21.66666667, 21.66666667, 11, 25, 240.3333333
), meanX1 = c(23, 34.5, 10, 25, 233.5), meanX2 = c(24.5, 26.5,
20, 25, 246.5), to_select = structure(c(3L, 1L, 2L, 1L, 1L), .Label = c("meanX1",
"meanX2", "meanX3"), class = "factor"), selected = c(21.66, 34.5,
20, 25, 240.33)), .Names = c("meanX3", "meanX1", "meanX2", "to_select",
"selected"), class = "data.frame", row.names = c(NA, -5L))
With base R, a safe strategy would be something like
cols <- as.character(unique(Book2$to_select))
row_col <- match(Book2$to_select, cols)
idx <- cbind(seq_along(Book2$to_select), row_col)
selected <- Book2[, cols][idx]
Book2$selected <- selected
Or using tidyverse packages, something like
library(tidyverse)
Book2 %>% mutate(row=1:n()) %>%
gather(prop, val, meanX3:meanX2) %>%
group_by(row) %>%
mutate(selected=val[to_select==prop]) %>%
spread(prop, val) %>% select(-row)
Would be a decent strategy.
One way is to group by row using rowwise() and then get the value of the string in 'to_select' column
Book2 %>%
rowwise() %>%
mutate(selected = get(as.character(to_select)))
# A tibble: 5 × 5
# meanX3 meanX1 meanX2 to_select selected
# <dbl> <dbl> <dbl> <fctr> <dbl>
#1 21.66667 23.0 24.5 meanX3 21.66667
#2 21.66667 34.5 26.5 meanX1 34.50000
#3 11.00000 10.0 20.0 meanX2 20.00000
#4 25.00000 25.0 25.0 meanX1 25.00000
#5 240.33333 233.5 246.5 meanX1 233.50000
In base R you can use match to select the desired column and then matrix subsetting to select the particular element for each row like this
Book2$selected <- as.numeric(Book2[cbind(seq_len(nrow(Book2)),
match(Book2$to_select, names(Book2)))])
Related
I have a dataset that I want to pivot.
dataset <- data.frame(date = c("01/01/2020","02/01/2020", "02/01/2020", "03/01/2020")
, camp_type = c("acquisition", "acquisition", "newsletter", "acquisition")
, channel_type = c("email", "direct_mail","email","email")
, sent = c(100, 200, 50, 250)
, open = c(30, NA, 14, 148)
, click = c(14, NA, 1, 100)
)
PLEASE NOTE: I have many more camp_types than the ones displayed in this example.
I want to get one row per day, and the rest of the information in different columns such as the picture below (renaming the columns "sent", "open" and "click" based on "channel_type" and "camp_type").
I have tried something not very elegant, and entirely manual, but I get an error when I rename the variables (code below)
dataset %>%
filter(camp_type == 'Acquisition' & channel_type == 'direct_mail') %>%
rename (dm_acq_sent = sent
, dm_acq_open = open
, dm_acq_click = clicked
)
The problem with this code above is that (once I fix the renaming issue) it will be heavily manual because I have to repeat the same chunk of code several times and needs that someone regularly checks that there are no more combinations of camp_type and channel_type.
Any help / advise will be massively appreciated.
With tidyr you can use pivot_wider:
library(tidyr)
pivot_wider(df, id_cols = date, names_from = c(camp_type, channel_type), values_from = c(sent, open, click))
Output
# A tibble: 3 x 10
date sent_acquisition… sent_acquisition_… sent_newsletter_… open_acquisitio… open_acquisition… open_newsletter… click_acquisiti… click_acquisitio… click_newslette…
<date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2020-01-01 100 NA NA 30 NA NA 14 NA NA
2 2020-02-01 NA 200 50 NA NA 14 NA NA 1
3 2020-03-01 250 NA NA 148 NA NA 100 NA NA
Data
df <- structure(list(date = structure(c(18262, 18293, 18293, 18322), class = "Date"),
camp_type = structure(c(1L, 1L, 2L, 1L), .Label = c("acquisition",
"newsletter"), class = "factor"), channel_type = structure(c(2L,
1L, 2L, 2L), .Label = c("direct_email", "email"), class = "factor"),
sent = c(100, 200, 50, 250), open = c(30, NA, 14, 148), click = c(14,
NA, 1, 100)), class = "data.frame", row.names = c(NA, -4L
))
Data contains a column "date range" that contains 2 months i.e. Oct 31,2019-Nov 30,2019 (November) and Dec 1,2019-Dec 31, 2019(December). Need to separate them in different columns under Post Period (December) and Pre Period (October) wrt to column "Revenue". I want to automate this process when I upload a file comparing any 2 months. Earlier month under "Pre Period" and later under "Post Period". Attached an example excel screenshot of the raw data and the processed data.
x<-data.frame("A"=c("book","mobile","tablet","desktop"),
"B"=c("new york","chicago","london","paris"),
"Date.Range"=c("Oct 31,2019-Nov 30,2019","Oct 31,2019-Nov 30,2019","Dec 1,2019-Dec 31, 2019","Dec 1,2019-Dec 31, 2019"),
"Revenue"=c(542,837,1234,846))
dput(x)
structure(list(A = structure(c(1L, 3L, 4L, 2L), .Label = c("book",
"desktop", "mobile", "tablet"), class = "factor"), B = structure(c(3L,
1L, 2L, 4L), .Label = c("chicago", "london", "new york", "paris"
), class = "factor"), Date.Range = structure(c(2L, 2L, 1L, 1L
), .Label = c("Dec 1,2019-Dec 31, 2019", "Oct 31,2019-Nov 30,2019"
), class = "factor"), Revenue = c(542, 837, 1234, 846)), class = "data.frame", row.names = c(NA,
-4L))
Raw Data.
Processed Data.
Using base R's reshape function:
df = reshape(data = x,idvar = c("A","B"),direction = "wide",timevar = "DateRange")
colnames(df)=c("A","B","pre","post")
We can extract one date from Date.Range, arrange the data according to it, create a new period column and get the data in wide format.
library(dplyr)
x %>%
mutate(date = lubridate::mdy(sub("-.*", "", Date.Range))) %>%
arrange(date) %>%
mutate(period = rep(c("pre", "post"), each = 2)) %>%
tidyr::pivot_wider(names_from = period, values_from = Revenue,
values_fill = list(Revenue = 0)) %>%
select(-date)
# A tibble: 4 x 5
# A B Date.Range pre post
# <fct> <fct> <fct> <dbl> <dbl>
#1 book new york Oct 31,2019-Nov 30,2019 542 0
#2 mobile chicago Oct 31,2019-Nov 30,2019 837 0
#3 tablet london Dec 1,2019-Dec 31, 2019 0 1234
#4 desktop paris Dec 1,2019-Dec 31, 2019 0 846
I'm trying to generalize this chunk of code:
trimmedMeans %>%
mutate(Expectation_mean = paste(format(Expectation_mean, digits = 2, nsmall = 2),
"±",
format(Expectation_sd, digits = 2, nsmall = 2)),
Interesting_mean = paste(format(Interesting_mean, digits = 2, nsmall = 2),
"±",
format(Interesting_sd, digits = 2, nsmall = 2)),
Useful_mean = paste(format(Useful_mean, digits = 2, nsmall = 2),
"±",
format(Useful_sd, digits = 2, nsmall = 2)),
OralPresentation_mean = paste(format(OralPresentation_mean, digits = 2, nsmall = 2),
"±",
format(OralPresentation_sd, digits = 2, nsmall = 2))
)
I'm trying to do this:
paste.Mean.Sd <- function(m, s){
paste(format(m, digits = 2, nsmall = 2),
"±",
format(s, digits = 2, nsmall = 2)) }
trimmedMeans2 <- trimmedMeans %>%
mutate_at(vars(contains('_mean')), funs(paste.Mean.Sd(
vars(contains('_mean')), vars(contains('_sd'))
)) )
What I'm getting is something like this:
What I expected to have is this:
What am I missing?
EDIT 1
This code gives me the right result for the "left part" (mean) of the string, not for the SD part:
trimmedMeans %>%
mutate_at(vars(contains('_mean')), funs(paste.Mean.Sd(., str_replace(., "_mean", "_sd"))))
EDIT 2
The following is the code to reproduce the dataframe I used:
trimmedMeans <- structure(list(TrackName = structure(c(2L, 2L, 2L, 2L, 2L, 2L
), .Label = c("Llytse", "Mneshe", "Phrypa", "Veormi"), class = "factor"),
SpeakerName = c("Delta Shelby", "Irvine Fairburn", "Kristine Harland",
"Paislee Jež", "Rhianna Clarke", "Spencer Hargrave"), NumOfVoters = c(15L,
14L, 5L, 14L, 17L, 19L), Expectation_mean = c(4.6, 5, 4.2,
4.07142857142857, 4.41176470588235, 4.73684210526316), Interesting_mean = c(4.46666666666667,
5.5, 5, 4.78571428571429, 5.05882352941176, 5.57894736842105
), Useful_mean = c(4.6, 5.14285714285714, 4.6, 4.28571428571429,
4.52941176470588, 5.42105263157895), OralPresentation_mean = c(4.33333333333333,
5.28571428571429, 5.4, 4.85714285714286, 5.17647058823529,
5.52631578947368), Expectation_sd = c(0.736788397613007,
0.784464540552736, 0.836660026534076, 0.474631146549323,
0.870260272089029, 0.561951486949016), Interesting_sd = c(0.639940473422184,
0.518874521662771, 0.707106781186548, 0.801783725737273,
0.747545001596402, 0.507257273501788), Useful_sd = c(0.9102589898328,
1.02710518202619, 0.894427190999916, 0.913873533463375, 1.06757008311068,
0.507257273501788), OralPresentation_sd = c(0.975900072948533,
0.825420305855557, 0.547722557505166, 0.864437821507567,
0.63593377383646, 0.611775290321498)), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -6L), vars = c("TrackName",
"SpeakerName"), drop = TRUE, indices = list(0L, 1L, 2L, 3L, 4L,
5L), group_sizes = c(1L, 1L, 1L, 1L, 1L, 1L), biggest_group_size = 1L, labels = structure(list(
TrackName = structure(c(2L, 2L, 2L, 2L, 2L, 2L), .Label = c("Llytse",
"Mneshe", "Phrypa", "Veormi"), class = "factor"), SpeakerName = c("Delta Shelby",
"Irvine Fairburn", "Kristine Harland", "Paislee Jež", "Rhianna Clarke",
"Spencer Hargrave")), class = "data.frame", row.names = c(NA,
-6L), vars = c("TrackName", "SpeakerName"), drop = TRUE, .Names = c("TrackName",
"SpeakerName")), .Names = c("TrackName", "SpeakerName", "NumOfVoters",
"Expectation_mean", "Interesting_mean", "Useful_mean", "OralPresentation_mean",
"Expectation_sd", "Interesting_sd", "Useful_sd", "OralPresentation_sd"
))
I found your approach to be challenging, even after reading the Programming with dplyr vignette. Instead, I used tidyr to gather() and spread() the data to get your desired result, which was more intuitive to me.
library(tidyr)
trimmedMeans %>%
gather(key, value, -TrackName, -SpeakerName, -NumOfVoters) %>%
mutate_at('value', format, digits = 2, nsmall = 2) %>%
separate(key, c('var', 'key')) %>%
group_by(SpeakerName, var) %>%
spread(key, value) %>%
group_by(SpeakerName) %>%
unite(value, mean, sd, sep = " ± ") %>%
mutate(var = paste0(var, "_sd")) %>%
spread(var, value)
# A tibble: 6 x 7
# Groups: SpeakerName [6]
TrackName SpeakerName NumOfVoters Expectation_sd Interesting_sd
<fct> <chr> <int> <chr> <chr>
1 Mneshe Delta Shel… 15 4.60 ± 0.74 4.47 ± 0.64
2 Mneshe Irvine Fai… 14 5.00 ± 0.78 5.50 ± 0.52
3 Mneshe Kristine H… 5 4.20 ± 0.84 5.00 ± 0.71
4 Mneshe Paislee Jež 14 4.07 ± 0.47 4.79 ± 0.80
5 Mneshe Rhianna Cl… 17 4.41 ± 0.87 5.06 ± 0.75
6 Mneshe Spencer Ha… 19 4.74 ± 0.56 5.58 ± 0.51
# ... with 2 more variables: OralPresentation_sd <chr>,
# Useful_sd <chr>
I solved in the meantime with this trick:
for (characteristic in speaker.characteristcs) {
characteristic_str <- paste0(characteristic, "_str")
trimmedMeans[characteristic_str] <-
trimmedMeans %>% ungroup() %>% select( contains(characteristic) ) %>%
tidyr::unite()
}
paste.Mean.Sd <- function(s){
paste(format(as.numeric(strsplit(s, "\\_")[[1]][1]), digits = 2, nsmall = 2),
"±",
format(as.numeric(strsplit(s, "\\_")[[1]][2]), digits = 2, nsmall = 2)) }
trimmedMeans %>%
mutate_at(vars(contains('_str')),
funs(paste.Mean.Sd(.))) %>%
ungroup() %>%
select(SpeakerName, NumOfVoters, contains('_str')) %>%
I don't know if it's possible to get the result with a single statement, using dplyr programming features.
I have two data.tables A:
contract.name contract.start contract.end price
Q1-2019 2019-01-01 2019-04-01 10
Q2-2019 2019-04-01 2019-07-01 12
Q3-2019 2019-07-01 2019-10-01 11
Q4-2019 2019-10-01 2020-01-01 13
and B:
contract delivery.begin delivery.end bid ask
Q2-2018 2018-04-01 2018-06-30 9.8 10.5
Q3-2018 2018-07-01 2018-09-30 11.5 12.1
Q4-2018 2018-10-01 2018-12-31 10.5 11.3
Q1-2019 2019-01-01 2019-03-31 12.8 13.5
I want a vector with the bid values from B ordered by the contract.name values from A like so:
bid = c(12.8, 0, 0, 0)
df1 %>%
left_join(df2, by=c("contract.name"="contract")) %>%
select(bid) %>%
replace_na(list(bid=0)) %>%
as.character()
Output is:
"c(12.8, 0, 0, 0)"
Sample data:
df1 <- structure(list(contract.name = c("Q1-2019", "Q2-2019", "Q3-2019",
"Q4-2019"), contract.start = c("2019-01-01", "2019-04-01", "2019-07-01",
"2019-10-01"), contract.end = c("2019-04-01", "2019-07-01", "2019-10-01",
"2020-01-01"), price = c(10L, 12L, 11L, 13L)), .Names = c("contract.name",
"contract.start", "contract.end", "price"), class = "data.frame", row.names = c(NA,
-4L))
df2 <- structure(list(contract = c("Q2-2018", "Q3-2018", "Q4-2018",
"Q1-2019"), delivery.begin = c("2018-04-01", "2018-07-01", "2018-10-01",
"2019-01-01"), delivery.end = c("2018-06-30", "2018-09-30", "2018-12-31",
"2019-03-31"), bid = c(9.8, 11.5, 10.5, 12.8), ask = c(10.5,
12.1, 11.3, 13.5)), .Names = c("contract", "delivery.begin",
"delivery.end", "bid", "ask"), class = "data.frame", row.names = c(NA,
-4L))
library(data.table)
DT.A <- data.table(structure(list(contract.name = structure(1:4, .Label = c("Q1-2019",
"Q2-2019", "Q3-2019", "Q4-2019"), class = "factor"), contract.start = structure(1:4, .Label = c("2019-01-01",
"2019-04-01", "2019-07-01", "2019-10-01"), class = "factor"),
contract.end = structure(1:4, .Label = c("2019-04-01", "2019-07-01",
"2019-10-01", "2020-01-01"), class = "factor"), price = c(10L,
12L, 11L, 13L)), .Names = c("contract.name", "contract.start",
"contract.end", "price"), class = "data.frame", row.names = c(NA,
-4L)))
DT.B <- data.table(structure(list(contract = structure(c(2L, 3L, 4L, 1L), .Label = c("Q1-2019",
"Q2-2018", "Q3-2018", "Q4-2018"), class = "factor"), delivery.begin = structure(1:4, .Label = c("2018-04-01",
"2018-07-01", "2018-10-01", "2019-01-01"), class = "factor"),
delivery.end = structure(1:4, .Label = c("2018-06-30", "2018-09-30",
"2018-12-31", "2019-03-31"), class = "factor"), bid = c(9.8,
11.5, 10.5, 12.8), ask = c(10.5, 12.1, 11.3, 13.5)), .Names = c("contract",
"delivery.begin", "delivery.end", "bid", "ask"), class = "data.frame", row.names = c(NA,
-4L)))
# Get vector of contract names
orderVals <- DT.A$contract.name
# Key table B by contract
setkey(DT.B, contract)
# Extract rows from table B with the specified key values
output <- DT.B[.(orderVals)]
# Change the values where there was no match from NA to 0
output[is.na(bid), bid := 0]
# Get desired vector
output$bid
You can do:
library("data.table")
A <- fread(
"contract.name contract.start contract.end price
Q1-2019 2019-01-01 2019-04-01 10
Q2-2019 2019-04-01 2019-07-01 12
Q3-2019 2019-07-01 2019-10-01 11
Q4-2019 2019-10-01 2020-01-01 13")
B <- fread(
"contract delivery.begin delivery.end bid ask
Q2-2018 2018-04-01 2018-06-30 9.8 10.5
Q3-2018 2018-07-01 2018-09-30 11.5 12.1
Q4-2018 2018-10-01 2018-12-31 10.5 11.3
Q1-2019 2019-01-01 2019-03-31 12.8 13.5")
setnames(B, "contract", "contract.name")
A[B, on="contract.name", bid:=bid][, ifelse(is.na(bid), 0, bid)]
# > A[B, on="contract.name", bid:=bid][, ifelse(is.na(bid), 0, bid)]
# [1] 12.8 0.0 0.0 0.0
or (a variant without ifelse()):
setnames(B, "contract", "contract.name")
A[B, on="contract.name", bid:=bid]
A[is.na(bid), bid:=0][, bid]
I need to add a Thickness column to my Products table based on multiple conditions.
1 : Thickness should be only one of these values
Plate_Thickness <- c(5.8,25.1,27.1,32.5,55.6,98.1,120.4)
2 : Thickness should be between the ThicknessMin and ThicknessMax values already existing in table.
Current table looks like this:
Product ThicknessMin ThicknessMax
P0001 0 8
P0002 31.01 70
P0003 8.01 31
P0004 70.01 999
P0005 8.01 31
So, the idea is to pick a value for Thickness from the vector randomly but it should be between the ThicknessMin and ThicknessMax. Please help with any pointers how to go about this. Thanks.
A vectorized base R solution (df is your data.frame):
set.seed(1) #just for reproducibility
a<-findInterval(df$ThicknessMin,Plate_Thickness,all.inside=TRUE)
b<-findInterval(df$ThicknessMax,Plate_Thickness,all.inside=TRUE)
Plate_Thickness[runif(length(a)) %/% (1/(b-a+1))+a]
#[1] 5.8 32.5 25.1 98.1 5.8
Your data
Plate_Thickness <- c(5.8,25.1,27.1,32.5,55.6,98.1,120.4)
df <- structure(list(Product = c("P0001", "P0002", "P0003", "P0004",
"P0005"), ThicknessMin = c(0, 31.01, 8.01, 70.01, 8.01), ThicknessMax = c(8L,
70L, 31L, 999L, 31L), Plate_Thickness = c(5.8, 32.5, 27.1, 120.4,
25.1)), .Names = c("Product", "ThicknessMin", "ThicknessMax",
"Plate_Thickness"), row.names = c(NA, -5L), class = c("data.table",
"data.frame"))
solution
library(dplyr)
acceptable_vals <- lapply(1:nrow(df), function(x) Plate_Thickness[between(Plate_Thickness, df$ThicknessMin[x], df$ThicknessMax[x])])
set.seed(1)
df$Plate_Thickness <- sapply(acceptable_vals, function(x) x[sample(1:length(x), 1)])
Output
Product ThicknessMin ThicknessMax Plate_Thickness
1: P0001 0.00 8 5.8
2: P0002 31.01 70 32.5
3: P0003 8.01 31 27.1
4: P0004 70.01 999 120.4
5: P0005 8.01 31 25.1
We can use the rowwise function from the dplyr package to sample from the Plate_Thickness vector. Within the call to sample, we sample only from elements of Plate_Thickness which are between ThicknessMin and ThicknessMax. I put your table in a data.frame called dat:
library(dplyr)
set.seed(123)
dat %>%
rowwise() %>%
mutate(thick_sample = sample(Plate_Thickness[between(Plate_Thickness, ThicknessMin, ThicknessMax)],
1))
Product ThicknessMin ThicknessMax thick_sample
<fctr> <dbl> <int> <dbl>
1 P0001 0.00 8 2.0
2 P0002 31.01 70 55.6
3 P0003 8.01 31 25.1
4 P0004 70.01 999 120.4
5 P0005 8.01 31 27.1
Data (for reproducibility)
dat <- structure(list(Product = structure(1:5, .Label = c("P0001", "P0002",
"P0003", "P0004", "P0005"), class = "factor"), ThicknessMin = c(0,
31.01, 8.01, 70.01, 8.01), ThicknessMax = c(8L, 70L, 31L, 999L,
31L)), .Names = c("Product", "ThicknessMin", "ThicknessMax"), class = "data.frame", row.names = c(NA,
-5L))
#DATA
df = structure(list(Product = c("P0001", "P0002", "P0003", "P0004",
"P0005"), ThicknessMin = c(0, 31.01, 8.01, 70.01, 8.01), ThicknessMax = c(8L,
70L, 31L, 999L, 31L)), .Names = c("Product", "ThicknessMin",
"ThicknessMax"), class = c("data.table", "data.frame"), row.names = c(NA,
-5L))
Plate_Thickness = c(5.8,25.1,27.1,32.5,55.6,98.1,120.4)
set.seed(1)
apply(X = df[c("ThicknessMin", "ThicknessMax")],
MARGIN = 1, #Run FUN on each row of X
FUN = function(x)
#Retain only eligible values for each row and sample 1 value
sample(x = Plate_Thickness[Plate_Thickness > x[1] & Plate_Thickness < x[2]],
size = 1))
#[1] 2.0 32.5 27.1 120.4 25.1