Tidyverse change values based on name - r

I have a dataframe as follows
library(tidyverse)
library(tidymodels)
#df <- read_csv("C:\\Users\\omarl\\OneDrive\\Escritorio\\games.csv")
df <- structure(list(gameId = 3326086514, creationTime = 1504279457970,
gameDuration = 1949, seasonId = 9, winner = 1, firstBlood = 2,
firstTower = 1, firstInhibitor = 1, firstBaron = 1, firstDragon = 1,
firstRiftHerald = 2, t1_champ1id = 8, t1_champ1_sum1 = 12,
t1_champ1_sum2 = 4, t1_champ2id = 432, t1_champ2_sum1 = 3,
t1_champ2_sum2 = 4, t1_champ3id = 96, t1_champ3_sum1 = 4,
t1_champ3_sum2 = 7, t1_champ4id = 11, t1_champ4_sum1 = 11,
t1_champ4_sum2 = 6, t1_champ5id = 112, t1_champ5_sum1 = 4,
t1_champ5_sum2 = 14, t1_towerKills = 11, t1_inhibitorKills = 1,
t1_baronKills = 2, t1_dragonKills = 3, t1_riftHeraldKills = 0,
t1_ban1 = 92, t1_ban2 = 40, t1_ban3 = 69, t1_ban4 = 119,
t1_ban5 = 141, t2_champ1id = 104, t2_champ1_sum1 = 11, t2_champ1_sum2 = 4,
t2_champ2id = 498, t2_champ2_sum1 = 4, t2_champ2_sum2 = 7,
t2_champ3id = 122, t2_champ3_sum1 = 6, t2_champ3_sum2 = 4,
t2_champ4id = 238, t2_champ4_sum1 = 14, t2_champ4_sum2 = 4,
t2_champ5id = 412, t2_champ5_sum1 = 4, t2_champ5_sum2 = 3,
t2_towerKills = 5, t2_inhibitorKills = 0, t2_baronKills = 0,
t2_dragonKills = 1, t2_riftHeraldKills = 1, t2_ban1 = 114,
t2_ban2 = 67, t2_ban3 = 43, t2_ban4 = 16, t2_ban5 = 51), row.names = c(NA,
-1L), class = c("tbl_df", "tbl", "data.frame"))
df <- df %>%
mutate(winner = ifelse(winner == 1, "team1", "team2")) %>%
mutate(firstBlood = ifelse(firstBlood == 1, "team1", "team2")) %>%
mutate(firstTower = ifelse(firstTower == 1, "team1", "team2")) %>%
mutate(firstInhibitor = ifelse(firstInhibitor == 1, "team1", "team2")) %>%
mutate(firstBaron = ifelse(firstBaron == 1, "team1", "team2")) %>%
mutate(firstDragon = ifelse(firstDragon == 1, "team1", "team2")) %>%
mutate(firstRiftHerald = ifelse(firstRiftHerald == 1, "team1", "team2")) %>%
select(-gameId, -creationTime) %>%
filter(seasonId == 9) %>%
select(gameDuration, winner, firstBlood, firstTower, firstInhibitor, firstBaron, firstDragon,
firstRiftHerald)
As you can see, mutate is really redundant here, because I'm copying the code for every variable. Is there any way to apply the ifelse to columns that start with first, t1, etc. programatically?

You may try
library(dplyr)
df %>%
mutate(across(starts_with("t1")|starts_with("first"), ~ifelse(.x == 1, "team1", "team2")))

Park gave a best (one liner) solution. But if you want to look at some other options, here is how we can do it via using some other functions in dplyr:
df %>%
gather(key, value, firstBlood:t1_ban5) %>%
mutate(value = ifelse(value == 1, "team1", "team2")) %>%
spread(key, value) %>%
select(-gameId, -creationTime) %>%
filter(seasonId == 9) %>%
select(gameDuration, winner, firstBlood, firstTower, firstInhibitor, firstBaron, firstDragon,
firstRiftHerald)

Related

how do i create a bar chart to compare pre and post scores between participants?

I am trying to create a bar chart or column chart plot to compare pre and post scores between participants. I managed to do this in a line graph, however, I am struggling to visualise this within a bar chart, can anyone help me with this?
Here is the data I am using:
structure(list(Participant = c(2, 3, 5, 7), PRE_QUIP_RS = c(24,
24, 20, 20), POST_QUIP_RS = c(10, 23, 24, 14), PRE_PDQ8 = c(11,
8, 10, 4), POST_PDQ8 = c(7, 7, 9, 4), PRE_GDS = c(1, 7, 1, 0),
POST_GDS = c(1, 4, 2, 0), PRE_PERSISTENT = c(9, 13, 6, 2),
POST_PERSISTENT = c(9, 13, 11, 3), PRE_EPISODIC = c(3, 4,
2, 0), POST_EPISODIC = c(2, 5, 6, 2), PRE_AVOIDANCE = c(6,
3, 0, 2), POST_AVOIDANCE = c(3, 3, 4, 1), PRE_IPQ = c(39,
48, 40, 37), POST_IPQ = c(16, 44, 30, 17), PRE_GSE = c(28,
31, 36, 29), POST_GSE = c(29, 30, 30, 29), PRE_BCI = c(11,
9, 5, 3), POST_BCI = c(3, 15, 0, 0)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -4L))
In terms of how I roughly want it to look, I want the bars to be placed together for pre and post for each participant, kind of like this:
You may try
library(tidyverse)
df %>%
select(Participant, PRE_QUIP_RS, POST_QUIP_RS) %>%
pivot_longer(cols = c(PRE_QUIP_RS, POST_QUIP_RS), names_to = "group") %>%
mutate(group = str_split(group, "_", simplify = T)[,1],
Participant = as.factor(Participant)) %>%
ggplot(aes(x = Participant, y = value, group = group, fill = group)) +
geom_col(position = "dodge")
PRE POST order
dummy %>%
select(Participant, PRE_QUIP_RS, POST_QUIP_RS) %>%
pivot_longer(cols = c(PRE_QUIP_RS, POST_QUIP_RS), names_to = "group") %>%
mutate(group = str_split(group, "_", simplify = T)[,1] %>%
factor(., levels = c("PRE", "POST")), # HERE
Participant = as.factor(Participant)) %>%
ggplot(aes(x = Participant, y = value, group = group, fill = group)) +
geom_col(position = "dodge")

Using add_header_row from flextable to create columns of varying widths

I have data as follows:
dat <- structure(list(rn = c("type_A", "type_B", "type_C"
), freq = list(c(0, 0, 0, 5, 7, 16, 28), c(2, 1, 0, 5, 0, 8),
c(0, 0, 3, 5, 12, 53, 73)), colspan = list(c(`25` = 1, `100` = 2,
`250` = 1, `500` = 1, `1000` = 1, Infinity = 3, SUM = 1), c(`25` = 1,
`100` = 2, `250` = 1, `500` = 1, Infinity = 4, SUM = 1), c(`25` = 1,
`50` = 1, `100` = 1, `250` = 1, `500` = 1, Infinity = 4, SUM = 1
))), row.names = c(NA, 3L), class = "data.frame")
total_colspan = c(0, 25, 50, 100, 250, 500, 1000, 1500, 3000, "Infinity", "SUM")
rn freq colspan
1 type_A 0, 0, 0, 5, 7, 16, 28 1, 2, 1, 1, 1, 3, 1
2 type_B 2, 1, 0, 5, 0, 8 1, 2, 1, 1, 4, 1
3 type_C 0, 0, 3, 5, 12, 53, 73 1, 1, 1, 1, 1, 4, 1
I would like to create a table with varying column spans (but they all add up to 10), in an R-markdown Word document, like the table below:
I was advised to try flextable for this (link). I am trying to use the header options to create these varying colspan. I thought about doing something like:
dat_table <- flextable(dat)
dat_table <- lapply(dat_table, add_header_row, values = unlist(freq), colwidths = unlist(colspan))
But this is not working.
EDIT:
My second attempt:
dat <- structure(list(rn = c("type_A", "type_B", "type_C"
), freq = list(c(0, 0, 0, 5, 7, 16, 28), c(2, 1, 0, 5, 0, 8),
c(0, 0, 3, 5, 12, 53, 73)), colspan = list(c(1, 2, 1, 1, 1, 3, 1), c(1, 2, 1, 1, 4, 1), c(1, 1, 1, 1, 1, 4, 1
))), row.names = c(NA, 3L), class = "data.frame")
# The thresholds as in the picture
thresholds <- data.frame(c("Lower threshold","Upper threshold"), c(0,25), c(25,50), c(50,100), c(100,250), c(250,500),c(500,1000),c(1000,1500),c(1500,3000),c(3000, "Infinity"), c("", "SUM"))
names(thresholds) <- c("One", "Two", "Three", "Four", "Five", "Six", "Seven", "Eight", "Nine", "Ten", "Eleven")
thresholds <- flextable(thresholds)
# There was one column to few in the example
dat <- transform(dat, colspan=Map('c', 1, dat[["colspan"]] ))
dat <- transform(dat, freq=Map('c', "", dat[["freq"]] ))
# for loop to stick to the syntax
for (i in nrow(dat)) {
thresholds <- add_header_row(thresholds, values = dat[[2]][[i]], colwidths = dat[[3]][[i]])
}
For some reason it only adds one row (while it allows for more headers to be added).
Here's a solution that is perhaps way too overkill, but seems to do what you're looking for:
library(tidyverse)
library(flextable)
dat <- structure(list(rn = c("type_A", "type_B", "type_C"
), freq = list(c(0, 0, 0, 5, 7, 16, 28), c(2, 1, 0, 5, 0, 8),
c(0, 0, 3, 5, 12, 53, 73)), colspan = list(c(1, 2, 1, 1, 1, 3, 1), c(1, 2, 1, 1, 4, 1), c(1, 1, 1, 1, 1, 4, 1
))), row.names = c(NA, 3L), class = "data.frame")
# The thresholds as in the picture
thresholds <- data.frame(c("Lower threshold","Upper threshold"), c(0,25), c(25,50), c(50,100), c(100,250), c(250,500),c(500,1000),c(1000,1500),c(1500,3000),c(3000, "Infinity"), c("", "SUM"))
names(thresholds) <- c("One", "Two", "Three", "Four", "Five", "Six", "Seven", "Eight", "Nine", "Ten", "Eleven")
out <- map(1:nrow(dat), function(index){
out <- data.frame("freq" = dat$freq[[index]],
"span" = dat$colspan[[index]]) %>%
tidyr::uncount(span, .id = 'span') %>%
mutate(freq = ifelse(span>1, NA, freq)) %>%
t %>%
as.data.frame() %>%
mutate(rn = dat$rn[[index]],
across(everything(), ~as.character(.))) %>%
select(rn, everything()) %>%
set_names(nm = names(thresholds)) %>%
slice(1)
return(out)
})
combined <- thresholds %>%
mutate(across(everything(), ~as.character(.))) %>%
bind_rows(out)
spans <- map(1:length(dat$colspan), function(index){
spans <- dat$colspan[[index]] %>%
as_tibble() %>%
mutate(idx = row_number()) %>%
tidyr::uncount(value, .remove = F) %>%
group_by(idx) %>%
mutate(pos = 1:n(),
value = ifelse(pos != 1, 0, value)) %>%
ungroup() %>%
select(value) %>%
t
return(append(1, spans))
})
myft <- flextable(combined) %>%
theme_box()
myft$body$spans$rows[3:nrow(myft$body$spans$rows),] <- matrix(unlist(spans), ncol = ncol(combined), byrow = TRUE)
myft
Created on 2022-04-29 by the reprex package (v2.0.1)
This makes the table:
I don't think you can pass colspan options here without quite a bit of hacking. If at all possible, I would suggest adding the information which cells need to be combined manually. This is the only option, as far as I know, in flextable:
library(flextable)
library(tidyverse)
# clean up the object
dat_clean <- dat %>%
mutate(freq = map2(freq, colspan, ~rep(.x, .y))) %>%
select(-colspan) %>%
unnest(freq) %>%
group_by(rn) %>%
mutate(col = paste0("col_", row_number())) %>%
pivot_wider(names_from = col, values_from = freq)
flextable(dat_clean) %>%
merge_at(i = 1, j = 3:4, part = "body") %>%
merge_at(i = 1, j = 7:9, part = "body") %>%
border_inner(part="all", border = fp_border_default()) %>%
align(align = "center", part = "all")
Created on 2022-04-25 by the reprex package (v2.0.1)
It is a bit tricky to merge those two tables. This is the closest I came to reproduce your desired table. First I created your data in a suitable way:
thresholds <- data.frame(c("Lower threshold", "Upper threshold", "type_A", "type_B", "type_C"),
c(0,25, 0, 2, 0),
c(25,50, 0, 1, 0),
c(50,100, NA, NA,3),
c(100,250,0,0,5),
c(250,5005,5,5,12),
c(500,1000,7,0,53),
c(1000,1500,16,NA,NA),
c(1500,3000,NA,NA,NA),
c(3000, "Infinity",NA,NA,NA),
c("SUM", "SUM", 28,8,73))
names(thresholds) <- c("One", "Two", "Three", "Four", "Five", "Six", "Seven", "Eight", "Nine", "Ten", "Eleven")
Using the officer package you can give the horizontal and vertical lines different colors you want. Using the merge_at function you can merge certain cells. With the border_inner function you get borders in the table. You can use the following code:
library(officer)
std_border = fp_border(color="gray")
library(flextable)
library(dplyr)
thresholds %>%
flextable() %>%
merge_at(i = 3, j = 3:4, part = "body") %>%
merge_at(i = 4, j = 3:4, part = "body") %>%
merge_at(i = 3, j = 8:10, part = "body") %>%
merge_at(i = 4, j = 7:10, part = "body") %>%
merge_at(i = 5, j = 7:10, part = "body") %>%
border_inner(border = std_border) %>%
align(align = "left", part = "all")
Output:

How can I get a table in R with the mean values of t, t-1, t-2, t-3, etc. of a variable in a timeseries?

For instance, I want to get a table with the average of "value" for 1 day (already calculated), the last 2 days, the last 3 days, etc. and up to the last 10 days.
df<- data.frame(day = seq(as.Date("1910/1/1"), as.Date("1910/1/10"), "days"), value = c(1, 4, 7, 33, 2, 6, 9, 88, 5, 6))
Any hints?
df<- data.frame(day = seq(as.Date("1910/1/1"), as.Date("1910/1/10"), "days"), value = c(1, 4, 7, 33, 2, 6, 9, 88, 5, 6))
library(tidyverse)
library(slider)
df <- 1:9 %>%
map(
~slide_dbl(df$value, ~mean(.x), .before = .x, .complete = F)
) %>%
bind_cols() %>%
bind_cols(df, .) %>%
set_names(c("day", "value", paste0("value", 2:10)))

How to plot the distribution as combo chart in R?

Here is my initial data set:
data_x <- tribble(
~price, ~id, ~cost, ~revenue,
1, 10, 0.20, 0,
2, 20, 0.30, 60,
3, 20, 0.30, 0,
4, 10, 0.20, 100,
5, 30, 0.10, 40,
6, 10, 0.20, 0,
1, 20, 0.30, 80,
2 , 10, 0.20, 0,
3, 30, 0.10, 20,
3, 20, 0.30, 40,
)
Then, I have a new variable that is zet:
data_y <- data_x %>%
mutate(zet = cost/revenue) %>%
mutate_if(is.numeric, list(~na_if(., Inf))) %>%
mutate_all(funs(replace_na(.,0)))
Now, I plot the price distribution while showing the zet distribution, as well. Here is my desired plot:
To do this, I first wanted to see price and zet distribution even they are not percentage now.
price_dist <- data_y %>%
group_by(priceseg = cut(as.numeric(price), c(0, 1, 3, 5, 6))) %>%
summarise(price_n = n_distinct(price)) %>%
pivot_wider(names_from = priceseg, values_from = price_n)
zet_dist <- data_y %>%
group_by(priceseg = cut(as.numeric(price), c(0, 1, 3, 5, 6))) %>%
summarise(zet_n = n_distinct(zet)) %>%
pivot_wider(names_from = priceseg, values_from = zet_n)
I would be grateful if you could help me to plot my desired chart.
d <- data_y %>%
group_by(priceseg = cut(as.numeric(price), c(0, 1, 3, 5, 6))) %>%
summarise(price_n = n_distinct(price),
zet_n = n_distinct(zet)) %>%
mutate(price_n = 100 * prop.table(price_n),
zet_n2 = 100 * prop.table(zet_n))
ggplot(d) +
geom_col(aes(x = priceseg, y = price_n)) +
geom_line(data = d, mapping = aes(x = priceseg, y = zet_n2, group = 1)) +
geom_label(data = d, mapping = aes(x = priceseg, y = zet_n2, label = zet_n), nudge_y = 5)

My R code is taking too long and is too complex

Using big data sets, this code is taking a really long time to process. Does anyone have any simpler ways of running it?
Ran this code, locked up my machine for a while
SID_Scores <- filtered %>%
group_by(SalesPerson_SID) %>%
summarise(
Brand_Advocacy = mean(Q1, na.rm = TRUE),
Vehicle_Satisfaction = mean(Q2, na.rm = TRUE),
Dealer_Satisfaction = mean(Q3, na.rm = TRUE),
Sales_Advocacy = mean(Q6N_srvsls_Recommend_10Pt, na.rm = TRUE),
Overall_SalesCon = mean(Q5N1_ovrsls, na.rm = TRUE),
Understanding_Needs = mean(Q7N1_slsneeds, na.rm = TRUE),
Product_Features = mean(Q7N2_slsfeat, na.rm = TRUE),
Professional_Court = mean(Q7N3_slsprof, na.rm = TRUE),
Feel_Valued = mean(SlsValued, na.rm = TRUE),
Trustworthy = mean(SlsTrustworthy, na.rm = TRUE),
Financial_Arrang = mean(Q5N2_ovrfin, na.rm = TRUE),
Financial_Agreement = mean(Q8N2_finease, na.rm = TRUE),
Respect_Time = mean(Q8N3_fintime, na.rm = TRUE),
Honesty = mean(Q8N4_finhon, na.rm = TRUE),
Delivery = mean(Q5N3_ovrdlv, na.rm = TRUE),
U_Pairing = (sum(filtered$Q_UCPairing == '1', na.rm = TRUE)) / (
sum(filtered$Q_UCPairing == '1', na.rm = TRUE) +
sum(filtered$Q_UCPairing == '2', na.rm = TRUE)
),
U_Demonstrate = (sum(filtered$Q_UCDemonstrate == '1', na.rm = TRUE)) /
(
sum(filtered$Q_UCDemonstrate == '1', na.rm = TRUE) +
sum(filtered$Q_UCDemonstrate == '2', na.rm = TRUE)
),
U_FreeTrials = (sum(filtered$Q_UCFreeTrials == '1', na.rm = TRUE)) /
(
sum(filtered$Q_UCFreeTrials == '1', na.rm = TRUE) +
sum(filtered$Q_UCFreeTrials == '2', na.rm = TRUE)
),
U_Presets = (sum(filtered$Q_UCRadioPreset == '1', na.rm = TRUE)) /
(
sum(filtered$Q_UCRadioPreset == '1', na.rm = TRUE) +
sum(filtered$Q_UCRadioPreset == '2', na.rm = TRUE)
)
) %>%
group_by(SalesPerson_SID)
This has been running for several hours now. Filtered has 540000 rows with 35 variables
Here is the code to reproduce some sample data:
structure(list(EventType = c("001", "001", "001", "001", "001",
"001"), `Survey Type` = c("Sales", "Sales", "Sales", "Sales",
"Sales", "Sales"), ModelYear = c(2018, 2019, 2018, 2018, 2018,
2018), PurchaseDate = c(20181209, 20181216, 20181209, 20181215,
20181218, 20181218), `ZoneCode (BC)` = c("32", "71", "71", "51",
"63", "74"), SalesDistrict = c("G", "D", "G", "C", "T", "G"),
SalesGroupSize = c("E", "E", "B", "D", "D", "B"), DealerCode = c("60698",
"45622", "69319", "36277", "44107", "26922"), Q1 = c(9, 8,
10, 10, 10, 9), Q2 = c(9, 10, 10, 10, 10, 9), Q3 = c(8, 10,
10, 10, 10, 9), Q6N_srvsls_Recommend_10Pt = c(9, 10, 10,
10, 10, 9), Q5N1_ovrsls = c(8, 10, 10, 10, 10, 8), Q5N2_ovrfin = c(9,
10, 10, 10, 10, 7), Q5N3_ovrdlv = c(8, NA, 10, 10, 10, 6),
Q5N4_srvsls_facility = c(9, 10, 10, 10, 10, 10), Q7N1_slsneeds = c(9,
10, 10, 10, 10, 9), Q7N2_slsfeat = c(9, 10, 10, 10, 10, 9
), Q7N3_slsprof = c(10, 10, 10, 10, 10, 9), Q8N1_finneg = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), Q8N2_finease = c(9,
10, 10, 9, 10, 7), Q8N3_fintime = c(9, 10, 10, 10, 10, 10
), Q8N4_finhon = c(9, 10, 10, 10, 10, 9), Q9 = c(0, 0, 0,
0, 0, 0), SlsValued = c(9, 10, 10, 10, 10, 8), SlsTrustworthy = c(NA_real_,
NA_real_, NA_real_, NA_real_, NA_real_, NA_real_), SlsPaperwork = c(NA,
3, 2, 2, 2, NA), `SlsF&ITransaction` = c(3, 2, 2, 3, 1, 4
), SalesPerson_SID = c("S39547M", "S56830O", "S35478Q", "S61788P",
"S35680B", "S75254K"), Q_UCPairing = c(1, 1, 1, 1, 1, 1),
Q_UCDemonstrate = c(1, 1, 1, 1, NA, 1), Q_UCFreeTrials = c(1,
1, 1, 1, 1, 1), Q_UCRadioPreset = c(1, 1, 1, 2, 1, 1)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -6L), .Names = c("EventType",
"Survey Type", "ModelYear", "PurchaseDate", "ZoneCode (BC)",
"SalesDistrict", "SalesGroupSize", "DealerCode", "Q1", "Q2",
"Q3", "Q6N_srvsls_Recommend_10Pt", "Q5N1_ovrsls", "Q5N2_ovrfin",
"Q5N3_ovrdlv", "Q5N4_srvsls_facility", "Q7N1_slsneeds", "Q7N2_slsfeat",
"Q7N3_slsprof", "Q8N1_finneg", "Q8N2_finease", "Q8N3_fintime",
"Q8N4_finhon", "Q9", "SlsValued", "SlsTrustworthy", "SlsPaperwork",
"SlsF&ITransaction", "SalesPerson_SID", "Q_UCPairing", "Q_UCDemonstrate",
"Q_UCFreeTrials", "Q_UCRadioPreset"))

Resources