Match grouping variable with stripping/shading using kableExtra - r

I have a table with multiple records for each individual (ID1) and would like the row shading (i.e. kable_styling(c("striped")) to alternate by group (ID1) rather than by every other row. I was hoping I could add group_by(ID1) to the code below... Alas I am still in search of a solution. While there are lots of helpful tips are shown here, I have not been able to find a solution.
I am also wondering how to make a single outside border to the table rather than border every cell.
Below is a reproducible data set.
Many thanks in advance.
```{r echo=F, warning=F, message = FALSE}
library(tidyverse)
library(kableExtra)
set.seed(121)
Dat <- data.frame(
ID1 = sample(c("AAA", "BBB", "CCC","DDD"), 100, replace = T),
ID2 = sample(c("Cat", "Dog", "Bird"), 100, replace = T),
First = rnorm(100),
Two = sample.int(100))
ExTbl <- Dat %>%
group_by(ID1, ID2) %>%
summarize(One = mean(First),
Max = max(Two)) %>%
arrange(ID1)
kable(ExTbl) %>%
kable_styling(c("striped", "bordered"), full_width = F)
```
> head(as.data.frame(ExTbl) )
ID1 ID2 One Max
1 AAA Bird 0.15324169 86
2 AAA Cat -0.02726006 83
3 AAA Dog -0.19618126 78
4 BBB Bird 0.62176633 100
5 BBB Cat -0.35502912 77
6 BBB Dog -0.29977145 87
>

Right now there is no direct approach in kableExtra but this is the method I used last time. Maybe I should pack this into this package.
library(tidyverse)
library(kableExtra)
set.seed(121)
Dat <- data.frame(
ID1 = sample(c("AAA", "BBB", "CCC","DDD"), 100, replace = T),
ID2 = sample(c("Cat", "Dog", "Bird"), 100, replace = T),
First = rnorm(100),
Two = sample.int(100))
ExTbl <- Dat %>%
group_by(ID1, ID2) %>%
summarize(One = mean(First),
Max = max(Two)) %>%
arrange(ID1)
ind_end <- cumsum(rle(as.character(ExTbl$ID1))$lengths)
ind_start <- c(1, ind_end[-length(ind_end)] + 1)
pos <- purrr::map2(ind_start, ind_end, seq)
pos <- unlist(pos[1:length(pos) %% 2 != 0])
kable(ExTbl) %>%
kable_styling(c("bordered"), full_width = F) %>%
row_spec(pos, background = "#EEEEEE")

Related

Using a temporal inner variable in dplyr outside of the group

I need to calculate an FDR variable per group, using an expected random distribution of p values (corresponds to the "Random" type).
library(dplyr)
library(data.table)
calculate_empirical_fdr = function(control_pVal, test_pVal) {
m_control = length(control_pVal)
m_test = length(test_pVal)
unlist(lapply(test_pVal, function(significance_threshold) {
m_control = length(control_pVal)
m_test = length(test_pVal)
FP_expected = length(control_pVal[control_pVal<=significance_threshold])*m_test/m_control # number of
expected false positives in a p-value sequence with the size m_test
S = length(test_pVal[test_pVal<=significance_threshold]) # number of significant hits (FP + TP)
return(FP_expected/S)
}))
}
An example dataset with groups I need to control for in the "Group" variable:
set.seed(42)
library(dplyr); library(data.table)
dataset_test = data.table(Type = c(rep("Random", 500),
rep("test1", 500),
rep("test2", 500)),
Group = sample(c("group1", "group2", "group3"), 1500, replace = T),
Pvalue = c(runif(n = 500),
rbeta(n = 500, shape1 = 1, shape2 = 4),
rbeta(n = 500, shape1 = 1, shape2 = 6))
)
Data visualization:
I have found that the best way to use my function per group would be using a temporal variable where I can store the p values of the random type, but this does not work:
dataset_test %>%
group_by(Group) %>%
{filter(Type=="Random") %>% select(Pvalue) ->> control_set } %>%
group_by(Type, add = T) %>%
mutate(FDR_empirical = calculate_empirical_fdr(control_pVal = control_set,
test_pVal = Pvalue)) %>%
data.table()
Error in filter(Type == "Random") : object 'Type' not found
I understand that probably temporal vairables "do not see" the environment within the data.table, would be glad to hear any suggestions how to fix it.
You can do something like this, which filters the control group P-values using the data.table special .BY
setDT(dataset_test)
dataset_test[
i= Type!="Random",
j = FDR_empirical:=calculate_empirical_fdr(dataset_test[Type=="Random" & Group ==.BY$Group, Pvalue], Pvalue),
by = .(Group, Type)
]
Output:
Type Group Pvalue FDR_empirical
1: Random group1 0.70292111 NA
2: Random group1 0.72383117 NA
3: Random group1 0.76413459 NA
4: Random group1 0.87942702 NA
5: Random group2 0.71229213 NA
---
1496: test2 group3 0.34817178 0.3681791
1497: test2 group1 0.22419118 0.2308988
1498: test2 group3 0.07258545 0.2314655
1499: test2 group2 0.24687976 0.2849462
1500: test2 group1 0.12206777 0.1760657
Two possible solutions
Use the dot .
dataset_test %>%
group_by(Group) %>%
{filter(., Type=="Random") %>% select(Pvalue) ->> control_set; . } %>%
group_by(Type, add = T)
Use the tee-pipe %T>% from the magrittr package
library(magrittr)
dataset_test %>%
group_by(Group) %T>% {
filter(., Type=="Random") %>% select(Pvalue) ->> control_set} %>%
group_by(Type, add = T)

How to change the row names in R data.frame?

I would like to rename station in DF to something like DA056 to Happy and AB786 to Sad.
library(tidyverse)
DF1 <- data.frame(Station = rep("DA056",3), Level = 100:102)
DF2 <- data.frame(Station = rep("AB786",3), Level = 201:203)
DF <- bind_rows(DF1,DF2)
We can use factor with labels specified for corresponding levels
library(dplyr)
DF <- DF %>%
mutate(Station = factor(Station, levels = c("DA056", "AB786"),
labels = c("Happy", "Sad")))
DF$Station
#[1] Happy Happy Happy Sad Sad Sad
#Levels: Happy Sad
Or with recode
DF %>%
mutate(Station = recode(Station, DA056 = 'Happy', AB786 = 'Sad'))
# Station Level
#1 Happy 100
#2 Happy 101
#3 Happy 102
#4 Sad 201
#5 Sad 202
#6 Sad 203
If there are many values to be changed, a better option is a join after creating a key/val dataset
keyval <- data.frame(Station = c("DA056", "AB786"),
val = c("Happy", "Sad"), stringsAsFactors = FALSE)
DF %>%
left_join(keyval) %>%
mutate(Station = coalesce(val, Station))
Or with base R
DF$Station <- with(df, factor(Station, levels = c("DA056", "AB786"),
labels = c("Happy", "Sad")))
An option is to use dplyr::case_when:
library(dplyr)
DF1 <- data.frame(Station = rep("DA056",3), Level = 100:102, stringsAsFactors = F)
DF2 <- data.frame(Station = rep("AB786",3), Level = 201:203, stringsAsFactors = F)
DF <- bind_rows(DF1,DF2)
DF <- DF %>% mutate(Station = case_when( Station == "DA056" ~ "Happy",
Station == "AB786" ~ "Sad",
TRUE ~ Station))
Output
> DF
Station Level
1 Happy 100
2 Happy 101
3 Happy 102
4 Sad 201
5 Sad 202
6 Sad 203
You can do it using case_when:
DF %>%
mutate(Station = case_when(Station == "DA056" ~ "Happy", Station =="AB786" ~ "Sad"))
Another simple solution
DF$Station = ifelse(DF$Station == "DA056", "Happy", "Sad")

Group dataframe row and column wise based on other dataframe?

I have a dataframe that I would like to group in both directions, first rowise and columnwise after. The first part worked well, but I am stuck with the second one. I would appreciate any help or advice for a solution that does both steps at the same time.
This is the dataframe:
df1 <- data.frame(
ID = c(rep(1,5),rep(2,5)),
ID2 = rep(c("A","B","C","D","E"),2),
A = rnorm(10,20,1),
B = rnorm(10,50,1),
C = rnorm(10,10,1),
D = rnorm(10,15,1),
E = rnorm(10,5,1)
)
This is the second dataframe, which holds the "recipe" for grouping:
df2 <- data.frame (
Group_1 = c("B","C"),
Group_2 = c("D","A"),
Group_3 = ("E"), stringsAsFactors = FALSE)
Rowise grouping:
df1_grouped<-bind_cols(df1[1:2], map_df(df2, ~rowSums(df1[unique(.x)])))
Now i would like to apply the same grouping to the ID2 column and sum the values in the other columns. My idea was to mutate a another column (e.g. "group", which contains the name of the final group of ID2. After this i can use group_by() and summarise() to calculate the sum for each. However, I can't figure out an automated way to do it
bind_cols(df1_grouped,
#add group label
data.frame(
group = rep(c("Group_2","Group_1","Group_1","Group_2","Group_3"),2))) %>%
#remove temporary label column and make ID a character column
mutate(ID2=group,
ID=as.character(ID))%>%
select(-group) %>%
#summarise
group_by(ID,ID2)%>%
summarise_if(is.numeric, sum, na.rm = TRUE)
This is the final table I need, but I had to manually assign the groups, which is impossible for big datasets
I will offer such a solution
library(tidyverse)
set.seed(1)
df1 <- data.frame(
ID = c(rep(1,5),rep(2,5)),
ID2 = rep(c("A","B","C","D","E"),2),
A = rnorm(10,20,1),
B = rnorm(10,50,1),
C = rnorm(10,10,1),
D = rnorm(10,15,1),
E = rnorm(10,5,1)
)
df2 <- data.frame (
Group_1 = c("B","C"),
Group_2 = c("D","A"),
Group_3 = ("E"), stringsAsFactors = FALSE)
df2 <- df2 %>% pivot_longer(everything())
df1 %>%
pivot_longer(-c(ID, ID2)) %>%
mutate(gr_r = df2$name[match(ID2, table = df2$value)],
gr_c = df2$name[match(name, table = df2$value)]) %>%
arrange(ID, gr_r, gr_c) %>%
pivot_wider(c(ID, gr_r), names_from = gr_c, values_from = value, values_fn = list(value = sum))

How can I Loop through a dataframe with Dplyr and use mutate to change values of some cells based on a condition?

I am writing a report using Rmarkdown, I have a data frame like this:
I want to check all cells and change the markup so the values that are smaller than "0.05" are highlighted in red. I managed to do that in a simpler data frame with only 2 rows and a specific cell, so it was easy and it's working. But in this case, I need to check all cells and I have no idea how to do it. I have tried with mutate_if, mutate_all and got nowhere.
This line was all that I needed to make it work on the other data frame:
mutate(p.value = cell_spec(p.value, "html", color = ifelse(p.value[1] < 0.05, "red", "black")))
Edit: as requested..
I am using Kable and KableExtra to do some of the printing on the report. Here is a code example that I used to do the highlighting on a more basic data frame:
aov.formiga <- aov(as.formula(sprintf("%s ~ Local", v)), formigas)
d <- tidy(aov.formiga)
print(
d %>%
replace(is.na(.),"") %>%
mutate(p.value = cell_spec(p.value, "html", color = ifelse(p.value[1] < 0.05, "red", "black"))) %>%
kable(format = "html", escape = F, col.names = c("Source", "DF","Anova SS", "Mean Square", "F Value", "Pr > F")) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = F)
)
That is the result of this print:
I can provide any data, but I can try to simplify the idea:
DF
... A B C
1 1 2 1
2 2 1 2
3 3 4 1
Let's say I want to loop this DF and replace all 1 for "One" and leave the other values intact. If I can manage to do that with Dplyr I think I can work the other parts. Thank you!
With your example data, if you want to turn all the cells greater than or equal to 2 to red:
x = c(A = 1, B = 2, C = 1)
y = c(A = 2, B = 1, C = 2)
z = c(A = 3, B = 4, C = 1)
data=data.frame(rbind(x,y,z))
library(knitr)
library(kableExtra)
library(tidyverse)
data %>%
mutate_all(~cell_spec(.x, color = ifelse(.x >= 2, "red"," black"))) %>%
kable(escape = F) %>%
kable_styling()
Response to comment
The ifelse() breaks with NA values, so we can use case_when():
data %>%
mutate_all(~cell_spec(.x, color = case_when(.x >= 2 ~ "red",
TRUE ~ "black"))) %>%
kable(escape = F) %>%
kable_styling()

Multiple group_by with dplyr

There is a lot of questions about it on this forum but I could not do it. I got a dataframe with a bunch of categrocial variables (class factor). I got a target column (1 or 0). I want to compute the frequency of 1's within each level of the categorical variables. I want to do those 3 group_by computations in once.
library(dplyr)
# Build the toy dataset
target = sample(x = c(0,1),size = 100,replace = T)
cat1 = sample(x = c("a","b","c"),size = 100,replace = T)
cat2 = sample(x = c("x","y","z"),size = 100,replace = T)
cat3 = sample(x = c("T","U","V"),size = 100,replace = T)
df = data.frame(target,cat1,cat2,cat3)
# How to do those 3 group_by computations in once knowing that in reality I got thousands of those categorical columns?
df %>%
group_by(cat1) %>%
summarise(statistics = mean(target))
df %>%
group_by(cat2) %>%
summarise(statistics = mean(target))
df %>%
group_by(cat3) %>%
summarise(statistics = mean(target))
If I understand your question correctly, I believe this code can help you:
df %>%
group_by(cat1,cat2,cat3) %>%
summarise(statistics = mean(target)) %>% arrange(cat1,cat2)

Resources