total() in tab_cols only sum up to one, any suggestion? - r

Suppose I have dataframe 'y'
WR<-c("S",'J',"T")
B<-c("b1","b2","b3")
wgt<-c(0.3,2,3)
y<-data.frame(WR,B,wgt)
I want to make column percentage crosstab with B as row, WR, and total of WR as columns using expss function
library(expss)
y %>% tab_cols(total(),WR) %>% # Columns
tab_stat_valid_n("Base") %>%
tab_weight(wgt) %>%
tab_stat_valid_n("Projection") %>%
tab_cells(mrset(B))%>% # Row
tab_stat_cpct(total_row_position = "none") %>%
tab_pivot()
Result
But the total Base column does not match up
# #Total WR|J WR|S WR|T
# Base 1.000000 1 1.0 1
# Projection 5.300000 2 0.3 3
# b1 5.660377 NA 100.0 NA
# b2 37.735849 100 NA NA
# b3 56.603774 NA NA 100
I think I found the solution
y %>% tab_cols(total(),WR) %>% # Columns
tab_cells(mrset(B))%>% # Row
tab_stat_valid_n("Base") %>%
tab_weight(wgt) %>%
tab_stat_valid_n("Projection") %>%
tab_stat_cpct(total_row_position = "none") %>%
tab_pivot()
| | | #Total | WR | | |
| | | | J | S | T |
| -- | ---------- | ------ | --- | ----- | --- |
| B | Base | 3.0 | 1 | 1.0 | 1 |
| | Projection | 5.3 | 2 | 0.3 | 3 |
| b1 | | 5.7 | | 100.0 | |
| b2 | | 37.7 | 100 | | |
| b3 | | 56.6 | | | 100 |

Related

frequency table for banner list

I am trying to create a function to generate frequency table (to show count , valid percentage , percentage) for list of banner.
I want to export tables in xlsx files.
like for variable "gear" , i want to calculate the table for banner below ()
library(expss)
df <- mtcars
df$all<- 1
df$small<-ifelse(df$vs==1,1,NA)
df$large<-ifelse(df$am ==1,1,NA)
val_lab(df$all)<-c("Total"=1)
val_lab(df$small)<-c("Small"=1)
val_lab(df$large)<-c("Large"=1)
banner <- list(dat$all,dat$small,dat$large)
data <- df
var <- "gear"
var1 <- rlang::parse_expr(var)
expss::var_lab(data[[var]])
#tab1 <- expss::fre(data[[var1]])
table1 <- expss::fre(data[[var1]],
stat_lab = getOption("expss.fre_stat_lab", c("Count N", "Valid percent", "Percent",
"Responses, %", "Cumulative responses, %")))
table1
the output table should be like below
You need to make custom function around fre:
library(expss)
df <- mtcars
df$all<- 1
df$small<-ifelse(df$vs==1,1,NA)
df$large<-ifelse(df$am ==1,1,NA)
val_lab(df$all)<-c("Total"=1)
val_lab(df$small)<-c("Small"=1)
val_lab(df$large)<-c("Large"=1)
my_fre <- function(curr_var) setNames(expss::fre(curr_var)[, 1:3],
c("row_labels", "Count N", "Valid percent"))
cross_fun_df(df, gear, list(all, small, large), fun = my_fre)
# | | Total | | Small | | Large | |
# | | Count N | Valid percent | Count N | Valid percent | Count N | Valid percent |
# | ------ | ------- | ------------- | ------- | ------------- | ------- | ------------- |
# | 3 | 15 | 46.88 | 3 | 21.43 | | |
# | 4 | 12 | 37.50 | 10 | 71.43 | 8 | 61.54 |
# | 5 | 5 | 15.62 | 1 | 7.14 | 5 | 38.46 |
# | #Total | 32 | 100.00 | 14 | 100.00 | 13 | 100.00 |
# | <NA> | 0 | | 0 | | 0 | |

Find maximum value of one column based on group_by multiple other columns

I have the a table of the following form in R:
| COUNTRY | date_start | code | bin | ord |
| -----------------------------------------|
| Chile | 04/11/2020 | 4.5.1 | 1 | 3 |
| Chile | 04/11/2020 | 4.5.2 | 1 | 0 |
| Norway | 23/02/2021 | 4.4.1 | 1 | 2 |
| Norway | 23/02/2021 | 4.4.2 | 0 | 1 |
| Norway | 25/02/2021 | 4.4.2 | 0 | 1 |
First I want to drop the column 'who_code', and then I want to create an extra column 'ordMax', and populate it with the maximum value of the 'ord' column for a given 'COUNTRY' and 'date_start'. So in this example the new column would be
| COUNTRY | date_start | bin | ord | ordMax |
| ------------------------------------------|
| Chile | 04/11/2020 | 1 | 3 | 3 |
| Chile | 04/11/2020 | 1 | 0 | 3 |
| Norway | 23/02/2021 | 1 | 2 | 2 |
| Norway | 23/02/2021 | 0 | 1 | 2 |
| Norway | 25/02/2021 | 0 | 1 | 1 |
I have tried a couple of methods in R, using both 'aggregate' and the dplyr library, but it nothing seemed to work. One of the things that I tried was:
df_k_reduced <- df_k %>%
group_by(COUNTRY, date_start) %>%
select(-code) %>%
summarise(ordMax = max(ord))
But this gives something like:
| COUNTRY | date_start | ordMax |
| ------------------------------|
| Chile | 04/11/2020 | 3 |
| Norway | 23/02/2021 | 2 |
| Norway | 25/02/2021 | 1 |
Note that 'bin' and the original 'ord' column have also been dropped, even though that was not the original intention.
How would I obtain the table with that extra column, where the only dropped column is 'code', and no rows are dropped?
We can use slice_max instead of summarise to return all the columns after the select step
library(dplyr)
df_k %>%
group_by(COUNTRY, date_start) %>%
select(-code) %>%
slice_max(order_by = 'ord', n = 1)
If we need to create a new column, use mutate
df_k %>%
group_by(COUNTRY, date_start) %>%
select(-code) %>%
mutate(ordMax = max(ord, na.rm = TRUE)) %>%
ungroup
data.table way
sample data
library(data.table)
DT <- fread("COUNTRY | date_start | code | bin | ord
Chile | 04/11/2020 | 4.5.1 | 1 | 3
Chile | 04/11/2020 | 4.5.2 | 1 | 0
Norway | 23/02/2021 | 4.4.1 | 1 | 2
Norway | 23/02/2021 | 4.4.2 | 0 | 1
Norway | 25/02/2021 | 4.4.2 | 0 | 1 ")
code
DT[, ordMax := max(ord), by = .(COUNTRY, date_start)][, code := NULL][]
output
# COUNTRY date_start bin ord ordMax
# 1: Chile 04/11/2020 1 3 3
# 2: Chile 04/11/2020 1 0 3
# 3: Norway 23/02/2021 1 2 2
# 4: Norway 23/02/2021 0 1 2
# 5: Norway 25/02/2021 0 1 1

Top 3 Box without TRUE FALSE in R / Rstudio

I'm new to R. I'm able to create top 3 and bottom 3 boxes in my tables, but it displays as "TRUE" and "FALSE" like this...
The code that i used is...
library(expss)
X4607 %>%
tab_cells(qcs1a_SQ001, "Top 3 Box"=qcs1a_SQ001>7 & qcs1a_SQ001<11, "Bottom 3 Box"=qcs1a_SQ001<=2) %>%
tab_cols(total(), spcode) %>%
tab_stat_cpct() %>%
tab_last_sig_cpct() %>%
tab_pivot()
Is there any way to just have the number of 'TRUE' come in under the "top 3 box" label and get rid of the "TRUE" and "FALSE" displaying.
There is a special function subtotal for that:
library(expss)
set.seed(123)
N = 100
X4607 = data.frame(
spcode = sample(c("South", "North"), size = N, replace = TRUE),
qcs1a_SQ001 = sample(c(1:10, 99), size = N, replace = TRUE)
)
X4607 %>%
tab_cells(subtotal(qcs1a_SQ001, "Bottom 3 Box" = 1:3, "Top 3 Box" = 7:10, position = "bottom")) %>%
tab_cols(total(), spcode) %>%
tab_stat_cpct() %>%
tab_last_sig_cpct() %>%
tab_pivot()
# | | #Total | spcode | |
# | | | North | South |
# | | | A | B |
# | ------------ | ------ | ------ | ------ |
# | 1 | 5.0 | 9.3 | 1.8 |
# | 2 | 5.0 | 4.7 | 5.3 |
# | 3 | 9.0 | 4.7 | 12.3 |
# | 4 | 11.0 | 7.0 | 14.0 |
# | 5 | 6.0 | 9.3 | 3.5 |
# | 6 | 10.0 | 2.3 | 15.8 A |
# | 7 | 13.0 | 16.3 | 10.5 |
# | 8 | 14.0 | 16.3 | 12.3 |
# | 9 | 10.0 | 11.6 | 8.8 |
# | 10 | 10.0 | 9.3 | 10.5 |
# | 99 | 7.0 | 9.3 | 5.3 |
# | Bottom 3 Box | 19.0 | 18.6 | 19.3 |
# | Top 3 Box | 47.0 | 53.5 | 42.1 |
# | #Total cases | 100 | 43 | 57 |

How to generate Z-test including totals of variables in columns in expss?

Two questions in fact.
How to add totals for variables in columns in expss?
Is it possible to perform Z-test for variables in columns including total as a different category?
Below you can find a piece of code I'd run but it didn't work... I mean I couldn't even add totals on the right/left side of column variable...
test_table = tab_significance_options(data = df, compare_type = "subtable", bonferroni = TRUE, subtable_marks = "both") %>%
tab_cells(VAR1) %>%
tab_total_statistic("w_cpct") %>%
tab_cols(VAR2) %>%
tab_stat_cpct() %>%
tab_cols(total(VAR2)) %>%
tab_last_sig_cpct() %>%
tab_pivot(stat_position = "outside_columns")
I would be grateful for any advice.
To compare with first column you need additionally specify "first_column" in the 'compare_type'. Secondary, for correct result one of total statistic should be cases. Taking into the account all of the above:
library(expss)
data(mtcars)
test_table = mtcars %>%
tab_significance_options(compare_type = c("first_column", "subtable"), bonferroni = TRUE, subtable_marks = "both") %>%
tab_total_statistic(c("u_cases", "w_cpct")) %>%
tab_cells(gear) %>%
tab_cols(total(am), am) %>%
tab_stat_cpct() %>%
tab_last_sig_cpct() %>%
tab_pivot()
test_table
# | | | #Total | am | |
# | | | | 0 | 1 |
# | | | | A | B |
# | ---- | ---------------- | ------ | -------- | -------- |
# | gear | 3 | 46.9 | 78.9 + | |
# | | 4 | 37.5 | 21.1 < B | 61.5 > A |
# | | 5 | 15.6 | | 38.5 |
# | | #Total cases | 32 | 19 | 13 |
# | | #Total wtd. cpct | 100 | 100 | 100 |

For each combination of a set of variables in a list, calculating correlations between this combination and another variable in R

In R I want to generate correlation co-efficients by comparing 2 variables whilst also retaining a phylogenetic signal.
The initial way I thought to do this is not computationally efficient, and I think there is a much simpler, but I do not have the skills in R to do it.
I have a csv file which looks like this:
+-------------------------------+-----+----------+---------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+---------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+
| Species | OGT | Domain | A | C | D | E | F | G | H | I | K | L | M | N | P | Q | R | S | T | V | W | Y |
+-------------------------------+-----+----------+---------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+---------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+
| Aeropyrum pernix | 95 | Archaea | 9.7659115711 | 0.6720465616 | 4.3895390781 | 7.6501943794 | 2.9344881615 | 8.8666657183 | 1.5011817208 | 5.6901432494 | 4.1428307243 | 11.0604191603 | 2.21143353 | 1.9387130928 | 5.1038552753 | 1.6855017182 | 7.7664358772 | 6.266067034 | 4.2052190807 | 9.2692433532 | 1.318690698 | 3.5614200159 |
| Argobacterium fabrum | 26 | Bacteria | 11.5698896021 | 0.7985475923 | 5.5884500155 | 5.8165463343 | 4.0512504104 | 8.2643271309 | 2.0116736244 | 5.7962804605 | 3.8931525401 | 9.9250463349 | 2.5980609708 | 2.9846761128 | 4.7828063605 | 3.1262365491 | 6.5684282943 | 5.9454781844 | 5.3740045968 | 7.3382308193 | 1.2519739683 | 2.3149400984 |
| Anaeromyxobacter dehalogenans | 27 | Bacteria | 16.0337898849 | 0.8860252895 | 5.1368827707 | 6.1864992608 | 2.9730203513 | 9.3167603253 | 1.9360386851 | 2.940143349 | 2.3473650439 | 10.898494736 | 1.6343905351 | 1.5247123262 | 6.3580285706 | 2.4715303021 | 9.2639057482 | 4.1890063803 | 4.3992339725 | 8.3885969061 | 1.2890166336 | 1.8265589289 |
| Aquifex aeolicus | 85 | Bacteria | 5.8730327277 | 0.795341216 | 4.3287799008 | 9.6746388172 | 5.1386954322 | 6.7148035486 | 1.5438364179 | 7.3358775924 | 9.4641440609 | 10.5736658776 | 1.9263080969 | 3.6183861236 | 4.0518679067 | 2.0493569604 | 4.9229955632 | 4.7976564501 | 4.2005259246 | 7.9169763709 | 0.9292167138 | 4.1438942987 |
| Archaeoglobus fulgidus | 83 | Archaea | 7.8742687687 | 1.1695110027 | 4.9165979364 | 8.9548767369 | 4.568636662 | 7.2640358917 | 1.4998752909 | 7.2472039919 | 6.8957233203 | 9.4826333048 | 2.6014466253 | 3.206476915 | 3.8419576418 | 1.7789787933 | 5.7572748236 | 5.4763351139 | 4.1490633048 | 8.6330814159 | 1.0325605451 | 3.6494619148 |
+-------------------------------+-----+----------+---------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+---------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+--------------+
What I want to do is, for each possible combination of the percentages within the 20 single letter columns (amino acids, so 10 million combinations). Is to calculate the correlation between each different combination and the OGT variable in the CSV.... (whilst retaining a phylogenetic signal)
My current code is this:
library(parallel)
library(dplyr)
library(tidyr)
library(magrittr)
library(ape)
library(geiger)
library(caper)
taxonomynex <- read.nexus("taxonomyforzeldospecies.nex")
zeldodata <- read.csv("COMPLETECOPYFORR.csv")
Species <- dput(zeldodata)
SpeciesLong <-
Species %>%
gather(protein, proportion,
A:Y) %>%
arrange(Species)
S <- unique(SpeciesLong$protein)
Scombi <- unlist(lapply(seq_along(S),
function(x) combn(S, x, FUN = paste0, collapse = "")))
joint_protein <- function(protein_combo, data){
sum(data$proportion[vapply(data$protein,
grepl,
logical(1),
protein_combo)])
}
SplitSpecies <-
split(SpeciesLong,
SpeciesLong$Species)
cl <- makeCluster(detectCores() - 1)
clusterExport(cl, c("Scombi", "joint_protein"))
SpeciesAggregate <-
parLapply(cl,
X = SplitSpecies,
fun = function(data){
X <- lapply(Scombi,
joint_protein,
data)
names(X) <- Scombi
as.data.frame(X)
})
Species <- cbind(Species, SpeciesAggregate)
`
Which attempts to feed in each combination into memory and then calculate the sum of each proportion of each of the acids, but this takes forever to finish and crashes before completion.
I think it would be better to feed in correlation co-efficents into a vector, and then just print out the relative co-efficients of each different combination for each species, but I don't know the best way of doing this in R.
I also aim to retain a phylogenetic signal using the ape package using something along the lines of this:
pglsModel <- gls(OGT ~ AminoAcidCombination, correlation = corBrownian(phy = taxonomynex),
data = zeldodata, method = "ML")
summary(pglsModel)
Apologies for how unclear this is, if anyone has any advice, much appreciated!
Edit: Link to taxonomyforzeldospecies.nex
Output from dput(Zeldodata):
1 Species OGT Domain A C D E F G H I K L M N P Q R S T V W Y
------------------------------- ----- ---------- --------------- -------------- -------------- -------------- -------------- -------------- -------------- -------------- -------------- --------------- -------------- -------------- -------------- -------------- -------------- -------------- -------------- -------------- -------------- --------------
2 Aeropyrum pernix 95 Archaea 9.7659115711 0.6720465616 4.3895390781 7.6501943794 2.9344881615 8.8666657183 1.5011817208 5.6901432494 4.1428307243 11.0604191603 2.21143353 1.9387130928 5.1038552753 1.6855017182 7.7664358772 6.266067034 4.2052190807 9.2692433532 1.318690698 3.5614200159
3 Argobacterium fabrum 26 Bacteria 11.5698896021 0.7985475923 5.5884500155 5.8165463343 4.0512504104 8.2643271309 2.0116736244 5.7962804605 3.8931525401 9.9250463349 2.5980609708 2.9846761128 4.7828063605 3.1262365491 6.5684282943 5.9454781844 5.3740045968 7.3382308193 1.2519739683 2.3149400984
4 Anaeromyxobacter dehalogenans 27 Bacteria 16.0337898849 0.8860252895 5.1368827707 6.1864992608 2.9730203513 9.3167603253 1.9360386851 2.940143349 2.3473650439 10.898494736 1.6343905351 1.5247123262 6.3580285706 2.4715303021 9.2639057482 4.1890063803 4.3992339725 8.3885969061 1.2890166336 1.8265589289
5 Aquifex aeolicus 85 Bacteria 5.8730327277 0.795341216 4.3287799008 9.6746388172 5.1386954322 6.7148035486 1.5438364179 7.3358775924 9.4641440609 10.5736658776 1.9263080969 3.6183861236 4.0518679067 2.0493569604 4.9229955632 4.7976564501 4.2005259246 7.9169763709 0.9292167138 4.1438942987
6 Archaeoglobus fulgidus 83 Archaea 7.8742687687 1.1695110027 4.9165979364 8.9548767369 4.568636662 7.2640358917 1.4998752909 7.2472039919 6.8957233203 9.4826333048 2.6014466253 3.206476915 3.8419576418 1.7789787933 5.7572748236 5.4763351139 4.1490633048 8.6330814159 1.0325605451 3.6494619148
this will give you a long data frame with each combination and sum per Species (takes about 35 seconds on my machine)...
zeldodata <-
Species %>%
gather(protein, proportion, A:Y) %>%
group_by(Species) %>%
mutate(combo = sapply(1:n(), function(i) combn(protein, i, FUN = paste0, collapse = ""))) %>%
mutate(sum = sapply(1:n(), function(i) combn(proportion, i, FUN = sum))) %>%
unnest() %>%
select(-protein, -proportion)
an example of calculating each species separately and saving the data to disk before reading each one in and combining them...
library(readr)
library(dplyr)
library(tidyr)
library(purrr)
# read in CSV file
zeldodata <-
read_delim(
delim = "|",
trim_ws = TRUE,
col_names = TRUE,
col_types = "cicdddddddddddddddddddd",
file = "Species | OGT | Domain | A | C | D | E | F | G | H | I | K | L | M | N | P | Q | R | S | T | V | W | Y
Aeropyrum pernix | 95 | Archaea | 9.7659115711 | 0.6720465616 | 4.3895390781 | 7.6501943794 | 2.9344881615 | 8.8666657183 | 1.5011817208 | 5.6901432494 | 4.1428307243 | 11.0604191603 | 2.21143353 | 1.9387130928 | 5.1038552753 | 1.6855017182 | 7.7664358772 | 6.266067034 | 4.2052190807 | 9.2692433532 | 1.318690698 | 3.5614200159
Argobacterium fabrum | 26 | Bacteria | 11.5698896021 | 0.7985475923 | 5.5884500155 | 5.8165463343 | 4.0512504104 | 8.2643271309 | 2.0116736244 | 5.7962804605 | 3.8931525401 | 9.9250463349 | 2.5980609708 | 2.9846761128 | 4.7828063605 | 3.1262365491 | 6.5684282943 | 5.9454781844 | 5.3740045968 | 7.3382308193 | 1.2519739683 | 2.3149400984
Anaeromyxobacter dehalogenans | 27 | Bacteria | 16.0337898849 | 0.8860252895 | 5.1368827707 | 6.1864992608 | 2.9730203513 | 9.3167603253 | 1.9360386851 | 2.940143349 | 2.3473650439 | 10.898494736 | 1.6343905351 | 1.5247123262 | 6.3580285706 | 2.4715303021 | 9.2639057482 | 4.1890063803 | 4.3992339725 | 8.3885969061 | 1.2890166336 | 1.8265589289
Aquifex aeolicus | 85 | Bacteria | 5.8730327277 | 0.795341216 | 4.3287799008 | 9.6746388172 | 5.1386954322 | 6.7148035486 | 1.5438364179 | 7.3358775924 | 9.4641440609 | 10.5736658776 | 1.9263080969 | 3.6183861236 | 4.0518679067 | 2.0493569604 | 4.9229955632 | 4.7976564501 | 4.2005259246 | 7.9169763709 | 0.9292167138 | 4.1438942987
Archaeoglobus fulgidus | 83 | Archaea | 7.8742687687 | 1.1695110027 | 4.9165979364 | 8.9548767369 | 4.568636662 | 7.2640358917 | 1.4998752909 | 7.2472039919 | 6.8957233203 | 9.4826333048 | 2.6014466253 | 3.206476915 | 3.8419576418 | 1.7789787933 | 5.7572748236 | 5.4763351139 | 4.1490633048 | 8.6330814159 | 1.0325605451 | 3.6494619148"
)
# save an RDS file for each species
for(species in unique(zeldodata$Species)) {
zeldodata %>%
filter(Species == species) %>%
gather(protein, proportion, A:Y) %>%
mutate(combo = sapply(1:n(), function(i) combn(protein, i, FUN = paste0, collapse = ""))) %>%
mutate(sum = sapply(1:n(), function(i) combn(proportion, i, FUN = sum))) %>%
unnest() %>%
select(-protein, -proportion) %>%
saveRDS(file = paste0(species, ".RDS"))
}
# read in and combine all the RDS files
zeldodata <-
list.files(pattern = "\\.RDS") %>%
map(read_rds) %>%
bind_rows()

Resources