Best way to apply code to 24 similar datasets? - r

I have a 24 datasets that each have one factor and one response. I have written code to subset the 93 entries into 3 categories, but I'm not sure what the most efficient way there is to run this code for all 24 of my datasets. Any ideas would be much appreciated.
Here's the data I'm working with.
dput(head(data))
structure(list(run.size.percentage = structure(c(2L, 13L, 24L,
35L, 46L, 57L), .Label = c(",2000,", "1,0.375,0.013", "10,0.868,0.11",
"11,0.953,0.12", "12,1.047,0.12", "13,1.149,0.13", "14,1.261,0.14",
"15,1.385,0.14", "16,1.520,0.15", "17,1.668,0.15", "18,1.832,0.16",
"19,2.011,0.17", "2,0.412,0.023", "20,2.207,0.17", "21,2.423,0.18",
"22,2.660,0.19", "23,2.920,0.20", "24,3.205,0.21", "25,3.519,0.22",
"26,3.863,0.24", "27,4.240,0.25", "28,4.655,0.26", "29,5.110,0.28",
"3,0.452,0.034", "30,5.610,0.30", "31,6.158,0.31", "32,6.760,0.33",
"33,7.421,0.35", "34,8.147,0.37", "35,8.943,0.39", "36,9.817,0.42",
"37,10.78,0.45", "38,11.83,0.47", "39,12.99,0.50", "4,0.496,0.049",
"40,14.26,0.53", "41,15.65,0.56", "42,17.18,0.58", "43,18.86,0.59",
"44,20.70,0.59", "45,22.73,0.58", "46,24.95,0.55", "47,27.39,0.52",
"48,30.07,0.49", "49,33.01,0.46", "5,0.545,0.061", "50,36.24,0.45",
"51,39.78,0.45", "52,43.67,0.45", "53,47.94,0.44", "54,52.62,0.42",
"55,57.77,0.38", "56,63.41,0.35", "57,69.61,0.32", "58,76.42,0.31",
"59,83.89,0.33", "6,0.598,0.072", "60,92.09,0.36", "61,101.1,0.42",
"62,111.0,0.49", "63,121.8,0.59", "64,133.7,0.74", "65,146.8,0.94",
"66,161.2,1.19", "67,176.9,1.49", "68,194.2,1.82", "69,213.2,2.18",
"7,0.656,0.083", "70,234.1,2.55", "71,256.9,2.94", "72,282.1,3.34",
"73,309.6,3.78", "74,339.9,4.25", "75,373.1,4.73", "76,409.6,5.20",
"77,449.7,5.60", "78,493.6,5.87", "79,541.9,5.93", "8,0.721,0.093",
"80,594.9,5.77", "81,653.0,5.37", "82,716.8,4.77", "83,786.9,4.03",
"84,863.9,3.21", "85,948.3,2.36", "86,1041,1.55", "87,1143,0.81",
"88,1255,0.30", "89,1377,0.056", "9,0.791,0.10", "90,1512,0.0044",
"91,1660,0", "92,1822,0"), class = "factor")), row.names = c(NA,
6L), class = "data.frame")
Here's the code that worked for each dataset.
data2 <- tidyr::separate(names(data), unlist(strsplit(names(data), "\\.")), ",", data=data)
group1 <- data2 %>% filter(size <= 2)
group2 <- data2 %>% filter(size > 2 & size <= 50)
group3 <- data2 %>% filter(size > 50 & size <= 2000)
sum(as.numeric(group1$percentage), na.rm=TRUE)
sum(as.numeric(group2$percentage), na.rm=TRUE)
sum(as.numeric(group3$percentage), na.rm=TRUE)

Put your dataframes in a list and use lapply. Used cut to create the needed size groups. Also added convert = TRUE arg to separate to convert numbers into numeric -
df_list <- list(df, df) # creating a dummy list with same df
lapply(df_list, function(x) {
separate(names(df), unlist(strsplit(names(df), "\\.")), ",",
data = df, convert = TRUE) %>%
group_by(group = cut(size, breaks = c(0,2,50,2000,Inf))) %>%
summarise(percentage = sum(percentage))
})
# every list element is your desired output df
[[1]]
# A tibble: 1 x 2
group percentage
<fct> <dbl>
1 (0,2] 0.252
[[2]]
# A tibble: 1 x 2
group percentage
<fct> <dbl>
1 (0,2] 0.252

Related

How to create a chord diagram in r?

I've never made a plot like this before, so sorry as this is probably a basic question, but I am stuck on how to make a chord diagram and specifically get the outer sections to be my column headings (drug mechanisms) and the inner connections between the sections to be the rows (genes) which don't need to be named in the plot as there are so many.
My data is rows of genes that are marked as interacting with columns of drug mechanisms by zeros or ones.
For example a subset of my data looks like:
Gene Diuretic Beta_blocker ACE_inhibitor
Gene1 1 0 0
Gene2 0 0 1
Gene3 1 1 1
Gene4 0 1 1
My total data is actually 700 genes for 15 columns of drug mechanisms with all zeors and ones. I am currently just creating a chord diagram with:
df <- fread('df.csv')
df[is.na(df)] <- 0
df <- df %>% data.frame %>% set_rownames(.$Gene) %>% dplyr::select(-Gene)
mt <- as.matrix(df)
circos.par(gap.degree = 0.9) #set this as I was otherwise getting an error with my total data
chordDiagram(mt, transparency = 0.5)
With my total data this plot looks like:
I've been getting various errors with trying to get this plot to be 15 sections only (and even just trying to get the sections to have the column names).
Is there a way for me plot a chord diagram with the sections being representative of each column? Then for genes/rows that have an interaction (a 1 in the data) for that section and any other section to be shown in the chord diagram? I don't need the gene names to be visible, I am looking to just visualize the amount of overlap between my columns/sections.
Example input data (for which my problem would be trying to make only have 3 sections per each column to show their overlap):
df <- structure(list(Gene = c("Gene1", "Gene2", "Gene3", "Gene4"),
Diuretic = c(1L, 0L, 1L, 0L), Beta_blocker = c(0L, 0L, 1L,
1L), ACE_inhibitor = c(0L, 1L, 1L, 1L)), row.names = c(NA,
-4L), class = c("data.table", "data.frame")
If you have 15 different drug mechanisms, it would be best to count the genes that various mechanisms have in common, and use these as weightings for the links between drug effects.
Your sample data is too limited to give a feel for how this would look, but the code would be something like this:
new_df <-apply(df, 1, function(x) {
x <- names(df)[which(x == 1)]
m <- 1 - diag(length(x))
dimnames(m) <- list(x, x)
inds <- which(lower.tri(m), arr.ind = TRUE)
data.frame(from = x[inds[,1]], to = x[inds[,2]])}) %>%
bind_rows() %>%
mutate(wt = 1) %>%
group_by(from, to) %>%
summarize(wt = sum(wt), .groups = 'drop')
new_df
#> # A tibble: 3 x 3
#> from to wt
#> <chr> <chr> <dbl>
#> 1 ACE_inhibitor Beta_blocker 2
#> 2 ACE_inhibitor Diuretic 1
#> 3 Beta_blocker Diuretic 1
We can see that we have two genes that have a common action on ACE inhibitor and Beta blocker mechansim (which is what your table implies), and a single gene that links diuretic to both beta blocker and ACE inhibitor to diuretic.
This produces the following rather dull chord diagram:
chordDiagram(new_df)
However, if we make a sample data set that is of the same scale as your real data, we get a more satisfactory result:
set.seed(123)
big_dat <- as.data.frame(matrix(rbinom(15 * 700, 1, 0.5), 700),
row.names = paste0('Gene', 1:700)) %>%
setNames(c('ACE_inhibitor', 'Diuretic', 'Beta_Blocker',
'CCB', 'Nitrate', 'K_channel', 'Aldosterone_blocker',
'Vasodilator', 'PDEI', 'Central', 'Relaxant',
'ARB', 'Alpha_blocker', 'Dopaminergic', 'Unknown'))
big_df <- apply(big_dat, 1, function(x) {
x <- names(big_dat)[which(x == 1)]
m <- 1 - diag(length(x))
dimnames(m) <- list(x, x)
inds <- which(lower.tri(m), arr.ind = TRUE)
data.frame(from = x[inds[,1]], to = x[inds[,2]])}) %>%
bind_rows() %>%
mutate(wt = 1) %>%
subset(complete.cases(.)) %>%
group_by(from, to) %>%
summarize(wt = sum(wt), .groups = 'drop')
chordDiagram(big_df)

R - create a dual entry pivot table

I'm an R newbie so my apologizes if this is a simple question.
I use a lot excel to create "dual entries" tables. It's likely the name 'dual table' is not the most accurate but I wouldn't know how to describe it otherwise.
I basically start from big tables and then create a new one where I average the data grouping by two columns and then I display it as a matrix.
I will share with you a perfectly functional R example I coded myself.
My question is: is there an easier / better way to do it?
This is my working code:
require(dplyr)
df <- mtcars
output_var <- 'disp'
rows_var <- 'cyl'
col_var <- 'am'
output_name <- paste0("Avg. ",output_var)
one_way_table <- df %>%
group_by(eval(parse(text=rows_var)), eval(parse(text=col_var)) ) %>%
summarise(output=mean( eval(parse(text=output_var)) ))
one_way_table <- data.frame(one_way_table, check.rows = F, check.names = F, stringsAsFactors = F)
colnames(one_way_table) <- c(rows_var, col_var, output_name)
unique_row_items <- unique(one_way_table[,rows_var])
unique_col_items <- unique(one_way_table[,col_var])
x_rows <- rep(unique_row_items, length(unique_col_items))
y_cols <- rep(unique_col_items, length(unique_row_items))
new_df <- data.frame(x = x_rows, y = y_cols, check.rows = F, check.names = F, stringsAsFactors = F)
colnames(new_df) <- c(rows_var, col_var)
new_df <- base::merge(new_df, one_way_table, by = c(rows_var, col_var), all.x=T)
m <- matrix(new_df[, output_name], ncol= length(unique(new_df[,col_var])) )
df_matrix <- data.frame(m, check.rows = F, check.names = F, stringsAsFactors = F)
Perhaps there's a way more efficient way to do it.
Notice how, since this will be coded inside a function, I had to use variable names do define what columns I want to use for the analysis.
Thanks
A possible solution for your issue can come from tidyverse. Here an example reshaping your data and aggregating with mean:
library(tidyverse)
#Data
df <- mtcars
#Code
df %>% pivot_longer(cols = -c(cyl,am)) %>% filter(name=='disp') %>%
group_by(cyl,am) %>% summarise(Mean=mean(value)) %>%
pivot_wider(names_from = am,values_from=Mean)
Output:
# A tibble: 3 x 3
# Groups: cyl [3]
cyl `0` `1`
<dbl> <dbl> <dbl>
1 4 136. 93.6
2 6 205. 155
3 8 358. 326
Which is close to df_matrix the final output of your code.
If we need to pivot, this can be done in a more simple way. We select the columns of interest and use pivot_wider with values_fn specifying as mean to be applied on the columns selected on values_from
library(dplyr)
library(tidyr)
mtcars %>%
select(cyl, am, disp) %>%
pivot_wider(names_from = am, values_from = disp, values_fn = mean)
# A tibble: 3 x 3
# cyl `1` `0`
# <dbl> <dbl> <dbl>
#1 6 155 205.
#2 4 93.6 136.
#3 8 326 358.

Expand to blank with unnest

I find purrr SUPER useful for making results tables for a bunch of different variables. I was wondering if there was a way for the unnest() function (or otherwise) to expand a high order variable into blanks, rather than just repeating.
For example, with this code:
library(tidyverse)
data <- data.frame(
group1 = sample(c('dog','cat', 'gecko'), 100, replace = T),
group2 = sample(c('hot dog', 'not hotdog', 'other'), 100, replace = T)
)
my_freq <- function(var){
result <- as.data.frame(table(data[[var]]))
colnames(result) <- c('level', 'n')
return(result)
}
the_table <- data.frame(var = c('group1', 'group2'))
the_table <- the_table %>%
mutate(
result = map(var, my_freq)
) %>%
unnest(result)
Instead of the resulting table looking like:
It would look like this:
I guess this would be a multi-level index in python, but not sure how to accomplish in r.
Extending rmagno's solution to other high order variables
...%>%
mutate_at(
.vars = vars(high_order_vars),
.funs = function(x) ifelse(duplicated(.[['var']]), NA, x)
)
Not sure what you mean by blank (in this example I am going with NA). The critical line is:
mutate(var = if_else(!duplicated(var), var, NA_integer_))
Minimal working example:
library(tidyverse)
data <- data.frame(
group1 = sample(c('dog','cat', 'gecko'), 100, replace = T),
group2 = sample(c('hot dog', 'not hotdog', 'other'), 100, replace = T)
)
my_freq <- function(var){
result <- as.data.frame(table(data[[var]]))
colnames(result) <- c('level', 'n')
return(result)
}
the_table <- data.frame(var = c('group1', 'group2'))
the_table <- the_table %>%
mutate(
result = map(var, my_freq)
) %>%
unnest(result) %>%
mutate(var = if_else(!duplicated(var), var, NA_integer_))
the_table
#> # A tibble: 6 x 3
#> var level n
#> <fct> <fct> <int>
#> 1 group1 cat 38
#> 2 <NA> dog 38
#> 3 <NA> gecko 24
#> 4 group2 hot dog 36
#> 5 <NA> not hotdog 34
#> 6 <NA> other 30
Created on 2020-02-29 by the reprex package (v0.3.0)
You could just use an lapply to get a list I call tb of the two tables. Then create a matrix with names(tb) in the first row and the rest blanks and convert it to a data frame. Finally Map assigns the desired names applying cbind on the columns of both data frames consecutively.
tb <- lapply(data, function(x) setNames(as.data.frame(table(x)), c("level", "n")))
res <- do.call(rbind,
Map(cbind,
var=data.frame(
matrix(c(names(tb), rep("", (el(lapply(tb, nrow)) - 1)*2)),
ncol=2, byrow=TRUE)),
tb))
res
# var level n
# X1.1 group1 cat 31
# X1.2 dog 26
# X1.3 gecko 43
# X2.1 group2 hot dog 35
# X2.2 not hotdog 37
# X2.3 other 28

Count frequency of same value in several columns

I'm quite new to R and I'm facing a problem which I guess is quite easy to fix but I couldn't find the answer.
I have a dataframe called clg where basically I have 3 columns date, X1, X2.
X1 and X2 are name of country teams. X1 and X2 have the same list of countries.
I'm simply trying to count the frequency of each country in the two columns as a total.
So far, I've only been able to count the frequency of the X1 column but I didn't find a way to sum both columns.
clt <- as_tibble(na.omit(count(clg, clg$X1)))
I would like to get a data frame where in the first columns I have unique countries, and in the second column the sum of occurrences in X1 + X2.
You can useunlist() and table() to get the overall counts. Wrapping it in data.frame() will give you the desired two column output.
clg <- data.frame(date=1:3,
X1=c("nor", "swe", "alg"),
X2=c("swe", "alg", "jpn"))
data.frame(table(unlist(clg[c("X1", "X2")])))
# Var1 Freq
# 1 alg 2
# 2 nor 1
# 3 swe 2
# 4 jpn 1
With tidyverse, we can gather into 'long' format and then do the count
library(tidyverse)
gather(clg, key, Var1, -date) %>%
count(Var1)
# A tibble: 4 x 2
# Var1 n
# <chr> <int>
#1 alg 2
#2 jpn 1
#3 nor 1
#4 swe 2
data
clg <- structure(list(date = 1:3, X1 = structure(c(2L, 3L, 1L), .Label = c("alg",
"nor", "swe"), class = "factor"), X2 = structure(c(3L, 1L, 2L
), .Label = c("alg", "jpn", "swe"), class = "factor")),
class = "data.frame", row.names = c(NA,
-3L))
You can obtain your goal with two steps. In the first step, you calculate the sum of occurrences for each country. In the next step, you're joining the two df's together and calculate the total sum.
X1_sum <- df %>%
dplyr::group_by(X1) %>%
dplyr::summarize(n_x1 = n())
X2_sum <- df %>%
dplyr::group_by(X2) %>%
dplyr::summarize(n_x2 = n()
final_summary <- X1_sum %>%
# merging data with by country names
dplyr::left_join(., X2_sum, by = c("X1", "X2")) %>%
dplyr::mutate(n_sum = n_x1 + n_x2)

Computing average over different columns/rows in a list of data.frames

I've a list of 140 elements of type data.frame ('my.list'). I would like to compute 350 averages of certain values ranges in a certain column for a certain set of rows in a certain data.frame (this is a bit cryptic); so, 350 different averages like:
Of data.frame #1, the average of column 'Measure1', row 1:5;
Of data.frame #2, the average of column 'Measure3', row 1:4, etc. etc.
I have another data.frame ('my.dfAverage') which indicates for which data.frame, column and rows it needs the average. I want to write the 350 different averages and standard deviations to this data.frame (so with the columns: 'average_id', 'dataframe_number', 'column_name', 'row_numbers', 'average' and 'st_dev'). Some value ranges have NA's, these values can be dropped for computing the average.
What is the best way to automatically compute the 350 averages and standard deviations from the list of data.frames based on the info in this data.frame? I thought of creating a for-loop (or maybe the lapply function?), but I'm quite new to these functions, so I'm not sure what the way to go is here.
Small reproducible example of my list of data.frames:
my.df1 <- data.frame(ID = c(1:5),
Measure1 = c(2247,2247,1970,1964,1971),
Measure2 = c(2247,2247,NA,1964,1971))
my.df2 <- data.frame(ID = c(1:4),
Measure3 = c(2247,NA,1970,1964),
Measure5 = c(2247,2247,NA,1964))
my.df3 <- data.frame(ID = c(1:4),
Measure6 = c(2247,600,1970,1964),
Measure8 = c(2247,2247,NA,1964))
my.list <- list(list1 = my.df1, list2 = my.df2, list3 = my.df3)
Desired output table for the averages and standard deviation:
my.dfAverage <- data.frame(average_id = c(1:3),
dataframe_number = c(1,2,3),
column_name = c('Measure1','Measure3','Measure6'),
row_numbers = c('1:3','1:4','1:2'),
average = (NA),
st_dev = (NA))
This is a different approach than the one given above: I will use only base r functions: Point to note, ensure the data has stringsAsFactors=FALSE
write a function but ensure you index mylist correctly. then compute the function on this i e f(...,na.rm=T). to write a function using apply:
fun1=function(f){with(my.dfAverage,
mapply(function(x,y,z)
f(x[eval(parse(text=y)),z],na.rm=T),my.list,row_numbers,column_name))}
transform(my.dfAverage,average=fun1(mean),st_dev=fun1(sd))
average_id dataframe_number column_name row_numbers average st_dev
1 1 1 Measure1 1:3 2154.667 159.9260
2 2 2 Measure3 1:4 2060.333 161.6859
3 3 3 Measure6 1:2 1423.500 1164.6049
Data Used:
my.dfAverage <- data.frame(average_id = c(1:3),
dataframe_number = c(1,2,3),
column_name = c('Measure1','Measure3','Measure6'),
row_numbers = c('1:3','1:4','1:2'),
average = (NA),
st_dev = (NA),stringsAsFactors = F)
A solution using tidyverse.
First, expand the my.dfAverage based on row_numbers.
library(tidyverse)
my.dfAverage2 <- my.dfAverage %>%
separate(row_numbers, into = c("start", "end")) %>%
mutate(row_numbers = map2(start, end, `:`)) %>%
unnest() %>%
select(-start, -end) %>%
mutate(row_numbers = as.integer(row_numbers),
dataframe_number = as.integer(dataframe_number))
Second, transform all data frames in my.list and combine them to a single data frame.
my.list.df <- my.list %>%
setNames(1:length(.)) %>%
map_dfr(function(x){
x2 <- x %>%
gather(column_name, value, -ID)
return(x2)
},.id = "dataframe_number") %>%
mutate(ID = as.integer(ID), dataframe_number = as.integer(dataframe_number)) %>%
rename(row_numbers = ID)
Third, merge my.dfAverage2 and my.list.df and calculate the mean and standard deviation. my.dfAverage3 is the final output.
my.dfAverage3 <- my.dfAverage2 %>%
left_join(my.list.df, by = c("dataframe_number", "column_name", "row_numbers")) %>%
group_by(average_id, dataframe_number, column_name) %>%
summarise(row_numbers = paste(min(row_numbers), max(row_numbers), sep = ":"),
average = mean(value, na.rm = TRUE),
st_dev = sd(value, na.rm = TRUE)) %>%
ungroup()
my.dfAverage3
# A tibble: 3 x 6
# average_id dataframe_number column_name row_numbers average st_dev
# <int> <int> <chr> <chr> <dbl> <dbl>
# 1 1 1 Measure1 1:3 2155 160
# 2 2 2 Measure3 1:4 2060 162
# 3 3 3 Measure6 1:2 1424 1165
DATA
my.list is the same as OP's my.list.
my.dfAverage <- data.frame(average_id = c(1:3),
dataframe_number = c(1,2,3),
column_name = c('Measure1','Measure3','Measure6'),
row_numbers = c('1:3','1:4','1:2'))

Resources