Data summary with for loop - r

I am trying to use the summarySE function from package Rmisc to generate data summaries for each column in a workbook. The first column in the worksheet is the grouping variable, and I want to loop through the other columns.
I am using the following code:
library(Rmisc)
for(i in 2:ncol(file)){
sum<-summarySE(file, measurevar = file[,i], groupvars = file[1])
}
But I keep getting the same error:
'Error in UseMethod("as.quoted") :
no applicable method for 'as.quoted' applied to an object of class
"data.frame"'
I know that file[1] is a list and should be a vector, but using unlist causes more problems. Any ideas?
Data:
structure(list(Treatment = c("SKELE", "SKELE", "SKELE", "SKELE",
"SKELE", "SKELE", "SKELE", "SKELE", "SKELE", "SKELE", "SKELE",
"SKELE", "TISSUE", "TISSUE", "TISSUE", "TISSUE", "TISSUE", "TISSUE",
"TISSUE", "TISSUE", "TISSUE", "TISSUE", "TISSUE", "TISSUE"),
`% lipid in skeleton` = c(21.8706902567934, 31.1736436075643,
62.2246234617322, 86.6248675033794, 46.5607971373041, 34.7532319115317,
32.7686161366371, 6.73685660233744, 33.7111477556584, 48.8970450055359,
54.3687328279357, 48.9086732773318, 78.1293097432066, 68.8263563924357,
37.7753765382678, 13.3751324966206, 53.4392028626959, 65.2467680884683,
67.2313838633629, 93.2631433976626, 66.2888522443416, 51.1029549944641,
45.6312671720643, 51.0913267226682), `% ash in skeleton` = c(97.370981485225,
98.6169174273543, 99.2417548180554, 99.1330769035889, 98.5523872323069,
98.0077944962001, 97.7848485294277, 98.0738823145836, 98.1567971208113,
98.8047064451889, 97.1790753033603, 98.7503991978965, 2.62901851477497,
1.38308257264571, 0.75824518194458, 0.866923096411125, 1.44761276769314,
1.99220550379987, 2.2151514705723, 1.92611768541643, 1.84320287918869,
1.19529355481109, 2.82092469663973, 1.24960080210352), `% tissue in skeleton` = c(55.2224357342865,
70.022864703591, 77.5880978578982, 83.1168129092154, 67.3012504898307,
62.1455896726595, 64.2488985210074, 67.3089347382539, 59.9276126303114,
70.5681668501146, 67.717146912379, 68.8185249866557, 44.7775642657135,
29.977135296409, 22.4119021421018, 16.8831870907846, 32.6987495101694,
37.8544103273405, 35.7511014789926, 32.6910652617461, 40.0723873696886,
29.4318331498854, 32.2828530876211, 31.1814750133443)), class = "data.frame", row.names = c(NA,
-24L), .Names = c("Treatment", "% lipid in skeleton", "% ash in skeleton",
"% tissue in skeleton"))

We need a data.frame to apply the summarySE. Using lapply, loop though the sequence of columns, subset the 'file', specify the measurevar and groupvars (based on the index of columns)
lapply(2:ncol(file), function(j) summarySE(file[c(1, j)], measurevar = 2, groupvars = 1))
#[[1]]
# Treatment N 2 sd se ci
#1 SKELE 12 42.38324 20.53836 5.928913 13.04945
#2 TISSUE 12 57.61676 20.53836 5.928913 13.04945
#[[2]]
# Treatment N 2 sd se ci
#1 SKELE 12 98.306052 0.6567242 0.1895799 0.4172626
#2 TISSUE 12 1.693948 0.6567242 0.1895799 0.4172626
#[[3]]
# Treatment N 2 sd se ci
#1 SKELE 12 67.83219 7.442443 2.148448 4.728703
#2 TISSUE 12 32.16781 7.442443 2.148448 4.728703
Or if we are using the OP's method
lst <- vector("list", ncol(file)-1)
for(i in 2:ncol(file)){
lst[[i]] <- summarySE(file, measurevar = i, groupvars = 1)
}
lst
Note that we can also specify the names instead of index
for(i in 2:ncol(file)){
lst[[i]] <- summarySE(file, measurevar = names(file)[i], groupvars = names(file)[1])
}
In the OP's code, the measurevar and groupvars are taking the values of the columns instead of the column name.

Related

Estimate_richness for all phyla in phyloseq

Is there an easy way to get ASV richness for each Phylum for each Station using the estimate_richness function in phyloseq? Or is there another simple way of extracting the abundance data for each taxonomic rank and calculating richness that way?
So far I have just been subsetting individual Phyla of interest using for example:
ps.Prymnesiophyceae <- subset_taxa(ps, Phylum == "Prymnesiophyceae")
alpha_diversity<-estimate_richness(ps.Prymnesiophyceae,measure=c("Shannon","Observed"))
H<-alpha_diversity$Shannon
S1<-alpha_diversity$Observed
S<-log(S1)
evenness<-H/S
alpha<-cbind(Shannon=H,Richness=S1,Evenness=evenness,sample_data(Prymnesiophyceae))
But this is rather a pain when having to do it for e.g. the top 20 phyla.
EDIT:
suggestion by #GTM works well until last step. See comment + dput:
> dput(head(sample_names(ps.transect), n=2)) c("2-1-DCM_S21_L001_R1_001.fastq", "2-1-SA_S9_L001_R1_001.fastq" )
> dput(head(alpha, n=2)) structure(list(Observed = c(31, 25), Shannon = c(2.84184012598765,
2.53358345702604), taxon = c("Prymnesiophyceae", "Prymnesiophyceae" ), sample_id = c("X2.1.DCM_S21_L001_R1_001.fastq", "X2.1.SA_S9_L001_R1_001.fastq" ), S = c(3.43398720448515,
3.2188758248682), evenness = c(0.827562817437384,
0.787101955736294)), row.names = c("X2.1.DCM_S21_L001_R1_001.fastq", "X2.1.SA_S9_L001_R1_001.fastq"), class = "data.frame")
> dput(head(smpl_data, n=1)) new("sample_data", .Data = list("001_DCM", 125L, structure(1L, .Label = "DCM", class = "factor"), structure(1L, .Label = "Transect", class = "factor"), structure(1L, .Label = "STZ", class = "factor"),
structure(1L, .Label = "STFW", class = "factor"), "Oligotrophic",
16L, -149.9978333, -29.997, 130.634, 17.1252, 35.4443, 1025.835008,
1.1968, 1e-12, 5.387, 2.8469, 52.26978546, 98.0505, 0, 0,
0.02, 0.9, 0, 0, 2069.47, 8.057, 377.3), names = c("Station_neat", "Depth_our", "Depth_bin", "Loc", "Front", "Water", "Zone", "Bottle", "Lon", "Lat", "pressure..db.", "Temperature", "Salinity", "Density_kgm.3", "Fluorescence_ugL", "PAR", "BottleO2_mLL", "CTDO2._mLL", "OxygenSat_.", "Beam_Transmission", "N_umolL", "NO3_umolL", "PO4_umolL", "SIL_umolL", "NO2_umolL", "NH4_umolL", "DIC_uMkg", "pH", "pCO2_matm"), row.names = "2-1-DCM_S21_L001_R1_001.fastq",
.S3Class = "data.frame")
You can wrap your code in a for loop to do so. I've slightly modified your code to make it a bit more flexible, see below.
require("phyloseq")
require("dplyr")
# Calculate alpha diversity measures for a specific taxon at a specified rank.
# You can pass any parameters that you normally pass to `estimate_richness`
estimate_diversity_for_taxon <- function(ps, taxon_name, tax_rank = "Phylum", ...){
# Subset to taxon of interest
tax_tbl <- as.data.frame(tax_table(ps))
keep <- tax_tbl[,tax_rank] == taxon_name
keep[is.na(keep)] <- FALSE
ps_phylum <- prune_taxa(keep, ps)
# Calculate alpha diversity and generate a table
alpha_diversity <- estimate_richness(ps_phylum, ...)
alpha_diversity$taxon <- taxon_name
alpha_diversity$sample_id <- row.names(alpha_diversity)
return(alpha_diversity)
}
# Load data
data(GlobalPatterns)
ps <- GlobalPatterns
# Estimate alpha diversity for each phylum
phyla <- get_taxa_unique(ps,
taxonomic.rank = 'Phylum')
phyla <- phyla[!is.na(phyla)]
alpha <- data.frame()
for (phylum in phyla){
a <- estimate_diversity_for_taxon(ps = ps,
taxon_name = phylum,
measure = c("Shannon", "Observed"))
alpha <- rbind(alpha, a)
}
# Calculate the additional alpha diversity measures
alpha$S <- log(alpha$Observed)
alpha$evenness <- alpha$Shannon/alpha$S
# Add sample data
smpl_data <- as.data.frame(sample_data(ps))
alpha <- left_join(alpha,
smpl_data,
by = c("sample_id" = "X.SampleID"))
This is a reproducible example with GlobalPatterns. Make sure to alter the code to match your data by replacing X.SampleID in the left join with the name of the column that contains the sample IDs in your sample_data. If there is no such column, you can create it from the row names:
smpl_data <- as.data.frame(sample_data(ps))
smpl_data$sample_id < row.names(smpl_data)
alpha <- left_join(alpha,
smpl_data,
by = c("sample_id" = "sample_id"))

pearson correlation for genes in gene expression data

I have two datasets:
one is actual count and other one is predicted counts. I want to do a pearson correlation between them.
My actual count data look like this:
My predicted counts data look like this:
I want to do pearson correlation for these two datasets for each geneID.
I have written this code:
install.packages("Rcpp")
library(Rcpp)
library("reshape2")
library("ggplot2")
# import in the actual expression values and the gene predicted values
act_cts <- read.delim("GVDS_normalized_counts_2021v1.txt", header = TRUE, sep="\t")
## fix the column names
colnames(act_cts)[1]<-"gene"
colnames(act_cts)<- substr(colnames(act_cts), 1, 7)
pred_cts<-read.delim("GVDS_PrediXcan_Test_2021v1.txt", header=TRUE, sep="\t")
colnames(pred_cts)<-substr(colnames(pred_cts), 1, 15)
## melt the predict counts, so the columns change to row entries FID, IID, gene
melt_pred_cts<-melt(pred_cts, id.vars=c("FID","IID"), variable.name="gene", value.name = "gene_exp")
## melts the actual counts, so it can be easily joined to the final prediction
melt_act_cts<-melt(act_cts, id.vars="gene", variable.name="IID", value.name = "act_gene_exp")
final_cts<-merge(melt_pred_cts,melt_act_cts)
## this takes a minute/ several minutes to run because it is joining on both gene and IID
# runs the Pearson correlation for each gene
all_genes<-unique(final_cts$gene)
pear_cor_all_df<- data.frame(gene=character(), pear_coeff=double())
## runs the correlation
for(g in all_genes)
{
wrk_cts_all<-final_cts[which(final_cts$gene==g),]
# temp working df for each gene
pear_coef_all<-cor(wrk_cts_all$gene_exp, wrk_cts_all$act_gene_exp, method="pearson")
# runs the correlation for each gene between gene_exp and act_gene_exp
new_row_all<-c(g, pear_coef_all)
pear_cor_all_df<-rbind(pear_cor_all_df, new_row_all)
#saves this to the df
}
But its not giving me the correct results.
This is data for act_count:
dput(act_counts[1:10, 1:10])
structure(list(gene = c("ENSG00000152931.6", "ENSG00000183696.9",
"ENSG00000139269.2", "ENSG00000169129.8", "ENSG00000134602.11",
"ENSG00000136237.12", "ENSG00000259425.1", "ENSG00000242284.2",
"ENSG00000235027.1", "ENSG00000228169.3"), Gene_Sy = c("ENSG00000152931.6",
"ENSG00000183696.9", "ENSG00000139269.2", "ENSG00000169129.8",
"ENSG00000134602.11", "ENSG00000136237.12", "ENSG00000259425.1",
"ENSG00000242284.2", "ENSG00000235027.1", "ENSG00000228169.3"
), Chr = c("5", "7", "12", "10", "X", "7", "15", "X", "11", "10"
), Coord = c(59783540, 48128225, 57846106, 116164515, 131157293,
22396763, 23096869, 134953994, 1781578, 116450393), HG00096 = c(0.101857770468582,
8.1838049456063, 1.19991028786682, 0.831939826228749, 27.6464223725999,
3.78850273139249, 0.0540590649819536, 0.351716382898523, 0.200791414339667,
96.1821778045089), HG00097 = c(0.0781095249582053, 5.68691050653862,
1.57357169691446, 0.0697777450667378, 24.3955715036476, 2.05096276937706,
0.112185357489692, 0.444540251941709, 0.190137938062251, 101.17926156721
), HG00099 = c(0.0489806714207954, 2.43465332606958, 0.521615781673147,
0.93108575037257, 16.4453735152148, 4.00031300285966, 0.00359181983091798,
0.227707651999832, 0.0929246302159905, 58.7830634918037), HG00100 = c(0.118597118618172,
3.83089421985197, 1.44722544015787, 0.620940765480242, 24.8066495438254,
3.27161920134705, 0.00049968321150251, 0.714112406249513, 0.108789749488722,
105.483527339859), HG00101 = c(0.00403496367614745, 6.61228835251498,
3.56579072437701, 1.66066836204679, 25.1133488775017, 1.79821591847768,
0.0293976115522442, 0.450911709524112, 0.23244822901371, 105.818192023699
), HG00102 = c(0.0109253485646219, 4.70964559086586, 1.98268073472144,
0.570481056180073, 19.2339882617972, 1.51668840574531, 0.0312661751488703,
0.491437808951175, 0.250905117203001, 136.140843495464)), row.names = c(NA,
-10L), class = c("tbl_df", "tbl", "data.frame"))
This is prd_counts:
dput(prd_counts[1:10, 1:10])
structure(list(FID = c("HG00096", "HG00097", "HG00099", "HG00100",
"HG00101", "HG00102", "HG00103", "HG00105", "HG00106", "HG00107"
), IID = c("HG00096", "HG00097", "HG00099", "HG00100", "HG00101",
"HG00102", "HG00103", "HG00105", "HG00106", "HG00107"), ENSG00000182902.8 = c(0.0223611610092831,
0.0385031316687293, -0.0682504384265577, 0.00018098416274239,
-0.045492721345375, -0.10473163051734, -0.0215970711860838, 0.060455638944161,
-0.00889260689717109, -0.102096211855105), ENSG00000183307.3 = c(0.129041336028238,
-0.13226906002202, 0.005409246530295, -0.0539556427088601, -0.00699884042001628,
-0.204743560777908, -0.0534359750800079, -0.235648260835705,
-0.10230402771496, -0.0914043464852205), ENSG00000237438.1 = c(-0.758838434524167,
-0.579236418964912, -0.695762357174973, -0.368416879945024, -0.339555280234214,
-0.809438763600528, -0.359798980325098, -0.417769387016999, -0.724636782037491,
-0.309671271758401), ENSG00000243156.2 = c(-0.58456094489168,
0.105851861253113, -0.275061563982305, -0.0406543077034047, -0.522672785138957,
-0.126100301787985, -0.288382571274346, -0.354309857822533, -0.314842662063296,
-0.141401921597711), ENSG00000099968.13 = c(0.135357355615122,
0.157616292043257, 0.180059097593111, 0.250009792099489, 0.170653230854707,
0.316157576642492, 0.314671674077333, 0.224102148083679, 0.232969333848649,
0.14963210689311), ENSG00000069998.8 = c(-0.0346986034383362,
-0.0173493017191681, 0, -0.0173493017191681, -0.645266014640116,
-0.0346986034383362, -0.0173493017191681, -0.0173493017191681,
-0.0346986034383362, 0), ENSG00000184979.8 = c(-0.160573318589815,
0.54683218159596, 0.3503062647549, 0.653899917577768, 0.321280544783323,
0.653727041876318, 0.822864620159811, 1.03780221621802, -0.195295753744408,
-0.228590172992798), ENSG00000070413.12 = c(0.775225873145799,
0.602092262450708, 1.0198591935485, 0.65587457098494, 0.306445027670957,
0.581202299884586, 0.836112660742631, 0.559373823767867, 0.46977171007116,
0.84426113999649)), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame"))
The provided test samples will not work because there are no genes in common between act_counts and prd_counts. I took the liberty of fixing that by reassigning column names:
library(dplyr)
library(tidyr)
## the line below fixes the problem with test samples
colnames(prd_counts)[3:10] <- act_counts$gene[1:8]
acts <- pivot_longer(act_counts,
cols = starts_with("HG"),
names_to = "FID",
values_to = "Actual")
prds <- pivot_longer(prd_counts,
cols = starts_with("ENSG"),
names_to = "gene",
values_to = "Predicted")
inner_join(acts, prds,
by = c("gene", "FID")) |>
select(gene, FID, Actual, Predicted) |>
group_by(gene) |>
summarize(rho = cor(Actual, Predicted))
##> # A tibble: 8 × 2
##> gene rho
##> <chr> <dbl>
##> 1 ENSG00000134602.11 -0.445
##> 2 ENSG00000136237.12 0.446
##> 3 ENSG00000139269.2 0.543
##> 4 ENSG00000152931.6 0.770
##> 5 ENSG00000169129.8 -0.802
##> 6 ENSG00000183696.9 0.405
##> 7 ENSG00000242284.2 -0.503
##> 8 ENSG00000259425.1 -0.110

Plotting different rows as different lines in R with matplot

I would like to plot different rows as different lines in the same plot to illustrate the movements of the average development of 3 groups: All, Men and Women. However, I'm not getting one of the lines printed and the legend is not being filled with the rownames.
I'l be glad for a solution, either in matplot or in ggplot.
Thank you!
Code:
matplot(t(Market_Work), type = 'l', xaxt = 'n', xlab = "Time Period", ylab = "Average", main ="Market Work")
legend("right", legend = seq_len(nrow(Market_Work)), fill=seq_len(nrow(Market_Work)))
axis(1, at = 1:6, colnames(Market_Work))
Data:
2003-2005 2006-2008 2009-2010 2011-2013 2014-2016 2017-2018
All 31.48489 32.53664 30.41938 30.53870 31.15550 31.77960
Men 37.38654 38.16698 35.10247 35.65543 36.54855 36.72496
Women 31.48489 32.53664 30.41938 30.53870 31.15550 31.77960
> dput(Market_Work)
structure(list(`2003-2005` = c(31.4848853173555, 37.3865421137,
31.4848853173555), `2006-2008` = c(32.5366433161048, 38.1669798351148,
32.5366433161048), `2009-2010` = c(30.4193794808191, 35.1024661973137,
30.4193794808191), `2011-2013` = c(30.5387012166381, 35.6554329405739,
30.5387012166381), `2014-2016` = c(31.1555032381292, 36.5485451138792,
31.1555032381292), `2017-2018` = c(31.7795953402235, 36.7249638612854,
31.7795953402235)), row.names = c("All", "Men", "Women"), class = "data.frame")
Here is an example with ggplot2. I changed some of your data, as two rows were same in your originial data.
library(tidyverse)
df <- structure(list(`2003-2005` = c(31.4848853173555, 37.3865421137,
30.4848853173555), `2006-2008` = c(32.5366433161048, 38.1669798351148,
30.5366433161048), `2009-2010` = c(30.4193794808191, 35.1024661973137,
33.4193794808191), `2011-2013` = c(30.5387012166381, 35.6554329405739,
33.5387012166381), `2014-2016` = c(31.1555032381292, 36.5485451138792,
30.1555032381292), `2017-2018` = c(31.7795953402235, 36.7249638612854,
30.7795953402235)), row.names = c("All", "Men", "Women"), class = "data.frame")
df2 <- as.data.frame(t(df))
df2$Year <- rownames(df2)
df2%>% pivot_longer( c(All,Men,Women), names_to = "Category") %>%
ggplot(aes(x = Year, y = value)) + geom_line(aes(group = Category, color = Category))

Automatically split function output (list) into component data.frames

I have a functions which yields 2 dataframes. As functions can only return one object, I combined these dataframes as a list. However, I need to work with both dataframes separately. Is there a way to automatically split the list into the component dataframes, or to write the function in a way that both objects are returned separately?
The function:
install.packages("plyr")
require(plyr)
fun.docmerge <- function(x, y, z, crit, typ, doc = checkmerge) {
mergedat <- paste(deparse(substitute(x)), "+",
deparse(substitute(y)), "=", z)
countdat <- nrow(x)
check_t1 <- data.frame(mergedat, countdat)
z1 <- join(x, y, by = crit, type = typ)
countdat <- nrow(z1)
check_t2 <- data.frame(mergedat, countdat)
doc <- rbind(doc, check_t1, check_t2)
t1<-list()
t1[["checkmerge"]]<-doc
t1[[z]]<-z1
return(t1)
}
This is the call to the function, saving the result list to the new object results.
results <- fun.docmerge(x = df1, y = df2, z = "df3", crit = c("id"), typ = "left")
In the following sample data to replicate the problem:
df1 <- structure(list(id = c("XXX1", "XXX2", "XXX3",
"XXX4"), tr.isincode = c("ISIN1", "ISIN2",
"ISIN3", "ISIN4")), .Names = c("id", "isin"
), row.names = c(NA, 4L), class = "data.frame")
df2 <- structure(list(id= c("XXX1", "XXX5"), wrong= c(1L,
1L)), .Names = c("id", "wrong"), row.names = 1:2, class = "data.frame")
checkmerge <- structure(list(mergedat = structure(integer(0), .Label = character(0), class = "factor"),
countdat = numeric(0)), .Names = c("mergedat", "countdat"
), row.names = integer(0), class = "data.frame")
In the example, a list with the dataframes df3 and checkmerge are returned. I would need both dataframes separately. I know that I could do it via manual assignment (e.g., checkmerge <- results$checkmerge) but I want to eliminate manual changes as much as possible and am therefore looking for an automated way.

colored categories in r wordclouds

Using the wordcloud package in R I would like to color different words according to a categorical variable in the dataset. Say my data is as follows:
name weight group
1 Aba 10 x
2 Bcd 20 y
3 Cde 30 z
4 Def 5 x
And here as a dput:
dat <- structure(list(name = c("Aba", "Bcd", "Cde", "Def"), weight = c(10,
20, 30, 5), group= c("x", "y", "z", "x")), .Names = c("name",
"weight", "group"), row.names = c(NA, -4L), class = "data.frame")
Is there a way in wordcloud() to color the names by their group (x, y, z) or should I use different software/packages?
It will automatically choose from a color list based on frequency or by word order if ordered.colors is specified.
name = c("Aba","Bcd","Cde","Def")
weight = c(10,20,30,5)
colorlist = c("red","blue","green","red")
wordcloud(name, weight, colors=colorlist, ordered.colors=TRUE)
The example above works for independent variables. In a data frame, your color specification will be stored as a factor, and it will have to be converted to text by wrapping it in as.character like this:
wordcloud(df$name, df$weight, colors=as.character(df$color), ordered.colors=TRUE)
If you just have factors and not a list of colors, you can generate a parallel colorlist with a couple of lines.
#general solution for any number of categories
basecolors = rainbow(length(unique(group)))
# solution for known categories
basecolors = c("red","green","blue")
group = c("x","y","z","x")
# find position of group in list of groups, and select that matching color...
colorlist = basecolors[ match(group,unique(group)) ]

Resources