pearson correlation for genes in gene expression data - r

I have two datasets:
one is actual count and other one is predicted counts. I want to do a pearson correlation between them.
My actual count data look like this:
My predicted counts data look like this:
I want to do pearson correlation for these two datasets for each geneID.
I have written this code:
install.packages("Rcpp")
library(Rcpp)
library("reshape2")
library("ggplot2")
# import in the actual expression values and the gene predicted values
act_cts <- read.delim("GVDS_normalized_counts_2021v1.txt", header = TRUE, sep="\t")
## fix the column names
colnames(act_cts)[1]<-"gene"
colnames(act_cts)<- substr(colnames(act_cts), 1, 7)
pred_cts<-read.delim("GVDS_PrediXcan_Test_2021v1.txt", header=TRUE, sep="\t")
colnames(pred_cts)<-substr(colnames(pred_cts), 1, 15)
## melt the predict counts, so the columns change to row entries FID, IID, gene
melt_pred_cts<-melt(pred_cts, id.vars=c("FID","IID"), variable.name="gene", value.name = "gene_exp")
## melts the actual counts, so it can be easily joined to the final prediction
melt_act_cts<-melt(act_cts, id.vars="gene", variable.name="IID", value.name = "act_gene_exp")
final_cts<-merge(melt_pred_cts,melt_act_cts)
## this takes a minute/ several minutes to run because it is joining on both gene and IID
# runs the Pearson correlation for each gene
all_genes<-unique(final_cts$gene)
pear_cor_all_df<- data.frame(gene=character(), pear_coeff=double())
## runs the correlation
for(g in all_genes)
{
wrk_cts_all<-final_cts[which(final_cts$gene==g),]
# temp working df for each gene
pear_coef_all<-cor(wrk_cts_all$gene_exp, wrk_cts_all$act_gene_exp, method="pearson")
# runs the correlation for each gene between gene_exp and act_gene_exp
new_row_all<-c(g, pear_coef_all)
pear_cor_all_df<-rbind(pear_cor_all_df, new_row_all)
#saves this to the df
}
But its not giving me the correct results.
This is data for act_count:
dput(act_counts[1:10, 1:10])
structure(list(gene = c("ENSG00000152931.6", "ENSG00000183696.9",
"ENSG00000139269.2", "ENSG00000169129.8", "ENSG00000134602.11",
"ENSG00000136237.12", "ENSG00000259425.1", "ENSG00000242284.2",
"ENSG00000235027.1", "ENSG00000228169.3"), Gene_Sy = c("ENSG00000152931.6",
"ENSG00000183696.9", "ENSG00000139269.2", "ENSG00000169129.8",
"ENSG00000134602.11", "ENSG00000136237.12", "ENSG00000259425.1",
"ENSG00000242284.2", "ENSG00000235027.1", "ENSG00000228169.3"
), Chr = c("5", "7", "12", "10", "X", "7", "15", "X", "11", "10"
), Coord = c(59783540, 48128225, 57846106, 116164515, 131157293,
22396763, 23096869, 134953994, 1781578, 116450393), HG00096 = c(0.101857770468582,
8.1838049456063, 1.19991028786682, 0.831939826228749, 27.6464223725999,
3.78850273139249, 0.0540590649819536, 0.351716382898523, 0.200791414339667,
96.1821778045089), HG00097 = c(0.0781095249582053, 5.68691050653862,
1.57357169691446, 0.0697777450667378, 24.3955715036476, 2.05096276937706,
0.112185357489692, 0.444540251941709, 0.190137938062251, 101.17926156721
), HG00099 = c(0.0489806714207954, 2.43465332606958, 0.521615781673147,
0.93108575037257, 16.4453735152148, 4.00031300285966, 0.00359181983091798,
0.227707651999832, 0.0929246302159905, 58.7830634918037), HG00100 = c(0.118597118618172,
3.83089421985197, 1.44722544015787, 0.620940765480242, 24.8066495438254,
3.27161920134705, 0.00049968321150251, 0.714112406249513, 0.108789749488722,
105.483527339859), HG00101 = c(0.00403496367614745, 6.61228835251498,
3.56579072437701, 1.66066836204679, 25.1133488775017, 1.79821591847768,
0.0293976115522442, 0.450911709524112, 0.23244822901371, 105.818192023699
), HG00102 = c(0.0109253485646219, 4.70964559086586, 1.98268073472144,
0.570481056180073, 19.2339882617972, 1.51668840574531, 0.0312661751488703,
0.491437808951175, 0.250905117203001, 136.140843495464)), row.names = c(NA,
-10L), class = c("tbl_df", "tbl", "data.frame"))
This is prd_counts:
dput(prd_counts[1:10, 1:10])
structure(list(FID = c("HG00096", "HG00097", "HG00099", "HG00100",
"HG00101", "HG00102", "HG00103", "HG00105", "HG00106", "HG00107"
), IID = c("HG00096", "HG00097", "HG00099", "HG00100", "HG00101",
"HG00102", "HG00103", "HG00105", "HG00106", "HG00107"), ENSG00000182902.8 = c(0.0223611610092831,
0.0385031316687293, -0.0682504384265577, 0.00018098416274239,
-0.045492721345375, -0.10473163051734, -0.0215970711860838, 0.060455638944161,
-0.00889260689717109, -0.102096211855105), ENSG00000183307.3 = c(0.129041336028238,
-0.13226906002202, 0.005409246530295, -0.0539556427088601, -0.00699884042001628,
-0.204743560777908, -0.0534359750800079, -0.235648260835705,
-0.10230402771496, -0.0914043464852205), ENSG00000237438.1 = c(-0.758838434524167,
-0.579236418964912, -0.695762357174973, -0.368416879945024, -0.339555280234214,
-0.809438763600528, -0.359798980325098, -0.417769387016999, -0.724636782037491,
-0.309671271758401), ENSG00000243156.2 = c(-0.58456094489168,
0.105851861253113, -0.275061563982305, -0.0406543077034047, -0.522672785138957,
-0.126100301787985, -0.288382571274346, -0.354309857822533, -0.314842662063296,
-0.141401921597711), ENSG00000099968.13 = c(0.135357355615122,
0.157616292043257, 0.180059097593111, 0.250009792099489, 0.170653230854707,
0.316157576642492, 0.314671674077333, 0.224102148083679, 0.232969333848649,
0.14963210689311), ENSG00000069998.8 = c(-0.0346986034383362,
-0.0173493017191681, 0, -0.0173493017191681, -0.645266014640116,
-0.0346986034383362, -0.0173493017191681, -0.0173493017191681,
-0.0346986034383362, 0), ENSG00000184979.8 = c(-0.160573318589815,
0.54683218159596, 0.3503062647549, 0.653899917577768, 0.321280544783323,
0.653727041876318, 0.822864620159811, 1.03780221621802, -0.195295753744408,
-0.228590172992798), ENSG00000070413.12 = c(0.775225873145799,
0.602092262450708, 1.0198591935485, 0.65587457098494, 0.306445027670957,
0.581202299884586, 0.836112660742631, 0.559373823767867, 0.46977171007116,
0.84426113999649)), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame"))

The provided test samples will not work because there are no genes in common between act_counts and prd_counts. I took the liberty of fixing that by reassigning column names:
library(dplyr)
library(tidyr)
## the line below fixes the problem with test samples
colnames(prd_counts)[3:10] <- act_counts$gene[1:8]
acts <- pivot_longer(act_counts,
cols = starts_with("HG"),
names_to = "FID",
values_to = "Actual")
prds <- pivot_longer(prd_counts,
cols = starts_with("ENSG"),
names_to = "gene",
values_to = "Predicted")
inner_join(acts, prds,
by = c("gene", "FID")) |>
select(gene, FID, Actual, Predicted) |>
group_by(gene) |>
summarize(rho = cor(Actual, Predicted))
##> # A tibble: 8 × 2
##> gene rho
##> <chr> <dbl>
##> 1 ENSG00000134602.11 -0.445
##> 2 ENSG00000136237.12 0.446
##> 3 ENSG00000139269.2 0.543
##> 4 ENSG00000152931.6 0.770
##> 5 ENSG00000169129.8 -0.802
##> 6 ENSG00000183696.9 0.405
##> 7 ENSG00000242284.2 -0.503
##> 8 ENSG00000259425.1 -0.110

Related

Estimate_richness for all phyla in phyloseq

Is there an easy way to get ASV richness for each Phylum for each Station using the estimate_richness function in phyloseq? Or is there another simple way of extracting the abundance data for each taxonomic rank and calculating richness that way?
So far I have just been subsetting individual Phyla of interest using for example:
ps.Prymnesiophyceae <- subset_taxa(ps, Phylum == "Prymnesiophyceae")
alpha_diversity<-estimate_richness(ps.Prymnesiophyceae,measure=c("Shannon","Observed"))
H<-alpha_diversity$Shannon
S1<-alpha_diversity$Observed
S<-log(S1)
evenness<-H/S
alpha<-cbind(Shannon=H,Richness=S1,Evenness=evenness,sample_data(Prymnesiophyceae))
But this is rather a pain when having to do it for e.g. the top 20 phyla.
EDIT:
suggestion by #GTM works well until last step. See comment + dput:
> dput(head(sample_names(ps.transect), n=2)) c("2-1-DCM_S21_L001_R1_001.fastq", "2-1-SA_S9_L001_R1_001.fastq" )
> dput(head(alpha, n=2)) structure(list(Observed = c(31, 25), Shannon = c(2.84184012598765,
2.53358345702604), taxon = c("Prymnesiophyceae", "Prymnesiophyceae" ), sample_id = c("X2.1.DCM_S21_L001_R1_001.fastq", "X2.1.SA_S9_L001_R1_001.fastq" ), S = c(3.43398720448515,
3.2188758248682), evenness = c(0.827562817437384,
0.787101955736294)), row.names = c("X2.1.DCM_S21_L001_R1_001.fastq", "X2.1.SA_S9_L001_R1_001.fastq"), class = "data.frame")
> dput(head(smpl_data, n=1)) new("sample_data", .Data = list("001_DCM", 125L, structure(1L, .Label = "DCM", class = "factor"), structure(1L, .Label = "Transect", class = "factor"), structure(1L, .Label = "STZ", class = "factor"),
structure(1L, .Label = "STFW", class = "factor"), "Oligotrophic",
16L, -149.9978333, -29.997, 130.634, 17.1252, 35.4443, 1025.835008,
1.1968, 1e-12, 5.387, 2.8469, 52.26978546, 98.0505, 0, 0,
0.02, 0.9, 0, 0, 2069.47, 8.057, 377.3), names = c("Station_neat", "Depth_our", "Depth_bin", "Loc", "Front", "Water", "Zone", "Bottle", "Lon", "Lat", "pressure..db.", "Temperature", "Salinity", "Density_kgm.3", "Fluorescence_ugL", "PAR", "BottleO2_mLL", "CTDO2._mLL", "OxygenSat_.", "Beam_Transmission", "N_umolL", "NO3_umolL", "PO4_umolL", "SIL_umolL", "NO2_umolL", "NH4_umolL", "DIC_uMkg", "pH", "pCO2_matm"), row.names = "2-1-DCM_S21_L001_R1_001.fastq",
.S3Class = "data.frame")
You can wrap your code in a for loop to do so. I've slightly modified your code to make it a bit more flexible, see below.
require("phyloseq")
require("dplyr")
# Calculate alpha diversity measures for a specific taxon at a specified rank.
# You can pass any parameters that you normally pass to `estimate_richness`
estimate_diversity_for_taxon <- function(ps, taxon_name, tax_rank = "Phylum", ...){
# Subset to taxon of interest
tax_tbl <- as.data.frame(tax_table(ps))
keep <- tax_tbl[,tax_rank] == taxon_name
keep[is.na(keep)] <- FALSE
ps_phylum <- prune_taxa(keep, ps)
# Calculate alpha diversity and generate a table
alpha_diversity <- estimate_richness(ps_phylum, ...)
alpha_diversity$taxon <- taxon_name
alpha_diversity$sample_id <- row.names(alpha_diversity)
return(alpha_diversity)
}
# Load data
data(GlobalPatterns)
ps <- GlobalPatterns
# Estimate alpha diversity for each phylum
phyla <- get_taxa_unique(ps,
taxonomic.rank = 'Phylum')
phyla <- phyla[!is.na(phyla)]
alpha <- data.frame()
for (phylum in phyla){
a <- estimate_diversity_for_taxon(ps = ps,
taxon_name = phylum,
measure = c("Shannon", "Observed"))
alpha <- rbind(alpha, a)
}
# Calculate the additional alpha diversity measures
alpha$S <- log(alpha$Observed)
alpha$evenness <- alpha$Shannon/alpha$S
# Add sample data
smpl_data <- as.data.frame(sample_data(ps))
alpha <- left_join(alpha,
smpl_data,
by = c("sample_id" = "X.SampleID"))
This is a reproducible example with GlobalPatterns. Make sure to alter the code to match your data by replacing X.SampleID in the left join with the name of the column that contains the sample IDs in your sample_data. If there is no such column, you can create it from the row names:
smpl_data <- as.data.frame(sample_data(ps))
smpl_data$sample_id < row.names(smpl_data)
alpha <- left_join(alpha,
smpl_data,
by = c("sample_id" = "sample_id"))

How to cbind a list of tables by one column, and suffix headings with the list item name

I've got a list of dataframes. I'd like to cbind them by the index column, sample_id. Each table has the same column headings, so I can't just cbind them otherwise I won't know which list item the columns came from. The name of the list item gives the measure used to generate them, so I'd like to suffix the column headings with the list item name.
Here's a simplified demo list of dataframes:
list_of_tables <- list(number = structure(list(sample_id = structure(1:3, levels = c("CSF_1",
"CSF_2", "CSF_4"), class = "factor"), total = c(655, 331, 271
), max = c(12, 5, 7)), row.names = c(NA, -3L), class = c("tbl_df",
"tbl", "data.frame")), concentration_cm_3 = structure(list(sample_id = structure(1:3, levels = c("CSF_1",
"CSF_2", "CSF_4"), class = "factor"), total = c(121454697, 90959097,
43080697), max = c(2050000, 2140000, 915500)), row.names = c(NA,
-3L), class = c("tbl_df", "tbl", "data.frame")), volume_nm_3 = structure(list(
sample_id = structure(1:3, levels = c("CSF_1", "CSF_2", "CSF_4"
), class = "factor"), total = c(2412783009, 1293649395, 438426087
), max = c(103500000, 117400000, 23920000)), row.names = c(NA,
-3L), class = c("tbl_df", "tbl", "data.frame")), area_nm_2 = structure(list(
sample_id = structure(1:3, levels = c("CSF_1", "CSF_2", "CSF_4"
), class = "factor"), total = c(15259297.4, 7655352.2, 3775922
), max = c(266500, 289900, 100400)), row.names = c(NA, -3L
), class = c("tbl_df", "tbl", "data.frame")))
You'll see it's a list of 4 tables, and the list item names are "number", "concentration_cm_3", "volume_nm_3", and "area_nm_2".
Using join_all from plyr I can merge them all by sample_id. However, how do I suffix with the list item name?
merged_tables <- plyr::join_all(stats_by_measure, by = "sample_id", type = "left")
we could do it this way:
The trick is to use .id = 'id' in bind_rows which adds the name as a column. Then we could pivot:
library(dplyr)
library(tidyr)
bind_rows(list_of_tables, .id = 'id') %>%
pivot_wider(names_from = id,
values_from = c(total, max))
sample_id total_number total_concentration_cm_3 total_volume_nm_3 total_area_nm_2 max_number max_concentration_cm_3 max_volume_nm_3 max_area_nm_2
<fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 CSF_1 655 121454697 2412783009 15259297. 12 2050000 103500000 266500
2 CSF_2 331 90959097 1293649395 7655352. 5 2140000 117400000 289900
3 CSF_4 271 43080697 438426087 3775922 7 915500 23920000 100400
Probably, we may use reduce2 here with suffix option from left_join
library(dplyr)
library(purrr)
nm <- names(list_of_tables)[1]
reduce2(list_of_tables, names(list_of_tables)[-1],
function(x, y, z) left_join(x, y, by = 'sample_id', suffix = c(nm, z)))
Or if we want to use join_all, probably we can rename the columns before doing the join
library(stringr)
imap(list_of_tables, ~ {
nm <- .y
.x %>% rename_with(~str_c(.x, nm), -1)
}) %>%
plyr::join_all( by = "sample_id", type = "left")
Or use a for loop
tmp <- list_of_tables[[1]]
names(tmp)[-1] <- paste0(names(tmp)[-1], names(list_of_tables)[1])
for(nm in names(list_of_tables)[-1]) {
tmp2 <- list_of_tables[[nm]]
names(tmp2)[-1] <- paste0(names(tmp2)[-1], nm)
tmp <- left_join(tmp, tmp2, by = "sample_id")
}
tmp

scalar multipling two data.frames in r

I have two data.frames in r.THe first one is
cases population urbanisation density temperature h_dev_index
Austria 563.375758 10.7969091 63.07388 134.08690 13.011172 0.9898000
Belgium 109.400000 13.5885455 99.38933 443.52112 16.185297 0.9829455
Bulgaria 0.000000 5.5320606 85.84782 52.30011 20.825068 0.9669576
Croatia 2.000000 3.4548485 64.21382 60.46082 20.855372 0.9288667
THe second one is:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.916e+00 7.144e-01 -2.682 0.00785 **
population -7.327e-03 1.572e-03 -4.659 5.37e-06 ***
cases 1.473e-03 1.544e-04 9.541 < 2e-16 ***
urbanisation 3.798e-03 1.962e-03 1.936 0.05410 .
density 8.132e-05 2.512e-04 0.324 0.74641
temperature -3.518e-02 8.641e-03 -4.071 6.43e-05 ***
h_dev_index 1.842e+00 8.800e-01 2.093 0.03743 *
I need to multiply the coeffient from the first column from second data.frame to the dataframe.THe result should be(for Austria) :
-1.916e+00 + 563.375758*(-7.327e-03)+ 10.7969091*(1.473e-03)+ 63.07388*(3.798e-03)+134.08690*(8.132e-05)+ 13.011172*(-3.518e-02)+ 0.9898000*1.842e+00
How should I solve this problem?
Try reshaping and merging:
library(tidyverse)
#Code
res <- df1 %>% rownames_to_column('id') %>%
pivot_longer(-id) %>%
left_join(df2 %>% select(1,2) %>% rename(name=V1)) %>%
group_by(id) %>%
summarise(Val=sum(value*V2)+df2$V2[df2$V1=='(Intercept)'])
Output:
# A tibble: 4 x 2
id Val
<chr> <dbl>
1 Austria 0.451
2 Belgium -0.200
3 Bulgaria -0.578
4 Croatia -0.712
Some data used:
#Data
df1 <- structure(list(cases = c(563.375758, 109.4, 0, 2), population = c(10.7969091,
13.5885455, 5.5320606, 3.4548485), urbanisation = c(63.07388,
99.38933, 85.84782, 64.21382), density = c(134.0869, 443.52112,
52.30011, 60.46082), temperature = c(13.011172, 16.185297, 20.825068,
20.855372), h_dev_index = c(0.9898, 0.9829455, 0.9669576, 0.9288667
)), class = "data.frame", row.names = c("Austria", "Belgium",
"Bulgaria", "Croatia"))
df2 <- structure(list(V1 = c("(Intercept)", "population", "cases", "urbanisation",
"density", "temperature", "h_dev_index"), V2 = c(-1.916, -0.007327,
0.001473, 0.003798, 8.132e-05, -0.03518, 1.842), V3 = c(0.7144,
0.001572, 0.0001544, 0.001962, 0.0002512, 0.008641, 0.88), V4 = c(-2.682,
-4.659, 9.541, 1.936, 0.324, -4.071, 2.093), V5 = c("0.00785",
"5.37e-06", "<", "0.05410", "0.74641", "6.43e-05", "0.03743"),
V6 = c("**", "***", "2e-16", ".", "", "***", "*"), V7 = c("",
"", "***", "", "", "", "")), class = "data.frame", row.names = c(NA,
-7L))
Be careful about the names of your dataframes.

Make many rows to single using an index

According to this input:
structure(list(mid = c("text11", "text12", "text21", "text22",
"text23"), term = c("test", "text", "section", "2", "sending"
)), class = "data.frame", row.names = c(NA, -5L))
How is it possible to transform it using the mid to make the melt row to a single. where in mid the part text1, text2... text12 shows the number of row and the new number the terms exists in this row. Merging them with a with space separation.
Example out dataframe
data.frame(mid = c("text1", "text2"), term = c("test "text", "section 2 sending"
))
This should work
library(dplyr)
library(stringr)
df <- structure(list(mid = c("text11", "text12", "text21", "text22",
"text23"), term = c("test", "text", "section", "2", "sending"
)), class = "data.frame", row.names = c(NA, -5L))
df %>%
mutate(mid = str_extract(mid, "text\\d")) %>%
group_by(mid) %>%
summarise(term = paste(term, collapse=" "))
# # A tibble: 2 x 2
# mid term
# <chr> <chr>
# 1 text1 test text
# 2 text2 section 2 sending
EDIT - to address comment
Addressing the question in the comment, the functions below will work for any case where all of the digits except the last one identify the group (i.e., 1 and 12 in the example below).
df <- structure(list(mid = c("text11", "text12", "text121", "text122", "text123"), term = c("test", "text", "section", "2", "sending")), class = "data.frame", row.names = c(NA, -5L))
df %>%
mutate(mid = str_sub(mid, 1, (nchar(mid)-1))) %>%
group_by(mid) %>%
summarise(term = paste(term, collapse=" "))
# # A tibble: 2 x 2
# mid term
# <chr> <chr>
# 1 text1 test text
# 2 text12 section 2 sending

Data summary with for loop

I am trying to use the summarySE function from package Rmisc to generate data summaries for each column in a workbook. The first column in the worksheet is the grouping variable, and I want to loop through the other columns.
I am using the following code:
library(Rmisc)
for(i in 2:ncol(file)){
sum<-summarySE(file, measurevar = file[,i], groupvars = file[1])
}
But I keep getting the same error:
'Error in UseMethod("as.quoted") :
no applicable method for 'as.quoted' applied to an object of class
"data.frame"'
I know that file[1] is a list and should be a vector, but using unlist causes more problems. Any ideas?
Data:
structure(list(Treatment = c("SKELE", "SKELE", "SKELE", "SKELE",
"SKELE", "SKELE", "SKELE", "SKELE", "SKELE", "SKELE", "SKELE",
"SKELE", "TISSUE", "TISSUE", "TISSUE", "TISSUE", "TISSUE", "TISSUE",
"TISSUE", "TISSUE", "TISSUE", "TISSUE", "TISSUE", "TISSUE"),
`% lipid in skeleton` = c(21.8706902567934, 31.1736436075643,
62.2246234617322, 86.6248675033794, 46.5607971373041, 34.7532319115317,
32.7686161366371, 6.73685660233744, 33.7111477556584, 48.8970450055359,
54.3687328279357, 48.9086732773318, 78.1293097432066, 68.8263563924357,
37.7753765382678, 13.3751324966206, 53.4392028626959, 65.2467680884683,
67.2313838633629, 93.2631433976626, 66.2888522443416, 51.1029549944641,
45.6312671720643, 51.0913267226682), `% ash in skeleton` = c(97.370981485225,
98.6169174273543, 99.2417548180554, 99.1330769035889, 98.5523872323069,
98.0077944962001, 97.7848485294277, 98.0738823145836, 98.1567971208113,
98.8047064451889, 97.1790753033603, 98.7503991978965, 2.62901851477497,
1.38308257264571, 0.75824518194458, 0.866923096411125, 1.44761276769314,
1.99220550379987, 2.2151514705723, 1.92611768541643, 1.84320287918869,
1.19529355481109, 2.82092469663973, 1.24960080210352), `% tissue in skeleton` = c(55.2224357342865,
70.022864703591, 77.5880978578982, 83.1168129092154, 67.3012504898307,
62.1455896726595, 64.2488985210074, 67.3089347382539, 59.9276126303114,
70.5681668501146, 67.717146912379, 68.8185249866557, 44.7775642657135,
29.977135296409, 22.4119021421018, 16.8831870907846, 32.6987495101694,
37.8544103273405, 35.7511014789926, 32.6910652617461, 40.0723873696886,
29.4318331498854, 32.2828530876211, 31.1814750133443)), class = "data.frame", row.names = c(NA,
-24L), .Names = c("Treatment", "% lipid in skeleton", "% ash in skeleton",
"% tissue in skeleton"))
We need a data.frame to apply the summarySE. Using lapply, loop though the sequence of columns, subset the 'file', specify the measurevar and groupvars (based on the index of columns)
lapply(2:ncol(file), function(j) summarySE(file[c(1, j)], measurevar = 2, groupvars = 1))
#[[1]]
# Treatment N 2 sd se ci
#1 SKELE 12 42.38324 20.53836 5.928913 13.04945
#2 TISSUE 12 57.61676 20.53836 5.928913 13.04945
#[[2]]
# Treatment N 2 sd se ci
#1 SKELE 12 98.306052 0.6567242 0.1895799 0.4172626
#2 TISSUE 12 1.693948 0.6567242 0.1895799 0.4172626
#[[3]]
# Treatment N 2 sd se ci
#1 SKELE 12 67.83219 7.442443 2.148448 4.728703
#2 TISSUE 12 32.16781 7.442443 2.148448 4.728703
Or if we are using the OP's method
lst <- vector("list", ncol(file)-1)
for(i in 2:ncol(file)){
lst[[i]] <- summarySE(file, measurevar = i, groupvars = 1)
}
lst
Note that we can also specify the names instead of index
for(i in 2:ncol(file)){
lst[[i]] <- summarySE(file, measurevar = names(file)[i], groupvars = names(file)[1])
}
In the OP's code, the measurevar and groupvars are taking the values of the columns instead of the column name.

Resources