Okay, I have a presence/absence matrix of 6 samples with 25 possibilities of presence/absence.
I've been able to make a cluster dendrogram with the data, but I'd rather have it plotted as a distance matrix that looks better and is easier to analysis? (Maybe a cluster plot or something similar?)
I'm really stuck with figuring out the next part - I've spent days searching on here and various other Google searches but nothing is turning up!
Here's the code I've got for the cluster dendrogram:
matrix<-read.csv("Horizontal.csv")
distance<-dist(matrix)
hc.m<-hclust(distance)
plot(hc.m, labels=matrix$Sample, main ="", cex.main=0.8, cex.lab= 1.1)
Help!
> dput(head(matrix,20))structure(list(Sample = structure(1:6, .Label = c("CL1", "CL2",
"CL3", "COL1", "COL2", "COL3"), class = "factor"), X = c(0L,
0L, 0L, 1L, 1L, 1L), X.1 = c(1L, 0L, 0L, 1L, 1L, 1L), X.2 = c(1L,
1L, 1L, 0L, 0L, 0L), X.3 = c(1L, 1L, 1L, 1L, 1L, 1L), X.4 = c(1L,
1L, 1L, 0L, 0L, 0L), X.5 = c(0L, 0L, 0L, 1L, 1L, 0L), X.6 = c(1L,
1L, 1L, 1L, 1L, 1L), X.7 = c(1L, 1L, 1L, 1L, 1L, 1L), X.8 = c(0L,
0L, 0L, 1L, 1L, 1L), X.9 = c(0L, 0L, 0L, 1L, 1L, 1L), X.10 = c(1L,
1L, 1L, 1L, 1L, 1L), X.11 = c(1L, 1L, 1L, 1L, 1L, 1L), X.12 = c(1L,
1L, 1L, 1L, 1L, 1L), X.13 = c(1L, 0L, 0L, 0L, 0L, 0L), X.14 = c(0L,
0L, 0L, 1L, 1L, 1L), X.15 = c(0L, 0L, 0L, 1L, 1L, 1L), X.16 = c(1L,
1L, 1L, 1L, 0L, 0L), X.17 = c(1L, 1L, 1L, 1L, 1L, 1L), X.18 = c(1L,
1L, 1L, 1L, 1L, 1L), X.19 = c(1L, 1L, 1L, 1L, 1L, 1L), X.20 = c(1L,
1L, 1L, 1L, 1L, 1L), X.21 = c(1L, 1L, 1L, 1L, 0L, 0L), X.22 = c(0L,
0L, 0L, 0L, 1L, 1L), X.23 = c(1L, 1L, 1L, 1L, 1L, 1L), X.24 = c(0L,
1L, 1L, 1L, 1L, 1L)), .Names = c("Sample", "X", "X.1", "X.2",
"X.3", "X.4", "X.5", "X.6", "X.7", "X.8", "X.9", "X.10", "X.11",
"X.12", "X.13", "X.14", "X.15", "X.16", "X.17", "X.18", "X.19",
"X.20", "X.21", "X.22", "X.23", "X.24"), row.names = c(NA, 6L
), class = "data.frame")
Okay with this code:
library(vegan)
library(ggplot2)
library(tidyverse)
library(MASS)
#set working directory
setwd("~/Documents/Masters/BS707/Metagenomics")
#read csv file
cookie<-read.csv("Horizontal.csv")
data.frame(cookie, row.names = c("CL1", "CL2", "CL3", "COL1", "COL2", "COL3"))
df = subset(cookie)
data.frame(df, row.names = c("CL1", "CL2", "CL3", "COL1", "COL2", "COL3"))
dm<- dist(df, method = "binary") #calculate the distance matrix
cmdscale(dm, eig = TRUE, k=2) -> mds
as.tibble(mds$points) #mds coordinates
bind_cols(df, Sample = df$Sample) #bind sample names
mutate(df,group = gsub("\\d$", "", "Sample1"))#remove last digit from sample names to form groups
ggplot(df)+
geom_point (aes(x = "V1",y = "V2", color = "group")) #plot
as.tibble(mds$points) %>% ggplot() + geom_point (aes(x = V1, y = V2))
I get the plot but each group is named 'Sample' rather than CL1, CL2, CL3, COL1, COL2, COL3. I had to remove the %>% because my R didn't recognise it as a command or anything and gave an error every single time (switched to + or deleted and then it worked fine).
Here is a way to visualize your data in 2 dimensions:
library(tidyverse)
df %>%
dplyr::select(-1) %>% #remove first column
dist(method = "binary") %>% #calculate the distance matrix
cmdscale(eig = TRUE, k = 2) -> mds #do MDS also known as principal coordinates analysis
as.tibble(mds$points) %>% #mds coordinates
bind_cols( Sample = df$Sample) %>% #bind sample names
mutate(group = gsub("\\d$", "", Sample)) %>% #remove last digit from sample names to form groups
ggplot()+
geom_point(aes(x = V1,y = V2, color = group)) #plot
or without tidyverse:
df_dist <- dist(df[,-1], method = "binary")
mds <- cmdscale(df_dist, eig = TRUE, k = 2)
for_plot <- data.frame(mds$points, group = gsub("\\d$", "", df$Sample))
ggplot(for_plot)+
geom_point(aes(x = X1,y = X2, color = group))
other options include using isoMDS from MASS library which will perform Kruskal's Non-metric Multidimensional Scaling or metaMDS from vegan library which performs Nonmetric Multidimensional Scaling with Stable Solution from Random Starts, Axis Scaling and Species Scores.
Related
I want to do logistic regressions for several (n = 30) SNPs (coded as 0,1,2) as predictors and a casecontrol variable (0,1) as an outcome. As few of those rs are correlated, I cannot put all rs# in one model but have to run one at a time regression for each i.e., I cannot simply plus them together in one model like rs1 + rs2 + rs3 and so on....I need to have each regressed separately like below;
test1 = glm(casecontrol ~ rs1, data = mydata, family=binomial)
test2 = glm(casecontrol ~ rs2, data = mydata, family=binomial)
test3 = glm(casecontrol ~ rs3, data = mydata, family=binomial)
While I can run all the above regressions separately, is there a way to loop them together so I could get a summary() of all tests in one go?
I will have to adjust for age and sex too, but that would come after I run an unadjusted loop.
My data head from dput(head(mydata)) for example;
structure(list(ID = 1:6, sex = c(2L, 1L, 2L, 1L, 1L, 1L), age = c(52.4725405022036,
58.4303618001286, 44.5300065923948, 61.4786037395243, 67.851808819687,
39.7451378498226), bmi = c(31.4068751083687, 32.0614937413484,
23.205021363683, 29.1445372393355, 32.6287483051419, 20.5887741968036
), casecontrol = c(0L, 1L, 0L, 1L, 1L, 1L), rs1 = c(1L, 0L, 2L,
2L, 1L, 2L), rs2 = c(2L, 1L, 2L, 0L, 1L, 1L), rs3 = c(1L, 0L,
1L, 2L, 2L, 2L), rs4 = c(1L, 1L, 1L, 1L, 0L, 2L), rs5 = c(1L,
0L, 0L, 0L, 1L, 2L), rs6 = c(1L, 1L, 1L, 1L, 1L, 2L), rs7 = c(0L,
0L, 0L, 0L, 0L, 0L), rs8 = c(0L, 0L, 1L, 0L, 2L, 1L), rs9 = c(0L,
0L, 2L, 1L, 1L, 0L), rs10 = c(2L, 0L, 0L, 2L, 2L, 1L), rs11 = c(0L,
1L, 1L, 0L, 1L, 1L), rs12 = c(1L, 2L, 0L, 1L, 2L, 2L), rs13 = c(0L,
2L, 0L, 0L, 0L, 0L), rs14 = c(1L, 1L, 1L, 1L, 2L, 2L), rs15 = c(1L,
2L, 1L, 1L, 0L, 1L), rs16 = c(0L, 2L, 1L, 2L, 2L, 1L), rs17 = c(0L,
2L, 1L, 1L, 2L, 2L), rs18 = c(1L, 2L, 2L, 1L, 1L, 1L), rs19 = c(1L,
1L, 0L, 1L, 2L, 2L), rs20 = c(2L, 1L, 0L, 2L, 2L, 1L), rs21 = c(1L,
2L, 2L, 1L, 1L, 0L), rs22 = c(1L, 1L, 2L, 2L, 0L, 1L), rs23 = c(2L,
0L, 2L, 1L, 1L, 1L), rs24 = c(0L, 0L, 0L, 2L, 2L, 2L), rs25 = c(2L,
2L, 1L, 1L, 0L, 1L), rs26 = c(1L, 1L, 0L, 2L, 0L, 1L), rs27 = c(1L,
1L, 1L, 1L, 0L, 1L), rs28 = c(0L, 1L, 1L, 2L, 0L, 2L), rs29 = c(2L,
2L, 2L, 2L, 1L, 2L), rs30 = c(0L, 2L, 1L, 2L, 1L, 0L)), row.names = c(NA,
6L), class = "data.frame")```
Probably you want something like this:
lapply(1:30, function(i) glm(as.formula(paste0('casecontrol ~ ', 'rs', i)), data = mydata, family = binomial))
which will execute 30 logistic regressions with the selected predictor.
Instead of hard coding the overall number of predictors, you can use:
sum(grepl('rs', names(mydata))), which will return 30.
You can use tidy function from broom package to get the summary in a tidy format.
purrr::map_dfr(1:30, function(i) data.frame(model = i, tidy(glm(as.formula(paste0('casecontrol ~ ', 'rs', i)), data = mydata, family = binomial))))
or you can do this in a more dynamic way:
names(mydata)[grepl('rs', names(mydata))] -> pred #get all predictors that contain 'rs'
purrr::map_dfr(1:length(pred),
function(i) data.frame(model = i,
tidy(glm(as.formula(paste0('casecontrol ~ ', pred[i])), data = mydata, family = binomial))))
If you want to include another variable, you simply need to adjust the pred vector.
c(pred, paste0(pred, ' + age')) -> pred #interaction between rs drivers and age
or
c(pred, paste0(pred, ' + age + sex')) -> pred #interaction between rs drivers age and sex
you can do something like this
outcome<-mydata %>% select("casecontrol") #select outcome
features <- mydata %>% select("rs1":"rs30") #select features
features_names<-data.frame(colnames(features)) #get feature names
for(i in 1:nrow(features_names)) # loop over length of features_name
{selected=features[,i,drop=FALSE] #get each feature
total <- cbind(selected, response) # combine with outcome
glm.fit <- glm(casecontrol ~ ., data = total, family = binomial("logit"))
summary(glm.fit)
}
I would like to calculate the percentage of people reported doing some work per day. For example I would like to know the percentage of people reported doing some work on Monday from the entire sample.
I used the following code to calculate this, but I am not sure about my result.
df1 <- structure(list(id = c(12L, 123L, 10L), t1_1 = c(0L, 0L, 1L),
t1_2 = c(1L, 0L, 1L), t1_3 = c(1L, 0L, 1L), t2_1 = c(0L,
1L, 1L), t2_2 = c(1L, 1L, 1L), t2_3 = c(0L, 1L, 1L), t3_1 = c(1L,
0L, 1L), t3_2 = c(0L, 0L, 1L), t3_3 = c(1L, 0L, 1L), t4_1 = c(0L,
1L, 1L), t4_2 = c(1L, 1L, 1L), t4_3 = c(0L, 1L, 1L), t5_1 = c(0L,
1L, 1L), t5_2 = c(1L, 1L, 1L), t5_3 = c(0L, 1L, 1L), t6_1 = c(1L,
0L, 1L), t6_2 = c(1L, 0L, 1L), t6_3 = c(1L, 0L, 1L), t7_1 = c(0L,
1L, 1L), t7_2 = c(0L, 1L, 1L), t7_3 = c(1L, 1L, 1L)),
class = "data.frame", row.names = c(NA, -3L))
Variable description t1 - Monday (t1_1, t1_2, t1_3 - are time steps that measured if work was done during Monday); t2 - Tuesday; t3 - Wednesday; t4 - Thursday; t5 - Friday; t6 - Saturda and t7- Sunday; id is an identification number
df2 <- reshape2::melt(df1, id.vars = "id")
df2$variable <- as.character(df2$variable)
df2$day <- sapply(strsplit(df2$variable, "_"), `[`, 1)
df2$day <- factor(df2$day, levels = variable)
df3<-df2 %>%
group_by (day) %>%
mutate (percent = (value/sum(value) *100))
ggplot(df3, aes(day, group = value)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
scale_fill_discrete(name="Days", labels=c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")) +
scale_y_continuous(labels=scales::percent, limits=c(0,1)) +
ylab("relative frequencies") +
theme_bw()
Result:
library(dplyr)
df1 <- structure(
list(id = c(12L, 123L, 10L),
t1_1 = c(0L, 0L, 1L), t1_2 = c(1L, 0L, 1L), t1_3 = c(1L, 0L, 1L),
t2_1 = c(0L, 1L, 1L), t2_2 = c(1L, 1L, 1L), t2_3 = c(0L, 1L, 1L),
t3_1 = c(1L, 0L, 1L), t3_2 = c(0L, 0L, 1L), t3_3 = c(1L, 0L, 1L),
t4_1 = c(0L, 1L, 1L), t4_2 = c(1L, 1L, 1L), t4_3 = c(0L, 1L, 1L),
t5_1 = c(0L, 1L, 1L), t5_2 = c(1L, 1L, 1L), t5_3 = c(0L, 1L, 1L),
t6_1 = c(1L, 0L, 1L), t6_2 = c(1L, 0L, 1L), t6_3 = c(1L, 0L, 1L),
t7_1 = c(0L, 1L, 1L), t7_2 = c(0L, 1L, 1L), t7_3 = c(1L, 1L, 1L)),
class = "data.frame", row.names = c(NA, -3L))
df2 <- reshape2::melt(df1, id.vars = "id")
df2$variable <- as.character(df2$variable)
df2$day <- sapply(strsplit(df2$variable, "_"), `[`, 1)
df3 <- df2 %>%
group_by(id, day) %>%
summarize(count = sum(value)) %>%
group_by(id) %>%
mutate(percent = count / sum(count)) %>%
arrange(day, id)
> df3
# A tibble: 21 x 4
# Groups: id [3]
id day count percent
<int> <chr> <int> <dbl>
1 10 t1 3 0.143
2 12 t1 2 0.182
3 123 t1 0 0
4 10 t2 3 0.143
5 12 t2 1 0.0909
6 123 t2 3 0.25
...
Is it something you are looking for?
I'm new to R. I have a data set
df <- structure(list(schoolid = c(1L, 1L, 1L, 1L, 1L, 1L),
score = c(0L, 10L, 0L, 40L, 42L, 4L),
gender = c(1L, 1L, 1L, 1L, 1L, 1L)),
.Names = c("schoolid", "score", "gender"),
row.names = c(NA, 6L),
class = "data.frame")
for which I have to run a random intercept model to see if there is an impact of the gender on the score across schools. Can anyone kindly explain what is expected from me?
Below I have code with 3 columns: a group field, a open/close field for the store, and the rolling sum of 3 month opens for the store. I also have the desired solution output.
My dataset can be thought of as an employees availability. You can assume each row to be a different time period (hour, day,month, year, whatever). In the open/closed column I have whether or not the employee was present. The 3month rolling column is a sum of the previous rows.
What I want to identify is the non-zero values in this rolling sum column following a gap of at least 3 zero rows for that particular group. While not present in this dataset, you can assume that there might be more than one 'gap' of zeros present.
structure(list(Group = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L), .Label = c("A", "B"), class = "factor"), X0_closed_1_open = c(0L,
1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L), X3month_roll_open = c(0L,
0L, 1L, 2L, 2L, 1L, 1L, 0L, 0L, 0L, 0L, 1L, 2L, 0L, 1L, 1L, 1L,
0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L), desired_solution = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L), .Label = c("no", "yes"), class ="factor")), .Names = c("Group", "X0_closed_1_open", "X3month_roll_open", "desired_solution"), class = "data.frame", row.names = c(NA,
-26L))
One option is:
res <- unsplit(
lapply(split(df1, df1$Group), function(x) {
rl <- with(x,rle(X3month_roll_open==0))
indx <- cumsum(c(0,diff(inverse.rle(within.list(rl,
values[values] <- lengths[values]>=3)))<0))
x$Flag <- indx!=0 & x[,3]!=0
x}),
df1$Group)
NOTE: Instead of 'yes/no', it may be better to have 'TRUE/FALSE' for easing subsetting.
identical(c('no', 'yes')[res$Flag+1L], as.character(res$desired_solution))
#[1] TRUE
I would like to compute a chi squared test for each column in a dataframe and grouping for the variable Project.
Basically I would like to compute a two by two table for each column and then store the value in a new table.
Here an example of my dataframe.
structure(list(Project = structure(c(1L, 1L, 1L, 2L, 2L, 2L), .Label = c("discovery", "validation"), class = "factor"), MLL = c(1L, 1L, 1L, 1L, 1L, 1L), CREB = c(0L, 1L, 1L, 1L, 1L, 0L), TNR = c(1L, 1L, 0L, 0L, 1L, 1L)), .Names = c("Project", "MLL", "CREB", "TNR"), row.names = c(1L, 2L, 3L, 300L, 301L, 302L), class = "data.frame")
After the comment of Jaap I have tried:
pvalue <- data.frame(apply(cast_subset[-1] , 2 , function(i) chisq.test(table(cast_subset$Project , i ))$p.value))
colnames(pvalue) <- "p.value"
but i can not accces the column with the gene name for merging to other data set.