Finding the duplicates, average them and create a proper table - r

Let's start with my data:
> dput(head(tbl_end))
structure(list(`Gene name` = c("at1g01050.1", "at1g01080.1",
"at1g01090.1", "at1g01220.1", "at1g01320.2", "at1g01420.1"),
`1_1` = c(0, 0, 0, 0, 0, 0), `1_2` = c(0, 0, 0, 0, 0, 0),
`1_3` = c(0, 1, 0, 0, 0, 0), `1_4` = c(0, 0.660693687777888,
0, 0, 0, 0), `1_5` = c(0, 0.521435654491704, 0, 0, 0, 1),
`1_6` = c(0, 0.437291194705566, 0, 0, 0, 1), `1_7` = c(0,
0.52204783488213, 0, 0, 0, 0), `1_8` = c(0, 0.524298383907171,
0, 0, 0, 0), `1_9` = c(1, 0.376865096972469, 0, 1, 0, 0),
`1_10` = c(0, 0, 0, 0, 0, 0), `1_11` = c(0, 0, 0, 0, 0, 0
), `1_12` = c(0, 0, 0, 0, 0, 0), `1_13` = c(0, 0, 0, 0, 0,
0), `1_14` = c(0, 0, 0, 0, 0, 0), `1_15` = c(0, 0, 0, 0,
0, 0), `1_16` = c(0, 0, 0, 0, 0, 0), `1_17` = c(0, 0, 0,
0, 0, 0), `1_18` = c(0, 0, 0.476101907006443, 0, 0, 0), `1_19` = c(0,
0, 1, 0, 0, 0), `1_20` = c(0, 0, 0, 0, 0, 0), `1_21` = c(0,
0, 0, 0, 1, 0), `1_22` = c(0, 0, 0, 0, 0, 0), `1_23` = c(0,
0, 0, 0, 0, 0), `1_24` = c(0, 0, 0, 0, 0, 0)), .Names = c("Gene name",
"1_1", "1_2", "1_3", "1_4", "1_5", "1_6", "1_7", "1_8", "1_9",
"1_10", "1_11", "1_12", "1_13", "1_14", "1_15", "1_16", "1_17",
"1_18", "1_19", "1_20", "1_21", "1_22", "1_23", "1_24"), row.names = c(NA,
6L), class = "data.frame")
so I have more than 2k rows. As a name of the row I set the gene name but there is a problem. Sometimes same gene has a different "models" (so they put the dot after name and the number 1 or 2) but still it's the same gene so I want to find all of those duplicates (same gene name) and average the values in different columns for this gene and just leave the 1 row with the averaged values.
Is it possible to do ?
Just showing some of the gene names I have:
> dput(vec_names)
c("at1g01050.1", "at1g01080.1", "at1g01090.1", "at1g01220.1",
"at1g01320.2", "at1g01420.1", "at1g01470.1", "at1g01800.1", "at1g01910.5",
"at1g01920.2", "at1g01960.1", "at1g01980.1", "at1g02020.2", "at1g02100.2",
"at1g02130.1", "at1g02140.1", "at1g02150.1", "at1g02305.1", "at1g02500.2",
"at1g02560.1", "at1g02780.1", "at1g02880.3", "at1g02920.1", "at1g02930.2",
"at1g03030.1", "at1g03090.2", "at1g03110.1", "at1g03130.1", "at1g03210.1",
"at1g03220.1", "at1g03230.1", "at1g03310.2", "at1g03330.1", "at1g03475.1",
"at1g03630.2", "at1g03680.1", "at1g03870.1", "at1g03900.1", "at1g04080.2",
"at1g04130.1", "at1g04170.1", "at1g04190.1", "at1g04270.2", "at1g04350.1",
"at1g04410.1", "at1g04420.1", "at1g04530.1", "at1g04640.2", "at1g04690.1",
"at1g04750.2", "at1g04810.1", "at1g04850.1", "at1g04870.2", "at1g05010.1",
"at1g05180.1", "at1g05190.1", "at1g05320.3", "at1g05350.1", "at1g05520.1",
"at1g05560.1", "at1g05620.2", "at1g06000.1", "at1g06110.1", "at1g06130.2",
"at1g06290.1", "at1g06410.1", "at1g06550.1", "at1g06560.1", "at1g06570.1",
I think there is a function for that but can't find it.

Using data.table
library(data.table)
dt <- data.table(dat)
dt[, gene_unique := gsub("[.]*", "", dt$Gene)]
cols <- colnames(dt)[2:25]
dt[, lapply(.SD, mean), by = gene_unique, .SDcols = cols]
Using aggregate as suggested in comments
dat$`Gene name` = gsub("[.]*", "", dat$Gene)
aggregate(. ~ `Gene name`, dat, mean)

Related

Changing a character column into a continuous column, by dividing them into sections (1,2,3,4)

I have a data set I'm trying to run a glm regression on, however it contains characters as age limit, race, and comorbidity class. I would like to change those columns into a continuous variable so the regression can accept it. Data below, I want to change the TBI.irace2 into (Hispanic=1, Black=2, white=3, and other=4) same with age (age 18-28=1, 29-46=2, 47-64=3, and >64=4) and with NISS (NISS 0-10=1, NISS 11-20=2, NISS 21-30=3, and NISS 31-40=4, NISS41-50=5, NISS 51-60=6, NISS 61-70=7, NISS>70= 8)
Please find summary of data below
TBI.crani = c(0, 0, 0, 0, 0, 0), TBI.vte = c(0,
0, 0, 0, 0, 0), TBI.FEMALE = c(0, 0, 1, 0, 1, 0), TBI.iracecat2 = c("Whites",
"Whites", "Whites", "Hispanics", "Whites", "Blacks"), TBI.agecat = c("Age 47-64",
"Age 29-46", "Age > 64", "Age 29-46", "Age 18-28", "Age 18-28"
), TBI.nisscategory = c("NISS 21-30", "NISS 11-20", "NISS 21-30",
"NISS 11-20", "NISS 11-20", "NISS 0-10"), TBI.LOS = c(5, 8, 1,
3, 19, 1), TBI.hospitalteach = c(0, 0, 1, 1, 1, 1), TBI.largebedsize = c(1,
1, 1, 1, 1, 1), TBI.CM_ALCOHOL = c(0, 0, 0, 1, 0, 0), TBI.CM_ANEMDEF = c(0,
0, 0, 0, 0, 0), TBI.CM_BLDLOSS = c(0, 0, 0, 0, 0, 0), TBI.CM_CHF = c(1,
0, 0, 0, 0, 0), TBI.CM_CHRNLUNG = c(0, 0, 0, 0, 0, 0), TBI.CM_COAG = c(0,
0, 0, 0, 1, 0), TBI.CM_HYPOTHY = c(0, 0, 0, 0, 0, 0), TBI.CM_LYTES = c(0,
0, 0, 0, 0, 0), TBI.CM_METS = c(0, 0, 0, 0, 0, 0), TBI.CM_NEURO = c(0,
0, 0, 0, 0, 0), TBI.CM_OBESE = c(0, 0, 0, 0, 0, 0), TBI.CM_PARA = c(0,
0, 0, 0, 0, 0), TBI.CM_PSYCH = c(0, 1, 0, 0, 0, 0), TBI.CM_TUMOR = c(0,
0, 0, 0, 0, 0), TBI.CM_WGHTLOSS = c(0, 0, 0, 0, 0, 0), TBI.UTI = c(0,
0, 0, 0, 0, 0), TBI.pneumonia = c(0, 0, 0, 0, 0, 0), TBI.AMI = c(0,
0, 0, 0, 0, 0), TBI.sepsis = c(0, 0, 0, 0, 0, 0), TBI.arrest = c(0,
0, 0, 0, 0, 0), TBI.spineinjury = c(0, 0, 0, 0, 0, 0), TBI.legfracture = c(0,
0, 0, 0, 0, 0), TBI_time_to_surg.NEW = c(0, 0, 0, 0, 0, 0)), row.names = c(NA,
6L), class = "data.frame")
A small little tip, provide a small sample set that is just big enough to address your question.
library(data.table)
# took a small sample and changed one value to Asian
dt <- data.table(
TBI.FEMALE = c(0, 0, 1, 0, 1, 0),
TBI.iracecat2 = as.character(c("Whites", "Whites", "Asian", "Hispanics", "Whites", "Blacks"))
)
# define race groups, and note I did not define Asian
convert_race <- c("Hispanics" = 1, "Blacks" = 2, "Whites" = 3) # other will all be not defined
dt[, TBI.irace2 := lapply(TBI.iracecat2, function(x) convert_race[x]), by = TBI.iracecat2]
dt[is.na(TBI.irace2), TBI.irace2 := 4]
dt
# TBI.FEMALE TBI.iracecat2 TBI.irace2
# 1: 0 Whites 3
# 2: 0 Whites 3
# 3: 1 Asian 4
# 4: 0 Hispanics 1
# 5: 1 Whites 3
# 6: 0 Blacks 2

Hierarchical clustering of a time-series

I am struggling with hierarchical or clustering. I have the following time-series and I want to cluster to based on time. Would transpose function work for this?
structure(list(`04:00` = c(0, 0, 0, 0, 0, 0), `04:10` = c(0,
0, 0, 0, 0, 0), `04:20` = c(0, 0, 0, 0, 0, 0), `04:30` = c(0,
0, 0, 0, 0, 0), `04:40` = c(0, 0, 0, 0, 0, 0), `04:50` = c(0,
0, 0, 0, 0, 0), `05:00` = c(0, 0, 0, 0, 0, 0), `05:10` = c(0,
0, 0, 0, 0, 0), `05:20` = c(0, 0, 0, 0, 0, 0), `05:30` = c(0,
0, 0, 0, 0, 0), `05:40` = c(0, 0, 0, 0, 0, 0), `05:50` = c(1,
0, 0, 0, 0, 0), `06:00` = c(1, 0, 0, 0, 0, 0), `06:10` = c(1,
0, 0, 0, 0, 0), `06:20` = c(2, 0, 0, 0, 0, 0), `06:30` = c(0,
0, 0, 0, 0, 0), `06:40` = c(0, 1, 0, 0, 0, 0), `06:50` = c(0,
2, 0, 0, 0, 1), `07:00` = c(0, 0, 0, 0, 0, 2), `07:10` = c(0,
0, 1, 0, 0, 2), `07:20` = c(0, 0, 0, 0, 0, 2), `07:30` = c(0,
0, 1, 0, 0, 0), `07:40` = c(1, 0, 1, 0, 0, 0), `07:50` = c(1,
0, 0, 0, 2, 0), `08:00` = c(1, 0, 0, 0, 0, 0), `08:10` = c(1,
0, 0, 0, 0, 0), `08:20` = c(2, 0, 0, 0, 0, 0), `08:30` = c(2,
0, 0, 0, 0, 0), `08:40` = c(2, 0, 0, 0, 0, 0), `08:50` = c(2,
0, 0, 0, 0, 0), `09:00` = c(0, 0, 0, 0, 0, 0), `09:10` = c(0,
0, 0, 0, 0, 0), `09:20` = c(0, 1, 0, 0, 0, 0), `09:30` = c(0,
1, 0, 2, 0, 0), `09:40` = c(0, 1, 0, 0, 0, 0), `09:50` = c(0,
1, 0, 0, 0, 0), `10:00` = c(0, 0, 0, 0, 0, 0), `10:10` = c(0,
0, 0, 0, 0, 0), `10:20` = c(0, 1, 0, 0, 0, 0), `10:30` = c(0,
1, 0, 0, 0, 0), `10:40` = c(0, 0, 0, 0, 0, 0), `10:50` = c(0,
0, 0, 0, 0, 0), `11:00` = c(2, 0, 0, 1, 0, 0), `11:10` = c(0,
0, 0, 1, 0, 0), `11:20` = c(0, 0, 0, 1, 0, 1), `11:30` = c(0,
0, 0, 1, 0, 1), `11:40` = c(0, 0, 0, 1, 0, 1), `11:50` = c(0,
0, 0, 1, 0, 0), `12:00` = c(0, 0, 0, 1, 2, 0), `12:10` = c(0,
0, 0, 1, 0, 0), `12:20` = c(0, 0, 0, 1, 0, 0), `12:30` = c(0,
0, 0, 1, 0, 0), `12:40` = c(0, 0, 0, 1, 0, 0), `12:50` = c(0,
0, 0, 1, 1, 0), `13:00` = c(0, 0, 0, 0, 1, 0), `13:10` = c(0,
0, 0, 0, 1, 0), `13:20` = c(0, 0, 0, 0, 1, 0), `13:30` = c(0,
0, 0, 0, 1, 0), `13:40` = c(0, 0, 0, 0, 1, 0), `13:50` = c(0,
0, 0, 0, 1, 0), `14:00` = c(0, 0, 0, 0, 1, 0), `14:10` = c(0,
0, 0, 0, 1, 0), `14:20` = c(0, 0, 0, 0, 1, 0), `14:30` = c(0,
0, 0, 0, 1, 0), `14:40` = c(0, 0, 0, 0, 1, 0), `14:50` = c(0,
0, 0, 0, 0, 0), `15:00` = c(0, 0, 0, 0, 0, 0), `15:10` = c(0,
2, 0, 0, 0, 0), `15:20` = c(0, 2, 0, 0, 1, 0), `15:30` = c(0,
2, 0, 0, 1, 1), `15:40` = c(0, 2, 0, 0, 1, 0), `15:50` = c(0,
2, 0, 0, 1, 0), `16:00` = c(0, 2, 0, 0, 1, 0), `16:10` = c(0,
2, 0, 0, 1, 0), `16:20` = c(2, 2, 0, 0, 1, 0), `16:30` = c(2,
2, 0, 0, 1, 2), `16:40` = c(2, 2, 0, 0, 1, 1), `16:50` = c(2,
2, 0, 0, 0, 1), `17:00` = c(0, 2, 0, 0, 2, 0), `17:10` = c(0,
0, 0, 0, 2, 0), `17:20` = c(0, 0, 0, 0, 2, 0), `17:30` = c(0,
0, 0, 0, 2, 0), `17:40` = c(0, 0, 0, 0, 0, 0), `17:50` = c(0,
0, 0, 0, 0, 0), `18:00` = c(0, 2, 0, 0, 0, 2), `18:10` = c(0,
2, 0, 0, 0, 2), `18:20` = c(0, 0, 0, 0, 2, 2), `18:30` = c(0,
0, 0, 0, 0, 2), `18:40` = c(0, 0, 0, 0, 0, 2), `18:50` = c(1,
0, 0, 0, 0, 2), `19:00` = c(1, 0, 0, 1, 1, 0), `19:10` = c(1,
0, 0, 1, 1, 0), `19:20` = c(1, 0, 0, 1, 1, 0), `19:30` = c(1,
0, 1, 1, 1, 0), `19:40` = c(1, 0, 1, 1, 1, 1), `19:50` = c(1,
0, 1, 1, 1, 1), `20:00` = c(0, 0, 1, 1, 1, 1), `20:10` = c(0,
0, 1, 1, 1, 1), `20:20` = c(0, 0, 1, 1, 1, 1), `20:30` = c(0,
1, 2, 1, 1, 1), `20:40` = c(0, 1, 0, 1, 1, 1), `20:50` = c(0,
1, 0, 1, 1, 1), `21:00` = c(0, 1, 0, 1, 1, 1), `21:10` = c(0,
1, 0, 0, 1, 1), `21:20` = c(0, 1, 0, 0, 1, 1), `21:30` = c(0,
1, 1, 0, 1, 1), `21:40` = c(0, 1, 1, 0, 1, 1), `21:50` = c(0,
1, 1, 0, 0, 1), `22:00` = c(0, 1, 1, 0, 0, 0), `22:10` = c(0,
1, 0, 0, 0, 0), `22:20` = c(0, 1, 0, 0, 0, 0), `22:30` = c(0,
1, 0, 0, 0, 0), `22:40` = c(0, 1, 0, 0, 0, 0), `22:50` = c(0,
1, 0, 0, 0, 0), `23:00` = c(0, 0, 0, 0, 1, 0), `23:10` = c(0,
0, 0, 0, 0, 1), `23:20` = c(0, 0, 0, 0, 0, 1), `23:30` = c(0,
0, 0, 0, 0, 1), `23:40` = c(0, 0, 0, 0, 0, 1), `23:50` = c(0,
0, 0, 0, 0, 0), `00:00` = c(0, 0, 0, 0, 0, 0), `00:10` = c(0,
0, 0, 0, 0, 0), `00:20` = c(0, 0, 0, 0, 0, 0), `00:30` = c(0,
0, 0, 0, 0, 0), `00:40` = c(0, 0, 0, 0, 0, 0), `00:50` = c(0,
0, 0, 0, 0, 0), `01:00` = c(0, 0, 0, 0, 0, 0), `01:10` = c(0,
0, 0, 0, 0, 0), `01:20` = c(0, 0, 0, 0, 0, 0), `01:30` = c(0,
0, 0, 0, 0, 0), `01:40` = c(0, 0, 0, 0, 0, 0), `01:50` = c(0,
0, 0, 0, 0, 0), `02:00` = c(0, 0, 0, 0, 0, 0), `02:10` = c(0,
0, 0, 0, 0, 0), `02:20` = c(0, 0, 0, 0, 0, 0), `02:30` = c(0,
0, 0, 0, 0, 0), `02:40` = c(0, 0, 0, 0, 0, 0), `02:50` = c(0,
0, 0, 0, 0, 0), `03:00` = c(0, 0, 0, 0, 0, 0), `03:10` = c(0,
0, 0, 0, 0, 0), `03:20` = c(0, 0, 0, 0, 0, 0), `03:30` = c(0,
0, 0, 0, 0, 0), `03:40` = c(0, 0, 0, 0, 0, 0), `03:50` = c(0,
0, 0, 0, 0, 0)), row.names = c("1", "2", "3", "4", "5", "6"), class = "data.frame")
I managed to run hierarchical clustering but only on cases and not on time
d_distance <- dist(as.matrix(df))
plot(hclust(d_distance))
The plot that I generated
As you can see on the plot the structure end points are indexes - how can I have instead of index time (maybe transpose)? Also I would like to plot time-series cluster separately like below plot. Would dtw be better than hierarchical clustering?

R apply funciton on each cell in data frame

I have a data frame that look something like this
> dput(tes)
structure(list(path = structure(1:6, .Label = c("1893-chicago-fair",
"1960s-afghanistan", "1970s-iran", "1980s-new-york", "20-bizarre-vintage-ads",
"20-bizarre-vintage-ads?utm_campaign=6678&utm_medium=rpages&utm_source=Facebook&utm_term=1e8e704f7b587515c72e6cf7895d55fd110b652c480d98c1440f0a7acba5fb0e",
"20-photos-segregation-america-show-far-weve-come-much-farther-go",
"7-bizarre-cultural-practices", "7-creepy-abandoned-cities?utm_campaign=6928&utm_medium=rpages&utm_source=Facebook&utm_term=4015a7368b588ff09694c96ba720c58f4e7f41a05b4181908b582bae682bef5e",
"a-brief-history-of-hippies", "abandoned-photographs", "albert-kahn",
"amazing-facts", "american-bison-extinction-1800s", "american-english-vs-british-english",
"andre-the-giant-photos", "andre-the-giant-photos??utm_source=facebook&sr_source=lift_facebook&utm_campaign=simplereach_andre-the-giant-photos&utm_medium=social",
"andre-the-giant-photos?grvVariant=d27feef0bfad84d60f335d3a8d241d9e",
"andre-the-giant-photos?grvVariant=d27feef0bfad84d60f335d3a8d241d9e&utm_campaign=gravityus2_142deb68f67fb1a99e7b80250fecc932&utm_medium=referral&utm_source=gravity",
"andre-the-giant-photos?grvVariant=d27feef0bfad84d60f335d3a8d241d9e&utm_campaign=gravityus2_16d63cf07ecf656f602b2d6b209344f7&utm_medium=referral&utm_source=gravity",
"andre-the-giant-photos?grvVariant=d27feef0bfad84d60f335d3a8d241d9e&utm_campaign=gravityus2_713050ecffc51540af02b2246ddf57dd&utm_medium=referral&utm_source=gravity",
"andre-the-giant-photos?grvVariant=d27feef0bfad84d60f335d3a8d241d9e&utm_campaign=gravityus2_c5bb3bc5e9408e0ad52ec9e787bd8654&utm_medium=referral&utm_source=gravity",
"andre-the-giant-photos?sr_source=lift_facebook&utm_campaign=simplereach_andre-the-giant-photos&utm_medium=social&utm_source=facebook",
"astounding-aerial-photography", "astounding-aerial-photography?utm_campaign=7002&utm_medium=rpages&utm_source=Facebook&utm_term=38e9e903d9ba59106d8b4d19be593f3de7ff8b91b12eafa03f2e382228f7b0d1",
"august-landmesser", "ben-franklin", "best-all-that-is-interesting-articles",
"bigfoot-facts", "celebrity-school-photos?grvVariant=82c0ce57a33dfd0209bdefc878665de0&utm_campaign=gravityus2_bc8646aefd6d0a16af03d7caf248f226&utm_medium=referral&utm_source=gravity",
"coolest-mushrooms?utm_campaign=taboolaINTL&utm_medium=referral&utm_source=taboola",
"craziest-ways-drugs-smuggled", "creepy-halloween-costumes",
"danakil-depression", "dark-john-lennon-quotes", "david-bowie-quotes",
"days-in-groundhog-day", "death-photos", "death-photos?utm_campaign=taboolaINTL&utm_medium=referral&utm_source=taboola",
"dr-seuss-quotes", "dream-chaser-spacecraft", "dust-bowl", "earth-two-planets",
"eixample-barcelona", "email-to-space", "evil-science-experiments",
"famous-incest", "famous-spies", "fun-facts-trivia", "golden-age-air-travel?utm_campaign=taboolaINTL&utm_medium=referral&utm_source=taboola",
"gross-foods", "gross-foods?utm_campaign=6928&utm_medium=rpages&utm_source=Facebook&utm_term=106965c54919c24bf37356500ec50f0709b1de621d6950bb4c5d48759ea3677e",
"gross-foods?utm_campaign=6928&utm_medium=rpages&utm_source=Facebook&utm_term=184e0ee39e66af82f9b124b904f6e07964b211e902cb0dc00c28771ff46163a2",
"gross-foods?utm_campaign=6928&utm_medium=rpages&utm_source=Facebook&utm_term=1a0ddea7bed770d5473c45e9f8d81dfd0c4fdd232f207c6b88b53c41ff220c59",
"gross-foods?utm_campaign=6928&utm_medium=rpages&utm_source=Facebook&utm_term=538659f1fc53f28d2c87b93ac73973681c1a46a04954964ab6c52ed1ab09b33a",
"gross-foods?utm_campaign=6928&utm_medium=rpages&utm_source=Facebook&utm_term=87caf0acb91ae2b202f1b00ad9eaad3fef20bbfb23405b9047fb2b5a5462ab9c",
"gross-foods?utm_campaign=6928&utm_medium=rpages&utm_source=Facebook&utm_term=91eae42c8fc9568103d46e0b6b6ec08fc34fd68b2e1918ffe2333ec73035c95a",
"gross-foods?utm_campaign=6928&utm_medium=rpages&utm_source=Facebook&utm_term=a72946874b2003a8e40635c6cf10c851d4e1c0ed45e645d69663214239550602",
"gross-foods?utm_campaign=6928&utm_medium=rpages&utm_source=Facebook&utm_term=ab594f0a1be002c8c3db297e8d33b04678af40e6a6469ac815884ae0a014b3a3",
"gross-foods?utm_campaign=6928&utm_medium=rpages&utm_source=Facebook&utm_term=fb1e333dd58cb7bb9251ec52290aae21771149f73e083440047068a69aaeae09",
"hilarious-insults", "hippie-communes", "hippie-communes?grvVariant=fda07538efb1c25617f7cc3d09c37c79",
"hippie-communes?grvVariant=fda07538efb1c25617f7cc3d09c37c79&utm_campaign=gravityus2_e3cd42d4745768460dab4694a972fd82&utm_medium=referral&utm_source=gravity",
"hippie-communes?pp=0", "history-of-the-vibrator", "history-of-the-vibrator?utm_campaign=whfbpd&utm_medium=social&utm_source=facebook",
"homosexuality-norm", "hunger-games-facts?utm_campaign=6905&utm_medium=rpages&utm_source=Facebook&utm_term=1a9e42ac8abb6ffa90bf0542206505e74d3df12114a2c4445527fb2b88ef8880",
"influential-photographs", "ingeniously-creative-ads", "insane-cults",
"insane-rulers", "inspirational-quotes", "inspirational-quotes?utm_medium=referral&utm_source=taboolainternal",
"interesting-facts-about-the-world", "interesting-quotes", "krokodil",
"making-a-murderer-theories", "maya-angelou-greatest-quotes",
"medieval-torture-devices", "milky-way-colorado", "montreal-metro",
"most-popular-female-names-in-america", "neil-degrasse-tyson-tweets",
"new-york-city-cinemagraphs", "new-york-subways-1980s", "north-korea-photographs",
"north-korea-photographs?utm_campaign=taboolaINTL&utm_medium=referral&utm_source=taboola",
"north-korea-photographs?utm_medium=referral&utm_source=taboolainternal",
"obama-aging", "pablo-escobar", "pablo-escobar??utm_source=facebook",
"pablo-escobar??utm_source=facebook&sr_source=lift_facebook&utm_campaign=simplereach_pablo-escobar&utm_medium=social",
"pablo-escobar?utm_campaign=whfbpd&utm_medium=social&utm_source=facebook",
"panda-facts", "photo-of-the-day-nasa-releases-crystal-clear-image-of-pluto",
"pollution-in-china-photographs", "pollution-in-china-photographs?utm_campaign=3434&utm_medium=rpages&utm_source=Facebook&utm_term=1a0ddea7bed770d5473c45e9f8d81dfd0c4fdd232f207c6b88b53c41ff220c59",
"pollution-in-china-photographs?utm_campaign=3434&utm_medium=rpages&utm_source=Facebook&utm_term=e28a76c1572c36c3a13965e52b4b2ea10518eb9f9c79c4bc84cfb85db16be81e",
"pollution-in-china-photographs?utm_campaign=6806&utm_medium=rpages&utm_source=Facebook&utm_term=1a0ddea7bed770d5473c45e9f8d81dfd0c4fdd232f207c6b88b53c41ff220c59",
"pollution-in-china-photographs?utm_campaign=7048&utm_medium=rpages&utm_source=Facebook&utm_term=2ef4bd7b6cd587601d6eeb35925282a1ed095ebbd4e9e4c0337ef868c7de7a0b",
"pollution-in-china-photographs?utm_campaign=7458&utm_medium=rpages&utm_source=Facebook&utm_term=b9e79a51cd4daf4c3ec02accce75b3e1fc9a22cb3133460c9c32a4f2f9cdb68c",
"powerful-photos-of-2014", "real-x-files", "romanovs-last-days",
"science-of-human-decay", "scientific-discoveries-2015", "scully-effect",
"serial-killer-quotes", "shah-iran", "six-of-the-craziest-gods-in-mythology",
"space-facts", "sun-facts", "sunken-cities", "sunken-ships",
"super-bowl-i-facts", "superhero-movies", "surreal-places", "syrian-civil-war-photographs",
"the-five-greatest-mysteries-of-human-history", "the-four-most-important-battles-of-ancient-greece",
"the-most-colorful-cities-in-the-world", "titanic-facts", "titanic-facts?utm_campaign=6385&utm_medium=rpages&utm_source=Facebook&utm_term=f5905e878216d14e20457ee3265caf6c10022d9545609edfb9a3cb0642c1a310",
"titanic-facts?utm_campaign=6899&utm_medium=rpages&utm_source=Facebook&utm_term=b9e79a51cd4daf4c3ec02accce75b3e1fc9a22cb3133460c9c32a4f2f9cdb68c",
"titanic-facts?utm_campaign=6928&utm_medium=rpages&utm_source=Facebook&utm_term=106965c54919c24bf37356500ec50f0709b1de621d6950bb4c5d48759ea3677e",
"titanic-facts?utm_campaign=6928&utm_medium=rpages&utm_source=Facebook&utm_term=538659f1fc53f28d2c87b93ac73973681c1a46a04954964ab6c52ed1ab09b33a",
"titanic-facts?utm_campaign=6928&utm_medium=rpages&utm_source=Facebook&utm_term=91eae42c8fc9568103d46e0b6b6ec08fc34fd68b2e1918ffe2333ec73035c95a",
"titanic-facts?utm_campaign=6928&utm_medium=rpages&utm_source=Facebook&utm_term=ab594f0a1be002c8c3db297e8d33b04678af40e6a6469ac815884ae0a014b3a3",
"titanic-facts?utm_campaign=6928&utm_medium=rpages&utm_source=Facebook&utm_term=d1864657a05e5b716bb5cb16a29f068a55652eb39fb669ea9c22a6486198f227",
"titanic-facts?utm_campaign=7292&utm_medium=rpages&utm_source=Facebook&utm_term=f5905e878216d14e20457ee3265caf6c10022d9545609edfb9a3cb0642c1a310",
"us-veterans-portraits", "vintage-disneyland", "wall-street-early-20th-century",
"what-we-love-this-week-the-incredible-last-words-of-famous-historical-figures",
"woodstock-photos", "zombie-proof-house"), class = "factor"),
`0089` = c(0, 0, 0, 0, 0, 1), `0096` = c(0, 0, 0, 0, 0, 0
), `02` = c(0, 0, 0, 0, 0, 0), `0215` = c(0, 0, 0, 0, 0,
0), `0225` = c(0, 0, 0, 0, 0, 0), `0252` = c(0, 0, 0, 0,
0, 0), `0271` = c(0, 0, 0, 0, 0, 0), `0272` = c(0, 0, 0,
0, 0, 0), `03` = c(0, 0, 0, 0, 1, 1)), .Names = c("path",
"0089", "0096", "02", "0215", "0225", "0252", "0271", "0272",
"03"), row.names = c(NA, 6L), class = "data.frame")
and I need to apply the min(x,1) function such that this function scan each value in the dataframe (except first column which is not numeric) and return the min(x,1). that way I have only zero's and one's.
I have tried:
f <- function(x) min(1,x)
res1<-do.call(f,tes[,2:ncol(tes)])
but that does not output the right result.
Any help aapreciated
We can use pmin
tes[,-1] <- pmin(1, as.matrix(tes[,-1]))
Or if we need only binary values
tes[,-1] <- +(!!tes[,-1])

Excluding one of the columns from the means calculating

I have a data.frame like this:
> dput(head(dat))
structure(list(`Gene name` = c("at1g01050", "at1g01080", "at1g01090",
"at1g01220", "at1g01320", "at1g01420"), `1_1` = c(0, 0, 0, 0,
0, 0), `1_2` = c(0, 0, 0, 0, 0, 0), `1_3` = c(0, 2.2266502274762,
0, 0, 0, 0), `1_4` = c(0, 1.42835007256373, 0, 0, 0, 0), `1_5` = c(0,
1, 0, 0, 0, 0.680307288653971), `1_6` = c(0, 0.974694551708235,
0.0703315834738149, 0, 0, 1.5411058346636), `1_7` = c(1, 1.06166030205396,
0, 0, 0, 0), `1_8` = c(1, 1.07309874414745, 0.129442847788922,
0, 0, 0), `1_9` = c(1.83566164452602, 0.770848509662441, 1.16522133036595,
1.02360016370994, 0, 0), `1_10` = c(0, 0, 0.96367393959757, 0,
0, 0), `1_11` = c(0, 1, 1.459452636222, 0, 0.992067202742928,
0), `1_12` = c(0, 0, 0.670100384155585, 0, 0.461601636474094,
0), `1_13` = c(0, 0, 1.43074917909221, 0, 1.35246977730244, 0
), `1_14` = c(0, 0, 1.13052717277684, 0, 1.27971261718285, 0),
`1_15` = c(0, 0, 0, 0, 0, 0), `1_16` = c(0, 0, 1.02186950513655,
0, 0.937805171752374, 0), `1_17` = c(0, 0, 0, 0, 1.82226410514639,
0), `1_18` = c(0, 0, 1.2057581396188, 0, 1, 0), `1_19` = c(0,
0, 2.54080080087007, 0, 1.74014162763125, 0), `1_20` = c(0,
0, 0, 0, 0, 0), `1_21` = c(0, 0, 1.85335086627868, 0, 2.93605031878879,
0), `1_22` = c(0, 0, 0, 0, 0, 0), `1_23` = c(0, 0, 0, 0,
0, 0), `1_24` = c(0, 0.59685787388353, 4.74450895485671,
0, 1.64665192735547, 0), `1_25` = c(0, 0, 0, 0, 0, 0), `1_26` = c(0,
0, 0, 0, 0, 0), `1_27` = c(0, 1.70324142554566, 0, 0, 0,
0), `1_28` = c(0, 4.02915818089525, 0, 0, 0, 0), `1_29` = c(0,
1.10050253348262, 0, 0, 0, 1.78705663080963), `1_30` = c(0,
0, 0, 0, 0, 0), `1_31` = c(0.525193634811661, 1.19203674964562,
0, 0, 0, 0), `1_32` = c(0.949695564218912, 0.511935958918944,
0.698256748091399, 0.924419021307232, 0, 0), `1_33` = c(1,
0.392202418854686, 0.981531026331928, 1, 0, 0), `1_34` = c(0,
0, 1.04480642952605, 0, 0, 0), `1_35` = c(0.875709646300199,
0.416787083481068, 0.910412293707794, 0, 0.931813162802324,
0), `1_36` = c(0.235817844851986, 0, 0.695496044366791, 0,
0, 0), `1_37` = c(0, 0, 0, 0, 0, 0), `1_38` = c(0, 0, 0,
0, 0, 0), `1_39` = c(0, 0, 0, 0, 0, 0), `1_40` = c(0, 0.426301584359177,
1.05916031917965, 0, 1.11716924423855, 0), `1_41` = c(0,
0, 0, 0, 0, 0), `1_42` = c(0, 0, 0, 0, 0, 0), `1_43` = c(0,
0, 0, 0, 0, 0), `1_44` = c(0, 0.817605484758179, 1, 0, 1,
0), `1_45` = c(0, 0, 0, 0, 1.83706702696725, 0), `1_46` = c(0,
0, 0, 0, 0, 0), `1_48` = c(0, 0, 0, 0, 0, 0), `1_49` = c(0,
0, 0, 0, 0, 0), `1_50` = c(0, 0, 0, 0, 0, 0), `1_51` = c(0,
0.822966241998042, 0, 0, 0, 0), `1_52` = c(0, 1.38548267401525,
0, 0, 0, 0), `1_53` = c(0, 0.693090058304095, 0, 0, 0, 1.200664746484
), `1_54` = c(0, 7.58136662752864, 0, 0, 0, 0), `1_55` = c(0.519878111919004,
0.530809413647805, 0.343274113384907, 0, 0, 0), `1_56` = c(1.24511715957891,
0.545097856366912, 0.397440073804376, 0, 0, 0), `1_57` = c(1.26748496499576,
0.502893153188496, 1, 1.09278985531586, 0, 0), `1_58` = c(0.696198684496234,
0.68197003689249, 1.30108437738319, 0.778091049180591, 0.533017938104689,
0), `1_59` = c(1.15255606344999, 0.294294436704185, 1.07862692616479,
1, 0.250091116406616, 0), `1_60` = c(1.95634163405497, 0,
1.1602014253913, 0, 0, 0), `1_61` = c(1.09287167009628, 0,
2.05939536537347, 1.08165521287259, 0.68027384701565, 0),
`1_62` = c(0.791776166968497, 0, 0.846107162142824, 0, 0.77013323652256,
0), `1_63` = c(0.378787010943447, 0.391876271945063, 0.623223753921758,
0, 0.651918444771296, 0), `1_64` = c(0.189585762007804, 0.361452381684218,
0.799519726870751, 0, 1.06818683719768, 0), `1_65` = c(0,
0, 2.5212953775211, 0, 0, 0), `1_66` = c(0, 0, 0, 0, 0, 0
), `1_67` = c(0, 0, 0, 0, 2.44827717262786, 0), `1_68` = c(0,
0, 0, 0, 0, 0), `1_69` = c(0, 0, 0, 0, 0, 0), `1_70` = c(0,
0, 2.36142611074334, 0, 2.391093649557, 0), `1_71` = c(0,
0, 0.35565044656798, 0, 0, 0), `1_72` = c(0, 0, 5.86951313801941,
0, 0, 0)), .Names = c("Gene name", "1_1", "1_2", "1_3", "1_4",
"1_5", "1_6", "1_7", "1_8", "1_9", "1_10", "1_11", "1_12", "1_13",
"1_14", "1_15", "1_16", "1_17", "1_18", "1_19", "1_20", "1_21",
"1_22", "1_23", "1_24", "1_25", "1_26", "1_27", "1_28", "1_29",
"1_30", "1_31", "1_32", "1_33", "1_34", "1_35", "1_36", "1_37",
"1_38", "1_39", "1_40", "1_41", "1_42", "1_43", "1_44", "1_45",
"1_46", "1_48", "1_49", "1_50", "1_51", "1_52", "1_53", "1_54",
"1_55", "1_56", "1_57", "1_58", "1_59", "1_60", "1_61", "1_62",
"1_63", "1_64", "1_65", "1_66", "1_67", "1_68", "1_69", "1_70",
"1_71", "1_72"), row.names = c(NA, 6L), class = "data.frame")
That's the code I use for calculation of the mean for 3 replicates which I have in the data frame:
## Calculating the mean of 3 "replicates"
ind <- c(1, 25, 49)
dat2 <- dat[-1]
tbl_end <- cbind(dat[1], sapply(0:23, function(i) rowMeans(dat2[ind+i])))
That's an error which comes:
Error in `[.data.frame`(dat2, ind + i) : undefined columns selected
Called from: eval(substitute(browser(skipCalls = pos), list(pos = 9 - frame)),
envir = sys.frame(frame))
I have 71 columns of results (should be 72 because I have 24 fractions and 3 replicates what gives 72 in total) but there should be one more column. No idea why it's missing but anyway I have to solve it. There is no 1_47 which should come with 1_23 and 1_71. Do you have any idea how can I edit my function to just ignore fraction 1_47 and still get a mean of 1_23 and 1_71 ?
Why not just add in a dummy column for 1_47. That will make your data more regular and make it much easier to extract the indexes you need. To do this, try
dat2<-cbind(dat[1:47], 1_47=rep(NA, nrow(dat)), dat[48:72])
ind <- c(1, 25, 49)
tbl_end <- cbind(dat[1], sapply(0:23, function(i) rowMeans(dat2[ind+i+1], na.rm=T)))

Plotting all of the rows in different graph - data frame

Propably the code is very simple but I have never tried plotting in R yet.
I would like to have a linear plot for every row and all the plots on different graph.
The number in my data goes from 0 to 1. Value one is the maximum of the plot, in some cases there might be few maximums in a single row. I would like to have a pdf file as an output.
Data:
> dput(head(tbl_end))
structure(list(`NA` = structure(1:6, .Label = c("AT1G01050",
"AT1G01080", "AT1G01090", "AT1G01220", "AT1G01320", "AT1G01420",
"ATCG00800", "ATCG00810", "ATCG00820", "ATCG01090", "ATCG01110",
"ATCG01120", "ATCG01240", "ATCG01300", "ATCG01310", "ATMG01190"
), class = "factor"), `10` = c(0, 0, 0, 0, 0, 0), `20` = c(0,
0, 0, 0, 0, 0), `52.5` = c(0, 1, 0, 0, 0, 0), `81` = c(0, 0.660693687777888,
0, 0, 0, 0), `110` = c(0, 0.521435654491704, 0, 0, 0, 1), `140.5` = c(0,
0.437291194705566, 0, 0, 0, 1), `189` = c(0, 0.52204783488213,
0, 0, 0, 0), `222.5` = c(0, 0.524298383907171, 0, 0, 0, 0), `278` = c(1,
0.376865096972469, 0, 1, 0, 0), `340` = c(0, 0, 0, 0, 0, 0),
`397` = c(0, 0, 0, 0, 0, 0), `453.5` = c(0, 0, 0, 0, 0, 0
), `529` = c(0, 0, 0, 0, 0, 0), `580` = c(0, 0, 0, 0, 0,
0), `630.5` = c(0, 0, 0, 0, 0, 0), `683.5` = c(0, 0, 0, 0,
0, 0), `735.5` = c(0, 0, 0, 0, 0, 0), `784` = c(0, 0, 0.476101907006443,
0, 0, 0), `832` = c(0, 0, 1, 0, 0, 0), `882.5` = c(0, 0,
0, 0, 0, 0), `926.5` = c(0, 0, 0, 0, 1, 0), `973` = c(0,
0, 0, 0, 0, 0), `1108` = c(0, 0, 0, 0, 0, 0), `1200` = c(0,
0, 0, 0, 0, 0)), .Names = c(NA, "10", "20", "52.5", "81",
"110", "140.5", "189", "222.5", "278", "340", "397", "453.5",
"529", "580", "630.5", "683.5", "735.5", "784", "832", "882.5",
"926.5", "973", "1108", "1200"), row.names = c(NA, 6L), class = "data.frame").
Would be great to have a name of the row on the top of each page in pdf.
Here's an example using your dputed data:
# open the pdf file
pdf(file='myfile.pdf')
# since I don't know what values should be on the X axis,
# I'm just using values from 1 to number of y-values
x <- 1:(ncol(tbl_end)-1)
for(i in 1:nrow(tbl_end)){
# plot onto a new pdf page
plot(x=x,y=tbl_end[i,-1],type='b',main=tbl_end[i,1],xlab='X',ylab='Y')
}
# close the pdf file
dev.off()
where the first page is something like this:
If you want to change the style (e.g. lines without the little circles etc.) of the plot, have a look at the documentation.

Resources