how to loop through column names in R - r

genres=c("Action","Adventure","Animation","Biography","Comedy","Crime",
"Documentary","Drama","Family","Game.Show","Horror","Music","Musical",
"Mystery","Romance","Sci.Fi","Short","Thriller","War","Western")
This is my vector of genres.
Another data set has the same column names.
This is the data set column names
"Title" "Genre" "imdbRating" "Release_Year"
"Action" "Adventure" "Animation" "Biography" "Comedy"
"Crime" "Documentary" "Drama" "Family"
"Fantasy" "Game.Show" "Horror" "Music"
"Musical" "Mystery" "N.A" "Romance"
"Sci.Fi" "Short" "Sport" "Thriller"
"War" "Western"
I want to run this command for all genres replacing each genre with the value.
data_predict$genres[grepl("*genres*", data_predict$Genre)]=1
Orignal Data set
data_predict<-structure(list(Genre = structure(c(3L, 1L, 2L), .Label = c("Action, Adventure, Sci-Fi",
"Action, Drama, War", "Sci-Fi"), class = "factor"), Action = c(0,
0, 0), Adventure = c(0, 0, 0), Animation = c(0, 0, 0), Biography = c(0,
0, 0), Comedy = c(0, 0, 0), Crime = c(0, 0, 0), Documentary = c(0,
0, 0), Drama = c(0, 0, 0), Family = c(0, 0, 0), Game.Show = c(0,
0, 0), Horror = c(0, 0, 0), Music = c(0, 0, 0), Musical = c(0,
0, 0), Mystery = c(0, 0, 0), Romance = c(0, 0, 0), Sci.Fi = c(0,
0, 0), Short = c(0, 0, 0), Thriller = c(0, 0, 0), War = c(0,
0, 0), Western = c(0, 0, 0)), .Names = c("Genre", "Action", "Adventure",
"Animation", "Biography", "Comedy", "Crime", "Documentary", "Drama",
"Family", "Game.Show", "Horror", "Music", "Musical", "Mystery",
"Romance", "Sci.Fi", "Short", "Thriller", "War", "Western"), row.names = c(NA,
3L), class = "data.frame")
Expected result
data_predicted<-structure(list(Genre = structure(c(3L, 1L, 2L), .Label = c("Action, Adventure, Sci-Fi",
"Action, Drama, War", "Sci-Fi"), class = "factor"), Action = c(0,
1, 1), Adventure = c(0, 1, 0), Animation = c(0, 0, 0), Biography = c(0,
0, 0), Comedy = c(0, 0, 0), Crime = c(0, 0, 0), Documentary = c(0,
0, 0), Drama = c(0, 0, 1), Family = c(0, 0, 0), Game.Show = c(0,
0, 0), Horror = c(0, 0, 0), Music = c(0, 0, 0), Musical = c(0,
0, 0), Mystery = c(0, 0, 0), Romance = c(0, 0, 0), Sci.Fi = c(0,
0, 0), Short = c(0, 0, 0), Thriller = c(0, 0, 0), War = c(0,
0, 1), Western = c(0, 0, 0)), .Names = c("Genre", "Action", "Adventure",
"Animation", "Biography", "Comedy", "Crime", "Documentary", "Drama",
"Family", "Game.Show", "Horror", "Music", "Musical", "Mystery",
"Romance", "Sci.Fi", "Short", "Thriller", "War", "Western"), row.names = c(NA,
3L), class = "data.frame")

Try
library(qdapTools)
mtabulate(strsplit(as.character(data_predict$Genre), ', '))
Or
data_predict[-1] <- lapply(names(data_predict)[-1],
function(x) as.numeric(grepl(x, data_predict$Genre)))

Related

Error: Length of `labels` should be as same as number of slices

I want to use ComplexHeatmap to plot the "types" (type 1, type 2) and "subtypes" as top annotations using block annotation.
The row annotations would be the column names of met.resolv.
Code Part 1:
library(ComplexHeatmap)
library(GetoptLong)
clust.col <- c("#003f5c", "#374c80","#7a5195","#bc5090","#ef5675","#ff764a", "#ffa600", "#84E4F7", "#FFB480","#FDFD86", "#00a692")
meth.col <- c("#ff816f", "#ffbaae", "#f1f1f1", "#7e959e", "#004252")
met.immune.col <- c("#01444f", "#01575e", "#0d6a6a", "#217d74", "#38917c", "#52a482", "#6eb786", "#8eca89", "#b0dc8c", "#d5ed90", "#fdfd96")
pdf("Plots/heatmap_methylresolver.pdf")
ha = HeatmapAnnotation(
name = "Sub-Cluster", empty = anno_empty(border = TRUE, height = unit(8, "mm")),
foo = anno_block(gp = gpar(fill = clust.col), labels = unique(meta$clust))
)
la = rowAnnotation(foo = anno_block(gp = gpar(col=met.immune.col),
labels = colnames(met.resolv),
labels_gp = gpar(col = "white", fontsize = 10)))
Heatmap(met.resolv, name = "SubClust", top_annotation = ha,
left_annotation = la, column_title = NULL)
Traceback:
Error: Length of `labels` should be as same as number of slices.
Code part 2:
group_block_anno = function(group, empty_anno, gp = gpar(),
label = NULL, label_gp = gpar()) {
seekViewport(qq("annotation_#{empty_anno}_#{min(group)}"))
loc1 = deviceLoc(x = unit(0, "npc"), y = unit(0, "npc"))
seekViewport(qq("annotation_#{empty_anno}_#{max(group)}"))
loc2 = deviceLoc(x = unit(1, "npc"), y = unit(1, "npc"))
seekViewport("global")
grid.rect(loc1$x, loc1$y, width = loc2$x - loc1$x, height = loc2$y - loc1$y,
just = c("left", "bottom"), gp = gp)
if(!is.null(label)) {
grid.text(label, x = (loc1$x + loc2$x)*0.5, y = (loc1$y + loc2$y)*0.5, gp = label_gp)
}
}
group_block_anno(meta[meta$type==1,], "empty", gp = gpar(fill = "#003f5c"), label = "type 1")
group_block_anno(meta[meta$type==2,], "empty", gp = gpar(fill = "#ffa600"), label = "type 2")
dev.off()
Data:
met.resolv
> dput(met.resolv[1:20,])
structure(list(Monocytes = c(0, 0, 0, 0, 0.0691477875220381,
0.0461824156116519, 0.00777223000960038, 0, 0, 0, 0.00165316191239164,
0.0245461060386295, 0.026342142484403, 0, 0, 0, 0.0362473177899938,
0, 0, 0.0615459951223746), `Dendritic Cells` = c(0, 0, 0.00772620422001257,
0, 0, 0, 0.0480402297895918, 0, 0, 0.00898992233305366, 0.057888955860833,
0.0362367878235371, 0, 0.0472205793224695, 0.0286203273050095,
0, 0, 0, 0, 0), Macrophages = c(0, 0.0664642500649833, 0, 0,
0.0371204658284402, 0, 0, 0.0225187084795453, 0.0603416047052193,
0, 0, 0, 0, 0, 0, 0.0313730144635087, 0.0704265029977412, 0,
0.00934366999330129, 0.0411264824824766), Neutrophils = c(0.173202855063056,
0, 0, 0.0643464479529596, 0, 0.0187142163615865, 0.0117918312263748,
0, 0, 0.115244141262919, 0.0520653071278115, 0.00997874098002133,
0, 0.00754706466322519, 0.0885236230551497, 0.0144246971006176,
0.000602296924347016, 0, 0.0195266392400734, 0.00343527794086701
), Eosinophils = c(0, 0.00809451621782635, 0, 0, 0, 0, 0, 0.0026662337469062,
0.0126433025837339, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.00796674767545607,
0), `Regulatory T cells` = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0), `Naive T cells` = c(0, 0.00984426904423764,
0, 0.111694279700795, 0.0123267828167452, 0.0121761009946946,
0.015487451006506, 0, 0.0231848393777138, 0, 0.00278269237244245,
0.0200645732264692, 0, 0.0147361795082149, 0.0526711496398388,
0, 0.00992032127196248, 0, 0, 0.030586635289606), `Memory T cells` = c(0,
0.0312258875142767, 0.124409625779986, 0, 0.0135351004994425,
0.0537156172200875, 0.0540049513012593, 0.0297542571267331, 0,
0.0363411597373587, 0.0464268327265193, 0.0397546685980086, 0.0425232243321057,
0.0491394530734343, 0, 0.0512205034016493, 0.023265025230139,
0.130162735781893, 0, 0.00172924583134173), `CD8 T cells` = c(0.00282626493694126,
0.0225524838253428, 0, 0.0030508623462426, 0, 0.0128041131453121,
0, 0.102208367313482, 0, 0, 0, 0, 0.0668565430396047, 0.0343785834326558,
0, 0.0418137510155405, 0.0039045724524825, 0.0142647475514386,
0.0757110710314276, 0), `NK cells` = c(0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), `B cells` = c(0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0.0158949534772194, 0, 0, 0.0174286273520171,
0, 0, 0.0240398020597407, 0)), row.names = c("TCGA.2K.A9WE.01A",
"TCGA.2Z.A9J1.01A", "TCGA.2Z.A9J3.01A", "TCGA.2Z.A9J5.01A", "TCGA.2Z.A9J6.01A",
"TCGA.2Z.A9J7.01A", "TCGA.2Z.A9J8.01A", "TCGA.2Z.A9JD.01A", "TCGA.2Z.A9JI.01A",
"TCGA.2Z.A9JJ.01A", "TCGA.2Z.A9JO.01A", "TCGA.2Z.A9JQ.01A", "TCGA.4A.A93W.01A",
"TCGA.4A.A93X.01A", "TCGA.4A.A93Y.01A", "TCGA.5P.A9JU.01A", "TCGA.5P.A9JY.01A",
"TCGA.5P.A9KE.01A", "TCGA.A4.7288.01A", "TCGA.A4.7583.01A"), class = "data.frame")
meta
> dput(meta[1:20,])
structure(list(clust = c("1a", "2a", "2b", "1b", "2a", "2c",
"1c", "1c", "1b", "1d", "1e", "2c", "2b", "1c", "1e", "1c", "2c",
"1f", "1c", "2a"), type = c("1", "2", "2", "1", "2", "2", "1",
"1", "1", "1", "1", "2", "2", "1", "1", "1", "2", "1", "1", "2"
)), row.names = c("TCGA.2K.A9WE.01A", "TCGA.2Z.A9J1.01A", "TCGA.2Z.A9J3.01A",
"TCGA.2Z.A9J5.01A", "TCGA.2Z.A9J6.01A", "TCGA.2Z.A9J7.01A", "TCGA.2Z.A9J8.01A",
"TCGA.2Z.A9JD.01A", "TCGA.2Z.A9JI.01A", "TCGA.2Z.A9JJ.01A", "TCGA.2Z.A9JO.01A",
"TCGA.2Z.A9JQ.01A", "TCGA.4A.A93W.01A", "TCGA.4A.A93X.01A", "TCGA.4A.A93Y.01A",
"TCGA.5P.A9JU.01A", "TCGA.5P.A9JY.01A", "TCGA.5P.A9KE.01A", "TCGA.A4.7288.01A",
"TCGA.A4.7583.01A"), class = "data.frame")
Expected output (Example):

Subset of data frame base on the value in column or neighboring columns

I would like to find which column contains the highest number of 1. Number 1 should appear only once per row. As soon as column with highest number 1 will be located the script should check also neighboring columns (+1+ / -1) and if any of them contain number 1 it should be also selected. All of these rows should be kept within subset function.
Let's put part of original data:
structure(list( `10` = c(0, 0, 0, 0), `34` = c(0, 0, 0, 0),
`59` = c(0, 0, 0, 0), `84` = c(0, 0, 0, 0),
`110` = c(0, 0, 0, 0), `134` = c(0, 0, 0, 0),
`165` = c(0, 0, 0, 0), `199` = c(0, 0, 0, 0),
`234` = c(0, 0, 0, 0),
`257` = c(0.0160178986200301, 0, 0.0409772658686249, 0.0289710439505515),
`362` = c(0.0679054515644214, 0.126933274414494, 0.0855598028367368, 0.0596214721268868),
`433` = c(0.490914059297718, 0.604765061128296, 0.813348757670254, 1),
`506` = c(1, 1, 1, 0.971410482822965),
`581` = c(0.198244295668807, 0.234158197083517, 0.269655970224324, 0.195318383259472),
`652` = c(0.271177756524115, 0.223018854028576, 0.301352982597324, 0.142584385725234),
`733` = c(0.212426561005602, 0.212778023272942, 0.228513228045468, 0),
`818` = c(0.213816778248395, 0.168570481661511, 0.264465345538678, 0),
`896` = c(0.137102063123377, 0, 0.320234382858867, 0),
`972` = c(0.108932231179123, 0, 0.179106729705261, 0),
`1039` = c(0.101762535865555, 0, 0, 0),
EOD = c("Peter", "Peter", "Peter", "Peter"),
Complex = c(""FT team", "FT team", "FT team", "FT team")),
.Names = c("10", "34", "59", "84", "110", "134", "165", "199",
"234", "257", "362", "433", "506", "581", "652", "733",
"818", "896", "972", "1039", "EOD", "Complex"),
row.names = c("Peter_1_Rep_1_E", "Peter_1_Rep_2_E",
"Peter_1_Rep_3_E", "Peter_1_Rep_4_E"),
class = "data.frame")
As you can clearly see in the original data the column 506 should be selected as the one containing the highest number of 1 and data should be subseted base on it. However, output would be exactly the same because in this data neighboring fraction (-1, 433) contains also 1. That's easy example.
Situation might be more complicated, like in that case:
structure(list( `10` = c(0, 0, 0, 0, 0, 0, 0, 0),
`34` = c(0, 0, 0, 0, 0, 0, 0, 0),
`59` = c(0, 0, 0, 0, 0, 0, 0, 0),
`84` = c(0, 0, 0, 0, 0, 0, 0, 0),
`110` = c(0, 0, 0, 0, 0, 0, 0, 0),
`134` = c(0.168783347110543, 0, 0.382618775924215, 0, 0.530638724516877, 0, 0.169526042048202, 0),
`165` = c(1, 0.36380544964196, 1, 0.13979454361738, 1, 0.239652477288689, 1, 0.240341578327444),
`199` = c(0.355158938904336, 1, 0.646724265971128, 1, 0.582637073151552, 1, 0.20319390520841, 1),
`234` = c(0.0963628165627114, 0.575436312346942, 0.229853828180188, 0.433555069046817, 0.247567185011894, 0.508529485059242, 0.138356164383562, 0.389880251276011),
`257` = c(0, 0.17393595585728, 0, 0.127787133715056, 0, 0.117147323350173, 0, 0),
`362` = c(0, 0, 0, 0.0919333108790839, 0, 0, 0, 0),
`433` = c(0, 0, 0, 0.0745570899292691, 0, 0, 0, 0),
`506` = c(0, 0, 0, 0, 0, 0, 0, 0),
`581` = c(0, 0, 0, 0, 0, 0, 0, 0),
`652` = c(0, 0, 0, 0, 0, 0, 0, 0),
`733` = c(0, 0, 0, 0, 0, 0, 0, 0),
`818` = c(0, 0, 0, 0, 0, 0, 0, 0),
`896` = c(0, 0, 0, 0, 0, 0, 0, 0),
`972` = c(0, 0, 0, 0, 0, 0, 0, 0),
`1039` = c(0, 0, 0, 0, 0, 0, 0, 0),
EOD = c("Paul", "Paul", "Paul", "Paul", "Paul", "Paul", "Paul", "Paul"),
Complex = c("GG Team", "GG Team", "GG Team", "GG Team", "GG Team", "GG Team", "GG Team", "GG Team")),
.Names = c("10", "34", "59", "84", "110", "134", "165", "199", "234", "257", "362", "433", "506", "581", "652", "733", "818", "896", "972", "1039", "EOD", "Complex"),
row.names = c("PaulG_1_Rep_1_E", "Paul_1_Rep_1_E", "PaulN_1_Rep_2_E", "PaulG_1_Rep_2_E", "Paul_1_Rep_3_E", "PaulC_1_Rep_3_E", "PaulC_1_Rep_4_E", "Paul_1_Rep_4_E"),
class = "data.frame")
In that situation there are two columns which contain the same number of 1s. In this case column with bigger colsum should be selected.
let df1 be your input:
df_num <- df1[,sapply(df1,is.numeric)] # keep only numeric columns to build filter
n1 <- colSums(df_num == 1) # number of 1s per column
i <- which(n1 == max(n1)) # index of cols with max 1s
if(length(i) > 1){
max_cs <- which.max(colSums(df_num[,i])) # index of col with max colsum among results
i <- i[max_cs] # our column index
}
filter <- rowSums(df_num[,seq(max(i-1,0),min(i+1,ncol(df_num)))]==1) >0 # filter is true if chosen column is 1 or if any neighbour is 1
df1[filter,] # your result
In both of your examples, all rows are kept
I'd use the tidyverse to convert it to long format then pull in the column sums to determine where the first one (with the largest sum) is:
library(tidyverse)
# add rownames to the data frame
df2$id <- rownames(df2)
# make a data frame of each column's sum
thecolsums <- colSums(df2[,map_lgl(df2, is.numeric)]) %>%
enframe(name = "colname", value = "colsum")
# change the data frame to long format
dflong <- df2 %>%
mutate(rowid = row_number()) %>%
gather(colname, val, -rowid)
# which column has the first 1 value
whichcol <- dflong %>%
group_by(colname) %>%
filter(val ==1) %>%
summarize(
firstone = min(rowid, na.rm = T)
) %>%
left_join(thecolsums, by = 'colname') %>%
filter(colsum == max(colsum)) %>%
pluck('colname')
# what's the numerical index of the column
whichcolindex <- which(names(df2) == whichcol)
# get previous and next columns if they exist
prevcolindex <- ifelse(whichcolindex < 1, F, whichcolindex -1)
nextcolindex <- ifelse(whichcolindex == ncol(df2) , F, whichcolindex +1)
# do the previous and next columns have 1s in them?
prevcolhasone <- any(df2[,prevcolindex] == 1)
nextcolhasone <- any(df2[,nextcolindex] == 1)
# create a vector with 1, 2 or 3 column indexes
finalindex <- c(
prevcolindex[prevcolhasone]
, whichcolindex
, nextcolindex[nextcolhasone]
)
# subset the original data frame, only preserving the columns in question
results <- df2[, finalindex]

mean for subsets using survey

I have two variables "c" and "q" in a data.frame. "c" is a number between zero and one (a level of poverty) and "q" indicate if the household (or subject) is poor with 1 or non-poverty with zero.
How can I calculate the mean of "c" only of the poor households (q=1).
What I need
Important detail: I have a database for a coutry and I want this result for regions.
I am using the svyby like this:
svyby( ~q , ~region , design = base2015_pos , na.rm=TRUE, svytotal)
so in that way the R give me the number of poor by region and I don't need this now. I need the mean of a subset (see image above) by region.
structure(list(domicilio = c(11000015001, 11000015003, 11000015004), agua = c(0, 0, 6), ind_agua = c(0, 0, 1), esgoto = c(1, 1, 6), ind_cond_sanitaria = c(1, 1, 1), lixo = c(0, 0, 0), ind_lixo = c(0, 0, 0), luz = c(0, 0, 0), ind_iluminacao = c(0, 0, 0), ativos = c(0, 0, 0), ind_ativos = c(0, 0, 0), emprego = c(0, 0, 0), ind_emprego = c(0, 0, 0), renda = c(0, 0, 0), ind_renda = c(0, 0, 0), casa = c(1, 1, 0), ind_riqueza = c(1, 1, 0), anos = c(0, 0, 0), ind_estudo = c(0, 0, 0), ler = c(0, 0, 0), ind_alfabetizado = c(0, 0, 0), peso = c(270, 270, 270), sexo = c(0, 1, 1), uf = c("11", "11", "11"), v4609 = c("001772940", "001772940", "001772940"), v4617 = c(110001, 110001, 110001), v4618 = c(1, 1, 1), pre_wgt = c(200, 200, 200), one = c(1L,
1L, 1L), region = c("1", "1", "1"), c = c(0.2, 0.2, 0.2), q = c(0, 0, 0)), .Names = c("domicilio", "agua", "ind_agua", "esgoto", "ind_cond_sanitaria", "lixo", "ind_lixo", "luz", "ind_iluminacao","ativos", "ind_ativos", "emprego", "ind_emprego", "renda", "ind_renda", "casa", "ind_riqueza", "anos", "ind_estudo", "ler", "ind_alfabetizado","peso", "sexo", "uf", "v4609", "v4617", "v4618", "pre_wgt", "one", "region", "c", "q"), row.names = c(NA, 3L), class = "data.frame")
# complex sample survey design
library(survey)
# your data.frame
x <- structure(list(domicilio = c(11000015001, 11000015003, 11000015004), agua = c(0, 0, 6), ind_agua = c(0, 0, 1), esgoto = c(1, 1, 6), ind_cond_sanitaria = c(1, 1, 1), lixo = c(0, 0, 0), ind_lixo = c(0, 0, 0), luz = c(0, 0, 0), ind_iluminacao = c(0, 0, 0), ativos = c(0, 0, 0), ind_ativos = c(0, 0, 0), emprego = c(0, 0, 0), ind_emprego = c(0, 0, 0), renda = c(0, 0, 0), ind_renda = c(0, 0, 0), casa = c(1, 1, 0), ind_riqueza = c(1, 1, 0), anos = c(0, 0, 0), ind_estudo = c(0, 0, 0), ler = c(0, 0, 0), ind_alfabetizado = c(0, 0, 0), peso = c(270, 270, 270), sexo = c(0, 1, 1), uf = c("11", "11", "11"), v4609 = c("001772940", "001772940", "001772940"), v4617 = c(110001, 110001, 110001), v4618 = c(1, 1, 1), pre_wgt = c(200, 200, 200), one = c(1L,
1L, 1L), region = c("1", "1", "1"), c = c(0.2, 0.2, 0.2), q = c(0, 0, 0)), .Names = c("domicilio", "agua", "ind_agua", "esgoto", "ind_cond_sanitaria", "lixo", "ind_lixo", "luz", "ind_iluminacao","ativos", "ind_ativos", "emprego", "ind_emprego", "renda", "ind_renda", "casa", "ind_riqueza", "anos", "ind_estudo", "ler", "ind_alfabetizado","peso", "sexo", "uf", "v4609", "v4617", "v4618", "pre_wgt", "one", "region", "c", "q"), row.names = c(NA, 3L), class = "data.frame")
# your survey.design (this is not the correct svydesign statement, please follow the directions specific to your data set)
y <- svydesign( ~ 1 , data = x , weights = ~ pre_wgt )
# your desired subset
z <- subset( y , q == 1 )
# your desired mean
svyby( ~ c , ~ region , z , svymean )
aggregate(df$c, by=list(df$q), FUN=mean)
Here's another possibility. To illustrate, create a dataset per your parameters:
set.seed(787)
dat.a <-runif(n=10,min=0,max=1)
dat.b <-rbinom(n=10, size=1, prob=0.5)
dat.1 <-data.frame(matrix(c(dat.a, dat.b), ncol=2, nrow=10))
colnames(dat.1) <-c("c","q")
dat.1
c q
1 0.35326234 1
2 0.45277055 0
3 0.29505270 0
4 0.78723105 1
5 0.95915348 1
6 0.17505284 0
7 0.79693672 0
8 0.01648420 1
9 0.02706417 0
10 0.93996311 1
Now subset by extracting all rows that match q=1 and compute mean for column c in resulting output:
dat.1.subset <-dat.1[dat.1$q==1,]
mean(dat.1.subset$c)
[1] 0.6112188

Calculate euclidean distance between profiles stored in data frame. Using one row as a reference

I have a data frame like that one below:
> dput(data)
structure(list(`28` = c(0, 0, 0, 0, 0, 0), `38` = c(0, 0, 0,
0, 0, 0), `45` = c(0, 0, 0, 0, 0, 0), `53` = c(0, 0, 0, 0, 0,
0), `60` = c(0, 0, 0, 0, 0, 0), `78` = c(0, 0, 0, 0, 0, 0), `116` = c(0,
0, 0, 0, 0, 0.983309489747258), `145` = c(0, 0, 0, 0, 0, 1),
`189` = c(0, 1, 0.560384508734634, 0, 0, 0.875695437927198
), `223` = c(0, 0.988158197286733, 1, 0, 0, 0.492500108379937
), `281` = c(1, 0.677856978615774, 0.448525741750624, 0,
0.362088745790311, 0.180474270603026), `362` = c(0.79151704397606,
0.763278914693033, 0.35864682503004, 1, 1, 0.114178985852806
), `440` = c(0.662841530054645, 0.818636468153598, 0.448488769756909,
0, 0.448447503793346, 0), `524` = c(0, 0.638192687974247,
0, 0, 0, 0), `634` = c(0, 0, 0, 0, 0, 0), `759` = c(0, 0,
0, 0, 0, 0), `848` = c(0, 0, 0, 0, 0, 0), `979` = c(0, 0,
0, 0, 0, 0), `1120` = c(0, 0, 0, 0, 0, 0), `1248` = c(0,
0, 0, 0, 0, 0)), .Names = c("28", "38", "45", "53", "60",
"78", "116", "145", "189", "223", "281", "362", "440", "524",
"634", "759", "848", "979", "1120", "1248"), row.names = c("Mark",
"Gregg", "Tim", "Oscar", "Tom", "Matthew"
), class = "data.frame")
I would like to calculate euclidean distance between all the profiles from this data and Tim should be used as a reference. The results can be stored in additional column.
Mark to Tim
Gregg to Tim
Oscar to Tim
and etc
You can use dist function (which actually computes all the distances between all the profiles) :
m <- as.matrix(DF)
distances <- as.matrix(dist(m, method = "euclidean", upper = TRUE,diag = TRUE))
> distances['Mark','Tim']
[1] 1.36069
> distances['Gregg','Tim']
[1] 0.9767401
> distances['Oscar','Tim']
[1] 1.458658

How to make the graph easier to read ? Editing plot

I would like to ask you for the suggestions how I can edit my plot function to make my graph more clear ?
Here I show you the code which I use for plotting:
# open the pdf file
pdf(file='LSF1_PWD_GWD.pdf')
a <- c('LSF1', 'PWD', 'GWD')
rowsToPlot<-c(1066,2269,109)
matplot(as.matrix(t(tbl_alles[rowsToPlot,])),type=rep("l", length(rowsToPlot)), col=rainbow(length(rowsToPlot)),xlab = 'Fraction Size', ylab = 'Intensity')
legend('topright',a,lty=1, bty='n', cex=.75, col = rainbow(length(rowsToPlot)))
# close the pdf file
dev.off()
and that's how the graph looks like:
It's just a basic plot because I have no idea how to edit it. The arrow indicates three lines on one position which you can't see because they overlap... and that's the most important part of this graph for me. Maybe I shouldn't use dotted line ? How to change it ?
Data:
tbl_alles <-
structure(list("10" = c(0, 0, 0, 0, 0, 0),
"20" = c(0, 0, 0, 0, 0, 0),
"52.5" = c(0, 0, 0, 0, 0, 0),
"81" = c(0, 0, 1, 0, 0, 0),
"110" = c(0, 0, 0, 0, 0, 0),
"140.5" = c(0, 0, 0, 0, 0, 0),
"189" = c(0, 0, 0, 0, 0, 0),
"222.5" = c(0, 0, 0, 0, 0, 0 ),
"278" = c(0, 0, 0, 0, 0, 0),
"340" = c(0, 0, 0, 0, 0, 0),
"397" = c(0, 1, 0, 0, 0, 0),
"453.5" = c(0, 0.66069369, 0, 0, 0, 1),
"529" = c(0, 0.521435654, 0, 0, 1, 0),
"580" = c(0, 0.437291195, 0, 0, 1, 0),
"630.5" = c(0, 0.52204783, 0, 0, 0, 0),
"683.5" = c(0, 0.52429838, 0, 0, 0, 0),
"735.5" = c(1, 0.3768651, 0, 1, 0, 0),
"784" = c(0, 0, 0, 0, 0, 0),
"832" = c(0, 0, 0, 0, 0, 0),
"882.5" = c(0, 0, 0, 0, 0, 0),
"926.5" = c(0, 0, 0, 0, 0, 0),
"973" = c(0, 0, 0, 0, 0, 0),
"1108" = c(0, 0, 0, 0, 0, 0),
"1200" = c(0, 0, 0, 0, 0, 0)),
.Names = c("10", "20", "52.5", "81",
"110", "140.5","189", "222.5",
"278", "340", "397", "453.5",
"529", "580", "630.5", "683.5",
"735.5", "784", "832", "882.5",
"926.5", "973", "1108", "1200"),
row.names = c("at1g01050.1", "at1g01080.1",
"at1g01090.1","at1g01220.1",
"at1g01420.1", "at1g01470.1"),
class = "data.frame")
RowsToPlot:
> dput(tbl_alles[rowsToPlot,])
structure(list(`10` = c(0, 0, 0), `20` = c(0, 0, 0), `52.5` = c(0,
0, 0), `81` = c(0, 0, 0), `110` = c(0, 0, 0), `140.5` = c(0,
0, 0), `189` = c(0, 0, 0), `222.5` = c(0, 0, 0), `278` = c(0,
0, 0), `340` = c(0, 0, 0), `397` = c(0, 0, 0), `453.5` = c(0,
0, 0), `529` = c(0, 0, 0), `580` = c(0, 0, 0), `630.5` = c(0,
0, 0), `683.5` = c(0, 0, 0.57073483), `735.5` = c(0, 1, 0.85691826
), `784` = c(0, 0, 0.90706982), `832` = c(1, 1, 1), `882.5` = c(0,
0, 0), `926.5` = c(0, 0, 0), `973` = c(0, 0, 0), `1108` = c(0,
0, 0), `1200` = c(0, 0, 0)), .Names = c("10", "20", "52.5", "81",
"110", "140.5", "189", "222.5", "278", "340", "397", "453.5",
"529", "580", "630.5", "683.5", "735.5", "784", "832", "882.5",
"926.5", "973", "1108", "1200"), row.names = c("at3g01510.1",
"at5g26570.1", "at1g10760.1"), class = "data.frame")
Okay, here's a way to distinguish the lines clearly, while keeping everything on one plot. I use non solid linetypes and different sizes to 'make room' for the overlayed lines.
library(reshape2)
library(ggplot2)
dat <- as.data.frame(as.matrix(t(tbl_alles)))
dat$x <- as.numeric(row.names(dat))
ggplot(melt(dat, id.vars='x'), aes(x=x, y=value, group=variable)) +
geom_line(aes(color=variable, linetype=variable, size=variable)) +
scale_linetype_manual(values=c('solid', 'dotted', 'dashed')) +
scale_size_manual(values=c(1,3,1)) +
scale_color_manual(values=c('black', 'red', 'white')) +
theme(axis.text = element_text(color='black'),
panel.background = element_rect('grey'),
legend.key = element_rect('grey'),
panel.grid = element_blank()) +
labs(title='This is not a pretty chart, but you can make out the lines')
I took as a starting point your data from the dput you pasted above:
tbl_alles <- structure(list(`10` = c(0, 0, 0), `20` = c(0, 0, 0), `52.5` = c(0, 0, 0), `81` = c(0, 0, 0), `110` = c(0, 0, 0), `140.5` = c(0, 0, 0), `189` = c(0, 0, 0), `222.5` = c(0, 0, 0), `278` = c(0, 0, 0), `340` = c(0, 0, 0), `397` = c(0, 0, 0), `453.5` = c(0, 0, 0), `529` = c(0, 0, 0), `580` = c(0, 0, 0), `630.5` = c(0, 0, 0), `683.5` = c(0, 0, 0.57073483), `735.5` = c(0, 1, 0.85691826), `784` = c(0, 0, 0.90706982), `832` = c(1, 1, 1), `882.5` = c(0, 0, 0), `926.5` = c(0, 0, 0), `973` = c(0, 0, 0), `1108` = c(0, 0, 0), `1200` = c(0, 0, 0)), .Names = c("10", "20", "52.5", "81", "110", "140.5", "189", "222.5", "278", "340", "397", "453.5", "529", "580", "630.5", "683.5", "735.5", "784", "832", "882.5", "926.5", "973", "1108", "1200"), row.names = c("at3g01510.1", "at5g26570.1", "at1g10760.1"), class = "data.frame")
This is most certainly not what you need, but perhaps it can give you another idea.
X=structure(list(`10` = c(0, 0, 0), `20` = c(0, 0, 0), `52.5` = c(0,
0, 0), `81` = c(0, 0, 0), `110` = c(0, 0, 0), `140.5` = c(0,
0, 0), `189` = c(0, 0, 0), `222.5` = c(0, 0, 0), `278` = c(0,
0, 0), `340` = c(0, 0, 0), `397` = c(0, 0, 0), `453.5` = c(0,
0, 0), `529` = c(0, 0, 0), `580` = c(0, 0, 0), `630.5` = c(0,
0, 0), `683.5` = c(0, 0, 0.57073483), `735.5` = c(0, 1, 0.85691826
), `784` = c(0, 0, 0.90706982), `832` = c(1, 1, 1), `882.5` = c(0,
0, 0), `926.5` = c(0, 0, 0), `973` = c(0, 0, 0), `1108` = c(0,
0, 0), `1200` = c(0, 0, 0)), .Names = c("10", "20", "52.5", "81",
"110", "140.5", "189", "222.5", "278", "340", "397", "453.5",
"529", "580", "630.5", "683.5", "735.5", "784", "832", "882.5",
"926.5", "973", "1108", "1200"), row.names = c("at3g01510.1",
"at5g26570.1", "at1g10760.1"), class = "data.frame");
library(ggplot2)
library(reshape2)
library(data.table)
X.dt<-as.data.table(t(X))
X.dt[,X:=1:dim(X.dt)[1]]
X.dt<-melt(X.dt, id='X')
ggplot(X.dt,aes(X, value,group=variable,color=variable))+
geom_line()+
facet_wrap(~variable, nrow=3)+
guides(color=FALSE)+labs(x="X",y="Intensity")
Since you have a discrete number of x values, I suggest using a barplot instead. This will make the categories easier to distinguish and highlight the aspect you are most interested in.
First put the data in long format
dat <- structure(list(`10` = c(0, 0, 0), `20` = c(0, 0, 0), `52.5` = c(0, 0, 0),
`81` = c(0, 0, 0), `110` = c(0, 0, 0), `140.5` = c(0, 0, 0),
`189` = c(0, 0, 0), `222.5` = c(0, 0, 0), `278` = c(0, 0, 0),
`340` = c(0, 0, 0), `397` = c(0, 0, 0), `453.5` = c(0, 0, 0),
`529` = c(0, 0, 0), `580` = c(0, 0, 0), `630.5` = c(0, 0, 0),
`683.5` = c(0, 0, 0.57073483), `735.5` = c(0, 1, 0.85691826),
`784` = c(0, 0, 0.90706982), `832` = c(1, 1, 1),
`882.5` = c(0, 0, 0), `926.5` = c(0, 0, 0), `973` = c(0, 0, 0),
`1108` = c(0, 0, 0), `1200` = c(0, 0, 0)),
.Names = c("10", "20", "52.5", "81", "110", "140.5", "189",
"222.5", "278", "340", "397", "453.5", "529", "580",
"630.5", "683.5", "735.5", "784", "832", "882.5",
"926.5", "973", "1108", "1200"),
row.names = c("at3g01510.1", "at5g26570.1", "at1g10760.1"),
class = "data.frame")
library(tidyr)
dat$rowname <- rownames(dat)
ggdat <- gather(dat, key = "colname", value = "Intensity", -rowname)
Then create the barplot using ggplot2
library(RColorBrewer)
library(ggplot2)
colors <- brewer.pal(nrow(dat), "Dark2")
ggplot(data = ggdat, aes(x = colname, y = Intensity, fill = rowname)) +
geom_bar(aes(color = rowname), stat = "identity",
position = position_dodge(), width = 0.75) +
scale_fill_manual(values = colors) +
scale_color_manual(values = colors) +
theme(axis.title.x = element_blank(),
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
legend.position = "bottom")
The code could be used for more than 3 rows, although the bars will get harder to distinguish with more categories. If this is a problem, you could consider dropping/binning x values, or perhaps splitting the plot into two:
ggdat$group <- factor(ggdat$colname %in% colnames(dat)[1:12],
levels = c(TRUE, FALSE), labels = c("Low x", "High x"))
ggplot(data = ggdat, aes(x = colname, y = Intensity, fill = rowname)) +
geom_bar(aes(color = rowname), stat = "identity",
position = position_dodge(), width = 0.75) +
scale_fill_manual(values = colors) +
scale_color_manual(values = colors) +
theme(axis.title.x = element_blank(),
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
legend.position = "bottom") +
facet_wrap(~ group, ncol = 1, scales = "free_x")
How many records does the dataset have? It seems you are dealing with an overplotting issue. Follow #Nikos method to tidy the data.
Use size and alpha to change the size and transparency of the line.
ggplot(data = X.dt, aes(x = X, y = value, group = variable, color = variable)) +
geom_line(data = X.dt, aes(x = X, y = value, group = variable, color = variable),
size = 3, alpha = .25)
The color of the line changes as they overlap. However this will only work for smaller datasets. My only other suggestion is to overlay geom_line() with geom_point() that will plot points over the lines. You can use position = position_jitter() to slightly augment the position of the points, that way if they overlap you can see where they overlap.
ggplot(data = X.dt, aes(x = X, y = value, group = variable, color = variable)) +
geom_point(position = position_jitter(w = 0.001, h = 0.02), size = 3, alpha = .5) +
geom_line(data = X.dt, aes(x = X, y = value, group = variable, color = variable), size = 1, alpha = .25)
You can try to play with the line types but this can become really difficult if you have too much lines to see : is 3 the maximum you'll have ? Else, you may consider another way to draw your data.
Here is an example with your data, when I plot it, I can see the 3 lines :
matplot(as.matrix(t(tbl_alles[rowsToPlot,])),type="l",lwd=2,lty=c("solid","48","36"), col=rainbow(length(rowsToPlot)),xlab = 'Fraction Size', ylab = 'Intensity')
legend('topright',c('LSF1', 'PWD', 'GWD'),lty=c("solid","48","36"),lwd=2, bty='n', cex=.75, col = rainbow(length(rowsToPlot)))
the 3 line types :
solid: this is the default type, as you already know...
48: first 4 units of line then a blank of 8 units
36: first 3 units of line then a blank of 6 units.
I also changed the width of the line with lwd=2.
There is another parameter to play with : transparency.
If (keeping the different lty) you change the colors to c("#FF000030","#0000FF50","#00FF0080") for example, it will be easier to see every lines (the two last characters of each hexadecimal code specify the transparency).
If you use transparency, then you can even specify a unique color and ovelapping lines will appear darker : for example, col=#00000044".

Resources