nrow() gives more rows than original in R - r

I have a file with 20 fields as headers in the first row. The remaining rows have unequal number of fields, some of the rows have more columns than the headers. When i tried to read it using read.delim(), it reads the data without error but the total row count is more than the original number.
Here are a few lines of the file:
Chromosome Position SNPid Reference Alternate QUAL Homozygosity Tool Depth MappingQuality EFFECT IMPACT FUNCTIONAL_CLASS CODON_CHANGE AMINO_ACID_CHANGE GENE_NAME GENE_BIOTYPE GENE_CODING TRANSCRIPT_ID EXON_ID
chr1 403111 . G A 24 het SAM 20 55 INTERGENIC MODIFIER _ _ _ _ _ _ _ _ _
chr1 602567 rs21953190 A G 3265.77 hom GATKSAM 91 58.46 SYNONYMOUS_CODING LOW SILENT gaT/gaC D1034 ADNP2 protein_coding CODING ENSCAFT00000000008 5 _
chr1 604894 rs21953191 A G 2869.77 hom GATKSAM 77 59.70 NON_SYNONYMOUS_CODING MODERATE MISSENSE Ttt/Ctt F259L ADNP2 protein_coding CODING ENSCAFT00000000008 5 _
chr1 758630 . T TC 1531.73 hom GATKSAM 38 46.20 INTRON MODIFIER _ _ _ PQLC1 protein_coding CODING ENSCAFT00000000011 2 _
chr1 800715 . C CT 514.73 hom GATKSAM 13 60.00 INTRON MODIFIER _ _ _ PQLC1 protein_coding CODING ENSCAFT00000000011 6 ,SPLICE_SITE_ACCEPTOR HIGH _ _ _ PQLC1 protein_coding CODING ENSCAFT00000000011 7 ,SPLICE_SITE_DONOR HIGH _ _ _ PQLC1 protein_coding CODING ENSCAFT00000000011 6 _
chr1 1104035 rs21966859 G A 3803.77 hom GATKSAM 97 57.97 INTRON MODIFIER _ _ _ NFATC1 protein_coding CODING ENSCAFT00000000013 2 ,INTRON MODIFIER _ _ _ NFATC1 protein_coding CODING ENSCAFT00000036234 2 _
chr1 1120994 . CGCG C 604.73 hom GATKSAM 21 56.55 INTERGENIC MODIFIER _ _ _ _ _ _ _ _ ,UPSTREAM MODIFIER _ _ _ NFATC1 protein_coding CODING ENSCAFT00000000013 _ ,UPSTREAM MODIFIER _ _ _ NFATC1 protein_coding CODING ENSCAFT00000036234 _ _
chr1 1136916 rs21935602 G A 3899.77 hom GATKSAM 101 59.17 DOWNSTREAM MODIFIER _ _ _ ATP9B protein_coding CODING ENSCAFT00000000014 _ ,DOWNSTREAM MODIFIER _ _ _ ATP9B protein_coding CODING ENSCAFT00000042968 _ ,UTR_3_PRIME MODIFIER _ _ _ ATP9B protein_coding CODING ENSCAFT00000046825 29 _
There are 9 rows in the file.But when it is read in R and the number of rows are counted it shows as 12.
read.delim("test.txt",header=T,sep='\t')->data
nrow(data)
Could someone help, to read the data properly?
Below is the output from dput(data)
> dput(data)
structure(list(Chromosome = structure(c(3L, 3L, 3L, 3L, 3L, 1L,
3L, 2L, 3L, 2L, 3L, 2L), .Label = c("HIGH", "MODIFIER", "chr1"
), class = "factor"), Position = structure(c(4L, 5L, 6L, 7L,
8L, 9L, 1L, 9L, 2L, 9L, 3L, 9L), .Label = c("1104035", "1120994",
"1136916", "403111", "602567", "604894", "758630", "800715",
"_"), class = "factor"), SNPid = structure(c(1L, 4L, 5L, 1L,
1L, 2L, 6L, 2L, 1L, 2L, 3L, 2L), .Label = c(".", "_", "rs21935602",
"rs21953190", "rs21953191", "rs21966859"), class = "factor"),
Reference = structure(c(4L, 1L, 1L, 5L, 2L, 6L, 4L, 6L, 3L,
6L, 4L, 6L), .Label = c("A", "C", "CGCG", "G", "T", "_"), class = "factor"),
Alternate = structure(c(1L, 5L, 5L, 8L, 4L, 7L, 1L, 6L, 3L,
6L, 1L, 2L), .Label = c("A", "ATP9B", "C", "CT", "G", "NFATC1",
"PQLC1", "TC"), class = "factor"), QUAL = structure(c(2L,
4L, 3L, 1L, 7L, 9L, 5L, 9L, 8L, 9L, 6L, 9L), .Label = c("1531.73",
"24", "2869.77", "3265.77", "3803.77", "3899.77", "514.73",
"604.73", "protein_coding"), class = "factor"), Homozygosity = structure(c(2L,
3L, 3L, 3L, 3L, 1L, 3L, 1L, 3L, 1L, 3L, 1L), .Label = c("CODING",
"het", "hom"), class = "factor"), Tool = structure(c(6L,
5L, 5L, 5L, 5L, 1L, 5L, 3L, 5L, 2L, 5L, 4L), .Label = c("ENSCAFT00000000011",
"ENSCAFT00000000013", "ENSCAFT00000036234", "ENSCAFT00000042968",
"GATKSAM", "SAM"), class = "factor"), Depth = structure(c(4L,
9L, 8L, 6L, 2L, 7L, 10L, 3L, 5L, 11L, 1L, 11L), .Label = c("101",
"13", "2", "20", "21", "38", "7", "77", "91", "97", "_"), class = "factor"),
MappingQuality = structure(c(5L, 8L, 10L, 4L, 11L, 1L, 7L,
12L, 6L, 2L, 9L, 3L), .Label = c(",SPLICE_SITE_DONOR", ",UPSTREAM",
",UTR_3_PRIME", "46.20", "55", "56.55", "57.97", "58.46",
"59.17", "59.70", "60.00", "_"), class = "factor"), EFFECT = structure(c(4L,
8L, 7L, 5L, 5L, 3L, 5L, 1L, 4L, 6L, 2L, 6L), .Label = c("",
"DOWNSTREAM", "HIGH", "INTERGENIC", "INTRON", "MODIFIER",
"NON_SYNONYMOUS_CODING", "SYNONYMOUS_CODING"), class = "factor"),
IMPACT = structure(c(4L, 2L, 3L, 4L, 4L, 5L, 4L, 1L, 4L,
5L, 4L, 5L), .Label = c("", "LOW", "MODERATE", "MODIFIER",
"_"), class = "factor"), FUNCTIONAL_CLASS = structure(c(4L,
3L, 2L, 4L, 4L, 4L, 4L, 1L, 4L, 4L, 4L, 4L), .Label = c("",
"MISSENSE", "SILENT", "_"), class = "factor"), CODON_CHANGE = structure(c(3L,
4L, 2L, 3L, 3L, 3L, 3L, 1L, 3L, 3L, 3L, 3L), .Label = c("",
"Ttt/Ctt", "_", "gaT/gaC"), class = "factor"), AMINO_ACID_CHANGE = structure(c(7L,
3L, 4L, 7L, 7L, 6L, 7L, 1L, 7L, 5L, 7L, 2L), .Label = c("",
"ATP9B", "D1034", "F259L", "NFATC1", "PQLC1", "_"), class = "factor"),
GENE_NAME = structure(c(6L, 2L, 2L, 5L, 5L, 7L, 4L, 1L, 6L,
7L, 3L, 7L), .Label = c("", "ADNP2", "ATP9B", "NFATC1", "PQLC1",
"_", "protein_coding"), class = "factor"), GENE_BIOTYPE = structure(c(3L,
4L, 4L, 4L, 4L, 2L, 4L, 1L, 3L, 2L, 4L, 2L), .Label = c("",
"CODING", "_", "protein_coding"), class = "factor"), GENE_CODING = structure(c(6L,
2L, 2L, 2L, 2L, 3L, 2L, 1L, 6L, 4L, 2L, 5L), .Label = c("",
"CODING", "ENSCAFT00000000011", "ENSCAFT00000036234", "ENSCAFT00000046825",
"_"), class = "factor"), TRANSCRIPT_ID = structure(c(8L,
4L, 4L, 5L, 5L, 3L, 6L, 1L, 8L, 8L, 7L, 2L), .Label = c("",
"29", "6", "ENSCAFT00000000008", "ENSCAFT00000000011", "ENSCAFT00000000013",
"ENSCAFT00000000014", "_"), class = "factor"), EXON_ID = structure(c(5L,
3L, 3L, 2L, 4L, 5L, 2L, 1L, 5L, 5L, 5L, 5L), .Label = c("",
"2", "5", "6", "_"), class = "factor"), X = structure(c(6L,
6L, 6L, 6L, 4L, 1L, 3L, 1L, 5L, 1L, 2L, 1L), .Label = c("",
",DOWNSTREAM", ",INTRON", ",SPLICE_SITE_ACCEPTOR", ",UPSTREAM",
"_"), class = "factor")), .Names = c("Chromosome", "Position",
"SNPid", "Reference", "Alternate", "QUAL", "Homozygosity", "Tool",
"Depth", "MappingQuality", "EFFECT", "IMPACT", "FUNCTIONAL_CLASS",
"CODON_CHANGE", "AMINO_ACID_CHANGE", "GENE_NAME", "GENE_BIOTYPE",
"GENE_CODING", "TRANSCRIPT_ID", "EXON_ID", "X"), class = "data.frame", row.names = c(NA,
-12L))

R thinks you have 21 rather than 20 fields per line (maybe there are trailing tabs on each line?), and your lines 6-9 have additional fields:
count.fields("test.txt",sep="\t")
## [1] 21 21 21 21 21 41 31 41 41
This confuses the heck out of read.delim, which tries to guess what's going on from the first 5 lines (it shouldn't, but that's the way it is). You might think you could use fill=TRUE to get around this, but you can't.
I tried using colClasses along with fill=TRUE to specify the field types (I used colClasses=rep("character",41) but you can probably guess better than that), but it doesn't seem to work, probably because your header only has 21 columns.
The fread function in the data.table package can do a little better, but only if you tell it not to try to guess the format from lines after #5, and it discards the data in columns beyond 21.
library(data.table)
nrow(fread("test.txt",autostart=5)) ## 9
Hmm, even that doesn't quite work as expected (it doesn't pick up the header properly, even if I set header=TRUE, probably because column 21 doesn't have a header field ... The bottom line is that you probably have to figure out what those extra fields are and do something more explicit with them (e.g. add header fields ...)
Basically, R expects your data to be pretty clean. It might be worth sending this example to the maintainer of the data.table package, who is trying to make fread be as robust as possible ... this would represent a challenge.

Looking at the data you can see that it is highly "mutated" with many fusion lines. These are in many cases signaled by the presence of commas. I think this data is in a different format than you expect. Your first element in the dput data was a factor with Chromosome values =c("HIGH", "MODIFIER", "chr1"). That's not a sensible result, pointing to a lack of understanding on your part about the organization of the original data. You should post the original text file somewhere that can be accessed over the Internet, so the original layout can be examined. In particular the tabs you think are the delimiters are either not there or are not being captured by the SO interface.
After being pointed to the data sample, which should have been put into the question body by you doing editing, try this to delete the comments that follow the commas:
datL <- readLines("~/Downloads/test.txt")
datLred <- gsub("[,].+$", "", datL)
read.delim(text=datLred)
> str(read.delim(text=datLred) )
'data.frame': 8 obs. of 21 variables:
$ Chromosome : Factor w/ 1 level "chr1": 1 1 1 1 1 1 1 1
$ Position : int 403111 602567 604894 758630 800715 1104035 1120994 1136916
$ SNPid : Factor w/ 5 levels ".","rs21935602",..: 1 3 4 1 1 5 1 2
$ Reference : Factor w/ 5 levels "A","C","CGCG",..: 4 1 1 5 2 4 3 4
$ Alternate : Factor w/ 5 levels "A","C","CT","G",..: 1 4 4 5 3 1 2 1
snipped remain columns

Related

cld() output has a wrong order of factor levels

I am using R cld() function with emmeans, but the order of factor level in the output is different from what I set. Before calling cld(), the by.years output is also in the desired order (screenshot), but when I do cld(), the output is in the alphabetical order of Light - Moderate - No(screenshot). I also checked cld.years$Grazing.intensity, the levels are correct. Is there a way to specify the order of factor levels in the cld() output? Any help is appreciated.
# sample data
plants <- structure(list(Grazing.intensity = structure(c(3L, 2L, 3L, 3L, 3L, 1L, 3L, 2L, 2L, 2L, 1L, 2L, 3L, 3L, 3L), .Label = c("Light-grazing", "Moderate-grazing", "No-grazing"), class = "factor"), Grazing.intensity1 = structure(c(3L, 2L, 3L, 3L, 3L, 1L, 3L, 2L, 2L, 2L, 1L, 2L, 3L, 3L, 3L), .Label = c("LG", "MG", "NG"), class = "factor"), Years = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 2L), .Label = c("Dry-year", "Wet-year"), class = "factor"), Month = structure(c(2L, 2L, 2L, 1L, 3L, 3L, 1L, 1L, 3L, 1L, 3L, 3L, 2L, 2L, 3L), .Label = c("Aug.", "Jul.", "Sept."), class = "factor"), Plots = c(1L, 3L, 8L, 6L, 9L, 7L, 2L, 2L, 10L, 10L, 7L, 7L, 9L, 4L, 2L), Species.richness = c(8L, 6L, 10L, 11L, 9L, 5L, 7L, 13L, 10L, 6L, 5L, 5L, 14L, 8L, 10L)), class = "data.frame", row.names = c(NA, -15L))
# set the order of factor levels
plants$Grazing.intensity <- factor(plants$Grazing.intensity, levels =
c('No-grazing','Light-grazing','Moderate-grazing'))
attach(plants)
lmer.mod <- lmer(Species.richness ~ Grazing.intensity*Years + (1|Month), data = plants)
by.years <- emmeans(lmer.mod, specs = ~ Grazing.intensity:Years, by = 'Years', type = "response")
# display cld
cld.years <- cld(by.years, Letters = letters)
This is my first time posting sample data in StackOverflow, so it may be wrong.. I used dput().
I solved the issue. The order changed because the levels are displayed in the increasing order of emmean. I set sort = FALSE, and the result was displayed in the default order. I should have read the documentations more thoroughly.

How to get the rest of the rows after taking some rows randomly from a dataframe in R

I have 2 dataframe df_1 and df_2. Now I have to select some rows randomly from df_1 and then I will merge the rest of the rows (which not selected randomly) from df_1 with df_2.
I am using this code
set.seed(9999)
df_1 <- # the whole dataset
test_dataset1 <- sample_n(df_1, 10)
train_part_1 <- df_1[which(!df_1 %in% test_dataset1)] # Not working
train_1 <- rbind(df_2, train_part_1)
But, when I am trying to extract the rows not selected randomly. My code is not working. I am getting the same data as the df_1 means 20 rows (same dataset)
Edited: Actually, I have to make 3 test and 3 train datasets. So, how can I use the seed function to get the same dataset for reproduce purposes?
Reproducible data (only df_1):
structure(list(nodeA = structure(c(4L, 2L, 1L, 1L, 1L, 4L, 1L,
9L, 3L, 4L, 2L, 8L, 2L, 1L, 5L, 7L, 3L, 6L, 2L, 1L), .Label = c("ID00309",
"ID00361", "ID00541", "ID00570", "ID00615", "ID00696", "ID00762",
"ID01200", "ID05109"), class = "factor"), nodeB = structure(c(8L,
3L, 3L, 1L, 2L, 7L, 9L, 8L, 8L, 6L, 9L, 7L, 4L, 4L, 6L, 9L, 6L,
7L, 5L, 5L), .Label = c("ID00361", "ID00541", "ID00570", "ID00615",
"ID00696", "ID01200", "ID05109", "ID11641", "ID11691"), class = "factor"),
scr = structure(20:1, .Label = c("1.85284606048794", "1.90444166064472",
"1.90762235378507", "1.94364188077133", "1.95883206119256",
"2.08440437841349", "2.26408172709962", "2.3223132020942",
"2.46120775935034", "2.49647215035727", "2.50432367561777",
"2.57541320006514", "2.65099330092281", "2.75209155741549",
"2.93717640337986", "2.99596628688011", "3.21209741517806",
"3.21997803385465", "3.48788394772132", "3.81389707587156"
), class = "factor")), class = "data.frame", row.names = c(NA,
-20L))
Get your sample using random row numbers and the use - to get the inverse:
df_1 <- structure(list(nodeA = structure(c(4L, 2L, 1L, 1L, 1L, 4L, 1L, 9L, 3L, 4L,
2L, 8L, 2L, 1L, 5L, 7L, 3L, 6L, 2L, 1L),
.Label = c("ID00309", "ID00361", "ID00541",
"ID00570", "ID00615", "ID00696",
"ID00762", "ID01200", "ID05109"),
class = "factor"),
nodeB = structure(c(8L, 3L, 3L, 1L, 2L, 7L, 9L, 8L, 8L, 6L,
9L, 7L, 4L, 4L, 6L, 9L, 6L, 7L, 5L, 5L),
.Label = c("ID00361", "ID00541", "ID00570",
"ID00615", "ID00696", "ID01200",
"ID05109", "ID11641", "ID11691"),
class = "factor"),
scr = structure(20:1, .Label = c("1.85284606048794", "1.90444166064472",
"1.90762235378507", "1.94364188077133",
"1.95883206119256", "2.08440437841349",
"2.26408172709962", "2.3223132020942",
"2.46120775935034", "2.49647215035727",
"2.50432367561777", "2.57541320006514",
"2.65099330092281", "2.75209155741549",
"2.93717640337986", "2.99596628688011",
"3.21209741517806", "3.21997803385465",
"3.48788394772132", "3.81389707587156"
), class = "factor")),
class = "data.frame", row.names = c(NA, -20L))
set.seed(9999)
Selected <- sample.int(nrow(df_1), 10)
# index selected the row; use [col,row] pattern to select rows
test_dataset1 <- df_1[ Selected, ]
# use -index to remove rows
train_part_1 <- df_1[-Selected, ]
test_dataset1
#> nodeA nodeB scr
#> 6 ID00570 ID05109 2.93717640337986
#> 9 ID00541 ID11641 2.57541320006514
#> 19 ID00361 ID00696 1.90444166064472
#> 3 ID00309 ID00570 3.21997803385465
#> 10 ID00570 ID01200 2.50432367561777
#> 2 ID00361 ID00570 3.48788394772132
#> 20 ID00309 ID00696 1.85284606048794
#> 8 ID05109 ID11641 2.65099330092281
#> 12 ID01200 ID05109 2.46120775935034
#> 18 ID00696 ID05109 1.90762235378507
train_part_1
#> nodeA nodeB scr
#> 1 ID00570 ID11641 3.81389707587156
#> 4 ID00309 ID00361 3.21209741517806
#> 5 ID00309 ID00541 2.99596628688011
#> 7 ID00309 ID11691 2.75209155741549
#> 11 ID00361 ID11691 2.49647215035727
#> 13 ID00361 ID00615 2.3223132020942
#> 14 ID00309 ID00615 2.26408172709962
#> 15 ID00615 ID01200 2.08440437841349
#> 16 ID00762 ID11691 1.95883206119256
#> 17 ID00541 ID01200 1.94364188077133
Created on 2021-03-14 by the reprex package (v1.0.0)

An R function to work out the preceding value

I'm trying to create a table of staff, who they report to and what level they are.
I've been working with a similar table, and #TonakShah was kind enough to help me with calculating the lowest level location is and the level above is using the solution below.
My employee table looks like this:
input = structure(list(Level.1 = structure(c(1L, 1L, 1L, 1L, 1L, 1L,
1L), .Label = "Board", class = "factor"), Level.2 = structure(c(2L,
2L, 2L, 1L, 1L, 3L, 3L), .Label = c("Aasha", "Grace", "Marisol"
), class = "factor"), Level.3 = structure(c(4L, 4L, 3L, 1L, 1L,
2L, 2L), .Label = c("Alex", "Chandler", "Millie", "Tushad"), class = "factor"),
Level.4 = structure(c(2L, 2L, 6L, 1L, 5L, 3L, 4L), .Label = c("#",
"Frank", "Joey", "Rachel", "Sarah", "Tony"), class = "factor"),
Level.5 = structure(c(3L, 2L, 1L, 1L, 1L, 4L, 1L), .Label = c("#",
"Lela", "Millie", "Ross"), class = "factor"), Level.6 = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L), .Label = "#", class = "factor")), class = "data.frame", row.names = c(NA,
-7L))
and using the technique described here by Ronak (stackoverflow.com/questions/56903188/create-a-table-from-a-hierarchy/)
which is,
as.data.frame(t(apply(input, 1, function(x)
{new_x = x[x != "###"]; c(rev(tail(new_x, 2)), length(new_x)) })))
I can get most of the required table. But I'm having trouble trying to get "the bosses" (eg. those with employees but are not "the board").
My ideal output would look something like this (I've added colnames to make it easier to understand):
structure(list(Subordinate = structure(c(9L, 4L, 14L, 5L, 7L,
13L, 9L, 2L, 1L, 12L, 11L, 6L, 3L, 8L, 10L), .Label = c("Aasha",
"Alex", "Chandler", "Frank", "Grace", "Joey", "Lela", "Marisol",
"Millie", "Rachel", "Ross", "Sarah", "Tony", "Tushad"), class = "factor"),
Boss = structure(c(5L, 10L, 6L, 3L, 5L, 9L, 6L, 1L, 3L, 2L,
7L, 4L, 8L, 3L, 4L), .Label = c("Aasha", "Alex", "Board",
"Chandler", "Frank", "Grace", "Joey", "Marisol", "Millie",
"Tushad"), class = "factor"), Level = c(5L, 4L, 3L, 2L, 5L,
4L, 3L, 3L, 2L, 4L, 5L, 4L, 3L, 2L, 4L)), class = "data.frame", row.names = c(NA,
-15L))
I think I maybe do it with a loop, but this doesn't seem to be the best answer. Can anyone offer any other tips?
Couldn't come up with a prettier solution but this works. Using a while loop in the apply call used previously, we can do
output <- do.call(rbind.data.frame, apply(input, 1, function(x) {
new_x = as.character(x[x != "#"])
list_df <- list()
i = 1
while(length(new_x) >= 2) {
#Get last 2 eneteries
list_df[[i]] <- c(rev(tail(new_x, 2)), length(new_x))
#Go one level deeper
new_x = head(new_x, -1)
i = i +1
}
do.call(rbind, list_df)
}))
#To remove duplicate enteries
output[!duplicated(output), ]
# V1 V2 V3
#1 Millie Frank 5
#2 Frank Tushad 4
#3 Tushad Grace 3
#4 Grace Board 2
#5 Lela Frank 5
#9 Tony Millie 4
#10 Millie Grace 3
#12 Alex Aasha 3
#13 Aasha Board 2
#14 Sarah Alex 4
#17 Ross Joey 5
#18 Joey Chandler 4
#19 Chandler Marisol 3
#20 Marisol Board 2
#21 Rachel Chandler 4

Stacked bar graph with fill ggplot2

I've read through the ggplot2 docs website and other question but I couldn't find a solution. I'm trying to visualize some data for varying age groups. I have sort of managed to do it but it does not look like I would intend it to.
Here is the code for my plot
p <- ggplot(suggestion, aes(interaction(Age,variable), value, color = Age, fill = factor(variable), group = Age))
p + geom_bar(stat = "identity")+
facet_grid(.~Age)![The facetting separates the age variables][1]
My ultimate goal is to created a stack bar graph, which is why I used the fill, but it does not put the TDX values in its corresponding Age group and Year. (Sometimes TDX values == DX values, but I want to visualize when they don't)
Here's the dput(suggestion)
structure(list(Age = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L,
1L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L,
3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L,
5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 1L, 2L, 3L, 4L, 5L, 6L,
7L), .Label = c("0-2", "3-9", "10-19", "20-39", "40-59", "60-64",
"65+", "UNSP", "(all)"), class = "factor"), variable = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L,
5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 7L, 7L, 7L,
8L, 8L, 8L, 8L, 8L, 8L, 8L), .Label = c("Year.10.DX", "Year.11.DX",
"Year.12.DX", "Year.13.DX", "Year.10.TDX", "Year.11.TDX", "Year.12.TDX",
"Year.13.TDX"), class = "factor"), value = c(26.8648932910636,
30.487741796656, 31.9938838749782, 62.8189679326958, 72.8480838120064,
69.3044125928752, 36.9789457527416, 21.808001825378, 24.1073451428435,
40.3305134762935, 70.4486116545885, 68.8342676191755, 63.9227718107745,
34.6086468618636, 8.84033719571875, 13.2807072303835, 28.4781516422802,
55.139497471546, 59.7230544500003, 67.9448927372699, 37.7293286937066,
6.9507024051526, 17.4393054963572, 33.1485743479821, 61.198647580693,
58.6845873573852, 48.0073013177248, 28.4455801248562, 26.8648932910636,
19.8044453272475, 23.0189084635948, 53.7037832071889, 60.6516550126422,
58.1573725886767, 27.0791868812255, 21.808001825378, 19.8146296425633,
35.0587750051557, 62.3308555053346, 59.3299998610862, 56.5341245769817,
27.7229319271878, 8.84033719571875, 13.2807072303835, 22.4081606349585,
48.0252683906252, 52.7560684009579, 65.2890977685045, 32.4142337849399,
6.9507024051526, 15.2833655677215, 24.5268503180754, 52.536784326675,
51.4100599515986, 40.9609231655724, 18.1306673637441)), row.names = c(NA,
-56L), .Names = c("Age", "variable", "value"), class = "data.frame")
It's unclear what you need but perhaps this.
ggplot(a,aes(x=variable,y=value,fill=Age)) + geom_bar(stat='identity')
+facet_wrap(~Age)
If you want to visualize separately the TDX and the DX entries, we'll need to change the dataframe a bit.
> head(a)
Age variable value
1 0-2 Year.10.DX 26.86489
2 3-9 Year.10.DX 30.48774
3 10-19 Year.10.DX 31.99388
4 20-39 Year.10.DX 62.81897
5 40-59 Year.10.DX 72.84808
6 60-64 Year.10.DX 69.30441
The column of interest variable is a combination of year and of TDX/DX value. We'll use the tidyr package to separate this into two columns.
library(tidyr)
library(dplyr)
tidy_a<- a %>% separate(variable, into = c( 'nothing',"year",'label'), sep = "\\.")
This actually splits the levels of column variable into three components, since we split on . and the character . appears twice in each entry.
> head(tidy_a)
Age nothing year label value
1 0-2 Year 10 DX 26.86489
2 3-9 Year 10 DX 30.48774
3 10-19 Year 10 DX 31.99388
4 20-39 Year 10 DX 62.81897
5 40-59 Year 10 DX 72.84808
6 60-64 Year 10 DX 69.30441
So the column nothing is rather useless, just a necessary result of using separate and separating on .. Now this will allow us to visualize TDX/DX separately.
ggplot(tidy_a,aes(x=year,y=value,fill=label)) + geom_bar(stat='identity') + facet_wrap(~Age)

Melting data resulting in incorrect Y-values when plotting geom_bar(position="dodge")?

I have a dataframe called split2_data (actually a drop-leveled subset of a bigger data frame).
It contains a column "Loci", which are factors that I want as x-axes, and several columns of y-values (note: All of these values are <=1) that I would like to plot beside one another in their respective x factor.
The dataframe
structure(list(Loci = structure(1:8, .Label = c("C485", "C487_PigTa",
"C536", "Carey", "Cool", "Coyote", "Deadpool", "Epstein"), class = "factor"),
All = structure(c(5L, 6L, 7L, 1L, 2L, 4L, 3L, 8L), .Label = c("0.0246",
"0.0352", "0.0563", "0.0646", "0.2349", "0.3242", "0.3278",
"0.6854"), class = "factor"), X1_only = structure(c(4L, 3L,
2L, 1L, 6L, 6L, 6L, 5L), .Label = c("0.0133", "0.7292", "0.8586",
"0.9377", "0.961", "1"), class = "factor"), X78_only = structure(c(7L,
6L, 4L, 5L, 8L, 3L, 1L, 2L), .Label = c("0.0018", "0.0175",
"0.4958", "0.6055", "0.7472", "0.7563", "0.825", "1"), class = "factor"),
X8_removed = structure(c(5L, 6L, 8L, 1L, 2L, 3L, 4L, 7L), .Label = c("0.0181",
"0.0348", "0.1482", "0.1706", "0.2217", "0.2602", "0.6748",
"0.7123"), class = "factor"), X8_only = structure(c(6L, 7L,
3L, 8L, 5L, 4L, 1L, 2L), .Label = c("0.1266", "0.1945", "0.4389",
"0.4496", "0.7078", "0.709", "0.8882", "1"), class = "factor"),
X7_removed = structure(c(6L, 4L, 5L, 2L, 1L, 3L, 7L, 8L), .Label = c("0.0159",
"0.02", "0.0541", "0.3232", "0.3972", "0.4226", "0.4919",
"0.5951"), class = "factor"), X7_only = structure(c(3L, 4L,
7L, 5L, 6L, 8L, 1L, 2L), .Label = c("0.0082", "0.1759", "0.4957",
"0.5248", "0.6665", "0.6789", "0.8372", "1"), class = "factor"),
X5_removed = structure(c(5L, 7L, 6L, 1L, 3L, 4L, 2L, 8L), .Label = c("0.0195",
"0.0316", "0.08", "0.1069", "0.1549", "0.395", "0.4405",
"0.6298"), class = "factor"), X5_only = structure(c(1L, 2L,
6L, 7L, 3L, 5L, 7L, 4L), .Label = c("0.0871", "0.2022", "0.3532",
"0.3677", "0.5292", "0.7602", "1"), class = "factor"), X4_removed = structure(c(8L,
4L, 7L, 2L, 3L, 5L, 1L, 6L), .Label = c("0.0188", "0.0194",
"0.0511", "0.1716", "0.1862", "0.6454", "0.661", "0.8003"
), class = "factor"), X4_only = structure(c(2L, 5L, 1L, 6L,
7L, 3L, 8L, 4L), .Label = c("0.0026", "0.0378", "0.2884",
"0.4386", "0.5116", "0.6549", "0.6928", "1"), class = "factor"),
X3_removed = structure(c(5L, 7L, 6L, 1L, 2L, 3L, 4L, 8L), .Label = c("0.0612",
"0.0627", "0.0808", "0.1636", "0.2728", "0.477", "0.5307",
"0.6506"), class = "factor"), X3_only = structure(c(8L, 1L,
7L, 2L, 4L, 6L, 3L, 5L), .Label = c("0.0225", "0.2111", "0.2471",
"0.5087", "0.6294", "0.768", "0.8263", "0.8951"), class = "factor"),
X2_removed = structure(c(4L, 5L, 6L, 3L, 7L, 2L, 1L, 8L), .Label = c("0.0526",
"0.0608", "0.0854", "0.2036", "0.3168", "0.3668", "0.413",
"0.7608"), class = "factor"), X2_only = structure(c(5L, 3L,
6L, 4L, 2L, 8L, 1L, 7L), .Label = c("-", "0.0014", "0.0949",
"0.1637", "0.1818", "0.5521", "0.8585", "1"), class = "factor"),
X1_removed = structure(c(5L, 7L, 3L, 6L, 1L, 4L, 2L, 8L), .Label = c("0.0258",
"0.031", "0.0496", "0.0676", "0.1053", "0.1439", "0.2823",
"0.5465"), class = "factor")), .Names = c("Loci", "All",
"X1_only", "X78_only", "X8_removed", "X8_only", "X7_removed",
"X7_only", "X5_removed", "X5_only", "X4_removed", "X4_only",
"X3_removed", "X3_only", "X2_removed", "X2_only", "X1_removed"
), row.names = 9:16, class = "data.frame")
I can't think of how to do this in base R, and after some careful study of other questions here, this is the best that I can come up with:
library(reshape)
library(ggplot2)
require(ggplot2)
split2_datam<-melt(split2_data,id="Loci")
p2<- ggplot(split2_datam, aes(x =Loci, y = value, color = variable, width=.15)) + geom_bar(position="dodge") + ylab("P-value")+ geom_hline(yintercept=0.05)+ opts(axis.text.x = theme_text(angle=90, size=8)) + scale_y_discrete(breaks=seq(0,1)) + scale_fill_grey()
p2
#when I add stat="identity", the y values don't change- they just shrink relative to the x-axis
p2<- ggplot(split2_datam, aes(x =Loci, y = value, color = variable, width=.15)) + geom_bar(position="dodge", stat="identity") + ylab("P-value")+ geom_hline(yintercept=0.05)+ opts(axis.text.x = theme_text(angle=90, size=8)) + scale_y_discrete(breaks=seq(0,1)) + scale_fill_grey()
p2
The plot:
You'll notice that the different variables are often much greater than 1. They should not be. Any idea what's causing this/how to fix?
Other things I don't yet know how to do/fix (perhaps this question should be cross-referenced?):
I don't know why the greyscale isn't working
I don't know how to make the legend scale correctly with the plot
I don't understand why my columns have an 'X' appended to them (e.g. "X1_only" instead of "1_only")
Thank you so much in advance for any suggestions!
Your data have been read in as factors, probably because there are some "-" characters mixed in with your data.
You'll want to convert them to NA when you read in your data using na.strings = "-".

Resources