Select (multiple) integers with n occurrences per row - r

I have a data.frame where the data entries are entered in this format 1,2,3,10. That is, they are comma separated integers that range from 0-20, and do not need to be consecutive. Each is currently considered a factor. I have four variables that contain these values, and I'd like to create a new variable, that includes a given integer only if it is in three of the the four variables, if there are not three occurrences of an integer, then use 0.
M1 M2 M3 M4 M_NEW
1 1,2 0 1 1
3,4 3,4 1,2,3,4 4 3,4
I am unsure on how to deal with these comma separated integers. If they were single integers, I could do something like this:
# modified from https://stackoverflow.com/a/14114085/1670053
# over each row of data.frame (or matrix)
sapply(1:nrow(df), function(idx) {
# get the number of time each entry in df occurs
t <- table(t(g[idx, ]))
# get the maximum count (or frequency)
if(max(t) > 2){
t.max <- max(t)
}else{ t.max <- 0
}
# get all values that equate to maximum count
t <- as.numeric(names(t[t == t.max]))
})
Though with these multiple values separated by commas, I am unsure where to start.
# data and example output
df <- structure(list(M1 = structure(c(3L, 2L, 2L, 5L, 3L, 1L, 7L, 1L,
8L, 1L, 3L, 4L, 3L, 6L), .Label = c("0", "1", "1,2", "1,2,3",
"1,2,3,4", "1,2,3,4,5", "3,4,5,6,7", "6,7,8,9,10,11,12,13,14,15,16"
), class = "factor"), M2 = structure(c(5L, 2L, 2L, 4L, 4L, 1L,
11L, 8L, 7L, 9L, 3L, 6L, 3L, 10L), .Label = c("0", "1,2", "1,2,3",
"1,2,3,4", "1,2,3,4,5", "1,2,3,4,5,6,7", "1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16",
"2", "2,3,4,5", "4,5,6", "4,5,6,7,8,9,10,11,12,13,14"), class = "factor"),
M3 = structure(c(4L, 1L, 1L, 8L, 3L, 1L, 6L, 1L, 7L, 3L,
2L, 5L, 9L, 3L), .Label = c("0", "1,2", "1,2,3,4", "1,2,3,4,5",
"1,2,3,4,5,6", "1,2,3,4,5,6,7,8", "1,2,3,4,5,6,7,8,9,10,11,12,13,14",
"3,4", "3,4,5"), class = "factor"), M4 = structure(c(5L,
1L, 2L, 8L, 2L, 1L, 6L, 3L, 4L, 1L, 3L, 3L, 7L, 9L), .Label = c("0",
"1", "1,2", "1,2,3,4,5,12,13,14,15,16,17", "1,2,3,4,5,6",
"1,2,3,4,5,6,7,8,9,10,11,12", "3,4", "4", "4,5"), class = "factor"),
M_NEW = structure(c(6L, 1L, 2L, 8L, 3L, 1L, 9L, 1L, 7L, 1L,
3L, 4L, 5L, 10L), .Label = c("0", "1", "1,2", "1,2,3", "1,2,3,",
"1,2,3,4,5", "1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16", "3,4",
"3,4,5,6,7,8", "4,5"), class = "factor")), .Names = c("M1",
"M2", "M3", "M4", "M_NEW"), class = "data.frame", row.names = c(NA,
-14L))

f <- function(x, n=3) {
tab <- table(strsplit(paste(x, collapse=","), ","))
res <- paste(names(tab[which(tab >= n)]), collapse=",")
return(ifelse(res == "", "0", res))
}
(df[, 5] <- apply(df[, 1:4], 1, f))
# [1] "1,2,3,4,5"
# [2] "0"
# [3] "1"
# [4] "3,4"
# [5] "1,2"
# [6] "0"
# [7] "3,4,5,6,7,8"
# [8] "0"
# [9] "1,10,11,12,13,14,15,16,2,3,4,5,6,7,8,9"
# [10] "0"
# [11] "1,2"
# [12] "1,2,3"
# [13] "3"
# [14] "4,5"

Related

How to convert all column data type to numeric and character dynamically?

I convert my columns data type manually:
data[,'particles'] <- as.numeric(as.character(data[,'particles']))
This not ideal as the data may evolve and I won't be sure what species coming, for instance they could be - "nox", "no2", "co", "so2", "pm10" and more in the future.
Is there anyway to convert them automatically?
My current dataset:
structure(list(particles = structure(c(1L, 3L, 5L, 5L, 5L, 5L,
5L, 5L, 5L, 5L, 5L, 6L, 2L, 2L, 2L, 3L, 3L, 3L, 1L, 1L, 4L, 4L,
4L, 3L, 3L, 3L, 3L, 5L, 6L, 5L, 3L), .Label = c("1", "11", "1.1",
"2", "2.1", "3.1"), class = "factor"), humidity = structure(c(4L,
7L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 6L, 1L, 1L, 1L,
5L, NA, NA, NA, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("0.1",
"1", "1.1", "1.3", "21", "2.1", "3"), class = "factor"), timestamp = c(1468833354929,
1468833365186, 1468833378458, 1468833538213, 1468833538416, 1468833538613,
1468833538810, 1468833538986, 1468833539172, 1468833539358, 1468833539539,
1468833554592, 1468833559059, 1468833562357, 1468833566225, 1468833573486,
1468840019118, 1468840024950, 1469029568849, 1469029584243, 1469029590530,
1469029622391, 1469029623598, 1469245154003, 1469245156533, 1469245156815,
1469245157123, 1469245162358, 1469245165911, 1469245170178, 1469245173788
), date = structure(c(1468833354.929, 1468833365.186, 1468833378.458,
1468833538.213, 1468833538.416, 1468833538.613, 1468833538.81,
1468833538.986, 1468833539.172, 1468833539.358, 1468833539.539,
1468833554.592, 1468833559.059, 1468833562.357, 1468833566.225,
1468833573.486, 1468840019.118, 1468840024.95, 1469029568.849,
1469029584.243, 1469029590.53, 1469029622.391, 1469029623.598,
1469245154.003, 1469245156.533, 1469245156.815, 1469245157.123,
1469245162.358, 1469245165.911, 1469245170.178, 1469245173.788
), class = c("POSIXct", "POSIXt"), tzone = "Asia/Singapore")), .Names = c("particles",
"humidity", "timestamp", "date"), row.names = c(NA, -31L), class = "data.frame")
It has particles, humidity, timestamp, date.
Another option using mutate_if() from dplyr which allows you to operate on columns for which a predicate returns TRUE
library(dplyr)
df %>%
mutate_if(is.factor, funs(as.numeric(as.character(.))))
Note: This method will work for your follow up question as well
If you don't know which columns need to be converted beforehand, you can extract that info from your dataframe as follows:
vec <- sapply(dat, is.factor)
which gives:
> vec
particles humidity timestamp date
TRUE TRUE FALSE FALSE
You can then use this vector to do the conversion on the subset with lapply:
# notation option one:
dat[, vec] <- lapply(dat[, vec], function(x) as.numeric(as.character(x)))
# notation option two:
dat[vec] <- lapply(dat[vec], function(x) as.numeric(as.character(x)))
If you want to detect both factor and character columns, you can use:
sapply(dat, function(x) is.factor(x)|is.character(x))
We can use data.table
library(data.table)
setDT(df)[, lapply(.SD, function(x) if(is.factor(x)) as.numeric(as.character(x)) else x)]
The best option is I think apply
You can do
newD<-apply(data[,"names"], 2,function(x) as.numeric(as.character(x)))
where in "names" you put all the variables you want. Then apply with 2 as second argument will apply the function(x) on all the columns(if you put 1 its by rows) of the first argument. And you can save it as new dataset or rewrite the old one with
data[,"names"]<-apply....
Use lapply:
cols <- c("particles", "nox", ...)
data[,cols] <- lapply(data[,cols], function(x) as.numeric(as.character(x)))

How to generate a sequence based on two columns in R?

Below you can recreate my data in R. I would like to generate a sequence of numbers based on two individual columns. In this example of real data my column names are :
df= or10x1BC
"Tank" "Core" "BCl" "BCu" "Mid" "TL" "SL"
I wish to use the value in each row from BCu and BCl to generate a sequence by 0.001. For example seq(BCu[1], BCl[1], 0.001) will generate a sequence based on the first row in each, I wish to have this work for each row down the list.
Ultimately this sequence will be used in my function to make an average of the sequence, i.e. mean(function(seq(Bcu[i], BCl[j], 0.001)) and be added to a new column or10x1BC["meanBVF"] = mean(function(seq(Bcu[i], BCl[j], 0.001)).
See data below:
structure(list(Tank = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = "1", class = "factor"), Core = structure(c(1L,
1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L), .Label = c("A", "B", "C"), class = "factor"),
BCl = structure(c(8L, 5L, 2L, 6L, 3L, 1L, 9L, 7L, 4L), .Label = c("17",
"18", "22", "22.3", "23", "26", "27.3", "28", "29"), class = "factor"),
BCu = structure(c(8L, 5L, 2L, 6L, 3L, 1L, 9L, 7L, 4L), .Label = c("12.5",
"13.5", "17", "17.8", "18", "22", "22.3", "23", "27.3"), class = "factor"),
Mid = structure(c(8L, 5L, 2L, 6L, 3L, 1L, 9L, 7L, 4L), .Label = c("14.75",
"15.75", "19.5", "20.05", "20.5", "24", "24.8", "25.5", "28.15"
), class = "factor"), TL = structure(c(2L, 2L, 2L, 1L, 1L,
1L, 3L, 3L, 3L), .Label = c("26", "28", "29"), class = "factor"),
SL = structure(c(4L, 4L, 3L, 2L, 4L, 3L, 1L, 4L, 3L), .Label = c("1.7",
"4", "4.5", "5"), class = "factor")), .Names = c("Tank",
"Core", "BCl", "BCu", "Mid", "TL", "SL"), row.names = c(NA, -9L
), class = "data.frame")
mapply is like apply, or lapply, but with multiple arguments:
First, as I mentioned in the comment, we need to convert your data to numeric. I did it like this, to convert all but the second column:
df[, -2] = lapply(df[, -2], as.character)
df[, -2] = lapply(df[, -2], as.numeric)
We can then use mapply like this to generate the sequences:
seqs = mapply(FUN = function(a, b) {
seq(from = a, to = b, by = .001)
}, a = df$BCu, b = df$BCl)
It seems messy to put that in the data frame, but you can if you'd like:
df$seqs = seqs
If it were me, I'd probably leave it as a list of vectors outside of the data frame.

Add relative frequency by factor to a data frame

I want to add a column to a data frame that looks like this with the relative frequency by factor (Var2)
X = structure(list(Var1 = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 11L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L
), .Label = c("0", "1", "2", "3", "4", "5", "6", "7", "8", "9",
"10"), class = "factor"), Var2 = structure(c(1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L), .Label = c("No Treatment", "Any Treatment"), class = "factor"),
Freq = c(1L, 3L, 6L, 13L, 30L, 53L, 69L, 123L, 198L, 270L,
1324L, 1L, 0L, 4L, 10L, 16L, 33L, 44L, 75L, 113L, 159L, 630L
)), .Names = c("Var1", "Var2", "Freq"), row.names = c(NA,
-22L), class = "data.frame")
The solution that I have in mind is very complicated, and not very flexible. This is what I'm doing right now:
library(data.table)
DT =data.table(X)
myfun <- function (freq, group, total1, total2)
{
if(group[[1]] == "No Treatment"){
relfreq = freq/total1
}else{
relfreq = freq/total2
}
return(relfreq)
}
DT[,relfreq:=myfun(Freq,Var2,sum(DT$Freq[DT$Var2=="No Treatment"]), sum(DT$Freq[DT$Var2=="Any Treatment"]))]
Can somebody show me a better solution that is more flexible and allows Var2 to take more than 2 values?
Thanks!
Here is a data table solution, since you started out that way.
DT[,relfreq:=Freq/sum(Freq),by=Var2]
This will be faster if your dataset is extremely large, mostly because data table adds the new column by reference, rather than copying the whole dataset.
You can get a vector of the sum by factor with ave and divide X$Freq by this vector:
X$relfreq <- X$Freq / ave(X$Freq, X$Var2, FUN=sum)
Or even:
X$relfreq <- ave(X$Freq, X$Var2, FUN=function(x) x/sum(x))
Note that your function is incorrect, and divides each Freq by 2090 in your example, rather than dividing by the sum of the Freq of each factor level.

How to "back" melt function from reshape2 package?

That's my data:
> head(data)
id C1 C2 C3 B1 B2 B3 Name
12 3 12 8 1 3 12 Agar
14 4 11 9 5 12 14 LB
18 7 17 6 7 14 16 YEF
20 9 15 4 3 11 17 KAN
so I used a melt function from reshape2 package to reorganize my data. Now it looks like that:
dt <- melt(data, measure.vars=2:7)
> head(dt)
n v variable value rt
1 id Name p C1 1
2 12 Agar p 3 2
3 14 LB p 4 3
4 18 YEF p 7 6
5 20 KAN p 9 3
6 id Name u C2 1
I did some calculations on my data and now there is an extra column. Let's call it "rt". I'd like to transform my data now to the previous "state" with this an extra column. Do you know any function which would be useful ?
dput(dt)
structure(list(n = structure(c(5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L,
3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L, 4L, 5L, 1L, 2L, 3L,
4L, 5L, 1L, 2L, 3L, 4L), class = "factor", .Label = c("12", "14",
"18", "20", "id")), v = structure(c(4L, 1L, 3L, 5L, 2L, 4L, 1L,
3L, 5L, 2L, 4L, 1L, 3L, 5L, 2L, 4L, 1L, 3L, 5L, 2L, 4L, 1L, 3L,
5L, 2L, 4L, 1L, 3L, 5L, 2L), class = "factor", .Label = c("Agar",
"KAN", "LB", "Name", "YEF")), variable = structure(c(1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L,
4L, 4L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L), .Label = c("p",
"u", "k", "l", "t", "h"), class = "factor"), value = c("C1",
"3", "4", "7", "9", "C2", "12", "11", "17", "15", "C3", "8",
"9", "6", "4", "B1", "1", "5", "7", "3", "B2", "3", "12", "14",
"11", "B3", "12", "14", "16", "17")), .Names = c("n", "v", "variable",
"value"), row.names = c(NA, -30L), class = "data.frame")
In the "reshape2" universe, melt and *cast go hand-in-hand.
Here's an example of melting a data.frame and dcasting it back to its original form. You would need to take a similar approach with your data.
mydf <- data.frame(A = LETTERS[1:3], B = 1:3, C = 4:6)
mydf
# A B C
# 1 A 1 4
# 2 B 2 5
# 3 C 3 6
library(reshape2)
mDF <- melt(mydf, id.vars="A")
mDF
dcast(mDF, A ~ variable, value.var="value")
# A B C
# 1 A 1 4
# 2 B 2 5
# 3 C 3 6
In the dcast step, think of the items before the ~ as being the "id" variables, and those coming after as being the resulting column names. value.var should be the column from which the values will fill in the resulting "grid" created by the id variables and column names.

nrow() gives more rows than original in R

I have a file with 20 fields as headers in the first row. The remaining rows have unequal number of fields, some of the rows have more columns than the headers. When i tried to read it using read.delim(), it reads the data without error but the total row count is more than the original number.
Here are a few lines of the file:
Chromosome Position SNPid Reference Alternate QUAL Homozygosity Tool Depth MappingQuality EFFECT IMPACT FUNCTIONAL_CLASS CODON_CHANGE AMINO_ACID_CHANGE GENE_NAME GENE_BIOTYPE GENE_CODING TRANSCRIPT_ID EXON_ID
chr1 403111 . G A 24 het SAM 20 55 INTERGENIC MODIFIER _ _ _ _ _ _ _ _ _
chr1 602567 rs21953190 A G 3265.77 hom GATKSAM 91 58.46 SYNONYMOUS_CODING LOW SILENT gaT/gaC D1034 ADNP2 protein_coding CODING ENSCAFT00000000008 5 _
chr1 604894 rs21953191 A G 2869.77 hom GATKSAM 77 59.70 NON_SYNONYMOUS_CODING MODERATE MISSENSE Ttt/Ctt F259L ADNP2 protein_coding CODING ENSCAFT00000000008 5 _
chr1 758630 . T TC 1531.73 hom GATKSAM 38 46.20 INTRON MODIFIER _ _ _ PQLC1 protein_coding CODING ENSCAFT00000000011 2 _
chr1 800715 . C CT 514.73 hom GATKSAM 13 60.00 INTRON MODIFIER _ _ _ PQLC1 protein_coding CODING ENSCAFT00000000011 6 ,SPLICE_SITE_ACCEPTOR HIGH _ _ _ PQLC1 protein_coding CODING ENSCAFT00000000011 7 ,SPLICE_SITE_DONOR HIGH _ _ _ PQLC1 protein_coding CODING ENSCAFT00000000011 6 _
chr1 1104035 rs21966859 G A 3803.77 hom GATKSAM 97 57.97 INTRON MODIFIER _ _ _ NFATC1 protein_coding CODING ENSCAFT00000000013 2 ,INTRON MODIFIER _ _ _ NFATC1 protein_coding CODING ENSCAFT00000036234 2 _
chr1 1120994 . CGCG C 604.73 hom GATKSAM 21 56.55 INTERGENIC MODIFIER _ _ _ _ _ _ _ _ ,UPSTREAM MODIFIER _ _ _ NFATC1 protein_coding CODING ENSCAFT00000000013 _ ,UPSTREAM MODIFIER _ _ _ NFATC1 protein_coding CODING ENSCAFT00000036234 _ _
chr1 1136916 rs21935602 G A 3899.77 hom GATKSAM 101 59.17 DOWNSTREAM MODIFIER _ _ _ ATP9B protein_coding CODING ENSCAFT00000000014 _ ,DOWNSTREAM MODIFIER _ _ _ ATP9B protein_coding CODING ENSCAFT00000042968 _ ,UTR_3_PRIME MODIFIER _ _ _ ATP9B protein_coding CODING ENSCAFT00000046825 29 _
There are 9 rows in the file.But when it is read in R and the number of rows are counted it shows as 12.
read.delim("test.txt",header=T,sep='\t')->data
nrow(data)
Could someone help, to read the data properly?
Below is the output from dput(data)
> dput(data)
structure(list(Chromosome = structure(c(3L, 3L, 3L, 3L, 3L, 1L,
3L, 2L, 3L, 2L, 3L, 2L), .Label = c("HIGH", "MODIFIER", "chr1"
), class = "factor"), Position = structure(c(4L, 5L, 6L, 7L,
8L, 9L, 1L, 9L, 2L, 9L, 3L, 9L), .Label = c("1104035", "1120994",
"1136916", "403111", "602567", "604894", "758630", "800715",
"_"), class = "factor"), SNPid = structure(c(1L, 4L, 5L, 1L,
1L, 2L, 6L, 2L, 1L, 2L, 3L, 2L), .Label = c(".", "_", "rs21935602",
"rs21953190", "rs21953191", "rs21966859"), class = "factor"),
Reference = structure(c(4L, 1L, 1L, 5L, 2L, 6L, 4L, 6L, 3L,
6L, 4L, 6L), .Label = c("A", "C", "CGCG", "G", "T", "_"), class = "factor"),
Alternate = structure(c(1L, 5L, 5L, 8L, 4L, 7L, 1L, 6L, 3L,
6L, 1L, 2L), .Label = c("A", "ATP9B", "C", "CT", "G", "NFATC1",
"PQLC1", "TC"), class = "factor"), QUAL = structure(c(2L,
4L, 3L, 1L, 7L, 9L, 5L, 9L, 8L, 9L, 6L, 9L), .Label = c("1531.73",
"24", "2869.77", "3265.77", "3803.77", "3899.77", "514.73",
"604.73", "protein_coding"), class = "factor"), Homozygosity = structure(c(2L,
3L, 3L, 3L, 3L, 1L, 3L, 1L, 3L, 1L, 3L, 1L), .Label = c("CODING",
"het", "hom"), class = "factor"), Tool = structure(c(6L,
5L, 5L, 5L, 5L, 1L, 5L, 3L, 5L, 2L, 5L, 4L), .Label = c("ENSCAFT00000000011",
"ENSCAFT00000000013", "ENSCAFT00000036234", "ENSCAFT00000042968",
"GATKSAM", "SAM"), class = "factor"), Depth = structure(c(4L,
9L, 8L, 6L, 2L, 7L, 10L, 3L, 5L, 11L, 1L, 11L), .Label = c("101",
"13", "2", "20", "21", "38", "7", "77", "91", "97", "_"), class = "factor"),
MappingQuality = structure(c(5L, 8L, 10L, 4L, 11L, 1L, 7L,
12L, 6L, 2L, 9L, 3L), .Label = c(",SPLICE_SITE_DONOR", ",UPSTREAM",
",UTR_3_PRIME", "46.20", "55", "56.55", "57.97", "58.46",
"59.17", "59.70", "60.00", "_"), class = "factor"), EFFECT = structure(c(4L,
8L, 7L, 5L, 5L, 3L, 5L, 1L, 4L, 6L, 2L, 6L), .Label = c("",
"DOWNSTREAM", "HIGH", "INTERGENIC", "INTRON", "MODIFIER",
"NON_SYNONYMOUS_CODING", "SYNONYMOUS_CODING"), class = "factor"),
IMPACT = structure(c(4L, 2L, 3L, 4L, 4L, 5L, 4L, 1L, 4L,
5L, 4L, 5L), .Label = c("", "LOW", "MODERATE", "MODIFIER",
"_"), class = "factor"), FUNCTIONAL_CLASS = structure(c(4L,
3L, 2L, 4L, 4L, 4L, 4L, 1L, 4L, 4L, 4L, 4L), .Label = c("",
"MISSENSE", "SILENT", "_"), class = "factor"), CODON_CHANGE = structure(c(3L,
4L, 2L, 3L, 3L, 3L, 3L, 1L, 3L, 3L, 3L, 3L), .Label = c("",
"Ttt/Ctt", "_", "gaT/gaC"), class = "factor"), AMINO_ACID_CHANGE = structure(c(7L,
3L, 4L, 7L, 7L, 6L, 7L, 1L, 7L, 5L, 7L, 2L), .Label = c("",
"ATP9B", "D1034", "F259L", "NFATC1", "PQLC1", "_"), class = "factor"),
GENE_NAME = structure(c(6L, 2L, 2L, 5L, 5L, 7L, 4L, 1L, 6L,
7L, 3L, 7L), .Label = c("", "ADNP2", "ATP9B", "NFATC1", "PQLC1",
"_", "protein_coding"), class = "factor"), GENE_BIOTYPE = structure(c(3L,
4L, 4L, 4L, 4L, 2L, 4L, 1L, 3L, 2L, 4L, 2L), .Label = c("",
"CODING", "_", "protein_coding"), class = "factor"), GENE_CODING = structure(c(6L,
2L, 2L, 2L, 2L, 3L, 2L, 1L, 6L, 4L, 2L, 5L), .Label = c("",
"CODING", "ENSCAFT00000000011", "ENSCAFT00000036234", "ENSCAFT00000046825",
"_"), class = "factor"), TRANSCRIPT_ID = structure(c(8L,
4L, 4L, 5L, 5L, 3L, 6L, 1L, 8L, 8L, 7L, 2L), .Label = c("",
"29", "6", "ENSCAFT00000000008", "ENSCAFT00000000011", "ENSCAFT00000000013",
"ENSCAFT00000000014", "_"), class = "factor"), EXON_ID = structure(c(5L,
3L, 3L, 2L, 4L, 5L, 2L, 1L, 5L, 5L, 5L, 5L), .Label = c("",
"2", "5", "6", "_"), class = "factor"), X = structure(c(6L,
6L, 6L, 6L, 4L, 1L, 3L, 1L, 5L, 1L, 2L, 1L), .Label = c("",
",DOWNSTREAM", ",INTRON", ",SPLICE_SITE_ACCEPTOR", ",UPSTREAM",
"_"), class = "factor")), .Names = c("Chromosome", "Position",
"SNPid", "Reference", "Alternate", "QUAL", "Homozygosity", "Tool",
"Depth", "MappingQuality", "EFFECT", "IMPACT", "FUNCTIONAL_CLASS",
"CODON_CHANGE", "AMINO_ACID_CHANGE", "GENE_NAME", "GENE_BIOTYPE",
"GENE_CODING", "TRANSCRIPT_ID", "EXON_ID", "X"), class = "data.frame", row.names = c(NA,
-12L))
R thinks you have 21 rather than 20 fields per line (maybe there are trailing tabs on each line?), and your lines 6-9 have additional fields:
count.fields("test.txt",sep="\t")
## [1] 21 21 21 21 21 41 31 41 41
This confuses the heck out of read.delim, which tries to guess what's going on from the first 5 lines (it shouldn't, but that's the way it is). You might think you could use fill=TRUE to get around this, but you can't.
I tried using colClasses along with fill=TRUE to specify the field types (I used colClasses=rep("character",41) but you can probably guess better than that), but it doesn't seem to work, probably because your header only has 21 columns.
The fread function in the data.table package can do a little better, but only if you tell it not to try to guess the format from lines after #5, and it discards the data in columns beyond 21.
library(data.table)
nrow(fread("test.txt",autostart=5)) ## 9
Hmm, even that doesn't quite work as expected (it doesn't pick up the header properly, even if I set header=TRUE, probably because column 21 doesn't have a header field ... The bottom line is that you probably have to figure out what those extra fields are and do something more explicit with them (e.g. add header fields ...)
Basically, R expects your data to be pretty clean. It might be worth sending this example to the maintainer of the data.table package, who is trying to make fread be as robust as possible ... this would represent a challenge.
Looking at the data you can see that it is highly "mutated" with many fusion lines. These are in many cases signaled by the presence of commas. I think this data is in a different format than you expect. Your first element in the dput data was a factor with Chromosome values =c("HIGH", "MODIFIER", "chr1"). That's not a sensible result, pointing to a lack of understanding on your part about the organization of the original data. You should post the original text file somewhere that can be accessed over the Internet, so the original layout can be examined. In particular the tabs you think are the delimiters are either not there or are not being captured by the SO interface.
After being pointed to the data sample, which should have been put into the question body by you doing editing, try this to delete the comments that follow the commas:
datL <- readLines("~/Downloads/test.txt")
datLred <- gsub("[,].+$", "", datL)
read.delim(text=datLred)
> str(read.delim(text=datLred) )
'data.frame': 8 obs. of 21 variables:
$ Chromosome : Factor w/ 1 level "chr1": 1 1 1 1 1 1 1 1
$ Position : int 403111 602567 604894 758630 800715 1104035 1120994 1136916
$ SNPid : Factor w/ 5 levels ".","rs21935602",..: 1 3 4 1 1 5 1 2
$ Reference : Factor w/ 5 levels "A","C","CGCG",..: 4 1 1 5 2 4 3 4
$ Alternate : Factor w/ 5 levels "A","C","CT","G",..: 1 4 4 5 3 1 2 1
snipped remain columns

Resources