I am having some trouble running MaxEnt in R. I keep getting two error messages:
1) Error in data.frame(..., check.names = FALSE) :
arguments imply differing number of rows: 1, 2, 0
and
2) Warning message:
In mean.default(sapply(e, function(x) { :
argument is not numeric or logical: returning NA
I'm somewhat new to R and am not sure what these error messages mean. The columns all have the same number of rows and there is no non-numeric data. Any help would be greatly appreciated. I have included the data and the R script. Thanks for your time and any help-
#MaxEnt (Maximum Entropy) Modeling is a species distribution model and machine-learning technique
#The package "dismo" calls in the java program.
#The package "maxnet" is an R-based Maxent implementation that uses glmnet with regularization to approximate the fit of maxent.jar
#The packages "raster" and "sp" are also required to run the model.
library(dismo)
library(maxnet)
library(raster)
library(sp)
#Load the Data
KCStatic = read.csv(file="D:/KCs3.csv", row.names="ID")
#Need to partition 'training dataset' and 'validation dataset'
KCwoT.Tng<-subset(KCStatic, Valid==0)
KCwoT.Val<-subset(KCStatic, Valid==1)
pvtest<-KCwoT.Val[KCwoT.Val[,1] == 1, 2:7]
avtest<-KCwoT.Val[KCwoT.Val[,1] == 0, 2:7]
#MaxEnt Model
KCwoT.ME.x<-KCwoT.Tng[,2:7]
KCwoT.ME.p<-KCwoT.Tng[,1]
KCwoT.ME<-maxent(KCwoT.ME.x,KCwoT.ME.p)
KCwoT.ME
KCwoT.ME.testp <- predict(KCwoT.ME, pvtest)
KCwoT.ME.testa <- predict(KCwoT.ME, avtest)
KCwoT.ME.eval <- evaluate(p=KCwoT.ME.testp, a=KCwoT.ME.testa)
KCwoT.ME.eval
#10 fold x validation:
xVal<- subset(KCwoT.Tng, select = c(ag, cent, jor, kliv, traw, silt))
k <- 10
group <- kfold(xVal, k)
e <- list()
for (i in 1:k) {
train <- xVal[group != i,]
test <- xVal[group == i,]
trainx<- train[,2:length(xVal)]
trainp<- train[,1]
me <- maxent(trainx,trainp)
testxp<-test[test[,1] == 1,2:length(xVal)]
testxa<-test[test[,1] == 0,2:length(xVal)]
testp <- predict(me, testxp)
testa <- predict(me, testxa)
e[[i]] <- evaluate(p=testp, a=testa)
}
mean(sapply( e, function(x){slot(x, 'auc')} ))
median(sapply( e, function(x){slot(x, 'auc')} ))
mean(sapply( e, function(x){slot(x, 'cor')} ))
median(sapply( e, function(x){slot(x, 'cor')} ))
KCwoT.ME.eval#TPR[match(threshold(KCwoT.ME.eval)[2],KCwoT.ME.eval#t)]
KCwoT.ME.eval#TNR[match(threshold(KCwoT.ME.eval)[2],KCwoT.ME.eval#t)]
KCwoT.ME.eval#CCR[match(threshold(KCwoT.ME.eval)[2],KCwoT.ME.eval#t)]
s3_Pa ag cent jor kliv traw silt ID Valid
0 2712.814007 5944.620219 7664 7545 0 0 3174 0
0 2732.815985 5646.817407 7527 7516 0 0 3221 0
0 5383.266705 5383.266705 7970 7970 0 0 3230 0
0 4857.46024 6259.344198 7726 5781 0 0 3300 0
0 8352.51324 8352.51324 7198 7198 0 0 3356 0
0 49378.96152 16984.2808 12172 7890 0 0 3415 0
0 4319.437464 4319.437464 6297 6297 0 0 3461 0
0 9444.516272 9444.516272 7394 7394 0 0 3552 0
0 3589.880714 3265.163078 6131 5188 0 0 3605 0
0 28749.74389 28749.74389 6466 6466 0 0 3653 0
0 6959.890193 4073.928736 5213 5412 0 0 3764 0
0 5118.247173 3272.811018 4705 4705 0 0 3829 0
0 2559.80507 3984.939677 4965 5422 0 0 3857 0
0 5189.140315 5189.140315 4864 4864 0 0 3903 0
0 2243.265175 2513.775258 5407 5285 0 0 3942 0
0 2798.840052 2798.840052 5284 5284 0 0 3943 0
0 3049.900227 3049.900227 5044 5044 0 0 4034 0
0 5314.032326 5314.032326 5336 5336 0 0 4049 0
0 2416.851993 2483.681392 4529 4204 0 0 4093 0
0 2527.316522 2527.316522 4898 4898 0 0 4199 0
0 2281.407824 2281.407824 4848 4848 0 0 4216 0
0 2257.802423 2257.802423 4873 4873 0 0 4285 0
0 2678.746278 2678.746278 5137 5137 0 0 4360 0
0 2319.915204 2319.915204 5138 5138 0 0 4362 0
0 3516.384174 3516.384174 4725 4725 0 0 4557 0
0 2218.583063 2218.583063 4464 4464 0 0 4583 0
0 7433.978369 6832.621571 3963 5527 0 0 4586 0
0 2604.437581 2604.437581 4565 4565 0 0 4630 0
0 2372.751422 3930.504765 4787 5560 0 0 4739 0
0 2516.984733 7087.776431 4219 5885 0 0 4818 0
0 2437.56414 2437.56414 4596 4596 0 0 4825 0
0 2440.659167 2440.659167 4556 4556 0 0 4933 0
0 2416.905821 2416.905821 4540 4540 0 0 4942 0
0 2428.521085 2428.521085 4867 4867 0 0 5121 0
0 2463.594566 2463.594566 5125 5125 0 0 5196 0
0 2487.539855 2487.539855 4803 4803 0 0 5249 0
0 3302.718352 3302.718352 4958 4958 0 0 5252 0
0 2605.906908 2605.906908 4846 4846 0 0 5332 0
0 2577.784698 2577.784698 5463 5463 0 0 5402 0
0 2764.191937 8861.087669 4848 6376 0 0 5494 0
0 2566.989482 2566.989482 4938 4938 0 0 5565 0
0 2592.787269 2592.787269 4889 4889 0 0 5626 0
0 2757.964558 2757.964558 5077 5077 0 0 5693 0
0 2620.543732 2620.543732 5216 5216 0 0 5769 0
0 5309.434576 5309.434576 5867 5867 0 0 5908 0
1 5921.125287 6881.922922 7736 7707 0 0 3217 0
1 2774.199514 21747.29759 7669 9197 0 0 3280 0
1 15495.78183 15495.78183 7826 7826 0 0 3307 0
1 2548.237657 4019.296229 7503 7421 0 0 3310 0
1 7666.402192 7666.402192 7501 7501 0 0 3342 0
1 4891.62472 4891.62472 7384 7384 0 0 3350 0
1 5042.36752 4343.180161 7456 7344 0 0 3373 0
1 5193.293844 5772.4049 7390 6359 0 0 3377 0
1 3534.172197 16711.2551 7446 7646 0 0 3423 0
1 14070.79994 14070.79994 7601 7601 0 0 3443 0
1 3255.725951 3255.725951 7345 7345 0 0 3450 0
1 4786.893258 4786.893258 6125 6125 0 0 3493 0
1 40210.85968 4484.216909 12105 5479 0 0 3517 0
1 4333.860544 4333.860544 7262 7262 0 0 3535 0
1 7795.332317 7795.332317 6679 6679 0 0 3542 0
1 3364.563525 3364.563525 6608 6608 0 0 3545 0
1 3303.389553 3303.389553 6879 6879 0 0 3547 0
1 3619.497747 3619.497747 6561 6561 0 0 3551 0
1 5450.874516 3356.834908 6633 5425 0 0 3570 0
1 2725.057799 2725.057799 6024 6024 0 0 3583 0
1 5691.763254 5691.763254 6377 6377 0 0 3602 0
1 3169.96849 3169.96849 5673 5673 0 0 3645 0
1 3275.840301 3250.607347 5876 5165 0 0 3660 0
1 2889.723967 2889.723967 6250 6250 0 0 3662 0
1 7669.776341 7669.776341 6345 6345 0 0 3686 0
1 3834.198391 2710.905485 5632 5238 0 0 3687 0
1 2460.824512 2626.740765 5489 5011 0 0 3714 0
1 2486.475314 5285.960072 5944 5571 0 0 3743 0
1 3044.274943 3780.675875 6001 5272 0 0 3779 0
1 2330.782119 2330.782119 5254 5254 0 0 3798 0
1 6918.421155 4441.631393 5837 5400 0 0 3807 0
1 2283.770553 2283.770553 5352 5352 0 0 3843 0
1 4017.235221 4017.235221 5268 5268 0 0 3897 0
1 7856.113523 7856.113523 5280 5280 0 0 3936 0
1 5723.619574 5723.619574 5414 5414 0 0 3985 0
1 3204.713159 2870.945921 5145 5082 0 0 3988 0
1 4486.528634 2810.032147 4970 4900 0 0 3992 0
1 4014.434161 2728.881238 5213 5069 0 0 4005 0
1 2752.918679 2752.918679 4704 4704 0 0 4007 0
1 2277.264207 2277.264207 4998 4998 0 0 4019 0
1 5711.280538 3392.648953 5123 4870 0 0 4020 0
1 3203.948015 3203.948015 4714 4714 0 0 4067 0
1 2767.113359 2767.113359 4886 4886 0 0 4091 0
1 2865.961261 2865.961261 4892 4892 0 0 4110 0
1 2911.739735 2911.739735 4780 4780 0 0 4116 0
1 2361.708077 2361.708077 4724 4724 0 0 4117 0
1 2286.082622 2360.683427 5355 5185 0 0 4118 0
1 2331.814226 2331.814226 5037 5037 0 0 4137 0
1 2308.986958 2308.986958 4682 4682 0 0 4140 0
1 2300.537289 2300.537289 4852 4852 0 0 4177 0
1 2321.675121 2321.675121 4638 4638 0 0 4238 0
1 2237.444686 2237.444686 5043 5043 0 0 4239 0
1 2390.690086 2390.690086 4395 4395 0 0 4241 0
1 2229.52996 2229.52996 5109 5109 0 0 4279 0
1 2520.728579 2520.728579 5119 5119 0 0 4283 0
1 2258.03747 2258.03747 5071 5071 0 0 4284 0
1 2278.607183 2278.607183 4785 4785 0 0 4298 0
1 2505.662096 2505.662096 4083 4083 0 0 4320 0
1 2225.789635 2225.789635 4880 4880 0 0 4331 0
1 2183.306088 2183.306088 4425 4425 0 0 4525 0
1 2263.787317 2263.787317 4964 4964 0 0 4540 0
1 2245.845341 2245.845341 4750 4750 0 0 4640 0
1 2283.423493 2283.423493 4806 4806 0 0 4662 0
1 2266.360563 2266.360563 4765 4765 0 0 4721 0
1 2260.095621 2260.095621 5038 5038 0 0 4732 0
1 2301.432888 2301.432888 4854 4854 0 0 4736 0
1 2329.630779 2329.630779 5708 5708 0 0 4762 0
1 2454.336191 3935.49151 4318 5562 0 0 4772 0
1 2361.297036 3624.034037 4331 5232 0 0 4779 0
1 2323.874056 2323.874056 4757 4757 0 0 4790 0
1 2382.420745 2382.420745 5349 5349 0 0 4798 0
1 2352.659926 2352.659926 4452 4452 0.0000846 0.020143027 4799 0
1 2409.321197 2409.321197 4475 4475 0 0 4815 0
1 2360.118466 2360.118466 4369 4369 0 0 4819 0
1 2339.601538 2339.601538 5518 5518 0 0 4861 0
1 2358.67411 2358.67411 4574 4574 0 0 4880 0
1 2386.410926 3718.035094 4393 5278 0 0 4894 0
1 2528.234053 2528.234053 4703 4703 0 0 4910 0
1 2291.851083 2291.851083 5101 5101 0 0 4925 0
1 2459.766511 2459.766511 4814 4814 0 0 4973 0
1 2490.775395 2490.775395 5044 5044 0 0 5025 0
1 2514.079723 5099.787095 4459 5380 0 0 5032 0
1 2427.873473 2427.873473 4754 4754 0 0 5037 0
1 2380.611838 2380.611838 5461 5461 0 0 5055 0
1 2511.565392 2511.565392 5059 5059 0 0 5056 0
1 3622.514274 3622.514274 4827 4827 0 0 5109 0
1 2475.631468 2475.631468 4908 4908 0 0 5118 0
1 2492.200769 2492.200769 4822 4822 0 0 5143 0
1 2509.438134 2509.438134 4628 4628 0 0 5185 0
1 2556.501335 2556.501335 4737 4737 0 0 5309 0
1 2548.650994 2548.650994 4802 4802 0 0 5316 0
1 2530.378885 2530.378885 4952 4952 0 0 5363 0
1 2528.110558 2551.811713 4629 4785 0 0 5392 0
1 2590.935464 2590.935464 4645 4645 0.000394667 0.093930809 5480 0
1 2631.168824 2958.393588 5187 5380 0 0 5521 0
1 2581.504472 2581.504472 5062 5062 0 0 5531 0
1 2575.585115 2575.585115 5185 5185 0 0 5538 0
1 2551.676567 2551.676567 4892 4892 0 0 5542 0
1 2569.698254 2569.698254 5053 5053 0 0 5557 0
1 2624.237765 2624.237765 4912 4912 0 0 5604 0
1 2614.385919 2614.385919 5301 5301 0 0 5640 0
1 2598.787723 2598.787723 5364 5364 0 0 5642 0
1 2578.060432 2578.060432 5090 5090 0 0 5656 0
1 4001.119207 5895.989889 4925 5693 0 0 5662 0
1 2623.749151 2623.749151 5440 5440 0 0 5673 0
1 2644.030557 2644.030557 5377 5377 0 0 5710 0
1 2669.872842 2669.872842 5177 5177 0 0 5734 0
1 3646.204271 3646.204271 5193 5193 0 0 5794 0
1 2618.429271 2618.429271 5035 5035 0 0 5815 0
1 2690.323195 2690.323195 4821 4821 0 0 5818 0
1 2633.516256 2633.516256 4956 4956 0 0 5883 0
1 2701.966232 2701.966232 5470 5470 0 0 5898 1
1 2946.4581 6141.938828 5043 5935 0 0 5919 1
1 2658.347761 2658.347761 5162 5162 0 0 5938 1
0 2424.833017 2424.833017 5726 5726 0 0 3737 1
0 2644.544075 2644.544075 4857 4857 0 0 3799 1
1 2410.138972 2280.102484 4816 4905 0 0 3974 1
1 2968.445375 2968.445375 4705 4705 0 0 4006 1
1 2267.857714 2267.857714 5088 5088 0 0 4330 1
1 2293.989337 2293.989337 5007 5007 0 0 4612 1
1 2308.364875 2281.548644 4560 4720 0 0 4922 1
1 2492.057156 2492.057156 4737 4737 0 0 5089 1
1 2566.653701 2566.653701 4989 4989 0 0 5478 1
Related
I keep getting this error: the table must the same classes in the same order
when implementing KNN and confusion matrix to get the accuracy
df_train <- df_trimmed_n[1:10,]
df_test <- df_trimmed_n[11:20,]
df_train_labels <- df_trimmed[1:10, 1]
df_test_labels <- df_trimmed[11:20, 1]
library(class)
library(caret)
df_knn<-knn(df_train,df_test,cl=df_train_labels,k=10)
confusionMatrix(table(df_knn,df_test_labels))
Error in confusionMatrix.table(table(df_knn, df_test_labels)) :
the table must the same classes in the same order
> print(df_knn)
[1] 28134 5138 4820 3846 1216 1885 1885 22021 5138 15294
Levels: 106 1216 1885 3846 4820 5138 15294 22021 22445 28134
> print(df_test_labels)
[1] 33262 6459 5067 7395 22720 1217 3739 84 16219 17819
> table(df_knn,df_test_labels)
df_test_labels
df_knn 84 1217 3739 5067 6459 7395 16219 17819 22720 33262
106 0 0 0 0 0 0 0 0 0 0
1216 0 0 0 0 0 0 0 0 1 0
1885 0 1 1 0 0 0 0 0 0 0
3846 0 0 0 0 0 1 0 0 0 0
4820 0 0 0 1 0 0 0 0 0 0
5138 0 0 0 0 1 0 1 0 0 0
15294 0 0 0 0 0 0 0 1 0 0
22021 1 0 0 0 0 0 0 0 0 0
22445 0 0 0 0 0 0 0 0 0 0
28134 0 0 0 0 0 0 0 0 0 1
evn though both knn and test dataset have the same number of rows=10 but i'm not too sure what is wrong with the same classes and order?
My raw data contains numeric values with a recall of the headers every 20 lines.
I wish to remove the repeated header lines with R. I know it's quite easy with sed command but I wish the R script to handle all steps of tidying data.
> raw <- read.delim("./vmstat_archiveadm_s.txt")
> head(raw)
kthr memory page disk faults cpu
r b w swap free re mf pi po fr de sr s2 s3 vc -- in sy cs us sy id
0 0 0 100097600 97779056 285 426 53 0 0 0 367 86 6 0 0 1206 7711 2630 1 0 99
0 0 0 96908192 94414488 7 31 0 0 0 0 0 120 0 0 0 2782 5775 5042 2 0 97
0 0 0 96889840 94397152 0 1 0 0 0 0 0 122 0 0 0 2737 5591 4958 2 0 97
kthr memory page disk faults cpu
r b w swap free re mf pi po fr de sr s2 s3 vc -- in sy cs us sy id
0 0 0 100065744 97745448 282 422 52 0 0 0 363 89 6 0 0 1233 7690 2665 1 0 99
0 0 0 96725312 94222040 7 31 0 0 0 0 0 604 69 0 0 5269 5703 7910 2 1 97
0 0 0 96668624 94170784 0 0 0 0 0 0 0 155 53 0 0 3047 5505 5317 2 0 97
0 0 0 96595104 94086816 0 0 0 0 0 0 0 174 0 0 0 2879 5567 5068 2 0 97
1 0 0 96521376 94025504 0 0 0 0 0 0 0 121 0 0 0 2812 5471 5105 2 0 97
0 0 0 96503256 93994896 0 0 0 0 0 0 0 121 0 0 0 2731 5621 4981 2 0 97
(...)
Try this :
where df is the dataframe
x = seq(6,100,21)
df = df[-x,]
Sequence will generate a string of numbers from 6 till 100 at an interval of 21.
Therefore, in this case :
6 27 48 69 90
Remove them from the dataframe by
df[-x,]
EDIT:
To do this for the entire dataframe, replace 100 with number of rows. i.e
seq(6,nrow(df),21)
Instead of processing the output in R I will clean it at the generation level:
$ vmstat 1 | egrep -v '^ kthr|^ r'
0 0 0 154831904 153906536 215 471 0 0 0 0 526 33 32 0 0 1834 14171 5253 0 0 99
1 0 0 154805632 153354296 9 32 0 0 0 0 0 0 0 0 0 1463 610 739 0 0 100
1 0 0 154805632 153354696 0 4 0 0 0 0 0 0 0 0 0 1408 425 634 0 0 100
0 0 0 154805632 153354696 0 0 0 0 0 0 0 0 0 0 0 1341 381 658 0 0 100
0 0 0 154805632 153354696 0 0 0 0 0 0 0 0 0 0 0 1299 353 610 0 0 100
1 0 0 154805632 153354696 0 0 0 0 0 0 0 0 0 0 0 1319 375 638 0 0 100
0 0 0 154805632 153354640 0 0 0 0 0 0 0 0 0 0 0 1308 367 614 0 0 100
0 0 0 154805632 153354640 0 0 0 0 0 0 0 0 0 0 0 1336 395 650 0 0 100
1 0 0 154805632 153354640 0 0 0 0 0 0 0 44 44 0 0 1594 378 878 0 0 100
0 0 0 154805632 153354640 0 0 0 0 0 0 0 66 65 0 0 1763 382 1015 0 0 100
0 0 0 154805632 153354640 0 0 0 0 0 0 0 0 0 0 0 1312 411 645 0 0 100
0 0 0 154805632 153354640 0 0 0 0 0 0 0 0 0 0 0 1342 390 647 0 0 100
Please bear with an R newbie here. I'm trying to follow along with a tutorial published on the wonderful flowingdata.com site by using my own data to replace the .Rdata file included in the tutorial. The Rdata file, "unisexCnts.RData", contains unisex names and the number of times used for different years:
head(unisexCnts)
1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951
Addison 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Alexis 0 0 0 0 0 0 0 0 0 0 0 12 0 0 0 0 0 0 0 0 0 0
Ali 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Alva 0 0 312 273 274 263 0 273 0 255 235 195 222 0 195 0 193 225 204 196 177 156
Amari 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Angel 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973
Addison 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Alexis 0 0 0 0 0 0 0 0 0 0 0 0 190 0 0 325 0 0 0 0 0 0
Ali 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 243 219 214
Alva 177 132 159 178 145 138 131 119 119 119 127 97 107 97 83 76 83 90 84 81 58 68
Amari 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Angel 0 0 0 0 0 0 0 0 0 1264 0 0 0 0 0 0 0 1579 2145 2488 0 0
1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995
Addison 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 595 664
Alexis 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Ali 0 0 0 0 0 0 0 0 0 0 0 0 561 565 556 643 747 722 0 742 0 0
Alva 54 57 53 54 59 40 62 0 48 0 28 0 34 0 0 0 0 0 0 0 0 26
Amari 0 0 0 0 0 0 11 0 0 0 0 0 16 0 22 0 32 0 0 0 0 0
Angel 2561 2690 2779 0 0 3004 3108 3113 3187 2924 3100 3341 3229 3101 3532 3889 4066 4520 0 0 0 0
1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012
Addison 778 889 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Alexis 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Ali 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Alva 0 0 0 19 0 14 0 0 0 0 0 24 0 0 0 0 0
Amari 0 0 0 0 0 0 1181 1397 1333 1299 1265 1550 1780 0 0 0 0
Angel 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
When I run it through the str() function I get the follwoing:
str(unisexCnts)
num [1:121, 1:83] 0 0 0 0 0 0 16 0 0 0 ...
- attr(*, "dimnames")=List of 2
..$ : chr [1:121] "Addison" "Alexis" "Ali" "Alva" ...
..$ : chr [1:83] "1930" "1931" "1932" "1933" ...
My data is in a csv file ,called "boysnames.csv":
,2013,2012,2011,2010,2009,2008
Jack,764,831,840,935,1068,1151
James,746,773,796,746,711,737
Daniel,678,683,711,792,842,828
Conor,610,639,709,726,776,857
I am trying to overwrite the unisexCnts.RData with the contents of my boysnames.csv. So to restructure and get my csv ready to be saved, I did:
Step1.
unisexCnts<-data <- read.csv("boysnames.csv", stringsAsFactors=FALSE, header=TRUE, check.names = FALSE)
Step2.
unisexCnts<-as.matrix(unisexCnts)
Step3.
save(file="unisexCnts.RData") ##save as Rdata file, overwriting the original unisexCnts.RData in the dir
However I get the following after steps 1 & 2 which doesn't match the structure of the original, any ideas/pointers?
> str(unisexCnts)
chr [1:100, 1:7] "Jack" "James" "Daniel" "Conor" "Sean" "Adam" "Ryan" "Michael" "Harry" "Noah" ...
- attr(*, "dimnames")=List of 2
..$ : NULL
..$ : chr [1:7] "" "2013" "2012" "2011" ...
When you load a .csv file you can specify the column that should become the row names of the uploaded data using the command "row.names"
I recreated your data quickly and uploaded it using the following code:
read.csv('test.csv', stringsAsFactors = F,head = T, row.names = 1)
This saves you having to do this work after uploading the data. This gives you the data structure you are looking for as well:
unisexCnts = read.csv('test.csv', stringsAsFactors = F,head = T, row.names = 1)
unisexCnts = as.matrix(unisexCnts)
str(unisexCnts)
int [1:4, 1:6] 764 746 678 610 831 773 683 639 840 796 ...
- attr(*, "dimnames")=List of 2
..$ : chr [1:4] "Jack" "James" "Dan" "Conor"
..$ : chr [1:6] "X2013" "X2012" "X2011" "X2010" ...
However I get the following after steps 1 & 2 which doesn't match the
structure of the original, any ideas/pointers?
In the original unisexCnts the names are specified as row names. That's why the the first attribute is
..$ : chr [1:121] "Addison" "Alexis" "Ali" "Alva" ...
To replicate that in your example. You can set the names as rownames by specifying
rownames(unisexCnts) <- ListorOrVectorofNamesHere
This will make the output match.
The reason this line:
chr [1:100, 1:7] "Jack" "James" "Daniel" "Conor" "Sean" "Adam" "Ryan" "Michael" "Harry" "Noah" ...
doens't match this line
num [1:121, 1:83] 0 0 0 0 0 0 16 0 0 0 ...
is the same. You have the names included in the actual matrix itself. In a matrix you can only have data of the same type . By including character data in the matrix (the names) you are converting the whole matrix itself into character/strings.
in summary
remove the name vector from the matrix and use it as row names and the str() of your two objects will match.
I have a dataset that looks like this..
MX000003035 LORETO 26.0170 111.3330 7.0 1938 2014
1941 1 31 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1941 2 28 0 0 0 10 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1941 3 31 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1941 4 30 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
where the data for each station begins with a description of the station - code, name, latitude etc.. The first column is the year, the second is the month of the year, the third the number of days and the following values are the daily precipitation values for that month.
There are 860 stations in this single dataset. How do I convert this into the following format in R?
Station Code Name Lat Long Year Month Precip
MX000003035 LORETO 26.017 111.333 1941 1 0
MX000003035 LORETO 26.017 111.333 1941 1 0
MX000003035 LORETO 26.017 111.333 1941 1 0
MX000003035 LORETO 26.017 111.333 1941 1 0
MX000003035 LORETO 26.017 111.333 1941 1 0
MX000003035 LORETO 26.017 111.333 1941 1 0
MX000003035 LORETO 26.017 111.333 1941 1 0
MX000003035 LORETO 26.017 111.333 1941 1 0
MX000003035 LORETO 26.017 111.333 1941 1 0
MX000003035 LORETO 26.017 111.333 1941 1 0
MX000003035 LORETO 26.017 111.333 1941 1 0
MX000003035 LORETO 26.017 111.333 1941 1 0
MX000003035 LORETO 26.017 111.333 1941 1 0
MX000003035 LORETO 26.017 111.333 1941 1 0
MX000003035 LORETO 26.017 111.333 1941 1 0
.. and so on
EDIT: Here is the link to the dataset
https://www.dropbox.com/s/o0yp1pe4rze8amd/gdcn_SWUS.txt
And here are some snippets...
1940 10 31 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1940 11 30 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1940 12 31 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1941 1 31 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1941 2 28 0 0 0 10 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1941 3 31 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1941 4 30 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
...
2014 9 30-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999
2014 10 31-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999
2014 11 30-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999
2014 12 31-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999
MX000003068 CIUDAD CONSTITUCION 24.9500 -111.7000 48.0 1957 2014
1957 1 31-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999
1957 2 28-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999
1957 3 31 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 190 80 0 0 0 0 0 0 0 0 0 0 0 0
1957 4 30 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1957 5 31 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1957 6 30 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1957 7 31 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1957 8 31 0 0 0 0 0 0 0 0 0 0 0 0 0 0 50 0 0 0 0 50 0 50 0 0 0 0 0 5 0 0 0
...
2014 9 30-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999
2014 10 31-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999
2014 11 30-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999
2014 12 31-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999
USC00040983 BORREGO DESERT PARK 33.2314 -116.4144 245.4 1942 2014
1942 1 31-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999
1942 2 28-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999
1942 3 31-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999
1942 4 30-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999
1942 5 31-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999
1942 6 30-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999-9999
Bring the data into R with readLines
dat <- raedLines( filename )
Then get the row numbers with station names and lat/long:
stations <- dat[ grep( "[[:alpha:]]{2}", dat) ]
Identify the row numbers of data lines:
breaks <- grep( "[[:alpha:]]{2}", dat)
breaks
#[1] 1 6 10
Make sequence of breaks:
breaks <- c(breaks, length(dat)+1 )
Then pull in data between breaks and let the R "auto repeat" function duplicate the station data:
newdf <- lapply( seq_along(breaks[-1]),
function(idx){
data.frame( stations[idx],
read.table(text=dat[(breaks[idx]+1):(breaks[idx+1]-1)], fill=TRUE))})
Then row-bind the rows back together:
newdf2 <- do.call(rbind, newdf)
Test data:
dat <- readLines( textConnection("MX000003035 LORETO 26.0170 111.3330 7.0 1938 2014
1941 1 31 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1941 2 28 0 0 0 10 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1941 3 31 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1941 4 30 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
MX000003036 Laredo 27.0170 112.3330 7.0 1938 2014
1941 1 31 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1941 2 28 0 0 0 10 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1941 3 31 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1941 4 30 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
MX000003037 Another 28.0170 113.3330 7.0 1938 2014
1941 1 31 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1941 2 28 0 0 0 10 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1941 3 31 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1941 4 30 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0")
Output: (not quite done but if you pass stations to read.table first and then do the lapply(cbind(..)) operation it should work fine:
## stations <- read.table(text=stations)
# Remove column 7 and add desired row names
> newdf ### the unfinished version
stations.idx. V1
1 MX000003035 LORETO 26.0170 111.3330 7.0 1938 2014 1941
2 MX000003035 LORETO 26.0170 111.3330 7.0 1938 2014 1941
3 MX000003035 LORETO 26.0170 111.3330 7.0 1938 2014 1941
4 MX000003035 LORETO 26.0170 111.3330 7.0 1938 2014 1941
5 MX000003036 Laredo 27.0170 112.3330 7.0 1938 2014 1941
6 MX000003036 Laredo 27.0170 112.3330 7.0 1938 2014 1941
7 MX000003036 Laredo 27.0170 112.3330 7.0 1938 2014 1941
8 MX000003036 Laredo 27.0170 112.3330 7.0 1938 2014 1941
9 MX000003037 Another 28.0170 113.3330 7.0 1938 2014 1941
10 MX000003037 Another 28.0170 113.3330 7.0 1938 2014 1941
11 MX000003037 Another 28.0170 113.3330 7.0 1938 2014 1941
12 MX000003037 Another 28.0170 113.3330 7.0 1938 2014 1941
V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20 V21 V22 V23
1 1 31 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2 2 28 0 0 0 10 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
3 3 31 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
4 4 30 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
5 1 31 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
6 2 28 0 0 0 10 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
7 3 31 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
8 4 30 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
9 1 31 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
10 2 28 0 0 0 10 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
11 3 31 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
12 4 30 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
V24 V25 V26 V27 V28 V29 V30 V31 V32 V33 V34
1 0 0 0 0 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0 0 NA NA NA
3 0 0 0 0 0 0 0 0 0 0 0
4 0 0 0 0 0 0 0 0 0 0 NA
5 0 0 0 0 0 0 0 0 0 0 0
6 0 0 0 0 0 0 0 0 NA NA NA
7 0 0 0 0 0 0 0 0 0 0 0
8 0 0 0 0 0 0 0 0 0 0 NA
9 0 0 0 0 0 0 0 0 0 0 0
10 0 0 0 0 0 0 0 0 NA NA NA
11 0 0 0 0 0 0 0 0 0 0 0
12 0 0 0 0 0 0 0 0 0 0 NA
20 Lines of the data I'm working on:
Zv9_NA110 6176 7276 5'to3'IntronExon 0 + 1100 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Zv9_NA110 10126 11226 5'to3'IntronExon 0 + 1100 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 7 9 9 15 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 13 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Zv9_NA110 11219 12319 5'to3'ExonIntron 0 + 1100 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Zv9_NA110 14887 15987 5'to3'IntronExon 0 + 1100 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 7 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9
Zv9_NA110 18923 20023 5'to3'IntronExon 0 + 1100 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Zv9_NA110 21069 22169 5'to3'ExonIntron 0 + 1100 0 135 115 65 54 45 36 27 16 9 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Zv9_NA113 1615 2715 5'to3'IntronExon 0 - 1100 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Zv9_NA113 2335 3435 5'to3'ExonIntron 0 - 1100 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 7 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Zv9_NA113 5398 6498 5'to3'IntronExon 0 - 1100 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Zv9_NA113 7173 8273 5'to3'ExonIntron 0 - 1100 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Zv9_NA118 11674 12774 5'to3'IntronExon 0 + 1100 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Zv9_NA118 12711 13811 5'to3'ExonIntron 0 + 1100 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Zv9_NA123 38151 39251 5'to3'ExonIntron 0 - 1100 0 1061 958 844 796 695 600 464 346 265 210 150 133 94 81 72 46 18 4 0 0 0 0 0 0 0 0 0 7 9 9 9 11 21 35 43 58 91 108 180 268 406 547 712 833 882 960 1094 1172 1245 1331 1432 1510 1604 1711 1810 1830 1837 1823 1781 1690 1638 1560 1489 1257 854 731 631 589 551 497 439 404 369 301 231 168 123 76 58 50 42 28 20 11 9 9 24 27 27 27 27 27 25 18 18 18 18 18 18 18 18 18 18 18 18 18 14 5 0 0
Zv9_NA124 2578 3678 5'to3'ExonIntron 0 + 1100 0 423 407 401 377 357 345 324 304 249 185 111 54 30 12 0 0 0 0 0 0 0 0 0 0 0 0 0 1 9 9 9 9 14 18 25 27 27 27 27 27 27 27 27 27 27 27 26 18 18 18 18 18 18 16 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 8 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Zv9_NA129 4939 6039 5'to3'IntronExon 0 + 1100 226 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4 9 9 9 9 9 9 9 9 9 9 9 9 14 34 45 60 97 128 175 293 395 524 621 764 894 1036 1164 1334 1469 1639 1801 1885 1983
Zv9_NA132 12589 13689 5'to3'ExonIntron 0 - 1100 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Zv9_NA132 13634 14734 5'to3'IntronExon 0 - 1100 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Zv9_NA132 14481 15581 5'to3'ExonIntron 0 - 1100 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 5 9 9 9 9 9 9 9 9 9 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Zv9_NA132 19534 20634 5'to3'IntronExon 0 - 1100 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Zv9_NA132 28708 29808 5'to3'ExonIntron 0 - 1100 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 5 9 15 18 24 27 42 46 73 112 142 157 162 162 162 162 162 162 162 162 159 153 153 153 153 153 150 144 132 112 76 52 30 25 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
I get this into R as follows:
> dat <- read.table("dat.dat",header=F)
I need to get the averages for columns 9 through 118, parsed by column 4.
This works:
> all_means <- aggregate(cbind(V9,V10,V11)~V4,data=dat,FUN=mean)
V4 V9 V10 V11
1 5'to3'ExonIntron 0 0.00 0
2 5'to3'IntronExon 0 0.75 1
But there's no way I'm typing this out to V118.
I've tried this:
> aggregate(cbind(9:118)~V4,data=blah,FUN=mean)
But I get this error:
Error in model.frame.default(formula = cbind(9:118) ~ V4, data = blah) :
variable lengths differ (found for 'V4')
Is there something dumb I'm missing?
You have a number of options.
create a formula using . and pass a subset of the data
aggregate( . ~ V4, data = dat[,c(4,9:118)], FUN = mean)
You could also create the vector of column names using paste
nn <- paste0('V', 9:118)
and refer by column name
aggregate( . ~ V4, data = dat[,c('V4',nn)], FUN = mean)
There isn't much point using cbind here, given the formula approach works, but for example.
aggregate( do.call(cbind,lapply(nn, as.name)) ~ V4, data = dat, FUN = mean)
But this is messy as it doesn't name the columns nicely. (and is hard to follow)
If speed is an issue in general (not necessary for this operation) and you want to use the data.table package, this is done as follows:
Safer solution
Thanks to mnel's comment, I would use that:
library(data.table)
dat <- as.data.table(dat)
dat[,lapply(.SD,mean),by="V4",.SDcols=paste0("V", 9:118)]
Old solution
dat[,lapply(.SD,mean),by="V4",.SDcols=9:118]
You can use
## S3 method for class 'data.frame'
aggregate(x, by, FUN, ..., simplify = TRUE)
With your data assuming your data is in dataframe DF
DF <- read.table(text = txt, header = FALSE, stringsAsFactors = FALSE)
result <- aggregate(DF[, 9:118], by = list(DF[, 4]), FUN = mean)
# Using pander to print result table nicely. It's not needed for aggregation :)
require(pander)
pandoc.table(result)
##
## ----------------------------------------------------
## Group.1 V9 V10 V11 V12 V13 V14
## ---------------- ----- ----- ----- ----- ----- -----
## 5'to3'ExonIntron 161.9 148 131 122.7 109.7 98.1
##
## 5'to3'IntronExon 0.0 0 0 0.0 0.0 0.0
## ----------------------------------------------------
##
## Table: Table continues below
##
##
## -----------------------------------------------
## V15 V16 V17 V18 V19 V20 V21 V22
## ----- ----- ----- ----- ----- ----- ----- -----
## 81.5 66.6 52.3 39.5 26.1 18.7 12.4 9.3
##
## 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
## -----------------------------------------------
##
## Table: Table continues below
##
##
## -----------------------------------------------
## V23 V24 V25 V26 V27 V28 V29 V30
## ----- ----- ----- ----- ----- ----- ----- -----
## 7.2 4.6 1.8 0.4 0 0 0 0.5
##
## 0.0 0.0 0.0 0.0 0 0 0 0.0
## -----------------------------------------------
##
## Table: Table continues below
##
##
## -----------------------------------------------
## V31 V32 V33 V34 V35 V36 V37 V38
## ----- ----- ----- ----- ----- ----- ----- -----
## 0.9 1.5 1.8 2.4 2.7 5 6.4 9.1
##
## 0.0 0.0 0.0 0.0 0.0 0 0.0 0.0
## -----------------------------------------------
##
## Table: Table continues below
##
##
## -----------------------------------------------
## V39 V40 V41 V42 V43 V44 V45 V46
## ----- ----- ----- ----- ----- ----- ----- -----
## 13 16.2 19.2 21.5 23 24.7 28 29.7
##
## 0 0.0 0.0 0.0 0 0.0 0 0.0
## -----------------------------------------------
##
## Table: Table continues below
##
##
## -----------------------------------------------
## V47 V48 V49 V50 V51 V52 V53 V54
## ----- ----- ----- ----- ----- ----- ----- -----
## 36.9 45.7 59.5 73.3 89.2 101.3 106.2 114
##
## 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0
## -----------------------------------------------
##
## Table: Table continues below
##
##
## -----------------------------------------------
## V55 V56 V57 V58 V59 V60 V61 V62
## ----- ----- ----- ----- ----- ----- ----- -----
## 127.3 134 140.7 148.1 156.2 160.4 167.4 175.7
##
## 0.0 0 0.0 0.0 0.0 0.0 0.0 0.0
## -----------------------------------------------
##
## Table: Table continues below
##
##
## -----------------------------------------------
## V63 V64 V65 V66 V67 V68 V69 V70
## ----- ----- ----- ----- ----- ----- ----- -----
## 183.9 183.1 183.7 182.3 178.1 169 163.8 156.7
##
## 0.0 0.0 0.0 0.0 0.0 0 0.0 0.0
## -----------------------------------------------
##
## Table: Table continues below
##
##
## -----------------------------------------------
## V71 V72 V73 V74 V75 V76 V77 V78
## ----- ----- ----- ----- ----- ----- ----- -----
## 149.8 126.6 86.3 74.0 64.0 59.8 56.0 50.6
##
## 0.7 0.9 0.9 1.5 1.8 1.8 1.8 1.8
## -----------------------------------------------
##
## Table: Table continues below
##
##
## -----------------------------------------------
## V79 V80 V81 V82 V83 V84 V85 V86
## ----- ----- ----- ----- ----- ----- ----- -----
## 45.6 42.2 38.7 31.9 24.9 18.6 14.1 9.4
##
## 1.8 1.8 1.8 1.8 1.8 1.8 2.2 2.7
## -----------------------------------------------
##
## Table: Table continues below
##
##
## -----------------------------------------------
## V87 V88 V89 V90 V91 V92 V93 V94
## ----- ----- ----- ----- ----- ----- ----- -----
## 7.6 6.2 5.1 3.7 2.9 2.5 2.7 2.7
##
## 2.7 2.7 2.7 2.7 2.7 2.7 2.2 0.9
## -----------------------------------------------
##
## Table: Table continues below
##
##
## --------------------------------------------------
## V95 V96 V97 V98 V99 V100 V101 V102
## ----- ----- ----- ----- ----- ------ ------ ------
## 4.2 4.5 4.5 4.5 4.5 4.5 4.3 2.5
##
## 0.9 0.9 0.9 1.4 4.1 5.4 6.9 10.6
## --------------------------------------------------
##
## Table: Table continues below
##
##
## -------------------------------------------------------
## V103 V104 V105 V106 V107 V108 V109 V110
## ------ ------ ------ ------ ------ ------ ------ ------
## 1.8 1.8 1.8 1.8 1.8 1.8 1.8 1.8
##
## 13.7 18.4 30.2 40.4 53.3 63.0 77.3 90.3
## -------------------------------------------------------
##
## Table: Table continues below
##
##
## -------------------------------------------------------
## V111 V112 V113 V114 V115 V116 V117 V118
## ------ ------ ------ ------ ------ ------ ------ ------
## 1.8 1.8 1.8 1.8 1.4 0.5 0.0 0.0
##
## 104.5 117.3 134.3 147.8 164.8 181.0 189.4 199.2
## -------------------------------------------------------
##