Principal component analysis (princomp, principal, etc.) on a 3D array - r

I have used PCA on 2D arrays before, and I use the first PC score vector that best best describes the variance of all the other columns in analyses. Below is a R example that shows the Comp.1 vector that would best describe the variance of the 2D array of interest.
data <- array(data=sample(12), c(4,3))
data
[,1] [,2] [,3]
[1,] 11 2 12
[2,] 4 3 10
[3,] 8 7 1
[4,] 6 9 5
output=princomp(data)
output$scores
Comp.1 Comp.2 Comp.3
[1,] 6.422813 2.865390 0.4025040
[2,] 3.251842 -3.617633 -0.9814571
[3,] -5.856500 1.848419 -1.3819379
[4,] -3.818155 -1.096176 1.9608909
My question is how can I do this same procedure on a 3D array? For example, if I have an array that the size is 4 x 5 x 3 how could I get the 4 x 5 2D array that is equivalent to the Comp.1 vector found above?
I have provided an R example below with code and outputs. When I look at the scores it only outputs one component (not 3 as expected), and the length is 60. Does that mean that the first 20 elements correspond to the first PC, the next 20 to the 2nd PC, and the last 20 to the 3rd PC? If so how does princomp arrange the entries, so I can get back to the original 4 x 5 2D array using the first 20 elements (1st PC)? Thank you for your assistance.
data=array(data=sample(48), c(4,5,3))
data
, , 1
[,1] [,2] [,3] [,4] [,5]
[1,] 47 21 45 41 34
[2,] 1 16 32 31 37
[3,] 39 8 35 10 6
[4,] 48 14 25 3 11
, , 2
[,1] [,2] [,3] [,4] [,5]
[1,] 12 43 15 36 23
[2,] 17 4 7 26 46
[3,] 2 13 33 20 40
[4,] 18 19 28 44 38
, , 3
[,1] [,2] [,3] [,4] [,5]
[1,] 42 24 47 21 45
[2,] 5 22 1 16 32
[3,] 30 29 39 8 35
[4,] 27 9 48 14 25
output=princomp(data)
output$scores
Comp.1
[1,] 21.8833333
[2,] -24.1166667
[3,] 13.8833333
[4,] 22.8833333
[5,] -4.1166667
[6,] -9.1166667
[7,] -17.1166667
[8,] -11.1166667
[9,] 19.8833333
[10,] 6.8833333
[11,] 9.8833333
[12,] -0.1166667
[13,] 15.8833333
[14,] 5.8833333
[15,] -15.1166667
[16,] -22.1166667
[17,] 8.8833333
[18,] 11.8833333
[19,] -19.1166667
[20,] -14.1166667
[21,] -13.1166667
[22,] -8.1166667
[23,] -23.1166667
[24,] -7.1166667
[25,] 17.8833333
[26,] -21.1166667
[27,] -12.1166667
[28,] -6.1166667
[29,] -10.1166667
[30,] -18.1166667
[31,] 7.8833333
[32,] 2.8833333
[33,] 10.8833333
[34,] 0.8833333
[35,] -5.1166667
[36,] 18.8833333
[37,] -2.1166667
[38,] 20.8833333
[39,] 14.8833333
[40,] 12.8833333
[41,] 16.8833333
[42,] -20.1166667
[43,] 4.8833333
[44,] 1.8833333
[45,] -1.1166667
[46,] -3.1166667
[47,] 3.8833333
[48,] -16.1166667
[49,] 21.8833333
[50,] -24.1166667
[51,] 13.8833333
[52,] 22.8833333
[53,] -4.1166667
[54,] -9.1166667
[55,] -17.1166667
[56,] -11.1166667
[57,] 19.8833333
[58,] 6.8833333
[59,] 9.8833333
[60,] -0.1166667

Related

List to data frame while keeping structure

str(coord_mat)
List of 1
$ :List of 1
..$ : num [1:17, 1:2] -122 -122 -122 -122 -122 ...
I have list of coordinate pairs in coord_mat that I would like to transform in to data frame (or matrix) of coordinate pairs in the same structure(first column being lon, the second being lat).
> coord_mat
[[1]]
[[1]][[1]]
[,1] [,2]
[1,] -122.3435 47.63787
[2,] -122.3435 47.63787
[3,] -122.3434 47.63787
[4,] -122.3434 47.63787
[5,] -122.3434 47.63787
[6,] -122.3434 47.63787
[7,] -122.3434 47.63787
[8,] -122.3434 47.63784
[9,] -122.3433 47.63777
[10,] -122.3430 47.63772
[11,] -122.3427 47.63778
[12,] -122.3425 47.63776
[13,] -122.3423 47.63749
[14,] -122.3421 47.63718
[15,] -122.3420 47.63700
[16,] -122.3419 47.63698
[17,] -122.3419 47.63698
How is this possible in R while keeping the same double column structure as the list?
I have tried matrix(unlist(coord_mat)), but this just produces a long vector of length 34 with the lon values first then the lat values. Is it because I am working with a list of lists?
> matrix(unlist(coord_mat))
[,1]
[1,] -122.34345
[2,] -122.34345
[3,] -122.34340
[4,] -122.34340
[5,] -122.34340
[6,] -122.34340
[7,] -122.34340
[8,] -122.34338
[9,] -122.34334
[10,] -122.34299
[11,] -122.34273
[12,] -122.34249
[13,] -122.34230
[14,] -122.34208
[15,] -122.34198
[16,] -122.34194
[17,] -122.34194
[18,] 47.63787
[19,] 47.63787
[20,] 47.63787
[21,] 47.63787
[22,] 47.63787
[23,] 47.63787
[24,] 47.63787
[25,] 47.63784
[26,] 47.63777
[27,] 47.63772
[28,] 47.63778
[29,] 47.63776
[30,] 47.63749
[31,] 47.63718
[32,] 47.63700
[33,] 47.63698
[34,] 47.63698
Here is the data:
dput(coord_mat)
list(list(structure(c(-122.34345, -122.34345, -122.343398333333,
-122.343398333333, -122.343398333333, -122.343398333333, -122.343398333333,
-122.343376666667, -122.34334, -122.342991666667, -122.342731666667,
-122.342491666667, -122.3423, -122.342081666667, -122.341983333333,
-122.341943333333, -122.341943333333, 47.6378716666667, 47.6378716666667,
47.6378683333333, 47.6378683333333, 47.6378683333333, 47.6378683333333,
47.6378683333333, 47.637835, 47.637775, 47.6377183333333, 47.63778,
47.63776, 47.6374916666667, 47.6371816666667, 47.6369966666667,
47.6369783333333, 47.6369783333333), .Dim = c(17L, 2L))))
res <- coord_mat[[c(1, 1)]]
# or
res <- matrix(unlist(coord_mat), ncol = 2)
colnames(res) <- c("lon", "lat")
res
lon lat
[1,] -122.3435 47.63787
[2,] -122.3435 47.63787
[3,] -122.3434 47.63787
[4,] -122.3434 47.63787
[5,] -122.3434 47.63787
[6,] -122.3434 47.63787
[7,] -122.3434 47.63787
[8,] -122.3434 47.63784
[9,] -122.3433 47.63777
[10,] -122.3430 47.63772
[11,] -122.3427 47.63778
[12,] -122.3425 47.63776
[13,] -122.3423 47.63749
[14,] -122.3421 47.63718
[15,] -122.3420 47.63700
[16,] -122.3419 47.63698
[17,] -122.3419 47.63698

error in storing unique values in 2d vector in r

> uc<-unique(r1$COMPANY)
> uc
[1] AZTEC CALIBER POINT COGNIZANT CYBAGE CYBAGE DLF
[7] GODREJ AND BOYCE LTD. HCL TECHNOLOGIES I-FLEX INFOCEPTS INFOSYS JATAAYU SOFTWARE (P) LTD.
[13] KANBAY KPIT L & T LTD. L & T INFOTECH MASTEK mBlazon SOLUTION PVT. LTD.
[19] MOTOR INDUSTRIES LTD NOVATECH PATNI COMPUTER SOFTWARE RF ARRAYS S.M. WIRELESS PVT. LTD. SATYAM COMPUTERS
[25] SATYAM COMPUTERS SATYAM COMPUTERS LTD. SHOBHA DEVELOPERS SOHAM's FOUNDTION ENGG. SYNTEL LTD. TCS
[31] TECH MAHINDRA LTD. ULTRA TECH CEMENT VRITTI SOLUTIONS ABO SOFTWARE ARTEFACT PROJECTS LTD. EATON
[37] FORCE MOTORS H.C.C. HEXAWARE TECHNOLOGIES HJB GROUP COMPANY, OMAN IBM DAKSH INDIAN MILITARY ACADEMY
[43] INDO RAMA SYNTHETICS INFOSPECTRUM PVT. LTD. JYOTI STRUCTUIRES LTD. KALPATARU KONE ELEVATORS L & T ( e- SOLUTIONS)
[49] LAMBENT MAHINDRA & MAHINDRA LTD. MAYTAS INFRA PVT. LTD. MOTOR INDUSTRIES LTD . ORIENT CEMENT PERSISTENT SYSTEMS PVT.LTD
[55] PREMIERE TECHNMOLOGY SCHNEIDER SETH CONSTRUCTION SIEMENS SIMPLEX SMS PARYAWARAN
[61] SOFT LINK INTERNATIONAL VARROC ENGINEERING, PUNE
> f<-length(uc)
> tt<-mat.or.vec(length(uc),3)
> for(n in 1:f)
+ {
+ tt[n,1]=uc[n]
+ tt[n,2]=5
+ tt[n,3]=9
+ }
> tt
[,1] [,2] [,3]
[1,] 3 5 9
[2,] 4 5 9
[3,] 5 5 9
[4,] 6 5 9
[5,] 7 5 9
[6,] 8 5 9
[7,] 11 5 9
[8,] 13 5 9
[9,] 16 5 9
[10,] 20 5 9
[11,] 22 5 9
[12,] 23 5 9
[13,] 26 5 9
[14,] 28 5 9
[15,] 29 5 9
[16,] 31 5 9
[17,] 34 5 9
[18,] 36 5 9
[19,] 37 5 9
[20,] 39 5 9
[21,] 41 5 9
[22,] 44 5 9
[23,] 45 5 9
[24,] 46 5 9
[25,] 47 5 9
[26,] 48 5 9
[27,] 51 5 9
[28,] 56 5 9
[29,] 57 5 9
[30,] 58 5 9
[31,] 59 5 9
[32,] 60 5 9
[33,] 62 5 9
[34,] 1 5 9
[35,] 2 5 9
[36,] 9 5 9
[37,] 10 5 9
[38,] 12 5 9
[39,] 14 5 9
[40,] 15 5 9
[41,] 17 5 9
[42,] 18 5 9
[43,] 19 5 9
[44,] 21 5 9
[45,] 24 5 9
[46,] 25 5 9
[47,] 27 5 9
[48,] 30 5 9
[49,] 32 5 9
[50,] 33 5 9
[51,] 35 5 9
[52,] 38 5 9
[53,] 40 5 9
[54,] 42 5 9
[55,] 43 5 9
[56,] 49 5 9
[57,] 50 5 9
[58,] 52 5 9
[59,] 53 5 9
[60,] 54 5 9
[61,] 55 5 9
[62,] 61 5 9
> strsplit(r1$COMPANY," ")
Error in strsplit(r1$COMPANY, " ") : non-character argument
I want t[n,1] to store all unique values of r1$COMPANY present in vector uc but it shows some random numbers.Please help to resolve the error.Also I would like to know how to obtain just first word from r1$COMPANY. I tried to split it on space character but it shows error.
As #Jason and #Metrics mentioned, it appears that uc is a factor rather than a character. To remedy this, define uc as follows:
uc <- with(r1, as.character(unique(COMPANY)))
The key here is as.character(), which will convert a factor variable to a character variable. Any string manipulation functions should now work on uc.
# I think `uc` is factor in your case (check using str(uc)). If it is character, it will give the solution as expected. Consider the following example:
uc<-names(mtcars)
#str(uc)
#chr [1:11] "mpg" "cyl" "disp" "hp" "drat" "wt" "qsec" "vs" "am" "gear" "carb"
f<-length(uc)
tt<-mat.or.vec(f,3)
for(n in 1:f) {
tt[n,1]<-uc[n]
tt[n,2]<-5
tt[n,3]<-9
}
> tt
tt
[,1] [,2] [,3]
[1,] "mpg" "5" "9"
[2,] "cyl" "5" "9"
[3,] "disp" "5" "9"
[4,] "hp" "5" "9"
[5,] "drat" "5" "9"
[6,] "wt" "5" "9"
[7,] "qsec" "5" "9"
[8,] "vs" "5" "9"
[9,] "am" "5" "9"
[10,] "gear" "5" "9"
[11,] "carb" "5" "9"

How to revert back scalling of data in R?

I had a data set with data like this:
value
[1,] 41601325
[2,] 54917632
[3,] 64616616
[4,] 90791277
[5,] 35335221
[6,] .
. .
. .
which I had to scale down to range [0,1] using
apply(data1, MARGIN = 2, FUN = function(X) (X - min(X))/diff(range(X)))
as I needed to fit the data in GP_fit() of GPfit package. The scaled down values became say:
value
[1,] .4535
[2,] .56355
[3,] .64616
[4,] .70791
[5,] .35563
[6,] .
. .
. .
After I applied GP_fit() on the scaled data and used predict() and as output I got the new values which again are in range[0,1] like:
value
[1,] .0135
[2,] .234355
[3,] .6716
[4,] .325079
[5,] .95563
[6,] .
. .
. .
but I want to take these back to the original range. How can I do that?
Basically I want to revert back/ return to original format for showing the output of predict()
NOTE: The original range is not fixed and can vary but normally the maximum value possible is about 20 million .
UPDATE: I tired to implement the code written by #JustinFletcher. My data was :
value
[1,] 54.2
[2,] 53.8
[3,] 53.9
[4,] 53.8
[5,] 54.9
[6,] 55.0
[7,] 38.5
[8,] 38.0
[9,] 38.1
[10,] 38.0
[11,] 38.8
[12,] 38.9
[13,] 24.3
[14,] 24.1
[15,] 24.3
[16,] 24.1
[17,] 24.4
[18,] 24.4
[19,] 57.3
[20,] 57.2
[21,] 57.6
[22,] 57.7
[23,] 58.1
[24,] 57.9
I wrote this to rescale it in range [0,1]:
data_new <- apply(data_test, MARGIN = 2, FUN = function(X) (X - min(X))/diff(range(X)))
and I got
value
[1,] 0.885294118
[2,] 0.873529412
[3,] 0.876470588
[4,] 0.873529412
[5,] 0.905882353
[6,] 0.908823529
[7,] 0.423529412
[8,] 0.408823529
[9,] 0.411764706
[10,] 0.408823529
[11,] 0.432352941
[12,] 0.435294118
[13,] 0.005882353
[14,] 0.000000000
[15,] 0.005882353
[16,] 0.000000000
[17,] 0.008823529
[18,] 0.008823529
[19,] 0.976470588
[20,] 0.973529412
[21,] 0.985294118
[22,] 0.988235294
[23,] 1.000000000
[24,] 0.994117647
then to revert it back to original scale I wrote this:
data_revert <- apply(data_new, MARGIN = 2, FUN = function(X, Y) (X + min(Y))*diff(range(Y)), Y=data_test)
and I got
value
[1,] 849.5
[2,] 849.1
[3,] 849.2
[4,] 849.1
[5,] 850.2
[6,] 850.3
[7,] 833.8
[8,] 833.3
[9,] 833.4
[10,] 833.3
[11,] 834.1
[12,] 834.2
[13,] 819.6
[14,] 819.4
[15,] 819.6
[16,] 819.4
[17,] 819.7
[18,] 819.7
[19,] 852.6
[20,] 852.5
[21,] 852.9
[22,] 853.0
[23,] 853.4
[24,] 853.2
This output is not correct.
This is simple algebra. To scale data, you calculate
n = (e - e_min)/(e_max - e_min)
Now you need back e, based on arbitrary e_min and e_max. It is trivial to show that
n(e_max - e_min) + e_min = e
Example:
e <- 1:10
n <- (e - min(e))/(max(e) - min(e))
new.e <- (n*(10-1)) + 1
> all(e == new.e)
[1] TRUE
You just need to apply the inverse of the function FUN to the output data. This requires the original data be passed to the function.
apply(dataOutput, MARGIN = 2, FUN = function(X, Y) (X + min(Y))*diff(range(Y)), Y=data1)
For a great description of the 'apply' function, see here.
P.S.: Romans response is exactly the same idea, I just implemented it with your variables, using apply, because I thought it was interesting.

Displaying 3D using Wireframe in R

I am trying to plot 3D graphs in R using Wireframe, but I have problem in displaying the graph correctly. The data is the following:
[1,] 1 1.000000 1
[2,] 2 1.709133 1
[3,] 4 3.278188 1
[4,] 8 5.082078 1
[5,] 16 5.753403 1
[6,] 32 5.778228 1
[7,] 64 5.783567 1
[8,] 1 1.000000 2
[9,] 2 1.709133 2
[10,] 4 3.278429 2
[11,] 8 5.081508 2
[12,] 16 5.751819 2
[13,] 32 5.777714 2
[14,] 64 5.783520 2
[15,] 1 1.000000 3
[16,] 2 1.709133 3
[17,] 4 3.278632 3
[18,] 8 5.079604 3
[19,] 16 5.753117 3
[20,] 32 5.777558 3
[21,] 64 5.783742 3
[22,] 1 1.000000 4
[23,] 2 1.709133 4
[24,] 4 3.278708 4
[25,] 8 5.080512 4
[26,] 16 5.753243 4
[27,] 32 5.778988 4
[28,] 64 5.782796 4
[29,] 1 1.000000 5
[30,] 2 1.709133 5
[31,] 4 3.278253 5
[32,] 8 5.082100 5
[33,] 16 5.752612 5
[34,] 32 5.778187 5
[35,] 64 5.783359 5
[36,] 1 1.000000 6
[37,] 2 1.709133 6
[38,] 4 3.278576 6
[39,] 8 5.078772 6
[40,] 16 5.753112 6
[41,] 32 5.777878 6
[42,] 64 5.784069 6
[43,] 1 1.000000 7
[44,] 2 1.709133 7
[45,] 4 3.277898 7
[46,] 8 5.081783 7
[47,] 16 5.753860 7
[48,] 32 5.777794 7
[49,] 64 5.784079 7
[50,] 1 1.000000 8
[51,] 2 1.709133 8
[52,] 4 3.278517 8
[53,] 8 5.080553 8
[54,] 16 5.750771 8
[55,] 32 5.779782 8
[56,] 64 5.783110 8
[57,] 1 1.000000 9
[58,] 2 1.709133 9
[59,] 4 3.278196 9
[60,] 8 5.080629 9
[61,] 16 5.753407 9
[62,] 32 5.777428 9
[63,] 64 5.784100 9
[64,] 1 1.000000 10
[65,] 2 1.709133 10
[66,] 4 3.278395 10
[67,] 8 5.081113 10
[68,] 16 5.752613 10
[69,] 32 5.777564 10
[70,] 64 5.783312 10
I tried the following code:
wireframe(temp, scales = list(arrows = FALSE,
x = list(labels = 2^seq(0, 6)),
y = list(labels = seq(1,7)),
z = list (labels =seq(1,7))), drape = TRUE, colorkey = TRUE,
screen = list(z = 30, x = -60),
xlab = "X", ylab = "Y", zlab = "Z")
I don't know why the graph is displayed like that:
If I tried to display the same code for the first 7 rows only the figure is like that:
Anyone knows what's the problem?.
I think you need to use the aspect argument here.
wireframe(as.matrix(dat),
scales = list(arrows = FALSE, x = list(labels = 2^seq(0, 6)),
y = list(labels = seq(1,7)), z = list(labels =seq(1,7))),
drape = TRUE, colorkey = TRUE, aspect = c(61/87, 0.4),
screen = list(z = 30, x = -60), xlab = "X", ylab = "Y", zlab = "Z")

r language getting a not so good neural network

i am trying to fit a neural net using nnet method. But for some reason i am getting fitted values =1 . any suggestion?
traininginput <- as.data.frame(runif(50, min=0, max=100))
trainingoutput <- sqrt(traininginput)
#Column bind the data into one variable
trainingdata <- cbind(traininginput,trainingoutput)
colnames(trainingdata) <- c("Input","Output")
nnet1=nnet(trainingdata$Input,trainingdata$Output,size=10,decay=.2,MaxNWts=100)
nnet1=nnet(trainingdata$Input,trainingdata$Output,size=10,decay=.2,MaxNWts=100)
# weights: 31
initial value 2398.911170
iter 10 value 1881.721970
iter 20 value 1879.273609
iter 30 value 1879.248746
final value 1879.248003
converged
nnet1$fitted.values
[,1]
[1,] 0.9995635611
[2,] 0.9995572993
[3,] 0.9994755520
[4,] 0.9995623911
[5,] 0.9946006508
[6,] 0.9995635779
[7,] 0.9995108061
[8,] 0.9995635025
[9,] 0.9995634973
[10,] 0.9993213029
[11,] 0.9994652808
[12,] 0.9979116411
[13,] 0.9993242430
[14,] 0.9995635764
[15,] 0.9995632595
[16,] 0.9995583546
[17,] 0.9992778848
[18,] 0.9995635364
[19,] 0.9939526885
[20,] 0.9995635788
[21,] 0.9993010783
[22,] 0.9995597586
[23,] 0.9995635748
[24,] 0.9995635799
[25,] 0.9995634813
[26,] 0.9992898229
[27,] 0.9959834977
[28,] 0.9991941868
[29,] 0.9995632224
[30,] 0.9995486025
[31,] 0.9995608608
[32,] 0.9995635755
[33,] 0.9995635740
[34,] 0.9995491922
[35,] 0.9995635661
[36,] 0.9995629924
[37,] 0.9995634948
[38,] 0.9994889240
[39,] 0.9995633558
[40,] 0.9995559725
[41,] 0.9995563696
[42,] 0.9995564636
[43,] 0.9995602699
[44,] 0.9995635768
[45,] 0.9995612253
[46,] 0.9895069469
[47,] 0.9995271104
[48,] 0.9995635564
[49,] 0.9995635797
[50,] 0.9995609961
Add the linout = TRUE argument to the nnet function:
nnet1=nnet(trainingdata$Input,trainingdata$Output,size=10,decay=.2,MaxNWts=100, linout = TRUE)
That should solve your problem! By default, the fitted values are logistic output units - see ?nnet.

Resources