How to make a random strata sample in R? - r

I have a data.frame calls "per" who has three variables: nrodocumento, cod_jer(42 groups) and grupo_fict(8 groups). I would like to have a random sample (data.frame)for each cod_jer and inside each grupo_fict.
> dput(head(per))
structure(list(nrodocumento = c(49574917L, 54692750L, 54731807L,
57364176L, 57364198L, 46867674L), cod_jer = c(1146L, 32L, 0L,
0L, 0L, 0L), grupo_fict = c(3L, 1L, 8L, 1L, 1L, 1L)), .Names =
c("nrodocumento",
"cod_jer", "grupo_fict"), row.names = c(NA, 6L), class = "data.frame")
> head(per,n=100)
nrodocumento cod_jer grupo_fict
1 49574917 1146 3
2 54692750 32 1
3 54731807 0 8
4 57364176 0 1
5 57364198 0 1
6 46867674 0 1
7 46867668 0 1
8 57364201 0 1
9 53767871 0 1
10 55339012 0 1
11 49204318 0 8
12 53743017 0 1
13 47622958 0 1
14 49019862 0 1
15 50167428 0 2
16 48783260 0 4
17 52020945 433 5
18 54486680 236 4
19 51402916 0 4
20 48543242 0 2
21 54671603 0 1
22 50644599 0 8
23 53293608 0 1
24 52742799 0 4
25 49815210 0 8
26 50967719 236 3
27 51938997 0 8
28 50057188 324 3
29 52754706 0 6
30 55322102 0 3
31 53040748 0 1
32 50321642 0 5
33 51621354 236 8
34 49611806 0 7
35 53347667 0 8
36 52462498 0 3
37 54158570 0 8
38 54034849 0 8
39 52507674 321 3
40 50218598 317 7
41 45078442 432 7
42 51491066 0 8
43 53278953 0 2
44 52661658 0 2
45 50092873 236 3
46 50308064 0 7
47 51941635 0 7
48 53527966 0 1
49 49614579 0 1
50 49450678 318 8
51 52953427 1146 7
52 52133221 0 8
53 53363128 0 7
54 52819643 0 1
55 47516589 0 1
56 52563137 0 3
57 49511296 0 7
58 54154013 0 2
59 50822420 1349 4
60 50822408 1349 4
61 50822414 1349 6
62 52339683 0 1
63 50026113 0 7
64 47328586 0 7
65 56041961 0 7
66 47756955 432 8
67 53158397 0 7
68 53151167 0 7
69 54710039 0 3
70 54408844 114 4
71 46286323 114 4
72 50310877 0 1
73 50929135 0 7
74 49817218 0 1
75 53604540 0 8
76 52812736 1147 1
77 53726314 1147 1
78 50835936 0 8
79 55429334 0 1
80 48421020 329 8
81 49800217 0 3
82 52818263 0 1
83 45884978 0 1
84 50203385 0 1
85 53433610 0 2
86 54515938 0 1
87 50263935 0 8
88 52439152 0 2
89 48424129 236 3
90 47031563 0 8
91 53577610 11 1
92 48759083 11 1
93 50344731 432 1
94 51164013 0 3
95 52026977 163 7
96 50965482 0 3
97 45947594 433 8
98 53357234 0 7
99 48367529 0 8
100 54286153 0 3
> table(per$cod_jer,per$grupo_fict)
1 2 3 4 5 6 7 8
0 3990 2296 1743 1453 356 250 2031 2051
11 149 85 29 34 14 6 34 25
13 2 4 1 0 0 0 1 1
14 3 1 0 0 0 0 0 1
32 37 12 13 10 3 1 23 13
101 19 12 6 5 3 0 6 12
102 2 0 0 0 0 0 0 0
103 11 10 3 3 0 1 3 0
104 17 8 1 7 2 1 7 9
105 11 12 3 3 3 0 6 10
106 147 57 30 29 8 1 43 42
107 33 37 5 9 3 2 8 9
108 6 10 2 3 0 2 3 4
109 44 37 11 9 6 2 14 14
111 112 81 26 28 8 3 22 18
112 21 8 4 8 2 0 3 2
113 94 61 14 16 4 1 17 24
114 60 52 10 14 9 5 8 20
115 72 24 21 13 5 1 11 16
125 5 4 1 0 1 0 0 1
138 15 5 2 2 1 0 2 0
163 50 35 26 26 7 12 43 41
234 51 43 31 32 10 7 49 53
236 78 29 46 35 7 7 39 37
317 44 28 21 13 7 2 28 21
318 20 27 5 10 4 3 12 14
319 45 21 25 19 1 2 26 21
321 6 4 9 3 0 3 8 1
322 43 30 24 16 5 3 16 34
323 30 14 25 15 3 4 24 22
324 59 29 31 27 8 5 28 27
325 15 12 6 5 1 2 8 11
326 18 12 17 13 4 2 20 15
327 45 28 23 26 7 6 25 40
328 52 49 33 32 5 9 31 35
329 42 36 26 20 2 3 23 30
431 6 2 4 1 2 0 2 6
432 39 18 27 24 5 1 28 34
433 139 92 90 89 18 13 61 66
1146 97 49 26 14 7 5 24 29
1147 56 33 26 25 9 0 19 20
1349 15 9 11 10 0 1 10 3
1544 62 33 20 32 4 3 25 43
1545 37 13 22 14 1 3 14 31
1848 16 27 11 15 3 0 10 12
For other hand I have a data.frame wiht vacancies, I mean, the size of each sample I need inside each gruop.
> dput(head(vacantes))
structure(list(cod_jer = c(101L, 316L, 325L, 1349L, 1544L, 102L
), vacantes = c(132, 180, 54, 63, 45, 0), vac1 = c(27, 36, 11,
13, 9, 0), vac2 = c(27, 36, 11, 13, 9, 0), vac3 = c(24, 33, 10,
12, 9, 0), vac4 = c(24, 33, 10, 12, 9, 0), vac5 = c(8, 11, 4,
4, 3, 0), vac6 = c(8, 11, 4, 4, 3, 0), vac7 = c(7, 10, 3, 3,
2, 0), vac8 = c(7, 10, 3, 3, 2, 0)), .Names = c("cod_jer", "vacantes",
"vac1", "vac2", "vac3", "vac4", "vac5", "vac6", "vac7", "vac8"
), row.names = c(NA, 6L), class = "data.frame")
> vacantes
cod_jer vacantes vac1 vac2 vac3 vac4 vac5 vac6 vac7 vac8
1 101 132 27 27 24 24 8 8 7 7
2 316 180 36 36 33 33 11 11 10 10
3 325 54 11 11 10 10 4 4 3 3
4 1349 63 13 13 12 12 4 4 3 3
5 1544 45 9 9 9 9 3 3 2 2
6 102 0 0 0 0 0 0 0 0 0
7 103 0 0 0 0 0 0 0 0 0
8 104 0 0 0 0 0 0 0 0 0
9 105 0 0 0 0 0 0 0 0 0
10 106 0 0 0 0 0 0 0 0 0
11 107 0 0 0 0 0 0 0 0 0
12 108 0 0 0 0 0 0 0 0 0
13 109 0 0 0 0 0 0 0 0 0
14 110 0 0 0 0 0 0 0 0 0
15 111 0 0 0 0 0 0 0 0 0
16 112 0 0 0 0 0 0 0 0 0
17 113 0 0 0 0 0 0 0 0 0
18 114 0 0 0 0 0 0 0 0 0
19 115 0 0 0 0 0 0 0 0 0
20 137 0 0 0 0 0 0 0 0 0
21 138 0 0 0 0 0 0 0 0 0
22 139 0 0 0 0 0 0 0 0 0
23 140 0 0 0 0 0 0 0 0 0
24 234 0 0 0 0 0 0 0 0 0
25 236 0 0 0 0 0 0 0 0 0
26 317 0 0 0 0 0 0 0 0 0
27 318 0 0 0 0 0 0 0 0 0
28 319 0 0 0 0 0 0 0 0 0
29 320 0 0 0 0 0 0 0 0 0
30 321 0 0 0 0 0 0 0 0 0
31 322 0 0 0 0 0 0 0 0 0
32 323 0 0 0 0 0 0 0 0 0
33 324 0 0 0 0 0 0 0 0 0
34 326 0 0 0 0 0 0 0 0 0
35 327 0 0 0 0 0 0 0 0 0
36 328 0 0 0 0 0 0 0 0 0
37 329 0 0 0 0 0 0 0 0 0
38 431 0 0 0 0 0 0 0 0 0
39 432 0 0 0 0 0 0 0 0 0
40 433 0 0 0 0 0 0 0 0 0
41 1146 0 0 0 0 0 0 0 0 0
42 1147 0 0 0 0 0 0 0 0 0
43 1545 0 0 0 0 0 0 0 0 0
44 1630 0 0 0 0 0 0 0 0 0
45 1848 0 0 0 0 0 0 0 0 0
I would like to make a sample strata in each of this combination groups: cod_jer and grupo_fict, in case of vacancies are 0, the sample size will be 0.
I was trying this:
size=subset(vacantes,select=c(vac1,vac2,vac3,vac4,vac5,vac6,vac7,vac8))
size=as.matrix(size)
size=as.vector(size)
for(i in 1:length(size)) {
if (size[i] > 0 ) {
s=strata(per,c("cod_jer","grupo_fict"),size=size,
method="srswor")
} else {
s="0"
}}
But I cant get it work :(
Any suugestion?
Thanks!

Related

Calculate weighted mean from matrix in R

I have a matrix that looks like the following. For rows 1:23, I would like to calculate the weighted mean, where the data in rows 1:23 are the weights and row 24 is the data.
1 107 33 41 22 12 4 122 44 297 123 51 16 7 9 1 1 0
10 5 2 2 1 0 3 4 6 12 3 3 0 1 1 0 0 0
11 1 3 1 0 0 0 4 2 8 3 4 0 0 0 0 0 0
12 2 1 1 0 0 0 2 1 5 6 3 1 0 0 0 0 0
13 1 0 1 0 0 0 3 1 3 5 2 2 0 1 0 0 0
14 3 0 0 0 0 0 3 1 2 3 0 1 0 0 0 0 0
15 0 0 0 0 0 0 2 0 0 1 0 1 0 0 0 0 0
16 0 0 0 0 1 0 0 0 2 0 0 0 0 0 0 0 0
17 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
18 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
19 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
2 80 27 37 5 6 4 97 48 242 125 44 27 7 8 8 0 2
20 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
21 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
22 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
23 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
3 47 12 33 12 6 1 63 42 200 96 45 19 6 6 9 2 0
4 45 14 21 9 4 2 54 26 130 71 36 17 8 5 1 0 2
5 42 10 14 6 3 2 45 19 89 45 26 7 4 8 2 1 0
6 17 3 12 5 2 0 18 21 51 41 19 15 5 1 1 0 0
7 16 2 6 0 0 1 14 9 37 23 17 7 3 0 3 0 0
8 9 4 4 2 1 0 7 9 30 15 8 3 3 1 1 0 1
9 12 2 3 1 1 1 6 5 14 12 5 1 2 0 0 1 0
24 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
As an example using the top two rows, there would have an additional column at the end indicated the weighted mean.
1 107 33 41 22 12 4 122 44 297 123 51 16 7 9 1 1 0 6.391011
10 5 2 2 1 0 3 4 6 12 3 3 0 1 1 0 0 0 6.232558
I'm a little new to coding so I wasn't too sure how to do it - any advice would be appreciated!
You can do:
apply(df[-nrow(df), ], 1, function(row) weighted.mean(df[nrow(df), ], row))
I'm assuming your first columns is some kind of index and not used for the weighted mean (and the data is stored in matr_dat):
apply(matr_dat[-nrow(matr_dat), -1], 1,
function(row) weighted.mean(matr_dat[nrow(matr_dat), -1], row))
Using apply and setting the margin to 1, the function defined in the third argument of apply to each row of the data; to calculate the weighted mean, you can use weighted.mean and set the weights to the values of the row.

Could any one explain me about below error in the R germinationmetrics Package?

I would like to compute cumulative germination counts and Compute germination indices and Plot FPHF curves
My data structure is the following:
concentration temp rep Day01 Day02 Day03 Day04 Day05 Day06 Day07
1 0.0 10 1 0 0 0 0 0 0 0
2 0.5 10 1 0 0 0 0 6 6 6
3 0.3 10 1 0 0 0 0 8 8 8
4 0.1 10 1 0 0 0 0 6 6 6
5 0.0 10 2 0 0 0 0 0 0 0
6 0.5 10 2 0 0 0 0 9 9 9
7 0.3 10 2 0 0 0 0 8 8 8
8 0.1 10 2 0 0 0 0 6 6 6
9 0.0 10 3 0 0 0 0 0 0 0
10 0.5 10 3 0 0 0 0 5 5 5
11 0.3 10 3 0 0 0 0 8 8 8
12 0.1 10 3 0 0 0 0 2 2 2
13 0.0 20 1 0 0 0 0 0 7 7
14 0.5 20 1 0 0 0 0 17 17 17
15 0.3 20 1 0 0 0 0 21 21 21
16 0.1 20 1 0 0 0 0 20 20 20
17 0.0 20 2 0 0 0 0 0 7 10
18 0.5 20 2 0 0 0 0 13 13 13
19 0.3 20 2 0 0 0 0 18 18 18
20 0.1 20 2 0 0 0 0 22 22 22
21 0.0 20 3 0 0 0 0 0 14 14
22 0.5 20 3 0 0 0 0 15 15 15
23 0.3 20 3 0 0 0 0 15 15 15
24 0.1 20 3 0 0 0 0 14 14 14
25 0.0 30 1 0 0 0 0 0 0 0
26 0.5 30 1 0 0 0 0 0 0 0
27 0.3 30 1 0 0 0 0 0 0 0
28 0.1 30 1 0 0 0 0 0 0 0
29 0.0 30 2 0 0 0 0 0 0 0
30 0.5 30 2 0 0 0 0 0 0 0
31 0.3 30 2 0 0 0 0 0 0 0
32 0.1 30 2 0 0 0 0 0 0 0
33 0.0 30 3 0 0 0 0 0 0 0
34 0.5 30 3 0 0 0 0 0 0 0
35 0.3 30 3 0 0 0 0 0 0 0
36 0.1 30 3 0 0 0 0 0 0 0
Day08 Day09 Day10 Day11 Day12 Day13 Day14 Day15 Day16 Day17 Day18
1 0 0 1 1 1 1 1 1 1 1 1
2 18 18 18 18 20 20 20 20 20 20 20
3 18 18 18 18 20 20 20 20 20 20 20
4 16 16 16 16 18 18 18 19 19 19 19
5 0 0 1 1 1 1 1 1 1 1 1
6 22 22 22 22 23 23 23 23 23 23 23
7 22 22 22 22 23 23 23 23 23 23 23
8 18 18 18 18 19 19 19 19 19 19 19
9 0 0 2 2 2 4 4 4 4 4 4
10 20 20 20 20 21 21 21 21 21 21 21
11 17 17 17 17 20 20 20 20 20 20 20
12 22 22 22 22 23 23 23 23 23 23 23
13 7 7 7 7 7 7 7 7 7 7 7
14 23 23 23 23 23 23 23 23 23 23 23
15 24 24 24 24 24 24 24 24 24 24 24
16 24 24 24 24 24 24 24 24 24 24 24
17 10 10 10 10 10 10 10 10 10 10 10
18 25 25 25 25 25 25 25 25 25 25 25
19 23 23 23 23 23 23 23 23 23 23 23
20 23 23 23 23 23 23 23 23 23 23 23
21 14 14 14 14 14 14 14 14 14 14 14
22 23 23 23 23 23 23 23 23 23 23 23
23 21 21 21 21 21 21 21 21 21 21 21
24 20 20 20 20 20 20 20 20 20 20 20
25 0 0 0 0 0 0 0 0 0 0 0
26 0 0 0 0 0 0 0 0 0 0 0
27 0 0 0 0 0 0 0 0 0 0 0
28 0 0 0 0 0 0 0 0 0 0 0
29 0 0 0 0 0 0 0 0 0 0 0
30 0 0 0 0 0 0 0 0 0 0 0
31 0 0 0 0 0 0 0 0 0 0 0
32 0 0 0 0 0 0 0 0 0 0 0
33 0 0 0 0 0 0 0 0 0 0 0
34 0 0 0 0 0 0 0 0 0 0 0
35 0 0 0 0 0 0 0 0 0 0 0
36 0 0 0 0 0 0 0 0 0 0 0
Day19 Total.Seeds
1 1 25
2 20 25
3 20 25
4 19 25
5 1 25
6 23 25
7 23 25
8 19 25
9 4 25
10 21 25
11 20 25
12 23 25
13 7 25
14 23 25
15 24 25
16 24 25
17 10 25
18 25 25
19 23 25
20 23 25
21 14 25
22 23 25
23 21 25
24 20 25
25 0 25
26 0 25
27 0 25
28 0 25
29 0 25
30 0 25
31 0 25
32 0 25
33 0 25
34 0 25
35 0 25
36 0 25
I receive the following error:
data(gcdata1)
Warning message:
In data(gcdata1) : data set ‘gcdata1’ not found
I created the below variable for counts.per.intervals
counts.per.intervals <- c("Day01", "Day02", "Day03", "Day04", "Day05",
+ "Day06", "Day07", "Day08", "Day09", "Day10",
+ "Day11", "Day12", "Day13", "Day14", "Day15", "Day16", "Day17", "Day18", "Day19")
As the following variable for indices
indices<-germination.indices(gcdata1, total.seeds.col = "Total.Seeds",
counts.intervals.cols = counts.per.intervals,
intervals = 1:19, partial = FALSE, max.int = 5)
I received the below error:
Error in if (nearest[2] == nearest[1]) { :
missing value where TRUE/FALSE needed
In addition: There were 50 or more warnings (use warnings() to see the first 50)

Summing up different elements in a matrix in R

I'm trying to perform calculations on different elements in a matrix in R. My Matrix is 18x18 and I would like to get e.g. the mean of each 6x6 array (which makes 9 arrays in total). My desired arrays would be:
A1 <- df[1:6,1:6]
A2 <- df[1:6,7:12]
A3 <- df[1:6,13:18]
B1 <- df[7:12,1:6]
B2 <- df[7:12,7:12]
B3 <- df[7:12,13:18]
C1 <- df[13:18,1:6]
C2 <- df[13:18,7:12]
C3 <- df[13:18,13:18]
The matrix looks like this:
5 10 15 20 25 30 35 40 45 50 55 60 65 70 75 80 85 90
5 14 17 9 10 8 4 10 12 18 9 13 14 NA NA 19 15 10 10
10 30 32 23 27 17 28 25 12 28 29 28 26 19 25 34 24 11 17
15 16 16 16 9 17 27 17 16 30 13 18 13 15 13 19 8 7 9
20 15 12 18 18 18 6 4 6 9 11 10 10 13 11 8 10 15 15
25 7 13 21 7 3 5 2 5 5 4 3 2 3 5 2 1 5 6
30 5 9 1 7 7 4 4 12 8 9 2 0 5 2 1 0 2 6
35 3 0 2 0 0 4 4 7 4 4 5 2 0 0 1 0 0 0
40 0 4 0 0 0 1 3 9 10 10 1 0 0 0 1 0 1 0
45 0 0 0 0 0 3 10 9 17 9 1 0 0 0 0 0 0 0
50 0 0 2 0 0 0 2 8 20 0 0 0 0 0 1 0 0 0
55 0 0 0 0 0 0 7 3 21 0 0 0 0 0 0 0 0 0
60 0 0 0 0 3 4 10 2 2 0 0 1 0 0 0 0 0 0
65 0 0 0 0 0 4 8 4 8 11 0 0 0 0 0 0 0 0
70 0 0 0 0 0 6 2 5 14 0 0 0 0 0 0 0 0 0
75 0 0 0 0 0 4 0 5 9 0 0 0 0 0 0 0 0 0
80 0 0 0 0 0 4 4 0 4 2 0 0 0 0 0 0 0 0
85 0 0 0 0 0 0 0 4 1 1 0 0 0 0 0 0 0 0
90 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Is there a clean way to solve this issue with a loop?
Thanks a lot in advance,
Paul
Given your matrix, e.g.
x <- matrix(1:(18*18), ncol=18)
Try, for example for sub matrices of 6
step <- 6
nx <- nrow(x)
if((nx %% step) != 0) stop("nx %% step should be 0")
indI <- seq(1, nx, by=step)
nbStep <- length(indI)
for(Col in 1:nbStep){
for(Row in 1:nbStep){
name <- paste0(LETTERS[Col],Row)
theCol <- indI[Col]:(indI[Col]+step-1)
theRow <- indI[Row]:(indI[Row]+step-1)
assign(name, sum(x[theCol, theRow]))
}
}
You'll get your results in A1, A2, A3...
This is the idea. Twist the code for non square matrices, different size of sub matrices, ...
Here's one way:
# generate fake data
set.seed(47)
n = 18
m = matrix(rpois(n * n, lambda = 5), nrow = n)
# generate starting indices
n_array = 6
start_i = seq(1, n, by = n_array)
arr_starts = expand.grid(row = start_i, col = start_i)
# calculate sums
with(arr_starts, mapply(function(x, y) sum(m[(x + 1:n_array) - 1, (y + 1:n_array) - 1]), row, col))
# [1] 158 188 176 201 188 201 197 206 204

How to show all threads on a specific CPU on Solaris?

Some process (or threads) is hammering CPU0 as you can see in mpstat 30 2
CPU minf mjf xcal intr ithr csw icsw migr smtx srw syscl usr sys wt idl
0 0 0 0 13 0 2 7 0 151 0 4250 99 1 0 0
1 114 0 2 197 84 5220 5 10 109 0 10518 30 2 0 67
2 79 0 1 184 83 5208 5 5 89 0 9788 30 2 0 68
3 67 0 1 181 84 5150 5 4 87 0 9510 30 2 0 69
4 53 0 3 171 72 12238 4 7 183 0 22214 3 3 0 94
5 43 0 3 135 7 218 2 6 16 0 162 0 1 0 99
6 110 0 2 172 79 4918 5 3 164 0 9553 34 2 0 64
7 120 0 1 180 80 4873 4 4 194 0 9494 32 2 0 66
8 53 0 1 23 2 28665 5 7 494 0 62023 12 9 0 79
9 43 0 0 34 2 21469 6 8 676 0 58090 10 13 0 77
10 59 0 1 210 2 33462 4 4 227 0 63500 7 16 0 78
11 93 0 2 16940 16627 1261 2 6 1027 0 2043 0 10 0 90
12 17 0 1 65 3 59 0 3 3 0 19 0 0 0 100
13 6 0 1 89 4 104 0 3 2 0 9 0 0 0 100
14 4 0 10 65 5 54 0 3 1 0 12 0 0 0 100
15 4 0 1 66 6 56 0 3 2 0 21 0 0 0 100
16 2 0 0 91 16 78 0 3 2 0 30 0 0 0 100
17 17 0 1 80 15 70 0 4 2 0 79 0 0 0 100
18 76 0 3 14946 14928 25 0 4 24 0 102 0 4 0 96
19 57 0 0 20 2 17 0 3 15 0 107 0 0 0 100
20 18 0 0 26 0 25 0 3 10 0 21 0 0 0 100
21 0 0 0 106 70 46 0 3 4 0 40 0 1 0 99
22 13 0 0 31 3 28 0 3 4 0 49 0 0 0 100
23 0 0 0 35 5 24 0 3 5 0 54 0 0 0 100
but with prstat -P0 only see the ndbmtd running wit around 15% on CPU0
PID USERNAME SIZE RSS STATE PRI NICE TIME CPU PROCESS/NLWP
20028 root 77G 75G cpu0 40 0 8369:33:0 15% ndbmtd/44
660 root 6200K 3700K sleep 59 0 0:00:53 0.0% inetd/4
159 daemon 4540K 2408K sleep 59 0 0:00:09 0.0% kcfd/3
11 root 11M 10M sleep 59 0 0:00:58 0.0% svc.configd/15
Is there a way to show all processes and treads on CPU0?
To show all processes and threads (LWPs) on CPU0:
prstat -P0 -L

Importing DAT file into R but uneven columns

I have a DAT file I want to read into R but when I import my data, it keeps on showing I have 10 columns/variables (coming from first line) when in actuality, it is really supposed to be 29 columns/variables. How do i fix this problem?
DAT file example on notepad:
smsa66 smsa76 nearc2 nearc4 nearc4a nearc4b ed76 ed66 age76 daded
nodaded momed nomomed momdad14 sinmom14 step14 south66 south76
lwage76 famed black wage76 enroll76 kww iqscore mar76 libcrd14
exp76 exp762
1 1 0 0 0 0 7
5 29 9.94 1 10.25 1 1
0 0 0 0 6.306275 9 1
548 0 15 . 1 0 16
256
1 1 0 0 0 0 12
11 27 8 0 8 0 1
0 0 0 0 6.175867 8 0
481 0 35 93 1 1 9
81
1 1 0 0 0 0 12
12 34 14 0 12 0 1
0 0 0 0 6.580639 2 0
721 0 42 103 1 1 16
256
1 1 1 1 1 0 11
11 27 11 0 12 0 1
0 0 0 0 5.521461 6 0
250 0 25 88 1 1 10
100
1 1 1 1 1 0 12
12 34 8 0 7 0 1
0 0 0 0 6.591674 8 0
729 0 34 108 1 0 16
256
1 1 1 1 1 0 12
11 26 9 0 12 0 1
0 0 0 0 6.214608 6 0
500 0 38 85 1 1 8
64
1 1 1 1 1 0 18
16 33 14 0 14 0 1
0 0 0 0 6.336826 1 0
565 0 41 119 1 1 9
81
1 1 1 1 1 0 14
13 29 14 0 14 0 1
0 0 0 0 6.410175 1 0
608 0 46 108 1 1 9
81
txt1<-" smsa66 smsa76 nearc2 nearc4 nearc4a nearc4b ed76 ed66 age76 daded
nodaded momed nomomed momdad14 sinmom14 step14 south66 south76
lwage76 famed black wage76 enroll76 kww iqscore mar76 libcrd14
exp76 exp762"
txt2 <-
" 1 1 0 0 0 0 7
5 29 9.94 1 10.25 1 1
0 0 0 0 6.306275 9 1
548 0 15 NA 1 0 16
256
1 1 0 0 0 0 12
11 27 8 0 8 0 1
0 0 0 0 6.175867 8 0
481 0 35 93 1 1 9
81
1 1 0 0 0 0 12
12 34 14 0 12 0 1
0 0 0 0 6.580639 2 0
721 0 42 103 1 1 16
256
1 1 1 1 1 0 11
11 27 11 0 12 0 1
0 0 0 0 5.521461 6 0
250 0 25 88 1 1 10
100
1 1 1 1 1 0 12
12 34 8 0 7 0 1
0 0 0 0 6.591674 8 0
729 0 34 108 1 0 16
256
1 1 1 1 1 0 12
11 26 9 0 12 0 1
0 0 0 0 6.214608 6 0
500 0 38 85 1 1 8
64
1 1 1 1 1 0 18
16 33 14 0 14 0 1
0 0 0 0 6.336826 1 0
565 0 41 119 1 1 9
81
1 1 1 1 1 0 14
13 29 14 0 14 0 1
0 0 0 0 6.410175 1 0
608 0 46 108 1 1 9
81"
Now the code:
inp <- scan(text=txt2, what="numeric")
inmat <- matrix( as.numeric(inp), ncol=29, byrow=TRUE)
dfrm <- as.data.frame(inmat)
scan(text=txt1, what="")
Read 29 items
[1] "smsa66" "smsa76" "nearc2" "nearc4" "nearc4a" "nearc4b" "ed76"
[8] "ed66" "age76" "daded" "nodaded" "momed" "nomomed" "momdad14"
[15] "sinmom14" "step14" "south66" "south76" "lwage76" "famed" "black"
[22] "wage76" "enroll76" "kww" "iqscore" "mar76" "libcrd14" "exp76"
[29] "exp762"
names(dfrm) <- scan(text=txt1, what="")
#Read 29 items
dfrm
#-----------------------
smsa66 smsa76 nearc2 nearc4 nearc4a nearc4b ed76 ed66 age76 daded nodaded momed nomomed
1 1 1 0 0 0 0 7 5 29 9.94 1 10.25 1
2 1 1 0 0 0 0 12 11 27 8 0 8 0
3 1 1 0 0 0 0 12 12
snipped remainder of output
Final result:
str(dfrm)
'data.frame': 8 obs. of 29 variables:
$ smsa66 : num 1 1 1 1 1 1 1 1
$ smsa76 : num 1 1 1 1 1 1 1 1
$ nearc2 : num 0 0 0 1 1 1 1 1
$ nearc4 : num 0 0 0 1 1 1 1 1
$ nearc4a : num 0 0 0 1 1 1 1 1
$ nearc4b : num 0 0 0 0 0 0 0 0
$ ed76 : num 7 12 12 11 12 12 18 14
$ ed66 : num 5 11 12 11 12 11 16 13
$ age76 : num 29 27 34 27 34 26 33 29
$ daded : num 9.94 8 14 11 8 9 14 14
$ nodaded : num 1 0 0 0 0 0 0 0
$ momed : num 10.2 8 12 12 7 ...
$ nomomed : num 1 0 0 0 0 0 0 0
$ momdad14: num 1 1 1 1 1 1 1 1
$ sinmom14: num 0 0 0 0 0 0 0 0
$ step14 : num 0 0 0 0 0 0 0 0
$ south66 : num 0 0 0 0 0 0 0 0
$ south76 : num 0 0 0 0 0 0 0 0
$ lwage76 : num 6.31 6.18 6.58 5.52 6.59 ...
$ famed : num 9 8 2 6 8 6 1 1
$ black : num 1 0 0 0 0 0 0 0
$ wage76 : num 548 481 721 250 729 500 565 608
$ enroll76: num 0 0 0 0 0 0 0 0
$ kww : num 15 35 42 25 34 38 41 46
$ iqscore : num NA 93 103 88 108 85 119 108
$ mar76 : num 1 1 1 1 1 1 1 1
$ libcrd14: num 0 1 1 1 0 1 1 1
$ exp76 : num 16 9 16 10 16 8 9 9
$ exp762 : num 256 81 256 100 256 64 81 81

Resources