I have data like this:
ID height S1 S2 S3
1 927 0.90695438 0.28872194 0.67114294
2 777 0.20981677 0.71783084 0.74498220
3 1659 0.35813799 0.92339744 0.44001698
4 174 0.44829914 0.67493949 0.11503942
5 1408 0.90642643 0.18593999 0.67564278
6 1454 0.38943930 0.34806716 0.73155952
7 2438 0.51745975 0.12351953 0.48398490
8 1114 0.12523909 0.10811622 0.17104804
9 1642 0.03014575 0.29795320 0.67584853
10 515 0.77180549 0.83819990 0.26298995
11 1877 0.32741508 0.99277109 0.34148083
12 2647 0.38947869 0.43713441 0.21024554
13 845 0.04105275 0.20256457 0.01631959
14 1198 0.36139663 0.96387150 0.37676288
15 2289 0.57097808 0.66038711 0.56230740
16 2009 0.68488024 0.29811683 0.67998461
17 618 0.97111675 0.11926219 0.74538877
18 1076 0.70195881 0.59975160 0.95007272
19 1082 0.01154550 0.12019055 0.16309071
20 2072 0.53553213 0.78843202 0.32475690
21 1610 0.83657146 0.36959607 0.13271604
22 2134 0.80686674 0.95632284 0.63729744
23 1617 0.08093264 0.91357666 0.33092961
24 2248 0.23890930 0.82333634 0.64907957
25 1263 0.96598986 0.31948216 0.30288836
26 518 0.03767233 0.87770033 0.07123327
27 2312 0.91640643 0.80035100 0.66239047
28 2646 0.72622658 0.61135664 0.75960356
29 1650 0.20077621 0.07242114 0.55336017
30 837 0.84020075 0.42158771 0.53927210
31 1467 0.39666235 0.34446560 0.84959232
32 2786 0.39270226 0.75173569 0.65322596
33 1049 0.47255689 0.21875132 0.95088576
34 2863 0.58365691 0.29213397 0.61722305
35 2087 0.35238717 0.35595337 0.49284063
36 2669 0.02847401 0.63196192 0.97600657
37 545 0.99508793 0.89253107 0.49034522
38 1890 0.95755846 0.74403278 0.65517230
39 2969 0.55165118 0.45722242 0.59880179
40 395 0.10195396 0.03609544 0.94756902
41 995 0.23791515 0.56851452 0.36801151
42 2596 0.86009766 0.43901589 0.87818701
43 2334 0.73826129 0.60048445 0.45487507
44 2483 0.49731226 0.95138276 0.49646702
45 1812 0.57992109 0.26943131 0.46061562
46 1476 0.01618339 0.65883839 0.61790820
47 2342 0.47212988 0.07647121 0.60414349
48 2653 0.04238973 0.07128521 0.78587960
49 627 0.46315442 0.37033152 0.55526847
50 925 0.62999477 0.29710220 0.76897834
51 995 0.67324929 0.55107827 0.40428567
52 600 0.08703467 0.36989059 0.51071981
53 711 0.14358380 0.84568953 0.52353644
54 828 0.90847850 0.62079070 0.99279921
55 1776 0.12253259 0.39914002 0.42964742
56 764 0.72886279 0.29966153 0.99601125
57 375 0.95037718 0.38111984 0.78660025
58 694 0.04335591 0.70113494 0.51591063
59 1795 0.01959930 0.94686529 0.50268797
60 638 0.19907246 0.77282832 0.91163748
61 1394 0.50508626 0.21955016 0.26441590
62 1943 0.92638876 0.71611036 0.17385687
63 2882 0.13840169 0.66421796 0.40033126
64 2031 0.16919458 0.70625020 0.53835738
65 1338 0.60662738 0.27962799 0.24496437
66 1077 0.81587669 0.71225050 0.37585096
67 1370 0.84338121 0.66094211 0.58025355
68 1339 0.78807719 0.04101269 0.20895531
69 739 0.01902087 0.06114149 0.80133001
70 2085 0.69808750 0.27976169 0.63880242
71 1240 0.81509312 0.30196772 0.73633076
72 987 0.56840006 0.95661083 0.43881241
73 1720 0.48006288 0.38981872 0.57981238
74 2901 0.16137012 0.37178879 0.25604401
75 1987 0.08925623 0.84314249 0.46371823
76 1876 0.16268237 0.84723500 0.16861486
77 2571 0.02672845 0.31933115 0.61389453
78 2325 0.70962948 0.13250605 0.95810262
79 2503 0.76101818 0.61710912 0.47819473
80 279 0.85747478 0.79130451 0.75115933
81 1381 0.43726582 0.33804871 0.02058322
82 1800 0.41713645 0.90544760 0.17096903
83 2760 0.58564949 0.19755671 0.63996650
84 2949 0.82496758 0.79408518 0.16497848
85 118 0.79313923 0.75460289 0.35472278
86 1736 0.32615257 0.91139485 0.18642647
87 2201 0.95793194 0.32268770 0.89765616
88 750 0.65301961 0.08616947 0.23778386
89 906 0.45867582 0.91120045 0.98494348
90 2202 0.60602188 0.95517383 0.02133074
I want to make a barplot using ggplot2 like this:
In the above-mentioned dataset height should be on the y-axis and S1, S2, S3 should be representing colors of each sample.
I have tried the base R function barplot which gave me the following output. Please give me any suggestion.
barplot(t(as.matrix(examp[,3:5])),col=rainbow(3))
It's not clear to me exactly what you want to plot. You say you want height on the y axis, but the examples you show are all 'filled to the top', implying the same height for each ID. Also, it is not clear what the numbers associated with each sample represent. I am guessing they should be relative weightings for the bar heights.
Assuming you actually want a filled bar plot as in the examples, with the relative sizes of the bars dictated by the sample values, you can do:
library(tidyr)
library(dplyr)
library(ggplot2)
df %>%
mutate(ID = reorder(ID, S3/(S3 + S2 + S1))) %>%
pivot_longer(3:5, names_to = "Sample", values_to = "Value") %>%
ggplot(aes(ID, Value * height, fill = Sample)) +
geom_col(position = "fill", color = NA) +
labs(y = "Height") +
theme_classic() +
scale_fill_manual(values = c("red", "green", "blue"))
Alternative
df %>%
arrange(order(height)) %>%
group_by(height) %>%
summarize(across(everything(), mean)) %>%
pivot_longer(3:5, names_to = "Sample", values_to = "Value") %>%
ggplot(aes(height, Value, fill = Sample, colour = Sample)) +
geom_smooth(method = loess, formula = y ~ x, linetype = 2, alpha = 0.2) +
theme_bw()
i have a list with indexes like this:
> mid_cp
[1] 3065 4871 13153 15587 18100 24010 26324 25648 38195 38196 39384 42237 45686 54217 55032 63684 62800 9134 35261 36449 36866 53968 16969
[24] 43529 46995 52351 4174 7011 18962 18151 18889 24036 32916 34061 34815 36866 51973 55802 53593 55421 56615 88 150 161 192 781
[47] 830 1300 1573 2396 2784 2547 3214 3135 3297 3301 4053 4249 4919 5856 6297 7328 7621 7708 8063 8219 8864 8887 9201
[70] 9214 9533 10334 10301 11235 10529 11356 10566 10872 12228 12250 12507 12048 12643 12913 13224 14297 16772 15363 18759 18979 16264 17363
[93] 20732 17971 22194 22422 19417 22903 22929 23087 19627 19961 23954 24297 25422 25423 25704 25765 25780 22769 22796 26871 27095 23789 24066
[116] 24069 27423 24366 24600 24871 25110 28374 26280 27873 29722 28839 29063 31031 31150 31546 32491 30356 33045 30863 33555 34201 34404 34684
[139] 35498 32912 33207 35874 33488 33716 36761 34543 36807 37000 35157 38195 38196 38458 36438 36619 39484 40109 37532 40143 40160 40458 41257
[162] 38434 38653 41866 41899 39429 42818 40001 43398 43441 40282 40566 43979 43996 40793 40806 40992 41065 41102 41330 41964 46322 43351 46670
and I have a table like this:
> head(movie.cp)
name id
252 $ (Dollars) (The Heist) 252
253 $5 a Day (Five Dollars a Day) 253
1 $9.99 1
254 $windle (Swindle) 254
255 "BBC2 Playhouse" Caught on a Train 255
256 "Independent Lens" Race to Execution 256
How do i get the mid_cp list to be a name list using the movie.cp table?
P.S.: I am completely newbie regarding R
are the numbers in mid_cp equivalent to movie.cp$id? if so try mid_cp <- movie.cp$name[match(mid_cp,movie.cp$id)]
I have an excel sheet having some series of data in the form of column vectors. each column vector is of different length. the sample data in the excel sheet is presented as column vectors as shown below.
No 1 2 4 5 6 7
1 7.68565 7.431991 7.620156 7.34955 7.493848 7.244905
2 8.247334 7.895186 8.107751 7.629121 8.01165 7.898938
3 8.861417 8.411331 8.616113 7.960177 8.551065 8.432346
4 9.522981 8.945542 9.117843 8.263698 9.129371 9.118917
5 10.10206 9.465829 9.621576 8.515904 9.680468 9.695693
6 10.74194 10.05058 10.2111 8.824739 10.22375 10.48411
7 11.41614 10.59113 10.70612 9.12775 10.78299 11.1652
8 12.08601 11.12069 11.23061 9.445629 11.32874 11.8499
9 12.8509 11.68692 11.81479 9.762563 11.92125 12.77563
10 13.79793 12.31746 12.3436 10.12344 12.5586 14.05427
11 14.40335 12.85409 12.81579 10.4148 13.2323 14.74745
12 14.96397 13.44764 13.39124 10.76968 13.91571 15.48449
13 15.49457 13.5184 13.94058 11.05081 14.43318 16.12423
14 16.06153 13.99386 14.35261 11.38416 14.95082 16.84513
15 16.61133 14.4879 14.86438 11.71484 15.47574 17.42593
16 17.24876 14.95296 15.30651 12.06838 16.01853 18.05138
17 17.8686 15.48764 15.82241 12.41315 16.546 18.69939
18 18.49424 16.01478 16.33324 12.76782 17.07923 19.29467
19 19.0651 16.5115 16.8808 13.11234 17.62211 20.00391
20 19.73842 17.07482 17.40481 13.46479 18.14528 20.67474
21 20.47123 17.51353 17.88455 13.55012 18.69565 21.35446
22 21.16333 18.00172 18.38069 13.82592 19.23222 22.16516
23 21.83083 18.55357 18.79004 14.10343 19.93576 23.0249
24 22.50095 19.04932 19.25296 14.38997 20.6087 23.75609
25 23.27895 19.66359 19.68497 14.66933 21.19856 24.33014
26 23.86791 20.19746 20.25114 14.96252 21.7933 25.16132
27 24.42128 20.79322 20.8394 15.27082 22.4216 25.64038
28 25.02747 21.34963 21.36803 15.59645 22.95553 26.40612
29 25.64392 21.96625 21.92369 15.90159 23.62858 26.99359
30 26.15457 22.51419 22.49119 16.21841 24.27062 27.48933
31 26.78083 23.14052 23.09582 16.5353 24.75912 28.13525
32 27.39095 23.71215 23.71597 16.84909 25.34079 28.66253
33 28.04546 24.23099 24.22622 17.23782 25.90887 29.27824
34 28.68887 24.69722 24.76757 17.58071 26.51803 30.06892
35 29.45707 25.24266 25.30781 17.91193 27.12488 30.87034
36 30.03946 25.75705 25.86998 18.24291 27.73606 31.71053
37 30.71511 26.29254 26.34333 18.50986 28.30462 32.37958
38 31.42378 26.91853 26.69165 18.81327 28.91142 33.07085
39 32.50335 27.44403 27.12134 19.20657 29.51637 33.8685
40 33.12328 27.98299 27.578 19.55173 30.14371 34.5783
41 33.71293 28.42661 28.16382 19.818 30.7509 35.29098
42 34.22313 29.11766 28.58075 20.20322 31.50584 35.97233
43 34.84822 29.69339 29.14229 20.60828 32.14028 36.53085
44 35.51228 30.30699 29.71523 20.86474 32.72842 36.82623
45 36.11674 30.89355 30.28881 21.24548 33.02594 37.79391
46 36.80722 31.50952 30.94186 21.56593 33.17226 38.42553
47 37.60966 31.98561 31.63391 21.89768 33.34089 39.20039
48 38.25016 32.63639 32.19883 22.23119 33.67384 39.98531
49 38.95744 33.18134 32.72147 22.4859 34.27073 40.76857
50 39.66163 33.67109 33.14864 22.90394 34.86681 41.49251
51 40.37425 34.12463 33.60807 23.26918 35.59697 42.51444
52 41.23707 34.66628 34.09723 23.52158 36.24535 43.14603
53 41.82558 35.1961 34.57659 23.89679 36.90796 44.16233
54 42.55081 35.72951 35.03618 24.49229 37.65297 44.59068
55 43.39907 36.31952 35.46371 24.81181 38.33818 45.22966
56 44.05056 37.05194 35.98615 25.12065 38.85623 46.23367
57 44.78049 37.1323 36.51719 25.4582 39.54339 46.54872
58 45.43282 37.76535 37.09313 25.88998 40.23827 47.07784
59 46.18882 38.27575 37.17476 26.22639 40.92604 47.807
60 46.90982 38.88576 37.90604 26.56257 41.63398 48.4778
61 47.56264 39.64927 38.5283 26.8499 42.29979 49.21885
62 48.10035 40.19561 39.16806 27.1614 42.99679 50.18735
63 49.01068 40.89077 39.80176 27.43677 43.8278 51.9102
64 49.76271 41.6514 40.39578 27.89204 44.4915 52.78747
65 50.53434 42.09778 41.03402 28.18638 45.01828 53.46253
66 51.67479 42.83619 41.44307 28.49254 45.8151 54.44443
67 52.20818 43.35224 42.17046 28.87821 46.38069 55.20507
68 52.84818 43.94838 42.54818 29.18387 47.27983 55.71156
69 53.54274 44.61937 43.04368 29.58712 47.76875 56.11357
70 54.24117 45.2113 43.55424 29.97786 48.52082 56.56269
71 55.10781 45.87016 44.19418 30.30342 49.17041 57.04574
72 55.81844 46.58728 44.70245 30.92939 50.00576 57.61847
73 56.53417 47.17022 45.19135 64.12819 50.76387 58.46774
74 56.99077 47.80587 45.81162 64.46482 51.44632 59.35406
75 57.70125 48.4632 46.53608 64.47179 52.09271 60.34232
76 58.40646 49.11251 47.44626 65.28538 52.77505 60.76057
77 59.20803 49.70755 48.0586 65.42728 53.3777 61.86707
78 59.71753 50.13534 48.76304 65.97044 54.06384 63.14102
79 60.58331 50.72049 49.47997 66.51449 54.7547 64.43312
80 61.03398 51.41927 50.11546 67.02634 55.4798 65.58254
81 61.80681 51.97609 50.69514 67.59518 55.96139 66.72086
82 62.48501 52.59973 51.31683 68.12712 56.93643 67.53484
83 63.36452 53.36562 51.73617 68.64816 57.6551 68.07806
84 64.31261 53.98405 52.21327 69.24711 58.23373 68.63623
85 65.24776 54.51552 52.77048 70.48085 58.97933 69.02074
86 66.17772 55.20282 53.22162 70.64199 59.76285 69.38057
87 67.08787 55.91391 53.7916 71.38781 60.25809 70.01195
88 68.01987 56.61301 54.46721 71.58064 61.31948 70.5335
89 68.92189 57.28238 55.16064 71.99983 62.18978 71.61938
90 69.79762 57.88332 55.85772 72.89091 63.02894 72.77907
91 69.86632 58.52047 56.78106 73.05919 63.78964 74.13258
92 70.60662 59.12164 57.49112 73.58095 64.54343 75.77073
93 71.63203 59.77399 58.20212 74.1192 65.36834 76.57243
94 72.18227 60.47282 58.77127 74.6143 65.83804 77.84715
95 72.97624 60.7739 59.41283 75.4809 66.61507 78.78102
96 73.75372 61.22352 59.84708 75.66663 67.44336 79.33527
97 74.66983 61.87689 60.49374 76.09998 68.30974 79.86294
98 75.85329 62.58495 60.7886 76.67287 69.23421 80.51763
99 76.38837 63.32424 61.5629 77.20351 70.00735 80.91219
100 77.38139 64.07433 62.21648 77.95189 70.7836 81.57964
101 78.25631 64.82328 62.74316 78.21231 71.2177 82.16656
102 79.19827 65.50484 63.64724 78.89301 72.00792 83.12364
103 80.38764 66.23685 64.48991 79.32261 73.00548 84.00261
104 80.87278 66.95412 65.2793 79.95379 73.50331 85.22213
105 81.76581 67.70247 65.82581 80.52102 74.28909 86.6621
106 83.02712 68.55701 66.62666 81.06393 75.11777 88.11059
107 83.48909 69.23235 67.35486 81.7409 75.9652
108 84.82759 70.58522 68.15342 82.25188 76.8884
109 85.28537 71.04559 68.92251 82.98396 77.83717
110 86.70018 71.73407 69.51888 83.51862 78.45438
111 87.35397 72.45837 70.31539 83.69946 79.32315
112 88.69969 73.14394 70.9007 84.25947 80.39831
113 73.92206 71.50578 85.10349 81.20853
114 74.65082 72.20686 85.26869 81.95338
115 75.32388 72.81664 86.07426 82.36201
116 76.37313 73.52561 86.33713 83.16817
117 76.85229 74.32013 86.85325 83.96463
118 77.55033 75.04207 87.32344 84.8136
119 78.19957 75.90256 87.93314 85.7303
120 79.23823 76.41772 88.39268 86.46136
121 79.57755 77.11913 88.96714 87.30937
122 79.70834 78.01459 88.17579
123 80.44374 78.76607 89.00109
124 81.47443 79.56496
125 81.80569 79.69939
126 82.57823 80.52383
127 83.38485 81.27236
128 84.09743 81.94386
129 84.78618 83.01913
130 85.91491 83.52692
131 86.18631 84.52093
132 86.87262 85.26204
133 88.0145 85.93992
134 88.30018 86.70402
135 89.08487 87.58891
136 88.27903
from the above data, the values are ranged from 7.3 (approx.) to 89.08 (approx) in the top to bottom. however, I have some data ranged from 7.3 to 89.09 (approx) in the bottom to top in another sheet of excel file.
Now, I would like to take the longest column vector (from the sample data it is column vector: 3) i.e 136*1 size and convert other column vectors (1,2,4,5 and 6) into the size of column vector :3 such that the original values (magnitudes) should remain same and their positions (values in the rows can be shifted). Between the values (original magnitudes), I need to interpolate so that, all the column vectors will be of same length (136*1).
like this column vectors, I have some hundreds.
the expected output is presented only for column:1 with reference to column:3
No 1 3
1 7.68565 7.620156
2 8.247334 8.107751
3 8.861417 8.616113
4 9.522981 9.117843
5 **9.8125205** 9.621576
6 10.10206 10.2111
7 10.74194 10.70612
8 11.41614 11.23061
9 **11.751075** 11.81479
10 12.08601 12.3436
11 12.8509 12.81579
12 13.79793 13.39124
13 **14.10064** 13.94058
14 14.40335 14.35261
15 14.96397 14.86438
16 15.49457 15.30651
17 **15.77805** 15.82241
18 16.06153 16.33324
19 16.61133 16.8808
20 17.24876 17.40481
21 17.8686 17.88455
22 18.49424 18.38069
23 **18.77967** 18.79004
24 19.0651 19.25296
25 19.73842 19.68497
26 20.47123 20.25114
27 **20.81728** 20.8394
28 21.16333 21.36803
29 21.83083 21.92369
30 22.50095 22.49119
31 23.27895 23.09582
32 23.86791 23.71597
33 24.42128 24.22622
34 **24.724375** 24.76757
35 25.02747 25.30781
36 25.64392 25.86998
37 26.15457 26.34333
38 26.78083 26.69165
39 27.39095 27.12134
40 **27.718205** 27.578
41 28.04546 28.16382
42 28.68887 28.58075
43 29.45707 29.14229
44 **29.748265** 29.71523
45 30.03946 30.28881
46 30.71511 30.94186
47 31.42378 31.63391
48 32.50335 32.19883
49 **32.813315** 32.72147
50 33.12328 33.14864
51 33.71293 33.60807
52 34.22313 34.09723
53 34.84822 34.57659
54 **35.18025** 35.03618
55 35.51228 35.46371
56 **35.81451** 35.98615
57 36.11674 36.51719
58 36.80722 37.09313
59 37.60966 37.17476
60 **37.92991** 37.90604
61 38.25016 38.5283
62 38.95744 39.16806
63 39.66163 39.80176
64 40.37425 40.39578
65 41.23707 41.03402
66 41.82558 41.44307
67 42.55081 42.17046
68 **42.97494** 42.54818
69 43.39907 43.04368
70 **43.724815** 43.55424
71 44.05056 44.19418
72 44.78049 44.70245
73 45.43282 45.19135
74 **45.81082** 45.81162
75 46.18882 46.53608
76 46.90982 47.44626
77 47.56264 48.0586
78 48.10035 48.76304
79 49.01068 49.47997
80 49.76271 50.11546
81 50.53434 50.69514
82 51.67479 51.31683
83 **51.941485** 51.73617
84 52.20818 52.21327
85 52.84818 52.77048
86 53.54274 53.22162
87 **53.891955** 53.7916
88 54.24117 54.46721
89 55.10781 55.16064
90 55.81844 55.85772
91 56.53417 56.78106
92 56.99077 57.49112
93 57.70125 58.20212
94 58.40646 58.77127
95 59.20803 59.41283
96 59.71753 59.84708
97 60.58331 60.49374
98 61.03398 60.7886
99 61.80681 61.5629
100 62.48501 62.21648
101 **62.924765** 62.74316
102 63.36452 63.64724
103 64.31261 64.48991
104 65.24776 65.2793
105 **65.71274** 65.82581
106 66.17772 66.62666
107 67.08787 67.35486
108 68.01987 68.15342
109 68.92189 68.92251
110 69.79762 69.51888
111 69.86632 70.31539
112 70.60662 70.9007
113 71.63203 71.50578
114 72.18227 72.20686
115 72.97624 72.81664
116 73.75372 73.52561
117 74.66983 74.32013
118 75.85329 75.04207
119 76.38837 75.90256
120 **76.88488** 76.41772
121 77.38139 77.11913
122 78.25631 78.01459
123 **78.72729** 78.76607
124 79.19827 79.56496
125 **79.792955** 79.69939
126 80.38764 80.52383
127 80.87278 81.27236
128 81.76581 81.94386
129 83.02712 83.01913
130 83.48909 83.52692
131 84.82759 84.52093
132 85.28537 85.26204
133 85.992775 85.93992
134 86.70018 86.70402
135 87.35397 87.58891
136 88.69969 88.27903
the expected interpolated values in column:1 are presented in double starred. here, the interpolation is done by averaging the i-1th and i+1th cell for the ith cell (simply linear interpolation)
the main purpose of doing so is to perform clustering. since column vectors/row vectors of unequal length cannot be used for clustering
is there any code to do that?
or can we calculate distance using DTW(Dynamic Time Warping) method or any other method with column vectors having unequal length (as shown in the example dataset) and perform clustering??
I am completely new at this and here, so please have mercy.
I want to open an ASCII data file in R.
After several different attempts, I have tried df=read.csv("C:MyDirectory" ,header=FALSE, sep="").
This has produced a table with several variables, but some rows clearly contain the wrong information, some cells are blank, some contain NA values.
Any ideas what has gone wrong? I have gotten the file from an offical Spanish research institute:
http://www.cis.es/cis/opencm/ES/2_bancodatos/estudios/listaTematico.jsp?tema=1&todos=si
Then BARÓMETRO DE OCTUBRE 2017, to the right is a small link entitled "fichero de datos", which allows you to download after providing them with some info. The file giving the trouble is DA3191. If anyone could go through the trouble of helping me with this, it would be awesome. Thank you.
Part 1
This looks like a fixed width format, so you need read.fwf instead of read.csv and friends. I made a screen shot of an almost random place of that file: my hypothesis is that the 99's and 98's etc are missing data codes, so the first 99 marked in yellow would belong to the same column with 4, 2, 0, etc, and the immediately following 99 (not marked) is in the same column with 0, 5, 7, etc.
Part 2
And then look at the file ES3191 -- this looks like SPSS code (pardon my French!) containing the rules about reading in the data file. You can probably figure out the width of each column and what's in there from that file:
DATA LIST FILE= 'DA3191'
/ESTU 1-4 CUES 5-9 CCAA 10-11 PROV 12-13 MUN 14-16 TAMUNI 17 CAPITAL 18 DISTR 19-20 SECCION 21-23
ENTREV 24-27 P0 28 P0A 29-31 P1 32 P2 33 P3 34 P4 35 P5 36 P6 37 P701 38-39 P702 40-41 P703 42-43
P801 44-45 P802 46-47 P803 48-49 P901 50-51 P902 52-53 P903 54-55 P904 56-57 P905 58-59 P906 60-61
P907 62-63 P1001 64 P1002 65 P1003 66 P1101 67 P1102 68 P1103 69 P1104 70 P1201 71 P1202 72
P1203 73 P1204 74 P1205 75 P1206 76 P1207 77 P1208 78 P1209 79 P13 80-81 P13A 82-83 P1401 84-85
P1402 86-87 P1403 88-89 P1404 90-91 P1405 92-93 P1406 94-95 P1407 96-97 P1408 98-99 P1409 100-101
P1410 102-103 P1411 104-105 P1412 106-107 P1413 108-109 P1414 110-111 P1415 112-113 P1416 114-115
I'm not an SPSS expert but I would guess that what it is trying to tell us is that
columns 1-4 contain the variable "ESTU"
columns 5-9 contain the variable "CUES"
etc
For read.fwf you have to calculate each variable's "width" i.e. 4 characters for ESTU (if my reading was right) 5 characters for CUES etc.
Part 3
Using the guesses above, I used the following code to read in your data, and it looks like it works:
# this is copy/pasted SPSS code from file "ES3191"
txt <- "ESTU 1-4 CUES 5-9 CCAA 10-11 PROV 12-13 MUN 14-16 TAMUNI 17 CAPITAL 18 DISTR 19-20 SECCION 21-23
ENTREV 24-27 P0 28 P0A 29-31 P1 32 P2 33 P3 34 P4 35 P5 36 P6 37 P701 38-39 P702 40-41 P703 42-43
P801 44-45 P802 46-47 P803 48-49 P901 50-51 P902 52-53 P903 54-55 P904 56-57 P905 58-59 P906 60-61
P907 62-63 P1001 64 P1002 65 P1003 66 P1101 67 P1102 68 P1103 69 P1104 70 P1201 71 P1202 72
P1203 73 P1204 74 P1205 75 P1206 76 P1207 77 P1208 78 P1209 79 P13 80-81 P13A 82-83 P1401 84-85
P1402 86-87 P1403 88-89 P1404 90-91 P1405 92-93 P1406 94-95 P1407 96-97 P1408 98-99 P1409 100-101
P1410 102-103 P1411 104-105 P1412 106-107 P1413 108-109 P1414 110-111 P1415 112-113 P1416 114-115
P1501 116-117 P1502 118-119 P1503 120-121 P1504 122-123 P1505 124-125 P1506 126-127 P1507 128-129
P1508 130-131 P1509 132-133 P1510 134-135 P1511 136-137 P1512 138-139 P1513 140-141 P1514 142-143
P1515 144-145 P1516 146-147 P16 148 P17 149 P1801 150-151 P1802 152-153 P1803 154-155 P1804 156-157
P1805 158-159 P1806 160-161 P1807 162-163 P1808 164-165 P1809 166-167 P1810 168-169 P1811 170-171
P1812 172-173 P1813 174-175 P19 176 P20 177 P21 178-179 P22 180-181 P23 182-183 P2401 184-185
P2402 186-187 P2403 188-189 P2404 190-191 P2405 192-193 P2406 194-195 P2407 196-197 P2408 198-199
P2409 200-201 P2410 202-203 P2411 204-205 P2412 206-207 P2413 208-209 P2414 210-211 P2415 212-213
P2416 214-215 P25 216 P26 217 P27 218 P27A 219-220 P28 221-222 P29 223 P30 224-225 P31 226 P31A 227-228
P32 229 P32A 230 P33 231 P34 232 P35 233 P35A 234 P36 235 P37 236 P37A 237 P37B 238 P38 239-241
P39 242 P39A 243 P40 244-246 P41 247-248 P42 249-250 P43 251 P43A 252 P43B 253 P44 254 P4501 255
P4502 256 P4503 257 P4504 258 P4601 259-261(A) P4602 262-264(A) P4603 265-267(A) P4604 268-270(A)
P4605 271-273(A) P4701 274-276(A) P4702 277-279(A) P4703 280-282(A) P4704 283-285(A) P4705 286-288(A)
P48 289 P49 290 P50 291 P51 292 I1 293-295 I2 296-298 I3 299-301 I4 302-304 I5 305-307 I6 308-310
I7 311-313 I8 314-316 I9 317-319 E101 320-321 E102 322-323 E103 324-325 E2 326 E3 327-329 E4 330
C1 331 C1A 332-333 C2 334 C2A 335 C2B 336-337 C3 338 C4 339-340 P21R 341-342 P22R 343-344 VOTOSIMG 345-346
P27AR 347-348 RECUERDO 349-350 ESTUDIOS 351 OCUMAR11 352-353 RAMA09 354 CONDICION11 355-356
ESTATUS 357 "
# making a 2-column matrix (name = left column, position = right column)
m <- matrix(scan(text=txt, what=""), ncol=2, byrow=TRUE)
m <- as.data.frame(m, stringsAsFactors=FALSE)
names(m) <- c("Var", "Pos")
pos <- sub("(A)", "", m$Pos, fixed = TRUE) # some entries contain '(A)' - no idea what it means so deleting it
pos <- strsplit(pos, "-")
starts <- as.numeric(sapply(pos, head, 1)) # get the first element from left
ends <- as.numeric(sapply(pos, tail, 1)) # get the first element from right
w <- ends - starts +1
MyData <- read.fwf("R/MD3191/DA3191", widths = w)
names(MyData) <- m$Var
head(MyData)
# ESTU CUES CCAA PROV MUN TAMUNI CAPITAL DISTR SECCION ENTREV P0 P0A P1 P2 P3 P4 P5 P6
# 1 3191 1 16 1 59 5 1 0 0 0 1 0 3 2 2 5 1 2
# 2 3191 2 16 1 59 5 1 0 0 0 1 0 4 2 3 5 2 3
# 3 3191 3 16 1 59 5 1 0 0 0 1 0 4 2 2 4 2 2
i need to calculate the mean value for each row (mean of interval). Here is a basic example (maybe anyone has even better idea to do it):
M_1_mb <- (15 : -15)#creating a vector value --> small
M_31 <- cut(M_31_mb,128)# getting 128 groups from the small vector
#M_1_mb <- (1500 : -1500)#creating a vector value
#M_1 <- cut(M_1_mb,128)# getting 128 groups from the vector
I do need to get the mean value for each row/group out of 128 intervals created in M_1 (actually i do not need even those intervals, i just need the mean of them) and i cannot figure out how to do it...
I had a look at the cut2 function from Hmisc library but unfortunatelly there is no option to set up number of intervals into which vector is to be cut (-> but there is an option to get the mean value of created intervals: levels.mean...)
I would appreciate any help! Thanks!
Additional Info:
cut2 function is working well for bigger vectors (M_1_mb), however when my vector is small (M_31_mb), then i am getting a Warning message:
Warning message:
In min(xx[xx > upper]) : no non-missing arguments to min; returning Inf
and only 31 groups are created:
M_31_mb <- (15 : -15) # smaller vector
M_31 <- table(cut2(M_31_mb,g=128,levels.mean = TRUE))
whereas
g = number of quantile groups
like this?
aggregate(M_1_mb,by=list(M_1),mean)
EDIT: Result
Group.1 x
1 (-1.5e+03,-1.48e+03] -1488.5
2 (-1.48e+03,-1.45e+03] -1465.0
3 (-1.45e+03,-1.43e+03] -1441.5
4 (-1.43e+03,-1.41e+03] -1418.0
5 (-1.41e+03,-1.38e+03] -1394.5
6 (-1.38e+03,-1.36e+03] -1371.0
7 (-1.36e+03,-1.34e+03] -1347.5
8 (-1.34e+03,-1.31e+03] -1324.0
9 (-1.31e+03,-1.29e+03] -1301.0
10 (-1.29e+03,-1.27e+03] -1277.5
11 (-1.27e+03,-1.24e+03] -1254.0
12 (-1.24e+03,-1.22e+03] -1230.5
13 (-1.22e+03,-1.2e+03] -1207.0
14 (-1.2e+03,-1.17e+03] -1183.5
15 (-1.17e+03,-1.15e+03] -1160.0
16 (-1.15e+03,-1.12e+03] -1136.5
17 (-1.12e+03,-1.1e+03] -1113.0
18 (-1.1e+03,-1.08e+03] -1090.0
19 (-1.08e+03,-1.05e+03] -1066.5
20 (-1.05e+03,-1.03e+03] -1043.0
21 (-1.03e+03,-1.01e+03] -1019.5
22 (-1.01e+03,-984] -996.0
23 (-984,-961] -972.5
24 (-961,-938] -949.0
25 (-938,-914] -926.0
26 (-914,-891] -902.5
27 (-891,-867] -879.0
28 (-867,-844] -855.5
29 (-844,-820] -832.0
30 (-820,-797] -808.5
31 (-797,-773] -785.0
32 (-773,-750] -761.5
33 (-750,-727] -738.0
34 (-727,-703] -715.0
35 (-703,-680] -691.5
36 (-680,-656] -668.0
37 (-656,-633] -644.5
38 (-633,-609] -621.0
39 (-609,-586] -597.5
40 (-586,-562] -574.0
41 (-562,-539] -551.0
42 (-539,-516] -527.5
43 (-516,-492] -504.0
44 (-492,-469] -480.5
45 (-469,-445] -457.0
46 (-445,-422] -433.5
47 (-422,-398] -410.0
48 (-398,-375] -386.5
49 (-375,-352] -363.0
50 (-352,-328] -340.0
51 (-328,-305] -316.5
52 (-305,-281] -293.0
53 (-281,-258] -269.5
54 (-258,-234] -246.0
55 (-234,-211] -222.5
56 (-211,-188] -199.0
57 (-188,-164] -176.0
58 (-164,-141] -152.5
59 (-141,-117] -129.0
60 (-117,-93.8] -105.5
61 (-93.8,-70.3] -82.0
62 (-70.3,-46.9] -58.5
63 (-46.9,-23.4] -35.0
64 (-23.4,0] -11.5
65 (0,23.4] 12.0
66 (23.4,46.9] 35.0
67 (46.9,70.3] 58.5
68 (70.3,93.8] 82.0
69 (93.8,117] 105.5
70 (117,141] 129.0
71 (141,164] 152.5
72 (164,188] 176.0
73 (188,211] 199.0
74 (211,234] 222.5
75 (234,258] 246.0
76 (258,281] 269.5
77 (281,305] 293.0
78 (305,328] 316.5
79 (328,352] 340.0
80 (352,375] 363.5
81 (375,398] 387.0
82 (398,422] 410.0
83 (422,445] 433.5
84 (445,469] 457.0
85 (469,492] 480.5
86 (492,516] 504.0
87 (516,539] 527.5
88 (539,562] 551.0
89 (562,586] 574.0
90 (586,609] 597.5
91 (609,633] 621.0
92 (633,656] 644.5
93 (656,680] 668.0
94 (680,703] 691.5
95 (703,727] 715.0
96 (727,750] 738.5
97 (750,773] 762.0
98 (773,797] 785.0
99 (797,820] 808.5
100 (820,844] 832.0
101 (844,867] 855.5
102 (867,891] 879.0
103 (891,914] 902.5
104 (914,938] 926.0
105 (938,961] 949.0
106 (961,984] 972.5
107 (984,1.01e+03] 996.0
108 (1.01e+03,1.03e+03] 1019.5
109 (1.03e+03,1.05e+03] 1043.0
110 (1.05e+03,1.08e+03] 1066.5
111 (1.08e+03,1.1e+03] 1090.0
112 (1.1e+03,1.12e+03] 1113.5
113 (1.12e+03,1.15e+03] 1137.0
114 (1.15e+03,1.17e+03] 1160.0
115 (1.17e+03,1.2e+03] 1183.5
116 (1.2e+03,1.22e+03] 1207.0
117 (1.22e+03,1.24e+03] 1230.5
118 (1.24e+03,1.27e+03] 1254.0
119 (1.27e+03,1.29e+03] 1277.5
120 (1.29e+03,1.31e+03] 1301.0
121 (1.31e+03,1.34e+03] 1324.0
122 (1.34e+03,1.36e+03] 1347.5
123 (1.36e+03,1.38e+03] 1371.0
124 (1.38e+03,1.41e+03] 1394.5
125 (1.41e+03,1.43e+03] 1418.0
126 (1.43e+03,1.45e+03] 1441.5
127 (1.45e+03,1.48e+03] 1465.0
128 (1.48e+03,1.5e+03] 1488.5