Create data frame from matrix elements - r

I Have a matrix 'Feats' which has 3 columns of data with feature titles:
Feats <- structure(list(6.25, 3.125, 21.875, NA, NA, NA, 0.0625, 2L, 1L,
7L, 22L, NA, -2.22250786972976, 1.29105036381309, 0.291962041644914,
-0.236742861516547, NA, NA, 206.568058210094, 26.5635498091798,
236.313419096143, NA, -177.80062957838, 206.568058210094,
6.6734180947409, -1.72176626557489, NA, 0.03, 0.1225, 0.37,
NA, 0.0281666666666667, 0.0338, 0.338, 0.338, 0.124, 19,
7.8291135544129, 6.09286912790999, 1.84893765231614, 0.567403653479842,
NA, NA, structure(2L, .Label = c("", "Resting"), class = "factor"),
12.5, 9.375, 25, NA, NA, NA, 0.13125, 4L, 3L, 8L, 17L, NA,
-4.61233109314598, 2.80969059774635, 0.310781140139641, 2.01618235362392,
NA, NA, 247.38710328687, 30.960434438506, 270.000001621512,
NA, -184.493243725839, 149.850165213139, 6.21562280279281,
18.9758339164604, NA, 0.06, 0.16390625, 0.34, NA, 0.0316923076923077,
0.0312857142857143, 0.412, 0.438, 0.062, 24, 10.757380314083,
4.99655985128538, 1.32689481272002, 0.497119334728677, NA,
NA, structure(2L, .Label = c("", "Resting"), class = "factor"),
28.125, 12.5, 18.75, NA, NA, NA, 0.1375, 9L, 4L, 6L, 13L,
NA, -8.28559386168768, 4.93024930500612, 0.000801170003772261,
3.3100091226664, NA, NA, 285.259587496729, 26.5665579000233,
246.377341685035, NA, -147.299446430003, 197.209972200245,
0.021364533433927, 40.7385738174326, NA, 0.06, 0.248125,
0.62, NA, 0.0587272727272727, 0.033375, 0.646, 0.534, 0.124,
18, 13.5914203301306, 4.06478678366543, 1.41204036306107,
0.497119356632411, NA, NA, structure(2L, .Label = c("", "Resting"
), class = "factor")), .Dim = c(44L, 3L), .Dimnames = list(
c("movPercentLeft", "movPercentRight", "movPercentForward",
"movPercentUTurn", "movPercentNonMov", "changeRateMovNonMov",
"changeRateBetweenAnyMov", "headingAccumLeft", "headingAccumRight",
"accumForward", "accumNonMoving", "accumUTurn", "rateOfChangeLeft",
"rateOfChangeRight", "rateOfChangeForward", "rateOfChangeNonMoving",
"rateOfChangeUTurn", "maxHeadingChangeLeft", "maxHeadingChangeRight",
"maxHeadingChangeForward", "maxHeadingChangeNonMoving", "maxHeadingChangeUTurn",
"meanHeadingChangePerLeft", "meanHeadingChangePerRight",
"meanHeadingChangePerForward", "meanHeadingChangePerNonMoving",
"meanHeadingChangePerUTurn", "minSpeed", "meanSpeed", "maxSpeed",
"minAccel", "meanAccelPos", "meanAccelNeg", "accumAccelPos",
"accumAccelNeg", "maxAccel", "changesPosNeg", "accumDistanceMov",
"accumDistanceNonMov", "maxDistanceMov", "maxDistanceNonMov",
"accumTimeMov", "accumTimeNonMov", "Class"), c("1", "2",
"3")))
I would like to create a single data frame 'Features' such that:
Features <- data.frame(Feats[ ,1], Feats [ ,2], Feats[ ,3])
This example has 3 feats columns but 'Features' could have many eventually. I am thinking something along the lines of a for loop to achieve this but is there something more elegant?

Related

How to produce histograms from large table set of mammals. new to coding

I am trying to create a histogram for the geographic range size of mammals.
I have data columns with geographic range size and different mammal names.
hist1 = hist(x = dat1$Range_Area_km2,
breaks = 100,
col = "blue",
border = "green",
main = "Geographic Range Size of Mammals", # add a title
xlab = "Km2", # change axis labels
ylab = "Frequency",
xlim = c(0, 65000000), # change the axis limits
ylim = c(0, 200))
I am expecting a histogram of geographic range size of all mammals.
structure(list(rownames = 1:20, MSW05_Binomial = c("Melomys_rubicola",
"Peromyscus_dickeyi", "Mogera_uchidai", "Pteropus_pelewensis",
"Mysateles_garridoi", "Neotoma_anthonyi", "Microtus_breweri",
"Hypsugo_lophurus", "Neotoma_martinensis", "Pteropus_howensis",
"Crocidura_dhofarensis", "Mesocapromys_angelcabrerai", "Ochotona_gaoligongensis",
"Myotis_abei", "Rhynchomys_isarogensis", "Calomyscus_tsolovi",
"Monodelphis_unistriata", "Geocapromys_ingrahami", "Hesperoptenus_gaskelli",
"Neotoma_bunkeri"), MSW05_Order = c("Rodentia", "Rodentia", "Soricomorpha",
"Chiroptera", "Rodentia", "Rodentia", "Rodentia", "Chiroptera",
"Rodentia", "Chiroptera", "Soricomorpha", "Rodentia", "Lagomorpha",
"Chiroptera", "Rodentia", "Rodentia", "Didelphimorphia", "Rodentia",
"Chiroptera", "Rodentia"), MSW05_Family = c("Muridae", "Cricetidae",
"Talpidae", "Pteropodidae", "Capromyidae", "Cricetidae", "Cricetidae",
"Vespertilionidae", "Cricetidae", "Pteropodidae", "Soricidae",
"Capromyidae", "Ochotonidae", "Vespertilionidae", "Muridae",
"Calomyscidae", "Didelphidae", "Capromyidae", "Vespertilionidae",
"Cricetidae"), AdultBodyMass_g = c(100, 28.6, NA, NA, NA, 195.4,
NA, NA, 240.16, 232.92, NA, NA, NA, NA, 122.29, NA, 55.3, 733.52,
NA, 375), LitterSize = c(NA, NA, NA, NA, NA, NA, 4.5, NA, NA,
0.98, NA, NA, NA, NA, NA, NA, NA, 1.04, NA, NA), Range_Area_km2 = c(0.000187,
0.00725, 0.03, 0.06, 0.49, 0.66, 1.85, 2.7, 2.8, 3.16, 3.52,
3.55, 4.89, 5.11, 5.28, 6.78, 6.86, 8.36, 8.89, 9.15), Precip_Mean_mm = c(NA,
NA, NA, NA, NA, NA, NA, 290, NA, NA, 7, 111, 195, 52, 331, 18,
107, NA, 237, NA), Temp_Mean_degC = c(NA, NA, NA, NA, NA, NA,
NA, 24.6, NA, NA, 21.1, 24.6, 11.3, -0.9, 25.2, 17.1, 17.9, NA,
20.6, NA), ActivityCycle = c(NA, 1L, 2L, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, 2L, NA, 1L, 1L, NA, NA), DietBreadth = c(NA,
NA, 1L, NA, 3L, NA, 1L, NA, NA, NA, NA, 3L, NA, NA, 1L, NA, 1L,
2L, NA, NA), HabitatBreadth = c(NA, NA, 1L, NA, 1L, NA, 2L, 1L,
NA, 1L, NA, 1L, 1L, 1L, NA, NA, 1L, 2L, 1L, NA), Terrestriality = c(NA,
NA, 1L, NA, 2L, NA, 1L, 2L, NA, 2L, NA, 2L, 1L, 2L, NA, NA, 1L,
2L, 2L, NA), TrophicLevel = c(NA, NA, 3L, NA, 2L, NA, 1L, NA,
NA, NA, NA, 2L, NA, NA, 3L, NA, 3L, 1L, NA, NA)), row.names = c(NA,
20L), class = "data.frame")
>

Deleting a line if multiple columns are NA - R solution

I want to delete rows only when selected columns are NA.
Data here:
dput(df)
structure(list(record_id = c("BIV-1601-1250-E1", "BIV-1601-1250-E1",
"BIV-1601-1250-E1", "BIV-1601-1250-E1", "BIV-1601-1250-E1", "BIV-1601-1719-E1",
"BIV-1601-1719-E1", "BIV-1601-1719-E1", "BIV-1601-1719-E1", "BIV-1601-1719-E1",
"BIV-1402-1368-E1", "BIV-1402-1368-E1", "BIV-1402-1368-E1", "BIV-1402-1368-E1",
"BIV-1402-1368-E1", "BIV-1101-1038-E1", "BIV-1101-1038-E1", "BIV-1101-1038-E1",
"BIV-1101-1038-E1", "BIV-1101-1038-E1", "BIV-1701-1145-E1", "BIV-1701-1145-E1",
"BIV-1701-1145-E1", "BIV-1701-1145-E1", "BIV-1701-1145-E1", "BIV-1102-2040-E1",
"BIV-1102-2040-E1", "BIV-1102-2040-E1", "BIV-1102-2040-E1", "BIV-1102-2040-E1"
), DATE = structure(c(NA, 17478, 17480, 17479, NA, 18295, NA,
18296, 18296, NA, NA, 17912, 17914, 17934, NA, 17221, 17221,
17223, 17224, NA, NA, 17820, 17822, 17823, NA, NA, 18359, 18361,
18361, NA), class = "Date"), haemoglobin = structure(c(NA, 101,
NA, NA, NA, 100, NA, NA, NA, NA, NA, 97.6, NA, NA, NA, NA, 109,
NA, NA, NA, NA, 120, NA, NA, NA, NA, 205, NA, NA, NA), label = "g/L", class = c("labelled",
"numeric")), WBC = structure(c(NA, NA, "5", NA, NA, NA, "27.6",
NA, NA, NA, NA, NA, "8.8", NA, NA, NA, NA, "10.3", NA, NA, NA,
NA, "23.5", NA, NA, NA, NA, "11.81", NA, NA), label = "10^9/L", class = c("labelled",
"character")), CRP = c(NA, NA, "9", NA, NA, NA, "499", NA, NA,
NA, NA, NA, "7", NA, NA, NA, "43", "54.4", NA, NA, NA, NA, "37",
NA, NA, NA, NA, "<4.0", NA, NA), admission_day = c(NA, 0L, 2L,
1L, NA, 1L, NA, 2L, 2L, NA, NA, 1L, 3L, 23L, NA, 0L, 0L, 2L,
3L, NA, NA, 0L, 2L, 3L, NA, NA, 0L, 2L, 2L, NA)), row.names = c(NA,
-30L), groups = structure(list(record_id = c("BIV-1101-1038-E1",
"BIV-1102-2040-E1", "BIV-1402-1368-E1", "BIV-1601-1250-E1", "BIV-1601-1719-E1",
"BIV-1701-1145-E1"), .rows = structure(list(16:20, 26:30, 11:15,
1:5, 6:10, 21:25), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, 6L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
I only want to drop the lines when the following columns DATE, haemoglobin, CRP, WBC, and admission_day all equal NA. My thoughts were something like this:
library(dplyr)
cols_to_drop <- c("DATE", "haemoglobin", "CRP", "WBC", "admission_day")
df <- df %>% mutate(case_when(is.na(cols_to_drop) ~ drop_na(DATE)))
Obviously (as usual for me) this doesn't work... II think it's something to do with needing to make case_when equal to a particular variable... but I want it to apply across the whole dataframe.
If someone can help, I'd be grateful!
You can use if_all/if_any -
library(dplyr)
cols_to_drop <- c("DATE", "haemoglobin", "CRP", "WBC", "admission_day")
df %>% filter(!if_all(cols_to_drop, is.na))
With if_any -
df %>% filter(if_any(cols_to_drop, Negate(is.na)))

Time-varying covariates: formatting for one categorical variable with 3 levels

(This is an edited version of a previously closed question)
I have a data.frame (condensed to testdata) of several demographic variables in addition to one variable with three levels for three possible comorbidities.
dput(testdata)
structure(list(age = c(31L, 48L, 19L, 23L, 24L, 24L, 40L, 22L,
25L, 20L, 39L, 26L, 28L, 27L, 25L), gender = structure(c(2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("F",
"M"), class = "factor"), race = structure(c(NA, NA, 1L, NA, 1L,
NA, NA, NA, 2L, 1L, NA, 3L, NA, 1L, NA), .Label = c("C", "M",
"N", "R", "Z"), class = "factor"), Time1 = c(NA, NA, NA, 319,
NA, 133, NA, 121, NA, NA, 30, NA, NA, NA, NA), Time2 = c(NA,
109, NA, NA, NA, NA, NA, NA, NA, NA, NA, 108, 52, NA, NA), Time3 = c(NA,
73, NA, NA, 4, NA, NA, 121, NA, NA, 2, NA, NA, NA, NA), OutcomeTime = c(4380,
199, 4380, 4380, 4380, 4380, 4380, 4380, 4380, 4380, 196, 4380,
4380, 4380, 4380), CoMo1 = c(NA, NA, NA, 1, NA, 1, NA, 1, NA,
NA, 1, NA, NA, NA, NA), CoMo2 = c(NA, 2, NA, NA, NA, NA, NA,
NA, NA, NA, NA, 2, 2, NA, NA), CoMo3 = c(NA, 3, NA, NA, 3, NA,
NA, 3, NA, NA, 3, NA, NA, NA, NA), Outcome = c(0, 1, 0, 0, 0,
0, 0, 0, 0, 0, 1, 0, 0, 0, 0), ID = 1:15), class = "data.frame", row.names =
c("1",
"4", "6", "7", "8", "11", "14", "18", "19", "26", "27", "28",
"30", "31", "38"))
Time1, Time2, Time3 are the times at which the ID had the comorbidity, CoMo1, CoMo2, CoMo3 respectively. The Outcome variable is whether death occurred or not. The OutcomeTime is when death occurred or if they patient was followed to the end of the study with no death.
In trying to set this up using tmerge(), I've had some difficulty with the commands (first time doing this!):
newdata<-tmerge(data1=testdata[,c(1:3,12)], data2=testdata, id=ID,
tstop=OutcomeTime, tstart=0)
newdata<-tmerge(newdata, testdata, id=ID, Comorbid=event(OutcomeTime))
newdata<-tmerge(newdata, testdata, id=ID, Comorbid=event(Time1))
newdata<-tmerge(newdata, testdata, id=ID, Comorbid=event(Time2))
newdata<-tmerge(newdata, testdata, id=ID, Comorbid=event(Time3))
newdata<-tmerge(newdata,newdata, ID, enum=cumevent(tstart))
I've tried several renditions with the above and I'm not getting quite what I want:
1) I would like a column specifying which comorbidity is occurring in the interval
2) I would like a column for the Outcome variable. As it is currently, it looks like maybe for IDs that have comorbities, it's adding another time interval for the Outcome but it's not being recorded anywhere. It really should be it's own variable.
3) Can someone explain the difference between when to use event, cumevent, tdc, and cumtdc?
Thank you!

R Problems with glm-model due to missing values

I have problems with putting my data into a glm model. I think the problem is because I have many missing values in my data (below). I tried this so far:
baseformula = as.formula(df)
glm(baseformula, data = df, family = poisson(link = "log"), na.action = na.exclude)
I am getting an Error:
Error in glm.fit(x = numeric(0), y = integer(0), weights = NULL, start
= NULL, : object 'fit' not found
Can somebody help me with this? When a variable is NA in my formula, I just want the glm to ignore the NAs and use these variables the same as variables without NA.
structure(list(V1 = c(0L, 1L, 3L, 0L, 0L, 0L, 2L, 0L, 1L, 1L,
0L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 2L, 0L, 0L, 0L, 0L,
0L, 2L, 0L, 0L, 1L, 5L, 1L, 1L, 0L, 1L, 0L, 0L, 1L, 3L, 0L, 1L,
0L), V48 = c(97.33, 96.88, 85.33, 83.75, 75.58, 86.13, 83, 95.75,
88.46, 80.25, 75, 67.17, 69.33, 64.08, 70.75, 78.46, 85.58, 83.42,
96.17, 76.5, 76.42, 65.38, 69.79, 68.38, 84.67, 89.67, 91.29,
80.54, 64.63, 72.29, 76.54, 65.33, 96.92, 91.38, 88.92, 80.63,
85.5, 76.38, 76.21, 78.29, 89.29, 87.04, 78.67), V49 = c(-0.9,
-0.1, 0, 0.9, -0.2, -6.3, -4.9, -1.2, -0.3, -1.4, 7.3, 10.5,
10.8, 17.5, 10.8, 9.2, 7.3, 8.2, 10.2, 8.5, 10.4, 25.6, 26.7,
28, 20.1, 20.2, 15.7, 15.3, 21.6, 24.8, 22.4, 27.1, 14.3, 13.8,
17.1, 19.5, 22.9, 21.9, 17.2, 18.9, 16.3, 14.2, 18.5), V58 = c(0.16208333,
-0.02576069, -0.24859501, -0.39733779, -0.35568168, -0.13908246,
-0.11529523, -0.07094469, 0.07592036, 0.13803538, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), V59 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, 0.40727943, 0.44007391, 0.50582446, 0.59001139,
0.55057958, 0.53888617, 0.55019019, 0.42592698, 0.347516, 0.52019593,
0.69611622, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), V61 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, 0.04555282, 0.16109391, 0.13651381, -0.02339007,
-0.24799358, -0.14477839, -0.0845835, -0.13505766, -0.06910931,
0.05876354, 0.11372484, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA), V68 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 0.01575957,
-0.19924471, -0.39083879, -0.26620543, -0.10669409, -0.05650572,
0.06644096, 0.24769837, -0.11404654, -0.49358358, -0.27725445,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA), V71 = c(NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, -0.1563703, -0.23797044, -0.37304736, -0.27425744,
-0.02347071, 0.36391633, 0.44316418, 0.21940339, 0.02321926,
-0.01531807, -0.05197635, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), V73 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -0.46298985,
-0.7644245, -0.82771396, -0.81243484, -0.75591058, -0.55440085,
-0.35516327, -0.05602486, -0.12290976, -0.14458255, -0.17033091
), V77 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, -0.04571093, 0.25592819, 0.35649173, 0.3507695, 0.30446594,
0.36505183, 0.54215354, 0.47808018, 0.40325075, 0.32091592, 0.09212919
)), .Names = c("V1", "V48", "V49", "V58", "V59", "V61", "V68",
"V71", "V73", "V77"), row.names = c(1L, 2L, 3L, 4L, 5L, 6L, 7L,
8L, 9L, 10L, 60L, 61L, 62L, 63L, 64L, 65L, 66L, 67L, 68L, 69L,
70L, 152L, 153L, 154L, 155L, 156L, 157L, 158L, 159L, 160L, 161L,
162L, 244L, 245L, 246L, 247L, 248L, 249L, 250L, 251L, 252L, 253L,
254L), class = "data.frame")

How to get conditional weighted means for several columns

For the following dataframe:
eu <- structure(list(land = structure(c(1L, 4L, 5L, 12L, 9L, 13L, 16L, 18L, 27L, 10L, 25L, 21L, 28L, 19L, 8L, 26L, 6L, 3L, 15L, 14L, 11L, 17L, 20L, 23L, 24L, 2L, 22L, 7L), .Label = c("Belgie", "Bulgarije", "Cyprus", "Denemarken", "Duitsland", "Estland", "Europese Unie", "Finland", "Frankrijk", "Griekenland", "Hongarije", "Ierland", "Italie", "Letland", "Litouwen", "Luxemburg", "Malta", "Nederland", "Oostenrijk", "Polen", "Portugal", "Roemenie", "Slovenie", "Slowakije", "Spanje", "Tsjechie", "Verenigd Koninkrijk", "Zweden"), class = "factor"), `1979` = c(91.36, 47.82, 65.73, 63.61, 60.71, 85.65, 88.91, 58.12, 32.35, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 61.99), `1981` = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, 81.48, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), `1984` = c(92.09, 52.38, 56.76, 47.56, 56.72, 82.47, 88.79, 50.88, 32.57, 80.59, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 58.98), `1987` = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 68.52, 72.42, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), `1989` = c(90.73, 46.17, 62.28, 68.28, 48.8, 81.07, 87.39, 47.48, 36.37, 80.03, 54.71, 51.1, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 58.41), `1994` = c(90.66, 52.92, 60.02, 43.98, 52.71, 73.6, 88.55, 35.69, 36.43, 73.18, 59.14, 35.54, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 56.67), `1995` = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 41.63, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), `1996` = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 67.73, 57.6, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), `1999` = c(91.05, 50.46, 45.19, 50.21, 46.76, 69.76, 87.27, 30.02, 24, 70.25, 63.05, 39.93, 38.84, 49.4, 30.14, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 49.51), `2004` = c(90.81, 47.89, 43, 58.58, 42.76, 71.72, 91.35, 39.26, 38.52, 63.22, 45.14, 38.6, 37.85, 42.43, 39.43, 28.3, 26.83, 72.5, 48.38, 41.34, 38.5, 82.39, 20.87, 28.35, 16.97, NA, NA, 45.47), `2007` = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 29.22, 29.47, NA), `2009` = c(90.39, 59.54, 43.3, 58.64, 40.63, 65.05, 90.75, 36.75, 34.7, 52.61, 44.9, 36.78, 45.53, 45.97, 40.3, 28.2, 43.9, 59.4, 20.98, 53.7, 36.31, 78.79, 24.53, 28.33, 19.64, 38.99, 27.67, 43), inwoners = c(11161642, 5602628, 80523746, 4591087, 65578819, 59685227, 537039, 16779575, 63896071, 11062508, 46727890, 10487289, 9555893, 8451860, 5426674, 10516125, 1320174, 865878, 2971905, 2023825, 9908798, 421364, 38533299, 2058821, 5410836, 7284552, 20020074, 501403599), plicht = structure(c(1L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("ja", "nee"), class = "factor")), .Names = c("land", "1979", "1981", "1984", "1987", "1989", "1994", "1995", "1996", "1999", "2004", "2007", "2009", "inwoners", "plicht"), row.names = c(NA, -28L), class = "data.frame")
I need conditional column means. I can do that with:
verplicht <- c("Europese Unie (stemplicht)", colMeans(eu[eu$plicht=="ja",c(2:13)], na.rm=TRUE), NA)
vrij <- c("Europese Unie (geen stemplicht)", colMeans(eu[eu$plicht=="nee",c(2:13)], na.rm=TRUE), NA)
eu2 <- rbind(eu, verplicht, vrij)
However, I need weighted column means with country population (the inwoners column) as the weights. I tried to that with:
verplicht <- c("Europese Unie (stemplicht)", lapply(eu[eu$plicht=="ja",c(2:13)], weighted.mean(x, eu[eu$plicht=="ja",14], na.rm=TRUE)), NA)
but that resulted in the following error:
Error in weighted.mean.default(x, eu[eu$plicht == "ja", 14], na.rm = TRUE) :
'x' and 'w' must have the same length
I understand what the error-message is saying, but don't know how to solve this. Any suggestions?
The problem is with how you're using lapply. Here's the correct code:
lapply(eu[eu$plicht=='ja',2:13], weighted.mean, eu[eu$plicht=='ja','inwoners'], na.rm=TRUE)
lapply(eu[eu$plicht=='nee',2:13], weighted.mean, eu[eu$plicht=='nee','inwoners'], na.rm=TRUE)
Notice how weighted.mean is used as an argument, rather than inside an anonymous function with x as an argument. You could equivalently do:
lapply(eu[eu$plicht=='ja',2:13], function(x) weighted.mean(x, eu[eu$plicht=='ja','inwoners'], na.rm=TRUE))
lapply(eu[eu$plicht=='nee',2:13], function(x) weighted.mean(x, eu[eu$plicht=='nee','inwoners'], na.rm=TRUE))
But you're currently kind of mixing the two different ways of using lapply.
If inwoners is the population, then
> (weights <- with(eu, inwoners/sum(inwoners)))
# [1] 0.0111303968 0.0055869443 0.0802983327 0.0045782350 0.0653952416
# [6] 0.0595181478 0.0005355356 0.0167326033 0.0637172042 0.0110315403
# [11] 0.0465970828 0.0104579315 0.0095291428 0.0084282004 0.0054114829
# [16] 0.0104866868 0.0013164784 0.0008634541 0.0029635856 0.0020181596
# [21] 0.0098810599 0.0004201845 0.0384254312 0.0020530577 0.0053956892
# [26] 0.0072641601 0.0199640310 0.5000000000
and the weighted mean of the 2004 column, for example, is
> weighted.mean(eu$`2004`, w = weights, na.rm = TRUE)
# [1] 45.31782
To get the weighted mean of each of the year columns for when plicht == 'ja',
> s <- subset(eu, plicht == "ja")
> w2 <- weights[as.numeric(rownames(s))]
> newDF <- do.call(rbind, lapply(2:13, function(i){
data.frame(wtMean.ja = weighted.mean(s[,i], w = w2, na.rm = TRUE))
}))
> rownames(newDF) <- names(s)[2:13]
> newDF
# wtMean.ja
# 1979 86.56735
# 1981 81.48000
# 1984 83.56127
# 1987 68.52000
# 1989 72.30636
# 1994 69.86950
# 1995 NaN
# 1996 NaN
# 1999 69.28708
# 2004 63.17060
# 2007 NaN
# 2009 58.99465

Resources