Related
I have data of fish stomach contents (prey items).
In my original df, each fish (with a unique FID) had multiple rows(observations) - one row per unique prey taxon found. For example, if fish #10 had both daphnia and goby in its stomach, there were two rows for that same fish (one row with # of daphnia in that fish's stomach and one row for # of goby in that same stomach); if the fish only had daphnia in their stomach then they had one row; and so on.
I have converted my data from long to wide format to have one observation per row (one unique fish per row).
I am trying to calculate the proportion of empty stomachs by month (when totalnumPrey == 0).
Reproducible data (shortened; complete data has 488 observations):
structure(list(id = c("1001_28", "1001_29", "1001_30", "1001_31",
"1001_32", "1001_33", "1001_34", "1001_35", "1023_3", "614_1",
"614_3", "616_1", "616_3", "616_4", "616_5", "616_6", "824_23",
"824_24", "824_25", "824_26", "824_28", "824_29", "824_30", "824_31",
"824_32", "824_33", "824_35"), CRN = c(1001L, 1001L, 1001L, 1001L,
1001L, 1001L, 1001L, 1001L, 1023L, 614L, 614L, 616L, 616L, 616L,
616L, 616L, 824L, 824L, 824L, 824L, 824L, 824L, 824L, 824L, 824L,
824L, 824L), FID = c(28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L,
3L, 1L, 3L, 1L, 3L, 4L, 5L, 6L, 23L, 24L, 25L, 26L, 28L, 29L,
30L, 31L, 32L, 33L, 35L), ac = c(2L, 2L, 1L, 1L, 1L, 1L, 0L,
0L, 1L, 2L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 1L), mm = c(200L, 159L, 117L, 120L, 108L, 103L, 92L,
97L, 104L, 301L, 163L, 85L, 271L, 290L, 330L, 294L, 270L, 260L,
266L, 197L, 195L, 185L, 160L, 157L, 178L, 166L, 149L), gr = c(95,
44, 15.1, 16.1, 11, 10, 6.9, 7.9, 10.9, 418, 62, 6.8, 311, 453,
593, 395, 283, 275, 261, 96, 90, 90, 56, 50, 57, 62, 45.5), catch = c(2L,
2L, 4L, 4L, 4L, 4L, 2L, 2L, 1L, 3L, 3L, 1L, 5L, 5L, 5L, 5L, 15L,
15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 14L), Daphnia = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Byths = c(0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
18L, 79L, 71L, 8L, 73L, 0L, 38L, 39L), Chiro.Pupae = c(0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 255L, 7L, 0L, 576L, 590L, 536L, 576L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Empty = c(0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Chiro.Larvae = c(0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 5L, 38L, 0L, 9L, 0L, 0L, 0L), Amphipod = c(0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 6L, 0L, 0L, 0L, 4L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Isopod = c(0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 5L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L), Chironomidae = c(0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L), Hemimysis = c(0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L), Copepoda = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L), Sphaeriidae = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L), Chiro.Adult = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 74L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L), Trichopteran = c(0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L), UID.Fish = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L), Chydoridae = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
200L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L), Cyclopoid = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), Fish.Eggs = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L), EggMass = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L), Dreissena = c(0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 1L, 0L, 0L, 0L, 1L
), Goby = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Eurycercidae = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Hirudinea = c(0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), totalnumPrey = c(0, 0, 0,
0, 1, 0, 0, 0, 200, 262, 81, 0, 576, 595, 536, 582, 0, 0, 0,
19, 84, 110, 9, 82, 0, 38, 40), MONTH = c(11L, 11L, 11L, 11L,
11L, 11L, 11L, 11L, 11L, 6L, 6L, 6L, 6L, 6L, 6L, 6L, 8L, 8L,
8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L), DAY = c(4L, 4L, 4L, 4L,
4L, 4L, 4L, 4L, 6L, 23L, 23L, 23L, 23L, 23L, 23L, 23L, 18L, 18L,
18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L), empty = c("Empty",
"Empty", "Empty", "Empty", "Not_empty", "Empty", "Empty", "Empty",
"Not_empty", "Not_empty", "Not_empty", "Empty", "Not_empty",
"Not_empty", "Not_empty", "Not_empty", "Empty", "Empty", "Empty",
"Not_empty", "Not_empty", "Not_empty", "Not_empty", "Not_empty",
"Empty", "Not_empty", "Not_empty")), row.names = c(NA, -27L), class = c("data.table",
"data.frame"))
I haven't been able to figure out a way to calculate proportion using counts instead of actual values (since I need to count the 0 values by group and not use the actual 0 value to calculate the proportion).
I have tried the following:
example %>%
group_by(empty, MONTH) %>%
summarise(totalnumPrey = n()) %>%
mutate(prop = n / sum(n))
This gives the following error:
Error in `mutate()`:
! Problem while computing `prop = n/sum(n)`.
ℹ The error occurred in group 1: empty = "Empty".
Caused by error in `sum()`:
! invalid 'type' (closure) of argument
I also tried this:
transform(example,
perc = ave(totalnumPrey,
empty,
FUN = prop.table))
but this doesn't give me what I need...
Also this:
example %>%
group_by(MONTH) %>%
summarise(n = n()) %>%
mutate(freq = n / sum(n))
which gives me proportion by month, not what I need (i.e. for June it's doing 127/362 = 0.35)...
I have tried many other ways from examples I found in other SO posts but still can't get what I need.
Is there a way I can calculate the proportion of empty vs non-empty stomachs by month?
I also need to do this for each prey type/taxon. For example, proportion of individual fish that contain "Isopod" and so on for each unique taxon in my data. Presence/absence type of proportions.
I mainly want to do this by month first, but will eventually use other groupings.
When I had the data in long format, I was able to calculate proportion of each prey item within one fish stomach by using:
transform(a,
perc = ave(number,
id,
FUN = prop.table))
data not included here.. but 'number' here being the total count of each unique prey taxa/group per stomach/fish & 'id' unique identifier I created to distinguish between different fish (since there were multiple rows for same fish).
I am happy to clarify anything that is not clear or add additional data if needed.
I have searched online and in SO for a few days but still can't figure this out.
Thank you in advance.
I think this is what you need.
What we need to do is to count the number of times the column empty is equal to "Empty" per each group - so we can do this using sum(empty=="Empty") and then divide by the number of rows in that group n().
library(dplyr)
dat %>%
group_by(MONTH) %>%
summarise(
prop_empty = sum(empty=="Empty")/n(),
prop_not_empy = sum(empty != "Empty")/n()
)
# A tibble: 3 × 3
MONTH prop_empty prop_not_empy
<int> <dbl> <dbl>
1 6 0.143 0.857
2 8 0.364 0.636
3 11 0.778 0.222
I have to identify genes showing sex specific expression in 2 tissues: "pancreas" and "lung".
To do it first of all i need to do a PCA to ascertain whether there is separation between tissues of different sexes (in particular there are 3 individuals of sex 1 and 3 of sex2 for each tissue)
I suppose that i should classify the genes in counts for sex by using the sex column in the Design list and after I should perform a PCA where different colors are assigned to sex 1 and sex 2 genes.
The problem is that even if I know what I should do to perform the PCA (if what i tought is right) I don't know how to write the codes required to do it: how can i create a new dataframe made by only the genes in count that correspond to lung and pancreas rows in Design?
I thought to do in this way in order to color the genes with different colors depending by sex (information shown in Design), if there's a simplier way is well accepted any suggestion.
dput(Design[1:10,]):
Design = structure(list(Individual = c("GTEX-Y5V6", "GTEX-1KXAM", "GTEX-18A67",
"GTEX-14BMU", "GTEX-13PVR", "GTEX-1211K", "GTEX-1KXAM", "GTEX-18A67",
"GTEX-14BMU", "GTEX-1211K"), sex = c(1L, 1L, 1L, 2L, 2L, 2L,
1L, 1L, 2L, 2L), age = c("60-69", "60-69", "50-59", "20-29",
"60-69", "60-69", "60-69", "50-59", "20-29", "60-69"), RNA.quality..max10. = c(7.1,
8.1, 7.2, 7.2, 7.3, 7, 7.2, 7.3, 7.4, 8.2), organ = c("Thyroid",
"Thyroid", "Thyroid", "Thyroid", "Thyroid", "Thyroid", "Stomach",
"Stomach", "Stomach", "Stomach"), tissue = c("Thyroid", "Thyroid",
"Thyroid", "Thyroid", "Thyroid", "Thyroid", "Stomach", "Stomach",
"Stomach", "Stomach")), row.names = c("GTEX-Y5V6-0526-SM-4VBRV",
"GTEX-1KXAM-1726-SM-D3LAE", "GTEX-18A67-0826-SM-7KFTI", "GTEX-14BMU-0226-SM-5S2QA",
"GTEX-13PVR-0626-SM-5S2RC", "GTEX-1211K-0726-SM-5FQUW", "GTEX-1KXAM-0926-SM-CXZKA",
"GTEX-18A67-2626-SM-718AD", "GTEX-14BMU-1126-SM-5RQJ8", "GTEX-1211K-1426-SM-5FQTF"
), class = "data.frame")
dput(counts[1:10,]):
structure(list(`GTEX-Y5V6-0526-SM-4VBRV` = c(0L, 1L, 2L, 1L,
0L, 0L, 0L, 0L, 0L, 214L), `GTEX-1KXAM-1726-SM-D3LAE` = c(0L,
0L, 2L, 0L, 0L, 0L, 0L, 0L, 0L, 205L), `GTEX-18A67-0826-SM-7KFTI` = c(0L,
0L, 2L, 0L, 0L, 0L, 0L, 0L, 0L, 164L), `GTEX-14BMU-0226-SM-5S2QA` = c(0L,
0L, 0L, 12L, 0L, 0L, 0L, 0L, 0L, 108L), `GTEX-13PVR-0626-SM-5S2RC` = c(0L,
0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 100L), `GTEX-1211K-0726-SM-5FQUW` = c(0L,
0L, 0L, 2L, 0L, 0L, 1L, 0L, 0L, 174L), `GTEX-1KXAM-0926-SM-CXZKA` = c(2L,
1L, 2L, 2L, 0L, 0L, 0L, 0L, 0L, 99L), `GTEX-18A67-2626-SM-718AD` = c(7L,
3L, 7L, 2L, 0L, 1L, 5L, 0L, 0L, 116L), `GTEX-14BMU-1126-SM-5RQJ8` = c(0L,
0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 44L), `GTEX-1211K-1426-SM-5FQTF` = c(4L,
0L, 5L, 2L, 0L, 0L, 0L, 0L, 0L, 143L), `GTEX-11TT1-0726-SM-5GU5A` = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 57L), `GTEX-1HCUA-1626-SM-A9SMG` = c(0L,
0L, 0L, 22L, 0L, 0L, 0L, 0L, 0L, 53L), `GTEX-1KXAM-0226-SM-EV7AP` = c(0L,
0L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 75L), `GTEX-18A67-1726-SM-7KFT9` = c(0L,
0L, 2L, 1L, 0L, 0L, 0L, 0L, 0L, 73L), `GTEX-14BMU-0726-SM-73KXS` = c(0L,
0L, 0L, 40L, 0L, 0L, 0L, 0L, 0L, 74L), `GTEX-13PVR-0726-SM-5S2PX` = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 54L), `GTEX-1211K-1126-SM-5EGGB` = c(0L,
1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 25L), `GTEX-11TT1-0326-SM-5LUAY` = c(0L,
0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 54L), `GTEX-1KXAM-2426-SM-DIPFC` = c(1L,
0L, 2L, 1L, 0L, 0L, 2L, 0L, 0L, 29L), `GTEX-18A67-0326-SM-7LG5X` = c(0L,
0L, 5L, 4L, 0L, 0L, 2L, 0L, 1L, 91L), `GTEX-14BMU-2026-SM-5S2W6` = c(0L,
0L, 2L, 5L, 0L, 0L, 0L, 0L, 0L, 30L), `GTEX-13PVR-2526-SM-5RQIT` = c(0L,
0L, 2L, 1L, 0L, 0L, 0L, 0L, 0L, 14L), `GTEX-1211K-2126-SM-59HJZ` = c(1L,
0L, 2L, 0L, 0L, 0L, 1L, 0L, 0L, 51L), `GTEX-Y3I4-2326-SM-4TT81` = c(0L,
0L, 3L, 0L, 0L, 0L, 1L, 0L, 0L, 38L), `GTEX-1KXAM-0426-SM-DHXKG` = c(0L,
0L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 105L), `GTEX-18A67-1126-SM-7KFSB` = c(1L,
0L, 0L, 4L, 0L, 0L, 1L, 0L, 0L, 76L), `GTEX-14BMU-0526-SM-73KW4` = c(0L,
0L, 0L, 11L, 0L, 0L, 0L, 0L, 0L, 53L), `GTEX-1211K-0826-SM-5FQUP` = c(1L,
0L, 0L, 2L, 0L, 0L, 1L, 0L, 0L, 104L), `GTEX-11TT1-1626-SM-5EQL7` = c(0L,
0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 113L), `GTEX-ZYFG-0226-SM-5GIDT` = c(1L,
0L, 2L, 2L, 0L, 0L, 2L, 0L, 0L, 54L), `GTEX-1KXAM-0826-SM-CXZK9` = c(0L,
0L, 0L, 5L, 0L, 0L, 2L, 0L, 0L, 97L), `GTEX-18A67-2426-SM-7LT95` = c(1L,
0L, 2L, 0L, 0L, 1L, 3L, 0L, 0L, 69L), `GTEX-14BMU-0926-SM-5S2QB` = c(0L,
0L, 0L, 3L, 0L, 0L, 0L, 0L, 0L, 29L), `GTEX-13PVR-1826-SM-5Q5CC` = c(1L,
0L, 0L, 3L, 0L, 1L, 2L, 0L, 0L, 32L), `GTEX-1211K-0926-SM-5FQTL` = c(0L,
0L, 0L, 3L, 0L, 0L, 1L, 0L, 0L, 99L), `GTEX-11TT1-0526-SM-5P9JO` = c(0L,
1L, 2L, 4L, 0L, 0L, 2L, 0L, 0L, 52L), `GTEX-1KXAM-0726-SM-E9U5I` = c(0L,
0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 45L), `GTEX-18A67-2526-SM-7LG5Z` = c(1L,
0L, 2L, 0L, 0L, 0L, 1L, 0L, 0L, 91L), `GTEX-14BMU-1026-SM-5RQJ5` = c(1L,
0L, 1L, 8L, 0L, 0L, 0L, 0L, 0L, 47L), `GTEX-13PVR-2026-SM-73KXT` = c(0L,
0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 27L), `GTEX-1211K-1326-SM-5FQV2` = c(0L,
0L, 3L, 0L, 0L, 0L, 1L, 1L, 0L, 57L), `GTEX-11TT1-0626-SM-5GU4X` = c(1L,
0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 90L), `GTEX-ZYFG-1826-SM-5GZWX` = c(0L,
0L, 3L, 2L, 0L, 0L, 2L, 0L, 0L, 91L), `GTEX-1KXAM-1926-SM-D3LAG` = c(0L,
0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 103L), `GTEX-18A67-2226-SM-7LT9Z` = c(0L,
0L, 2L, 2L, 0L, 0L, 1L, 0L, 1L, 157L), `GTEX-13PVR-1726-SM-5Q5EC` = c(1L,
0L, 2L, 0L, 0L, 0L, 1L, 0L, 0L, 34L), `GTEX-1211K-1826-SM-5EGJ2` = c(0L,
0L, 1L, 3L, 0L, 0L, 0L, 0L, 0L, 49L), `GTEX-11TT1-0926-SM-5GU5M` = c(0L,
2L, 0L, 3L, 1L, 0L, 0L, 0L, 1L, 49L), `GTEX-1KXAM-1026-SM-CY8IA` = c(0L,
0L, 1L, 3L, 0L, 0L, 0L, 0L, 0L, 93L), `GTEX-14BMU-1626-SM-5TDE7` = c(0L,
1L, 3L, 13L, 0L, 0L, 1L, 0L, 0L, 84L), `GTEX-13PVR-2226-SM-7DHKP` = c(0L,
0L, 2L, 2L, 0L, 0L, 0L, 0L, 0L, 75L), `GTEX-1211K-1926-SM-5EQLB` = c(0L,
1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 114L), `GTEX-11TT1-2126-SM-5GU5Y` = c(2L,
0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 49L), `GTEX-ZT9W-2026-SM-51MRA` = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 70L), `GTEX-1KXAM-2326-SM-CYPTD` = c(0L,
0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 20L), `GTEX-18A67-0226-SM-7LG67` = c(0L,
0L, 5L, 2L, 0L, 0L, 1L, 0L, 0L, 94L), `GTEX-14BMU-2126-SM-5S2TS` = c(0L,
0L, 0L, 3L, 0L, 0L, 0L, 0L, 0L, 50L), `GTEX-13PVR-2426-SM-5RQHN` = c(0L,
0L, 2L, 0L, 0L, 0L, 0L, 0L, 0L, 59L), `GTEX-1211K-2226-SM-5FQU6` = c(0L,
0L, 0L, 3L, 0L, 0L, 0L, 0L, 0L, 81L), `GTEX-11TT1-2426-SM-5EQMK` = c(0L,
1L, 0L, 2L, 0L, 0L, 0L, 0L, 0L, 60L)), row.names = c("ENSG00000243485",
"ENSG00000237613", "ENSG00000186092", "ENSG00000238009", "ENSG00000222623",
"ENSG00000241599", "ENSG00000236601", "ENSG00000235146", "ENSG00000223181",
"ENSG00000237491"), class = "data.frame")
I'm trying to compute the ranked abundances of a community data (site*species matrix) by using rankabundance(df) in the BiodiversityR package. But the following error keeps popping up whenever I try to run it.
Error in `[.data.frame`(pi, i) : undefined columns selected
Can someone please help with what this code means?
I've already specified the column names when sub-setting the data. And the data is also in the right format; I've tried running BCI (from vegan) for the same function and it runs perfectly fine. My data is the same format as BCI.
library(BiodiversityR)
rankabundance(alad2, digits = 1)
This is the code that I'm running, and the data-frame is arranged in a site*species matrix, where sites are rows and species are columns.
Here is the dataframe, alad2:
structure(list(`Alysicarpous sp.1` = c(0L, 0L, 1L, 0L, 0L, 4L,
0L, 0L, 0L, 0L, 0L, 4L), `Alysicarpous sp.2` = c(0L, 0L, 1L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), `Bothriochloa pertusa` = c(0L,
0L, 4L, 0L, 12L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), `Butea monosperma ` = c(0L,
0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), `Chromolaena odorata` = c(0L,
0L, 0L, 1L, 3L, 0L, 0L, 5L, 17L, 4L, 0L, 0L), `Chrysopogon sp.*` = c(62L,
64L, 57L, 68L, 72L, 74L, 72L, 62L, 56L, 67L, 54L, 61L), `Desmodium triflorum` = c(0L,
2L, 7L, 12L, 6L, 12L, 0L, 10L, 13L, 0L, 14L, 8L), `Eragrostis tenuifolia` = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 2L, 0L, 0L, 0L, 0L), `Fimbristylis dichotoma` = c(32L,
38L, 41L, 26L, 38L, 38L, 41L, 20L, 28L, 41L, 31L, 32L), H80 = c(2L, 0L, 0L, 0L, 0L, 0L, 2L, 0L, 0L, 0L, 0L, 0L), `Hemigraphis sp.*` = c(0L,
0L, 0L, 0L, 0L, 0L, 3L, 0L, 0L, 3L, 0L, 0L), `Ischaemum sp.*` = c(18L,
0L, 18L, 18L, 0L, 18L, 33L, 26L, 12L, 16L, 24L, 23L), `Lantana camara` = c(0L,
0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L), `Leucas aspera` = c(0L,
0L, 0L, 0L, 2L, 2L, 0L, 0L, 1L, 0L, 0L, 0L), `Oldenlandia umbellata` = c(3L,
6L, 9L, 8L, 3L, 0L, 0L, 3L, 6L, 7L, 3L, 0L), `Phyllanthus virgatus` = c(0L, 2L, 9L, 13L, 6L, 7L, 9L, 0L, 0L, 6L, 11L, 8L), `Rungia pectinata` = c(0L,
0L, 0L, 2L, 3L, 3L, 0L, 0L, 0L, 0L, 0L, 0L), `Senagalia pennata` = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L), `Senna spectabilis ` = c(0L,
0L, 0L, 0L, 2L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), `Setaria flavida` = c(0L,
0L, 0L, 0L, 11L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), `Setaria pumila` = c(4L,
0L, 13L, 0L, 0L, 0L, 5L, 4L, 7L, 5L, 4L, 7L), `Themeda triandra` = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 3L, 0L, 0L, 0L)), row.names = c(NA,
-12L), class = c("tbl_df", "tbl", "data.frame"))
You do not have a data frame, but a tibble. Use alad2 <- as.data.frame(alad2) and your code will work.
I have a file with 30 columns. These include userid, itemid, moviename, rating, date and the rest are to classify genres a movie belongs to. The genre categories are column names with binary values in the rows. If a movie belongs to a genre, it has a 1 under the appropriate column and 0 otherwise. I want to calculate the average rating per genre and want to know if there is a shorter process available?
I have currently tried filtering the data by selecting each genre where the value is '1' and then calculating the average rating. But I have almost 24 genres and doing it in this way is inefficient i think. Another way I have tried is to loop through the genre columns and again filtering each genre where value is '1' but loops consume alot of time and when the data is set is large(more than 100K rows), R can play up sometimes as I have noticed.
I want to ask if there is another way which avoids a loop like melt,dcast or another method that can get the same job done?
I am providing the dput of my dataset.
dput(data)
structure(list(user_id = c(10L, 890L, 867L, 5L, 320L, 630L, 151L,
699L, 21L, 450L, 179L, 135L, 314L, 487L, 735L, 823L, 169L, 889L,
846L), item_id = c(447L, 660L, 191L, 441L, 1052L, 568L, 414L,
1061L, 872L, 33L, 302L, 581L, 568L, 280L, 181L, 503L, 498L, 207L,
497L), Movie_title = structure(c(6L, 11L, 2L, 3L, 9L, 17L, 15L,
10L, 14L, 8L, 13L, 12L, 17L, 18L, 16L, 5L, 1L, 7L, 4L), .Label = c("African Queen, The (1951)",
"Amadeus (1984)", "Amityville Horror, The (1979)", "Bringing Up Baby (1938)",
"Candidate, The (1972)", "Carrie (1976)", "Cyrano de Bergerac (1990)",
"Desperado (1995)", "Dracula: Dead and Loving It (1995)", "Evening Star, The (1996)",
"Fried Green Tomatoes (1991)", "Kalifornia (1993)", "L.A. Confidential (1997)",
"Love Jones (1997)", "My Favorite Year (1982)", "Return of the Jedi (1983)",
"Speed (1994)", "Up Close and Personal (1996)"), class = "factor"),
Rating = c(4L, 2L, 5L, 1L, 2L, 4L, 5L, 3L, 2L, 5L, 4L, 4L,
5L, 5L, 4L, 5L, 3L, 3L, 5L), Date = structure(c(7L, 15L,
12L, 4L, 1L, 2L, 9L, 8L, 19L, 14L, 18L, 10L, 6L, 16L, 5L,
11L, 17L, 13L, 3L), .Label = c("1/14/1998", "1/25/1998",
"1/5/1998", "10/1/1997", "10/13/1997", "10/26/1997", "10/27/1997",
"11/10/1997", "11/15/1997", "11/18/1997", "11/2/1997", "11/21/1997",
"11/22/1997", "12/18/1997", "12/24/1997", "12/30/1997", "3/31/1998",
"4/10/1998", "9/22/1997"), class = "factor"), unknown = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), Action = c(0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L,
1L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 1L, 0L), Adventure = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L,
1L, 0L, 0L), Animation = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Children = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), Comedy = c(0L, 0L, 0L, 0L, 1L, 0L, 1L, 1L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L), Crime = c(0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L), Documentary = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Drama = c(0L,
1L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 1L,
0L, 1L, 0L), Fantasy = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Film.Noir = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), Horror = c(1L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Musical = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), Mystery = c(0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Romance = c(0L,
0L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 1L, 1L, 0L,
1L, 1L, 0L), Sci.Fi = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L), Thriller = c(0L,
0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 0L, 0L,
0L, 0L, 0L), War = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 0L), Western = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), Short = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), History = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), Biography = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), Sport = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), Family = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L)), .Names = c("user_id",
"item_id", "Movie_title", "Rating", "Date", "unknown", "Action",
"Adventure", "Animation", "Children", "Comedy", "Crime", "Documentary",
"Drama", "Fantasy", "Film.Noir", "Horror", "Musical", "Mystery",
"Romance", "Sci.Fi", "Thriller", "War", "Western", "Short", "History",
"Biography", "Sport", "Family"), class = "data.frame", row.names = c(NA,
-19L))
This is a good use case for dplyr and tidyr:
library(dplyr)
library(tidyr)
dat %>% gather(genre, value, unknown:Family) %>% filter(value == 1) %>%
group_by(genre) %>% summarize(average = mean(Rating))
This code:
gathers each of the movie/genre pairs into a separate row (there will be multiple rows for each movie)
filters for only the cases when a movie belongs to a genre
groups by genre, and summarizes within each to find the average rating (you could perform other operations like the median or standard deviation as well)
the old-fashion way also works:
genres <- c('Action','Adventure','Animation')
means <- numeric(length(genres))
names(means) <- genres
for(g in genres)
meanRatings[g] <- mean(myData$Rating[mydata[,g]==1])
means
I am working on an assignment where I must reproduce these results:
From this paper. I'm meant to do it in Stata, but in an effort to save a few hundred bucks and use open source software and so on, I'm using R.
I've tried using the aer package and from this I am able to get the coefficients and the standard error, but I'm not sure how to get the Hausman test or Sargan test values, as well as the F stat for the second stage of the regression. Here's what I have
> library("AER")
> stage1 <- lm(ln_export_area ~ atlantic_distance_minimum +
+ indian_distance_minimum + saharan_distance_minimum + red_sea_distance_minimum,
+ data=nunn)
> reg1 <- ivreg(ln_maddison_pcgdp2000 ~ ln_export_area | atlantic_distance_minimum +
+ indian_distance_minimum + saharan_distance_minimum +
+ red_sea_distance_minimum, data=nunn)
> summary(stage1)
Call:
lm(formula = ln_export_area ~ atlantic_distance_minimum + indian_distance_minimum +
saharan_distance_minimum + red_sea_distance_minimum, data = nunn)
Residuals:
Min 1Q Median 3Q Max
-6.3574 -2.4772 0.2513 2.8323 5.9544
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 29.10969 6.95941 4.183 0.000125 ***
atlantic_distance_minimum -1.31399 0.35678 -3.683 0.000594 ***
indian_distance_minimum -1.09543 0.37978 -2.884 0.005901 **
saharan_distance_minimum -2.43487 0.82305 -2.958 0.004830 **
red_sea_distance_minimum -0.00186 0.71041 -0.003 0.997922
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 3.445 on 47 degrees of freedom
Multiple R-squared: 0.2789, Adjusted R-squared: 0.2176
F-statistic: 4.545 on 4 and 47 DF, p-value: 0.003472
> summary(reg1)
Call:
ivreg(formula = ln_maddison_pcgdp2000 ~ ln_export_area | atlantic_distance_minimum +
indian_distance_minimum + saharan_distance_minimum + red_sea_distance_minimum,
data = nunn)
Residuals:
Min 1Q Median 3Q Max
-1.9254 -0.4602 0.1429 0.4917 1.4163
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 7.81135 0.20375 38.337 < 2e-16 ***
ln_export_area -0.20794 0.05301 -3.923 0.000267 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.7787 on 50 degrees of freedom
Multiple R-Squared: 0.1273, Adjusted R-squared: 0.1098
Wald test: 15.39 on 1 and 50 DF, p-value: 0.0002674
These should match with column 1. Does anyone know how
I can get the confidence intervals from the second stage regression?
I can get the F stat from the second stage regression?
I can conduct the Hausman test and the Sargan test on the first stage regression?
Here's the data in case you want to play around with it
> dput(nunn)
structure(list(isocode = structure(1:52, .Label = c("AGO", "BDI",
"BEN", "BFA", "BWA", "CAF", "CIV", "CMR", "COG", "COM", "CPV",
"DJI", "DZA", "EGY", "ETH", "GAB", "GHA", "GIN", "GMB", "GNB",
"GNQ", "KEN", "LBR", "LBY", "LSO", "MAR", "MDG", "MLI", "MOZ",
"MRT", "MUS", "MWI", "NAM", "NER", "NGA", "RWA", "SDN", "SEN",
"SLE", "SOM", "STP", "SWZ", "SYC", "TCD", "TGO", "TUN", "TZA",
"UGA", "ZAF", "ZAR", "ZMB", "ZWE"), class = "factor"), country = structure(c(2L,
6L, 3L, 5L, 4L, 9L, 23L, 7L, 12L, 11L, 8L, 14L, 1L, 15L, 17L,
18L, 20L, 21L, 19L, 22L, 16L, 24L, 26L, 27L, 25L, 33L, 28L, 30L,
34L, 31L, 32L, 29L, 35L, 36L, 37L, 38L, 45L, 40L, 42L, 43L, 39L,
46L, 41L, 10L, 48L, 49L, 47L, 50L, 44L, 13L, 51L, 52L), .Label = c("Algeria",
"Angola", "Benin", "Botswana", "Burkina Faso", "Burundi", "Cameroon",
"Cape Verde Islands", "Central African Republic", "Chad", "Comoros",
"Congo", "Democratic Republic of Congo", "Djibouti", "Egypt",
"Equatorial Guinea", "Ethiopia", "Gabon", "Gambia", "Ghana",
"Guinea", "Guinea-Bissau", "Ivory Coast", "Kenya", "Lesotho",
"Liberia", "Libya", "Madagascar", "Malawi", "Mali", "Mauritania",
"Mauritius", "Morocco", "Mozambique", "Namibia", "Niger", "Nigeria",
"Rwanda", "Sao Tome & Principe", "Senegal", "Seychelles", "Sierra Leone",
"Somalia", "South Africa", "Sudan", "Swaziland", "Tanzania",
"Togo", "Tunisia", "Uganda", "Zambia", "Zimbabwe"), class = "factor"),
ln_maddison_pcgdp2000 = c(6.670766, 6.35437, 7.187657, 6.74876,
8.377471, 6.472346, 7.189922, 7.01661, 7.702556, 6.364751,
7.482682, 7.005789, 7.934514, 7.979339, 6.436151, 8.265393,
7.154615, 6.349139, 6.796824, 6.523562, 8.981682, 6.927558,
6.741701, 7.750184, 7.405496, 7.885329, 6.559615, 6.73578,
7.266828, 6.924613, 9.273503, 6.520621, 8.24144, 6.22059,
7.052721, 6.721426, 6.898715, 7.267525, 5.937536, 6.760415,
7.111512, 7.865572, 8.75684, 6.049734, 6.35437, 8.420241,
6.261492, 6.669498, 8.32821, 5.384495, 6.50129, 7.154615),
ln_export_area = c(7.967494, 1.140843, 8.304137, 6.413822,
-2.302585, 1.171314, 5.096793, 4.944928, 5.623267, -2.302585,
-2.302585, -1.661718, 3.257355, 0.3999169, 7.078711, 4.62739,
8.818254, 7.26078, 7.561687, 8.518584, -0.9844123, 4.99911,
4.113622, 1.61487, -2.302585, -2.302585, 5.363239, 6.520308,
6.659775, 5.072949, -2.302585, 6.968824, -1.465302, 2.752292,
7.690816, -2.302585, 5.841245, 7.561687, 6.878126, 3.923764,
-2.302585, -2.302585, -2.302585, 6.023867, 8.536835, -2.302585,
6.338511, 2.959842, 0.5095113, 5.787438, 3.614361, 1.023552
), ln_export_pop = c(14.39925, 4.451658, 13.30897, 11.72429,
3.912023, 8.052058, 10.8437, 10.33106, 12.39107, 3.912023,
3.912023, 4.703024, 9.961392, 5.477251, 12.99278, 11.69496,
13.69867, 12.8232, 12.20487, 13.78166, 4.560218, 10.44663,
10.01004, 9.503454, 3.912023, 3.912023, 11.35552, 13.05647,
12.70125, 12.87464, 3.912023, 12.33135, 7.080479, 9.695232,
12.08837, 3.912023, 11.95632, 12.91682, 11.47905, 10.37327,
3.912023, 3.912023, 3.912023, 12.87262, 13.28513, 3.912023,
12.10194, 7.543646, 7.011269, 11.76894, 10.893, 7.925018),
colony0 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), colony1 = c(0L,
0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L,
1L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L,
1L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 0L,
1L, 1L, 1L, 0L, 1L, 1L), colony2 = c(0L, 0L, 1L, 1L, 0L,
1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 1L, 0L,
0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L,
0L, 0L), colony3 = c(1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), colony4 = c(0L,
1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 1L, 0L, 0L), colony5 = c(0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L), colony6 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), colony7 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L), abs_latitude = c(8, 16, 6, 12, 24,
4, 6, 4, 4, 11, 15, 11, 36, 28, 12, 0.2, 7, 10, 13, 12, 3,
2, 6, 32, 29, 33, 19, 14, 23, 18, 20, 15, 24, 13, 9, 2, 12,
14, 8, 5, 0.2, 26, 4, 12, 6, 36, 4, 0.2, 30, 7, 20, 19),
longitude = c(17.54142, 29.88722, 2.34264, -1.74292, 23.82042,
20.48058, -5.55555, 12.74132, 15.2263, 43.49777, -24.04431,
42.57752, 2.63691, 29.87953, 39.61983, 11.79747, -1.20736,
-10.93922, -15.38402, -14.96533, 10.34204, 37.85755, -9.3071,
18.04934, 28.2439, -6.35425, 46.72618, -3.54519, 35.58901,
-10.34285, 57.79387, 34.30362, 17.21072, 9.3731, 8.10141,
29.91774, 30.04159, -14.46586, -11.79189, 45.93132, 6.68778,
31.49749, 55.37541, 18.66029, 0.97634, 9.56295, 34.81658,
32.38633, 25.14831, 23.65134, 27.81663, 29.87154), rain_min = c(0L,
5L, 13L, 0L, 0L, 5L, 41L, 23L, 0L, 69L, 0L, 0L, 0L, 0L, 5L,
3L, 15L, 3L, 0L, 0L, 5L, 15L, 31L, 0L, 8L, 0L, 8L, 0L, 13L,
0L, 0L, 0L, 0L, 0L, 25L, 7L, 0L, 0L, 3L, 0L, 0L, 20L, 69L,
0L, 15L, 3L, 0L, 46L, 8L, 3L, 0L, 0L), humid_max = c(78L,
82L, 78L, 67L, 74L, 72L, 82L, 75L, 71L, 78L, 73L, 74L, 66L,
41L, 73L, 79L, 77L, 87L, 78L, 74L, 95L, 62L, 95L, 72L, 42L,
72L, 71L, 73L, 67L, 69L, 74L, 66L, 35L, 68L, 80L, 83L, 41L,
74L, 82L, 80L, 79L, 81L, 78L, 72L, 77L, 64L, 56L, 72L, 67L,
73L, 71L, 57L), low_temp = c(14L, 17L, 18L, 9L, -4L, 14L,
15L, 14L, 12L, 19L, 13L, 17L, 0L, 1L, 0L, 17L, 15L, 17L,
7L, 13L, 17L, 5L, 13L, 1L, -9L, 0L, 1L, 8L, 7L, 7L, 10L,
-1L, -4L, 8L, 16L, 12L, 5L, 12L, 19L, 15L, 13L, -5L, 19L,
8L, 15L, -1L, 8L, 12L, -2L, 14L, 4L, 0L), ln_coastline_area = c(0.2468601,
-4.60517, 0.0684028, -4.60517, -4.60517, -4.60517, 0.4696153,
-0.1668627, -0.7049121, 5.054218, 5.478362, 2.65835, -0.7015861,
0.8960881, 4.529333, 1.194601, 0.813252, 0.3411072, 1.957224,
2.026835, 2.35459, -0.084053, 1.651772, 0.0056658, -4.60517,
1.414962, 2.107577, -4.60517, 1.124865, -0.3119217, 4.555573,
-4.60517, 0.5923609, -4.60517, -0.0799525, -4.60517, -1.079278,
0.9966474, 1.723961, 1.55798, 5.383156, -4.60517, 6.983902,
-4.60517, -0.0141846, 1.947651, 0.4072272, -4.60517, 0.8589395,
-4.151253, -4.60517, -4.60517), island_dum = c(0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L), islam = c(0, 1, 13, 42, 0, 8, 20, 22, 1,
99, 0, 94, 99, 88, 32, 0.8, 16, 80, 94, 34, 0.5, 6, 14, 97,
0, 99, 3, 89, 13, 99, 13, 16, 0, 90, 45, 9, 73, 91, 39, 100,
0, 0, 0, 43, 12, 99, 33, 7, 1, 1, 0.3, 0), legor_fr = c(1L,
1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
0L, 1L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L,
0L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 1L, 1L,
0L, 0L, 0L, 1L, 0L, 0L), legor_uk = c(0L, 0L, 0L, 0L, 1L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L,
0L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 1L,
0L, 1L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L,
1L, 1L), region_n = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L,
0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L), region_s = c(0L,
0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L,
1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L,
0L, 0L, 1L, 0L, 1L, 1L), region_w = c(0L, 0L, 1L, 1L, 0L,
0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L,
0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 1L, 1L,
0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L,
0L, 0L), region_e = c(0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
1L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L,
0L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L,
1L, 0L, 0L, 1L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L), region_c = c(1L,
0L, 0L, 0L, 0L, 1L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L,
0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L,
0L, 0L, 0L, 1L, 0L, 0L), ln_avg_gold_pop = c(-6.614335, -2.633923,
-13.81551, -2.164328, -3.70509, -3.316046, -2.972711, -4.296489,
-3.967561, -13.81551, -13.81551, -13.81551, -13.81551, -13.81551,
-3.805913, -1.654193, 0.6532509, -0.8973768, -13.81551, -13.81551,
-13.81551, -5.279897, -1.717547, -13.81551, -13.81551, -5.472521,
-7.501954, -0.8741745, -5.648631, -1.444776, -13.81551, -13.81551,
-0.6370343, -5.203685, -11.85755, -3.503525, -3.181087, -13.81551,
-3.553395, -13.81551, -13.81551, -13.81551, -13.81551, -13.81551,
-13.81551, -13.81551, -3.161444, -2.876623, 3.084304, -2.295242,
-3.235008, 0.6928481), ln_avg_oil_pop = c(0.643126, -9.21034,
-3.531555, -9.21034, -9.21034, -9.21034, -3.270892, -0.8711616,
1.000878, -9.21034, -9.21034, -9.21034, 0.9135318, -0.3610424,
-9.21034, 2.650107, -5.899707, -9.21034, -9.21034, -9.21034,
0.3627662, -9.21034, -9.21034, 3.235896, -9.21034, -7.77914,
-9.21034, -9.21034, -9.21034, -9.21034, -9.21034, -9.21034,
-9.21034, -9.21034, 0.1340378, -9.21034, -4.488462, -9.21034,
-9.21034, -9.21034, -9.21034, -9.21034, -9.21034, -9.21034,
-9.21034, -0.3781253, -9.21034, -9.21034, -5.725029, -3.441503,
-9.21034, -9.21034), ln_avg_all_diamonds_pop = c(-1.701396,
-6.907755, -6.907755, -6.907755, 2.186849, -1.849576, -4.228216,
-6.907755, -6.907755, -6.907755, -6.907755, -6.907755, -6.907755,
-6.907755, -6.907755, -2.165953, -2.239469, -3.673854, -6.907755,
-6.907755, -6.907755, -6.907755, -2.123542, -6.907755, -3.637529,
-6.907755, -6.907755, -6.907755, -6.907755, -6.907755, -6.907755,
-6.907755, 0.2363898, -6.907755, -6.907755, -6.907755, -6.907755,
-6.907755, -1.536141, -6.907755, -6.907755, -4.457984, -6.907755,
-6.907755, -6.907755, -6.907755, -4.186928, -6.907755, -1.201608,
-0.68398, -6.907755, -5.543311), ln_pop_dens_1400 = c(-0.024917,
3.036856, 1.214196, 0.9085654, -2.075029, -0.4739045, 0.4721229,
1.020704, -0.3609614, -2.302585, -2.302585, -0.1698741, -0.404099,
1.430923, 0.3556669, -0.6607257, 1.338615, 0.6566054, 1.575844,
0.9559578, 0.8622092, 0.8193114, 0.3226074, -1.258461, -0.1520654,
1.268198, -0.074497, -0.1801779, -0.020148, -1.445708, -2.302585,
0.9706659, -2.121291, -0.586962, 1.821479, 2.945036, 0.4084752,
0.8638997, 1.618101, 0.0606283, -2.302585, -0.6176535, -2.302585,
-0.4927752, 1.470734, 1.629374, 0.4596342, 1.723667, -0.9186776,
0.4253423, -1.048586, -0.2810889), atlantic_distance_minimum = c(5.66876,
10.62621, 5.120652, 4.774938, 5.686335, 5.642056, 4.185696,
5.642056, 5.527229, 10.13065, 3.646842, 14.40755, 6.559232,
16.39266, 12.58899, 5.531399, 4.772588, 3.719985, 3.888797,
3.795674, 5.577306, 11.08334, 3.776146, 8.422357, 7.202152,
5.793966, 9.686486, 3.897489, 9.264256, 4.42371, 10.3101,
9.266991, 5.682842, 5.158515, 5.224331, 10.7538, 15.25287,
3.897721, 3.705474, 12.05779, 5.196697, 8.290959, 11.45741,
5.581032, 4.92623, 7.479859, 10.59497, 10.99569, 6.765942,
5.712497, 9.027167, 9.027167), indian_distance_minimum = c(6.980571,
2.570375, 9.233961, 9.299419, 5.764575, 8.772295, 9.457085,
8.772295, 7.923528, 1.754229, 11.59978, 2.682206, 14.91231,
4.667312, 2.705884, 8.366795, 9.299526, 10.26924, 10.79257,
10.63111, 8.556146, 2.704583, 9.777017, 16.77543, 3.035,
13.67561, 0.9039161, 10.79005, 2.185373, 11.9143, 0.0319096,
2.183153, 5.792154, 9.223114, 9.150605, 2.622741, 3.527528,
10.79068, 10.18761, 2.358296, 8.474005, 2.622083, 1.742192,
8.875547, 9.258235, 15.83294, 2.558215, 2.699154, 3.457205,
7.643048, 2.388914, 2.388914), saharan_distance_minimum = c(4.925892,
3.718742, 2.834785, 2.763519, 5.856533, 2.840084, 3.353074,
3.002548, 3.697363, 4.845693, 3.481602, 2.350743, 0.9850905,
0.4303847, 2.543248, 3.70284, 3.174178, 3.245414, 3.171976,
3.284617, 3.462215, 3.358859, 3.594752, 0.6098508, 6.637325,
1.022596, 5.731615, 2.262917, 5.267768, 2.255257, 6.273852,
4.820801, 5.980785, 1.768215, 2.641684, 3.567813, 1.827123,
3.034838, 3.473508, 3.090304, 3.6702, 6.294675, 4.635344,
1.879364, 3.009106, 0.3097339, 4.05628, 3.203552, 6.583775,
3.747742, 4.848526, 5.453967), red_sea_distance_minimum = c(3.872354,
2.215324, 3.901736, 4.239375, 4.2996, 2.293167, 4.793966,
3.051031, 3.227007, 2.609506, 6.465437, 0.0643895, 3.654165,
1.112658, 0.5100758, 3.528861, 4.332308, 5.258811, 5.637868,
5.633392, 3.515037, 1.36133, 5.2275, 2.151154, 4.845831,
4.570611, 3.453547, 4.310751, 3.298301, 4.973302, 3.883714,
2.922141, 4.685066, 2.953876, 3.314152, 2.101732, 0.983083,
5.518319, 5.409636, 0.6954757, 3.932184, 4.422592, 2.252856,
2.026491, 4.084906, 3.20461, 2.18672, 1.649949, 4.89507,
2.686999, 3.253377, 3.695537), ethnic_fractionalization = c(0.7867,
0.2951, 0.7872, 0.7377, 0.4102, 0.8295, 0.8204, 0.8635, 0.8747,
0, 0.4174, 0.7962, 0.3394, 0.1836, 0.7235, 0.769, 0.6733,
0.7389, 0.7864, 0.8082, 0.3467, 0.8588, 0.9084, 0.792, 0.255,
0.4841, 0.8791, 0.6906, 0.6932, 0.615, 0.4634, 0.6744, 0.6329,
0.6518, 0.8505, 0.3238, 0.7147, 0.6939, 0.8191, 0.8117, NA,
0.0582, 0.2025, 0.862, 0.7099, 0.0394, 0.7353, 0.9302, 0.7517,
0.8747, 0.7808, 0.3874), state_dev = c(0.635, 0.995, 0.695,
0.338, 0.893, 0.144, 0.082, 0.316, 0.536, 1, NA, 0.133, 0.99,
0.99, 0.843, 0.011, 0.651, 0.406, 0.426, 0.214, 0.211, 0.172,
0, 0.94, 1, 0.81, 0.505, 0.115, 0.844, 0.858, NA, 0.861,
0.664, 0.582, 0.478, 0.982, 0.576, 0.694, 0.008, 0.034, NA,
1, NA, 0.384, 0.622, 0.98, 0.669, 0.634, NA, 0.649, 0.743,
0.965), land_area = c(1.25, 0.0278, 0.113, 0.274, 0.6, 0.623,
0.322, 0.475, 0.342, 0.00217, 0.00403, 0.022, 2.38, 1, 1.22,
0.268, 0.239, 0.246, 0.0113, 0.0361, 0.0281, 0.583, 0.111,
1.76, 0.0304, 0.447, 0.587, 1.24, 0.802, 1.03, 0.00186, 0.118,
0.824, 1.27, 0.924, 0.0263, 2.51, 0.196, 0.0717, 0.638, 0.00096,
0.0174, 0.000455, 1.28, 0.0568, 0.164, 0.945, 0.236, 1.22,
2.35, 0.753, 0.391), stage1 = c(2.01310178962912, 3.27246381305353,
5.35639168025068, 5.91184836065855, 1.05526739235966, 5.16708434318269,
5.07687379044942, 4.77009597973404, 4.15863470510818, 2.07294773749099,
3.12168352262372, 1.51620481461539, 1.7500720543329, 1.40710379910631,
3.41028983224398, 3.65368793863828, 4.91474434088887, 5.06042577393594,
4.44344092292367, 4.46841489797154, 3.97186132841912, 3.40263869963118,
4.67529406834981, -1.82253414812983, 0.151421774947407, 4.01732607566164,
1.4293632764156, 6.65069002995716, 1.71011443496853, 4.74511489084807,
0.244087584882236, 2.79795925529004, 0.726391664388915, 7.91724568619572,
5.78273806073103, 3.41515174873482, 0.752703025744719, 4.76792093974516,
4.61325345534811, 3.15668762557839, 4.05476681343383, 0.00815576315910405,
0.855615157261806, 7.47388924450179, 5.16048051732826, 1.17712597142371,
2.5050250342168, 3.90137319769041, 0.39239658701787, 4.10077923691789,
2.81956584642736, 1.34457256371601)), .Names = c("isocode",
"country", "ln_maddison_pcgdp2000", "ln_export_area", "ln_export_pop",
"colony0", "colony1", "colony2", "colony3", "colony4", "colony5",
"colony6", "colony7", "abs_latitude", "longitude", "rain_min",
"humid_max", "low_temp", "ln_coastline_area", "island_dum", "islam",
"legor_fr", "legor_uk", "region_n", "region_s", "region_w", "region_e",
"region_c", "ln_avg_gold_pop", "ln_avg_oil_pop", "ln_avg_all_diamonds_pop",
"ln_pop_dens_1400", "atlantic_distance_minimum", "indian_distance_minimum",
"saharan_distance_minimum", "red_sea_distance_minimum", "ethnic_fractionalization",
"state_dev", "land_area", "stage1"), row.names = c(NA, -52L), class = "data.frame")