Error with loops in R: Error "object not found" - r

I have panel data with years from 2005 to 2015 and sectors from 1 to 33 (excluding 2, 4 and 31). I would like to run some loops and save the output for each year-sector combination separately. This is my code:
for (i in 2005:2015){
ntm_data <-subset(ntm_data_wip, StartDate <=i & EndDate >i)
for(s in c(1, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 32, 33)){
ntm_data <-subset(ntm_data, ISIC4==s)
# Once the data is loaded, I exclude NTM codes which are missing.
# I only need the reporter, NTM code and product codes (HS 6-digit codes).
ntm_data <- ntm_data[!is.na(ntm_data$ntmcode)&ntm_data$ntmcode!="",]
ntm_data <- ntm_data[,c("reporter", "ntmcode", "hs6")]
# I group the data by reporter, NTM and product code (hs6) and count the number of combinations in a new variable called count.
ntm_data <- ntm_data %>% group_by(reporter, ntmcode, hs6) %>%
summarise(count = n())
head(ntm_data)
# I prepare the regulatory matrix by creating a list of countries for which I want the regulatory distance. The
# regulatory matrix shows the distance between two countries and has as column and row names the ISO3 codes of the countries.
# As specified above, I am interested in having the analysis for all available countries.
avail_iso3s <- unique(ntm_data$reporter)
# I create an empty regulatory distance matrix. For column size I use the length of avail_iso3s and add 1 for the reporter column.
# I populate the column names with reporter and the ISO3 codes with the option dimnames.
regulatory_distance_matrix <- data.frame(matrix(vector(),0,length(avail_iso3s)+1,
dimnames = list(c(), c("reporter", avail_iso3s )
)),
stringsAsFactors=F)
#' Now I can move on to calculating the regulatory distance formula in page 3 of "DEEP REGIONAL INTEGRATION AND NON-TARIFF MEASURES:A METHODOLOGY FOR DATA ANALYSIS (2015)" .
#' As N is a constant, I start with calculating it outside of the loop
N <- ntm_data %>% group_by(ntmcode, hs6) %>% count()
N <- nrow(N)
# I now fill in the regulatory distance matrix with values
for (g in 1:length(avail_iso3s)){
country_a <- ntm_data[ntm_data$reporter==avail_iso3s[g],c("ntmcode", "hs6")]
country_a$country_a <- 1
regulatory_distance_matrix[g,"reporter"] <- avail_iso3s[g]
for (k in 1:length(avail_iso3s)){
if (!is.na(regulatory_distance_matrix[k,avail_iso3s[g]])){next }
country_b <- ntm_data[ntm_data$reporter==avail_iso3s[k],c("ntmcode", "hs6")]
country_b$country_b <- 1
merged <- merge(country_a, country_b, by=c("ntmcode", "hs6"), all = TRUE)
merged[is.na(merged)] <- 0
merged$abs_diff <- abs(merged$country_a-merged$country_b)
rd <- sum(merged$abs_diff)/N
regulatory_distance_matrix[g,avail_iso3s[k]] <- rd
}
}
# Now I fill in the missing values and create a Stata dta.file.
for (g in 1:length(avail_iso3s)){
for (k in 1:length(avail_iso3s)){
if (is.na(regulatory_distance_matrix[k,avail_iso3s[g]])){
regulatory_distance_matrix[k,avail_iso3s[g]] <- regulatory_distance_matrix[g,avail_iso3s[k]]
}
}
}
regulatory_distance_matrix$year <-i
regulatory_distance_matrix$ISIC4 <-s
write.dta(regulatory_distance_matrix, paste0("C:/Users/Utente/Desktop/Master's thesis/Thesis analysis/- RD construction/Binary sectoral RD/regulatory_distance_matrix_",i,"_",s,".dta"))
}
}
However, after the first file (regulatory_distance_matrix_",i,"_",s,".dta") is correctly created, I get the following error during the creation of the second file:
Error in eval(e, x, parent.frame()) : oggetto "ISIC4" non trovato
Does someone how to fix this issue?
Thanks in advance!
EDIT:
> dput(head(ntm_data_wip))
structure(list(reporter = c("TUR", "ARG", "BRA", "CHN", "USA",
"EUN"), Reporter_ISO_N = c("792", "032", "076", "156", "842",
"918"), hs6 = c("910610", "851679", "040221", "620449", "021012",
"284990"), ntmcode = c("B31", "A11", "B33", "B83", "A83", "B33"
), partner = c("TON", "WLD", "WLD", "IRN", "VAT", "WLD"), Partner_ISO_N = c("776",
"000", "000", "364", "336", "000"), nbr = c(1L, 1L, 1L, 1L, 2L,
1L), Year = c(2016L, 2014L, 2013L, 2016L, 2017L, 2011L), NTMNomenclature = c("M4",
"M4", "M4", "M4", "M4", "M4"), NomenCode = c("H4", "H4", "H4",
"H4", "H4", "H3"), Dataset_id = c(161L, 174L, 174L, 131L, 179L,
111L), ntm_1_digit = c("B", "A", "B", "B", "A", "B"), StartDate = c(2015L,
2006L, 2008L, 2011L, 1992L, 2009L), EndDate = c(9999L, 9999L,
9999L, 9999L, 9999L, 2011L), new_ISIC4 = c("32", "28", "10",
"13", "10", "19"), ISIC4 = c(32L, 28L, 10L, 13L, 10L, 19L)), datalabel = "", time.stamp = "31 Jul 2021 11:34", formats = c("%9s",
"%9s", "%9s", "%9s", "%9s", "%9s", "%9.0g", "%12.0g", "%9s",
"%9s", "%12.0g", "%9s", "%10.0g", "%10.0g", "%9s", "%10.0g"), types = c(3L,
3L, 6L, 4L, 3L, 3L, 65530L, 65529L, 2L, 3L, 65529L, 1L, 65529L,
65529L, 2L, 65530L), val.labels = structure(c("", "", "", "",
"", "", "", "", "", "", "", "", "", "", "", ""), .Names = c("",
"", "", "", "", "", "", "", "", "", "", "", "", "", "", "")), var.labels = c("",
"", "", "", "", "", "Number of NTM, distinct codes", "", "",
"", "", "", "(min) StartDate", "(max) EndDate", "", ""), version = 118L, label.table = list(), expansion.fields = list(
c("ISIC4", "destring", "Characters removed were:"), c("ISIC4",
"destring_cmd", "destring new_ISIC4, gen(ISIC4)")), byteorder = "LSF", orig.dim = c(6953474L,
16L), row.names = c(NA, 6L), class = "data.frame")

This is too long for a comment.
The issue is that the inner loop of for (s in c(...)) {...} uses ntm_data which overwrites itself within the inner loop. So in the first pass, ntm_data$ICIS4 is a column. Plus, since we are subsetting, we don’t want subsequent iterations having been filtered based on the previous s.
Here's a snippet of the top of the beginning code with a new object at the start of the outer loop.:
library(dplyr)
for (i in 2005:2015){
## CHANGED - make a different object for the inner loop to subset from
ntm_data_years <-subset(ntm_data_wip, StartDate <=i & EndDate >i)
for(s in c(1, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 32, 33)){
ntm_data <-subset(ntm_data_years, ISIC4==s) ## CHANGED to subset(ntm_data_years, ...)
ntm_data <- ntm_data[!is.na(ntm_data$ntmcode)&ntm_data$ntmcode!="",]
ntm_data <- ntm_data[,c("reporter", "ntmcode", "hs6")] ## This is the line that removes ISIC4 from the data frame
...
}

Related

Using Map on Function if name of df lists match

I have given a
a namend vector col:
col <- c(id = "CLUSTER", x = "LONGNUM", y = "LATNUM", n = "Severely.stunted.child.under.5.years..Total", pos = "Severely.stunted.child.under.5.years.Yes")
#a List of Dataframes with the the Elements of col as Columns and namend after a specific study area. (see deput below) The List Results contains 19 different files (different years)
a list of shapefiles of with 6 Elements (corresponding countries):
study_area <- c("Ethiopia", "Liberia", "Malawi", "Rwanda", "Uganda", "Zimbabwe")
Countries <- lapply(study_area, function(x){gisco_get_countries(country= x, resolution = 60 )})
Countries <- lapply(Countries, function(x) {as_Spatial(x, cast = TRUE, IDs = c("CNTR_NAME", "ISO§_CODE", "CNRT_ID", "NAME_ENGL", "FID"))})
names(Countries) <- study_area
I would like to preform the function from the prevR Library :
s.prevR(Results[[1]], col, Countries[[1]])
But actually for every element in the lists where the names fit:
I tried something like: Map(function(x, y) { as.prevR(x, col, y)}, Results, Countries)
But there it does (obviously) not match by names of x and y
dput( dput(Results[[1]][1:5,1:24])
structure(list(CLUSTER = c("", "1", "10", "100", "101"), Severely.stunted.child.under.5.years.No = c(3438,
8, 7, 9, 6), Severely.stunted.child.under.5.years.Yes = c(1047,
4, NA, 7, 1), Severely.stunted.child.under.5.years..Total = c(4485,
12, 7, 16, 7), Stunted.child.under.5.years.No = c(2531, 2, 7,
7, 5), Stunted.child.under.5.years.Yes = c(1954, 10, NA, 9, 2
), Stunted.child.under.5.years..Total = c(4485, 12, 7, 16, 7),
Severely.wasted.child.under.5.years.No = c(4295, 11, 7, 16,
7), Severely.wasted.child.under.5.years.Yes = c(190, 1, NA,
NA, NA), Severely.wasted.child.under.5.years..Total = c(4485,
12, 7, 16, 7), Wasted.child.under.5.years.No = c(3957, 10,
7, 16, 6), Wasted.child.under.5.years.Yes = c(528, 2, NA,
NA, 1), Wasted.child.under.5.years..Total = c(4485, 12, 7,
16, 7), Severely.underweight.child.under.5.years.No = c(4028,
10, 7, 12, 7), Severely.underweight.child.under.5.years.Yes = c(457,
2, NA, 4, NA), Severely.underweight.child.under.5.years..Total = c(4485,
12, 7, 16, 7), Underweight.child.under.5.years.No = c(3185,
7, 7, 12, 5), Underweight.child.under.5.years.Yes = c(1300,
5, NA, 4, 2), Underweight.child.under.5.years..Total = c(4485,
12, 7, 16, 7), LATNUM = c(NA, 10.889096, 5.323272, 8.830199,
10.806748), LONGNUM = c(NA, 37.269565, 39.556812, 40.72964,
39.7703), SurveyId = c("ET2005DHS", "ET2005DHS", "ET2005DHS",
"ET2005DHS", "ET2005DHS"), DHSC = c("ET", "ET", "ET", "ET",
"ET"), Country = c("Ethiopia", "Ethiopia", "Ethiopia", "Ethiopia",
"Ethiopia")), row.names = c(NA, 5L), class = "data.frame")
and Countries
dput(Countries[[1]])
new("SpatialPolygonsDataFrame", data = structure(list(CNTR_NAME = "Federal Democratic Republic of Ethiopia",
ISO3_CODE = "ETH", CNTR_ID = "ET", NAME_ENGL = "Ethiopia",
FID = "ET"), class = "data.frame", row.names = 1L), polygons = list(
new("Polygons", Polygons = list(new("Polygon", labpt = c(39.6420582930584,
8.63562315843106), area = 93.13026982, hole = FALSE, ringDir = 1L,
coords = structure(c(41.6307, 42.4043, 41.816, 41.8348,
42.9681, 42.7628, 42.9804, 43.9589, 45.6126, 46.9411,
47.8524, 45.6126, 45.4747, 45.2923, 44.9162, 43.4741,
42.8138, 41.9101, 41.2328, 40.708, 39.9305, 39.5667,
38.9731, 38.1026, 36.9621, 35.9477, 35.8294, 35.3235,
35.0325, 34.9588, 34.5428, 33.7557, 33.0448, 33.2485,
33.8204, 34.0937, 34.1132, 34.4181, 34.8021, 35.2153,
35.6227, 36.1342, 36.5603, 37.2972, 37.5268, 37.9201,
38.5391, 39.0217, 40.0851, 40.8941, 41.6307, 13.3913,
12.4686, 11.6292, 11.0448, 10.9974, 10.7159, 10.0644,
9.0545, 8.4674, 8.0224, 7.9151, 5.5657, 5.4241, 5.2367,
4.9368, 4.7993, 4.301, 3.9823, 3.9616, 4.2326, 3.8858,
3.5224, 3.5158, 3.6459, 4.3833, 4.62, 5.2367, 5.413,
5.8494, 6.4537, 6.7418, 7.6074, 7.899, 8.381, 8.4168,
8.6026, 9.4986, 10.6735, 10.8052, 11.9187, 12.5064, 12.8315,
14.2577, 14.3876, 14.2588, 14.8128, 14.4413, 14.5899,
14.5456, 14.0891, 13.3913), dim = c(51L, 2L)))), plotOrder = 1L,
labpt = c(39.6420582930584, 8.63562315843106), ID = "1",
area = 93.13026982)), plotOrder = 1L, bbox = structure(c(33.0448,
3.5158, 47.8524, 14.8128), dim = c(2L, 2L), dimnames = list(c("x",
"y"), c("min", "max"))), proj4string = new("CRS", projargs = "+proj=longlat +datum=WGS84 +no_defs"))
If the Countries names are all in the Results names and if 'Results' have duplicates for names, then we can make the Countries to have the same length by replicating based on the names of the 'Results'
Map(function(x, y) { as.prevR(x, col, y)}, Results, Countries[names(Results)])

R: new column based whether categorical levels of another column are the same or different from each other

I am having a problem creating a new column in a data where the column content is defined by levels in a factor in a different column are the same or different, which is dependent on another 2 columns.
Basically, I have a bunch of cows with different ID's that can have different parities. The quarter is the udder quarter affected by the disease and I would like to create a new column with a result that is based on whether quarters are the same or different or occurring once. Any help would be appreciated. Code for abbreviated data frame below/ The new column is the one I would like to achieve.
AnimalID <- c(10,10,10,10,12,12,12,12,14)
Parity <- c(8,8,9,9,4,4,4,4,2)
Udder_quarter <- c("LH","LH","RH","RH","LH","RH","LF","RF","RF")
new_column <- c("same quarter","same quarter","different quarter","different quarter","different quarter","different quarter","different quarter","different quarter","one quarter")
quarters<- data.frame(AnimalID,Parity,Udder_quarter,new_column)
structure(list(HerdAnimalID = c(100165, 100165, 100327, 100327,
100450, 100450), Parity = c(6, 6, 5, 5, 3, 3), no_parities = c(1,
1, 1, 1, 1, 1), case = c("1pathogen_lact", "1pathogen_lact",
"1pathogen_lact", "1pathogen_lact", "1pathogen_lact", "1pathogen_lact"
), FARM = c(1, 1, 1, 1, 1, 1), `CASE NO` = c("101", "101", "638",
"638", "593", "593"), MASTDATE = structure(c(1085529600, 1087689600,
1097884800, 1101254400, 1106092800, 1106784000), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), QRT = c("LF", "LF", "RH", "LF", "LH",
"LH"), MastitisDiagnosis = c("Corynebacterium spp", "Corynebacterium spp",
"S. uberis", "S. uberis", "Bacillus spp", "Bacillus spp"), PrevCalvDate =
structure(c(1075334400,
1075334400, 1096156800, 1096156800, 1091145600, 1091145600), class =
c("POSIXct",
"POSIXt"), tzone = "UTC")), .Names = c("HerdAnimalID", "Parity",
"no_parities", "case", "FARM", "CASE NO", "MASTDATE", "QRT",
"MastitisDiagnosis", "PrevCalvDate"), row.names = c(NA, -6L), class =
c("tbl_df",
"tbl", "data.frame"))
library(dplyr)
quarters %>%
group_by(AnimalID) %>%
mutate(new_column = ifelse(n()==1, 'one quarter', NA)) %>%
group_by(Parity, add=T) %>%
mutate(new_column=ifelse(length(unique(Udder_quarter))==1 & is.na(new_column),
"same quarter",
ifelse(length(unique(Udder_quarter))>1,
"different quarter",
new_column))) %>%
data.frame()
Output is:
AnimalID Parity Udder_quarter new_column
1 10 8 LH same quarter
2 10 8 LH same quarter
3 10 9 RH same quarter
4 10 9 RH same quarter
5 12 4 LH different quarter
6 12 4 RH different quarter
7 12 4 LF different quarter
8 12 4 RF different quarter
9 14 2 RF one quarter
Sample data:
quarters <- structure(list(AnimalID = c(10, 10, 10, 10, 12, 12, 12, 12, 14
), Parity = c(8, 8, 9, 9, 4, 4, 4, 4, 2), Udder_quarter = structure(c(2L,
2L, 4L, 4L, 2L, 4L, 1L, 3L, 3L), .Label = c("LF", "LH", "RF",
"RH"), class = "factor")), .Names = c("AnimalID", "Parity", "Udder_quarter"
), row.names = c(NA, -9L), class = "data.frame")
I would use ave to do that:
f <- function(x) {
if (length(x)==1) return("one")
else if (all(x == x[1])) return("same")
else return("different")
}
ave(Udder_quarter, interaction(AnimalID, Parity), FUN=f)
# [1] "same" "same" "same" "same" "different"
# [6] "different" "different" "different" "one"

Why hover in plotly barchart does not work?

I've got data like this ...
# rok miesiac ile kwartal miesiac2 kwartal2 miesiac3 limit serwis typ ile2 ile_proc lp
# (dbl) (dbl) (dbl) (dbl) (chr) (fctr) (chr) (dbl) (chr) (chr) (dbl) (dbl) (dbl)
# 1 2017 1 31.5 1 1 Q1 2017 Styczeń 0 Sport wizyty 32.5 97 1
# 2 2017 2 1.0 1 2 Q1 2017 Luty 0 Sport wizyty 32.5 3 1
... and I try to draw this plot from plotly library ...
plot_ly(tab,
x = ~lp,
y = ~ile,
color = ~miesiac2,
type = "bar",
text = ~miesiac3,
hoverinfo = "text")
... and everything is ok but hover. It does not work and I have no idea why. What is curious when I have the same format of data but a bit 'longer', everything works.
I have no idea where the problem is. I hope you do!
Simple data:
structure(list(rok = c(2017, 2017), miesiac = c(1, 2), ile = c(31.5,
1), kwartal = c(1, 1), miesiac2 = c("1", "2"), kwartal2 = structure(c(1L,
1L), .Label = "Q1 2017", class = "factor"), miesiac3 = c("Styczeń",
"Luty"), limit = c(97, 97), serwis = c("Sport", "Sport"), typ = c("wizyty",
"wizyty"), ile2 = c(32.5, 32.5), ile_proc = c(97, 3), lp = c(1,
1)), class = "data.frame", .Names = c("rok", "miesiac", "ile",
"kwartal", "miesiac2", "kwartal2", "miesiac3", "limit", "serwis",
"typ", "ile2", "ile_proc", "lp"), row.names = c(NA, -2L))
'Longer' data:
structure(list(rok = c(2016, 2016, 2016, 2016, 2016, 2016, 2016,
2016, 2016, 2017, 2017), miesiac = c(4, 5, 6, 7, 8, 9, 10, 11,
12, 1, 2), ile = c(80.1, 87.5, 159, 104, 125.3, 74.2, 84.9, 74.4,
75.3, 81.8, 2.4), kwartal = c(2, 2, 2, 3, 3, 3, 4, 4, 4, 1, 1
), miesiac2 = c("1", "2", "3", "1", "2", "3", "1", "2", "3",
"1", "2"), kwartal2 = structure(c(1L, 1L, 1L, 2L, 2L, 2L, 3L,
3L, 3L, 4L, 4L), .Label = c("Q2 2016", "Q3 2016", "Q4 2016",
"Q1 2017"), class = "factor"), miesiac3 = c("Kwiecień", "Maj",
"Czerwiec", "Lipiec", "Sierpień", "Wrzesień", "Październik",
"Listopad", "Grudzień", "Styczeń", "Luty"), limit = c(308, 308,
308, 300, 300, 300, 245, 245, 245, 244, 244), serwis = c("Sport",
"Sport", "Sport", "Sport", "Sport", "Sport", "Sport", "Sport",
"Sport", "Sport", "Sport"), typ = c("odslony", "odslony", "odslony",
"odslony", "odslony", "odslony", "odslony", "odslony", "odslony",
"odslony", "odslony"), ile2 = c(326.6, 326.6, 326.6, 303.5, 303.5,
303.5, 234.6, 234.6, 234.6, 84.2, 84.2), ile_proc = c(25, 27,
49, 34, 41, 24, 36, 32, 32, 97, 3), lp = c(1, 1, 1, 2, 2, 2,
3, 3, 3, 4, 4)), class = "data.frame", .Names = c("rok", "miesiac",
"ile", "kwartal", "miesiac2", "kwartal2", "miesiac3", "limit",
"serwis", "typ", "ile2", "ile_proc", "lp"), row.names = c(NA,
-11L))
The plotting works for me, although I do get a warning
Warning message:
In RColorBrewer::brewer.pal(N, "Set2") :
minimal value for n is 3, returning requested palette with 3 different levels
This is just a warning and can in this case be ignored. For the curious, it originates from RColorBrewer and can be avoided by manually specifying the colors.
library(RColorBrewer)
# display.brewer.all() # see all the palettes
# generate colors beforehand - same warning. extract only first two
cols <- brewer.pal(n = 2, name = "Set2")[1:2]
plot_ly(xyshort,
x = ~lp,
y = ~ile,
color = ~miesiac2,
colors = cols, # explicitly name colors
type = "bar",
text = ~miesiac3,
hoverinfo = "text")
R 3.3.2 on Windows 7 and plotly_4.5.6.

do a loop with different versions of a dataset based on a variable id and save the result after each loop

I have a dataset with x countries over y years.
I would like to do a certain analysis (see indicated below, but this code is not the problem)
The problem: I would like to do this analysis of the code I already have, a number of times: each time with a different dataset that has another combination of the x countries and y years. To be clear: I would like to do the analysis for EACH possible combination of the x countries and the y years.
The code that I would like to execute for each version of the dataset (explanation dataset see further)
library(stats)
##### the analysis for one dataset ####
d=data.frame(outcome_spring=rep(1,999),outcome_summer=rep(1,999),
outcome_autumn=rep(1,999),outcome_winter=rep(1,999))
o <- lapply(1:999, function(i) {
Alldata_Rainfed<-subset(Alldata, rainfed <= i)
outcome_spring=sum(Alldata$spring)
outcome_summer=sum(Alldata$summer)
outcome_autumn=sum(Alldata$autumn)
outcome_winter=sum(Alldata$winter)
d[i, ] = c(outcome_spring, outcome_summer, outcome_autumn, outcome_winter)
} )
combination<-as.data.frame(do.call(rbind, o)) #the output I want is another dataset for each unique dataset
#### the end of the analysis for one dataset ####
Desired output
That means that as an output I need to have the same amounts of datasets (named "combination" in the example) as the number of combinations possible between x countries and y years.
As an example, imagine having the following dataset (real dataset has over 500000 observations, 15 countries, 9 years)
> dput(Alldata)
structure(list(country = c("belgium", "belgium", "belgium", "belgium",
"germany", "germany", "germany", "germany"), year = c(2004, 2005,
2005, 2013, 2005, 2009, 2013, 2013), spring = c(23, 24, 45, 23,
1, 34, 5, 23), summer = c(25, 43, 654, 565, 23, 1, 23, 435),
autumn = c(23, 12, 4, 12, 24, 64, 23, 12), winter = c(34,
45, 64, 13, 346, 74, 54, 45), irrigation = c(10, 30, 40,
300, 288, 500, 996, 235), id = c(1, 2, 2, 3, 4, 5, 6, 6)), datalabel = "", time.stamp = "14 Nov 2016 20:09", .Names = c("country",
"year", "spring", "summer", "autumn", "winter", "irrigation",
"id"), formats = c("%9s", "%9.0g", "%9.0g", "%9.0g", "%9.0g",
"%9.0g", "%9.0g", "%9.0g"), types = c(7L, 254L, 254L, 254L, 254L,
254L, 254L, 254L), val.labels = c("", "", "", "", "", "", "",
""), var.labels = c("", "", "", "", "", "", "", "group(country year)"
), row.names = c("1", "2", "3", "4", "5", "6", "7", "8"), version = 12L, class = "data.frame")
In the example above, I already made an id for combining country and year. That means I want to make datasets with all observations that have combinations of the following ids:
dataset 1_2_3_4_5: ids 1, 2, 3, 4, 5 (so this dataset only misses the observations with id = 6)
dataset 1_2_3_4_6: ids 1, 2, 3, 4, 6 (but not 5)
dataset 1_2: ids 1, 2 (but not all the rest)
dataset 3_4_5: ids 3, 4, 5 (but not all the rest)
....
etc etc... Note that I gave the name of the dataset the name of the ids that are included. Otherwise it will be hard for me to distinguish all the different datasets from each other. Other names are fine too, as long as I can distinguish between the datasets!
Thanks for your help!
EDIT: it might be possible that certain datasets give no results (because in the second loop irrigation is used too loop and certain combinations might not have irrigation) but then the output should just be a dataset with missing values
Not sure if this is the most efficient way of doing this, but I think it should work:
# create a df to store the results of all combinations
result=data.frame()
The next loops are based on the combn() function, which creates all possible combinations of a vector (here ID), using m number of elements.
for(i in 2:max(o$id)){
combis=combn(unique(o$id),i)
for(j in 1:ncol(combis)){
sub=o[o$id %in% combis[,j],]
out=sub[1,] # use your function
out$label=paste(combis[,j],collapse ='') #provide an id so you know for which combination this result is
result=rbind(result,out) # paste it to previous output
}
}

Passing current value of ddply split on to function

Here is some sample data for which I want to encode the gender of the names over time:
names_to_encode <- structure(list(names = structure(c(2L, 2L, 1L, 1L, 3L, 3L), .Label = c("jane", "john", "madison"), class = "factor"), year = c(1890, 1990, 1890, 1990, 1890, 2012)), .Names = c("names", "year"), row.names = c(NA, -6L), class = "data.frame")
Here is a minimal set of the Social Security data, limited to just those names from 1890 and 1990:
ssa_demo <- structure(list(name = c("jane", "jane", "john", "john", "madison", "madison"), year = c(1890L, 1990L, 1890L, 1990L, 1890L, 1990L), female = c(372, 771, 56, 81, 0, 1407), male = c(0, 8, 8502, 29066, 14, 145)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -6L), .Names = c("name", "year", "female", "male"))
I've defined a function which subsets the Social Security data given a year or range of years. In other words, it calculates whether a name was male or female over a given time period by figuring out the proportion of male and female births with that name. Here is the function along with a helper function:
require(plyr)
require(dplyr)
select_ssa <- function(years) {
# If we get only one year (1890) convert it to a range of years (1890-1890)
if (length(years) == 1) years <- c(years, years)
# Calculate the male and female proportions for the given range of years
ssa_select <- ssa_demo %.%
filter(year >= years[1], year <= years[2]) %.%
group_by(name) %.%
summarise(female = sum(female),
male = sum(male)) %.%
mutate(proportion_male = round((male / (male + female)), digits = 4),
proportion_female = round((female / (male + female)), digits = 4)) %.%
mutate(gender = sapply(proportion_female, male_or_female))
return(ssa_select)
}
# Helper function to determine whether a name is male or female in a given year
male_or_female <- function(proportion_female) {
if (proportion_female > 0.5) {
return("female")
} else if(proportion_female == 0.5000) {
return("either")
} else {
return("male")
}
}
Now what I want to do is use plyr, specifically ddply, to subset the data to be encoded by year, and merge each of those pieces with the value returned by the select_ssa function. This is the code I have.
ddply(names_to_encode, .(year), merge, y = select_ssa(year), by.x = "names", by.y = "name", all.x = TRUE)
When calling select_ssa(year), this command works just fine if I hard code a value like 1890 as the argument to the function. But when I try to pass it the current value for year that ddply is working with, I get an error message:
Error in filter_impl(.data, dots(...), environment()) :
(list) object cannot be coerced to type 'integer'
How can I pass the current value of year on to ddply?
I think you're making things too complicated by trying to do a join inside ddply. If I were to use dplyr I would probably do something more like this:
names_to_encode <- structure(list(name = structure(c(2L, 2L, 1L, 1L, 3L, 3L), .Label = c("jane", "john", "madison"), class = "factor"), year = c(1890, 1990, 1890, 1990, 1890, 2012)), .Names = c("name", "year"), row.names = c(NA, -6L), class = "data.frame")
ssa_demo <- structure(list(name = c("jane", "jane", "john", "john", "madison", "madison"), year = c(1890L, 1990L, 1890L, 1990L, 1890L, 1990L), female = c(372, 771, 56, 81, 0, 1407), male = c(0, 8, 8502, 29066, 14, 145)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -6L), .Names = c("name", "year", "female", "male"))
names_to_encode$name <- as.character(names_to_encode$name)
names_to_encode$year <- as.integer(names_to_encode$year)
tmp <- left_join(ssa_demo,names_to_encode) %.%
group_by(year,name) %.%
summarise(female = sum(female),
male = sum(male)) %.%
mutate(proportion_male = round((male / (male + female)), digits = 4),
proportion_female = round((female / (male + female)), digits = 4)) %.%
mutate(gender = ifelse(proportion_female == 0.5,"either",
ifelse(proportion_female > 0.5,"female","male")))
Note that 0.1.1 is still a little finicky about the types of join columns, so I had to convert them. I think I saw some activity on github that suggested that was either fixed in the dev version, or at least something they're working on.

Resources