Problem
I'm building some weather data and need to check and make sure that there are no outliers, values equal to -9999, and no missing days. If any of these conditions are found, I've written a function nearest() which will find the 5 closest stations and compute an inverse distance weighted value, then plug that back into where the condition was found. The problem is that the code works, but it will take a very long time to run. I have over 600 stations and each station takes about 1 hour to compute.
Question
Can this code be optimized to improve computation time? What is the best way to deal with nested for() loops being used this way?
Code
The following code is a very small portion of the data set used as a reproducible example. This obviously runs very fast, but when spread out over the entire data set will take a long time. Notice that in output, row 10 has an NA in the value. When the code is run, that value is replaced.
dput:
db_sid <- structure(list(id = "USC00030528", lat = 35.45, long = -92.4,
element = "TMAX", firstyear = 1892L, lastyear = 1952L, state = "arkansas"), .Names = c("id",
"lat", "long", "element", "firstyear", "lastyear", "state"), row.names = 5L, class = "data.frame")
output <- structure(list(id = c("USC00031632", "USC00031632", "USC00031632",
"USC00031632", "USC00031632", "USC00031632", "USC00031632", "USC00031632",
"USC00031632", "USC00031632"), element = c("TMAX", "TMIN", "TMAX",
"TMIN", "TMAX", "TMIN", "TMAX", "TMIN", "TMAX", "TMIN"), year = c(1900,
1900, 1900, 1900, 1900, 1900, 1900, 1900, 1900, 1900), month = c(1,
1, 2, 2, 3, 3, 4, 4, 5, 5), day = c(1, 1, 1, 1, 1, 1, 1, 1, 1,
1), date = structure(c(-25567, -25567, -25536, -25536, -25508,
-25508, -25477, -25477, -25447, -25447), class = "Date"), value = c(30.02,
10.94, 37.94, 10.94, NA, 28.04, 64.94, 41, 82.04, 51.08)), .Names = c("id",
"element", "year", "month", "day", "date", "value"), row.names = c(NA,
-10L), class = c("tbl_df", "data.frame"))
newdat <- structure(list(id = c("USC00031632", "USC00031632", "USC00031632",
"USC00031632", "USC00031632", "USC00031632", "USC00031632", "USC00031632",
"USC00031632", "USC00031632"), element = structure(c(1L, 2L,
1L, 2L, 2L, 1L, 2L, 1L, 2L, 1L), .Label = c("TMAX", "TMIN"), class = "factor"),
year = c("1900", "1900", "1900", "1900", "1900", "1900",
"1900", "1900", "1900", "1900"), month = c("01", "01", "02",
"02", "03", "04", "04", "05", "05", "01"), day = c("01",
"01", "01", "01", "01", "01", "01", "01", "01", "02"), date = structure(c(-25567,
-25567, -25536, -25536, -25508, -25477, -25477, -25447, -25447,
-25566), class = "Date"), value = c(30.02, 10.94, 37.94,
10.94, 28.04, 64.94, 41, 82.04, 51.08, NA)), .Names = c("id",
"element", "year", "month", "day", "date", "value"), row.names = c(NA,
10L), class = "data.frame")
stack <- structure(list(id = c("USC00035754", "USC00236357", "USC00033466",
"USC00032930"), x = c(-92.0189, -95.1464, -93.0486, -94.4481),
y = c(34.2256, 39.9808, 34.5128, 36.4261), value = c(62.06,
44.96, 55.94, 57.92)), row.names = c(NA, -4L), class = c("tbl_df",
"tbl", "data.frame"), .Names = c("id", "x", "y", "value"))
station <- structure(list(id = "USC00031632", lat = 36.4197, long = -90.5858,
value = 30.02), row.names = c(NA, -1L), class = c("tbl_df",
"data.frame"), .Names = c("id", "lat", "long", "value"))
nearest() function:
nearest <- function(id, yr, mnt, dy, ele, out, stack, station){
if (dim(stack)[1] >= 1){
ifelse(dim(stack)[1] == 1, v <- stack$value, v <- idw(stack$value, stack[,2:4], station[,2:3]))
} else {
ret <- filter(out, id == s_id & year == yr, month == mnt, element == ele, value != -9999)
v <- mean(ret$value)
}
return(v)
}
for() loops:
library(dplyr)
library(phylin)
library(lubridate)
for (i in unique(db_sid$id)){
# Check for outliers
for(j in which(output$value > 134 | output$value < -80 | output$value == -9999)){
output[j,7] <- nearest(id = j, yr = as.numeric(output[j,3]), mnt = as.numeric(output[j,4]), dy = as.numeric(output[j,5]),
ele = as.character(output[j,2]), out = output)
}
# Check for NA and replace
for (k in which(is.na(newdat$value))){
newdat[k,7] <- nearest(id = k, yr = as.numeric(newdat[k,3]), mnt = as.numeric(newdat[k,4]), dy = as.numeric(newdat[k,5]),
ele = as.character(newdat[k,2]), out = newdat, stack = stack, station = station)
}
}
I'm not sure I understand at all what you're trying to do. For example, the i from the outer for loop is never actually used. Here is some code that I think will be useful to you:
library(plyr)
library(dplyr)
output_summary =
output %>%
filter(value %>% between(-80, 134) ) %>%
group_by(date, element, id) %>%
summarize(mean_value = mean(value))
if (nrow(stack) == 1) fill_value = stack$value else
fill_value = idw(
stack$value,
stack %>% select(x, y, value),
station %>% select(lat, long) )
newdat_filled =
newdat %>%
mutate(filled_value =
value %>%
mapvalues(NA, fill_value) )
Related
I have a dataframe like below (df1). I am filtering the df1 based on user input and can summarize the data to reflect the total numbers. But based on user input I want to split the data to break it into multiple tables, group and summarize the data in a specific format.I'm not sure if i need to melt the data and then do pivot_wide.
df1<-structure(list(record_id = c(1, 1, 1, 1, 1, 1), Name = c("Anna",
"Anna", "Anna", "Anna", "Anna",
"Anna"), Country = c("USA", "USA",
"USA", "USA", "USA",
"USA"),
record_id.y = c("1", "2", "3", "4", "5",
"6"), emp_id = c("1837100", "203013",
"1820027", "1852508", "2123813",
"1887667"), rel = c("S", "M", "I",
"F", "I", "I"), Date = structure(c(17869,
17862, 17865, 17848, 17862, 17848), class = "Date"), date1 = structure(c(1639134523,
1638615986, 1638764440, 1638876083, 1644605968, 1638764441
), class = c("POSIXct", "POSIXt"), tzone = "UTC"), date2 = structure(c(NA,
NA, NA, NA, 1638615988, NA), class = c("POSIXct", "POSIXt"
), tzone = "UTC"), typec = c(1, 1, 1, 1, 1, 1
), typer = c(0, 0, 0, 0, 0, 0), typey = c(0,
0, 0, 0, 0, 0), is_present = c(NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_
)), row.names = c(NA, 6L), class = "data.frame")
My code
table1 <- reactive({
req(input$Name,
input$Country,
input$Status,
input$Type)
my_data %>%
select(Name,
Country,
typec,
typer,
typey,
date1,
date2,
Date,
rel) %>%
filter(Name %in% input$Name,
Date >= input$dates[1] &
Date <= input$dates[2]) %>%
filter(
(('Status1' %in% input$status) & !is.na(date1)) |
(('Status2' %in% input$status) & !is.na(date2))
) %>%
filter(
(('C' %in% input$type) & typec == '1') |
(('R' %in% input$type) & typer == '1') |
(('Y' %in% input$type) & typey == '1')
) %>%
filter(Rel %in% input$rel) %>%
split(input$Name) %>%
group_by(get(input$status,
input$rel,
input$type)) %>%
summarize(Total=n(), .groups = "drop")
})
output$table <- DT::renderDataTable({
datatable(table1())
})
Desired output format
Results for Anna
C R Y
Status1
Status2
Results for Vika
C R Y
Status1
Status2
I want to calculate the number of times two individuals share the same group number. I'm working with quite a large dataset (169 individuals and over a 1000 observations (rows) of them) and I'm looking for an efficient way to count the occurrence of them being in the same group. My (simplified) data looks like this:
ID
Group number
Date
Time
Aa
1
15-06-22
15:05:22
Bd
1
15-06-22
15:05:27
Cr
2
15-06-22
15:07:12
Bd
1
15-06-22
17:33:15
Aa
2
15-06-22
17:36:54
Cr
2
15-06-22
17:37:01
...
I would like my output data to look like this:
Aa-Bd
Aa-Cr
Bd-Cr
...
1
1
0
Or:
Occurrence
Dyad
1
Aa-Bd; Aa-Cr
0
Bd-Cr
Or even a matrix might work. I've been trying to replicate the solution posed for this problem: Count occurrences of a variable having two given values corresponding to one value of another variable
but for some reason my matrix remains empty, even though I know that certain individuals have been in groups with others.
Any help and suggestions would be extremely appreciated! I feel like the solution shouldn't be too complicated but for some reason I can't seem to figure it out.
Thanks in advance!
Edit: some example data from dput():
dput(c[1:5,])
structure(list(Date = structure(c(19129, 19129, 19129, 19129,
19129), class = "Date"), Time = c("11:05:58", "11:06:06", "11:06:16",
"11:06:33", "11:06:59"), Data = structure(c(1L, 1L, 1L, 1L, 1L
), .Label = "Crossing", class = "factor"), Group = structure(c(5L,
5L, 5L, 5L, 5L), .Label = c("Ankhase", "Baie Dankie", "Kubu",
"Lemon Tree", "Noha"), class = "factor"), IDIndividual1 = structure(c(158L,
158L, 34L, 153L, 14L), .Label = c("Aaa", "Aal", "Aan", "Aapi",
"Aar", "Aara", "Aare", "Aat", "Amst", "App", "Asis", "Awa", "Beir",
"Bela", "Bet", "Buk", "Daa", "Dais", "Dazz", "Deli", "Dewe",
"Dian", "Digb", "Dix", "Dok", "Dore", "Eina", "Eis", "Enge",
"Fle", "Flu", "Fur", "Gale", "Gaya", "Gese", "Gha", "Ghid", "Gib",
"Gil", "Ginq", "Gobe", "Godu", "Goe", "Gom", "Gran", "Gree",
"Gri", "Gris", "Griv", "Guat", "Gub", "Guba", "Gubh", "Guz",
"Haai", "Hee", "Heer", "Heli", "Hond", "Kom", "Lail", "Lewe",
"Lif", "Lill", "Lizz", "Mara", "Mas", "Miel", "Misk", "Moes",
"Mom", "Mui", "Naal", "Nak", "Ncok", "Nda", "Ndaw", "Ndl", "Ndon",
"Ndum", "Nge", "Nko", "Nkos", "Non", "Nooi", "Numb", "Nurk",
"Nuu", "Obse", "Oerw", "Oke", "Ome", "Oort", "Ouli", "Oup", "Palm",
"Pann", "Papp", "Pie", "Piep", "Pix", "Pom", "Popp", "Prai",
"Prat", "Pret", "Prim", "Puol", "Raba", "Rafa", "Ram", "Rat",
"Rede", "Ree", "Reen", "Regi", "Ren", "Reno", "Rid", "Rim", "Rioj",
"Riss", "Riva", "Rivi", "Roc", "Sari", "Sey", "Sho", "Sig", "Sirk",
"Sitr", "Skem", "Sla", "Spe", "Summary", "Syl", "Tam", "Ted",
"Tev", "Udup", "Uls", "Umb", "Unk", "UnkAM", "UnkBB", "UnkJ",
"UnkJF", "UnkJM", "Upps", "Utic", "Utr", "Vla", "Vul", "Xala",
"Xar", "Xeni", "Xia", "Xian", "Xih", "Xin", "Xinp", "Xop", "Yam",
"Yamu", "Yara", "Yaz", "Yelo", "Yodo", "Yuko"), class = "factor"),
Behaviour = structure(c(2L, 3L, 1L, 1L, 1L), .Label = c("Crossing",
"First Approacher", "First Crosser", "Last Crosser", "Summary"
), class = "factor"), CrossingType = c("Road - Ground Level",
"Road - Ground Level", "Road - Ground Level", "Road - Ground Level",
"Road - Ground Level"), GPSS = c(-27.9999, -27.9999, -27.9999,
-27.9999, -27.9999), GPSE = c(31.20376, 31.20376, 31.20376,
31.20376, 31.20376), Context = structure(c(1L, 1L, 1L, 1L,
1L), .Label = c("Crossing", "Feeding", "Moving", "Unknown"
), class = "factor"), Observers = structure(c(12L, 12L, 12L,
12L, 12L), .Label = c("Christelle", "Christelle; Giulia",
"Christelle; Maria", "Elif; Giulia", "Josefien; Zach; Flavia; Maria",
"Mathieu", "Mathieu; Giulia", "Mike; Mila", "Mila", "Mila; Christelle; Giulia",
"Mila; Elif", "Mila; Giulia", "Nokubonga; Mila", "Nokubonga; Tam; Flavia",
"Nokubonga; Tam; Flavia; Maria", "Nokubonga; Zach; Flavia; Maria",
"Tam; Flavia", "Tam; Zach; Flavia; Maria", "Zach", "Zach; Elif; Giulia",
"Zach; Flavia; Maria", "Zach; Giulia"), class = "factor"),
DeviceId = structure(c(10L, 10L, 10L, 10L, 10L), .Label = c("{129F4050-2294-0D43-890F-3B2DEF58FC1A}",
"{1A678F44-DB8C-1245-8DD7-9C2F92F086CA}", "{1B249FD2-AA95-5745-9A32-56CDD0587018}",
"{2C7026A6-6EDC-BA4F-84EC-3DDADFFD4FD7}", "{2E489E9F-00BE-E342-8CAE-941618B2F0E6}",
"{359CEB57-351F-F54F-B2BD-77A05FB6C349}", "{3727647C-B73A-184B-B187-D6BF75646B84}",
"{7A4E6639-7387-7648-88EC-7FD27A0F258A}", "{854B02F2-5979-174A-AAE8-398C21664824}",
"{89B5C791-1F71-0149-A2F7-F05E0197F501}", "{D92DF19A-9021-A740-AD99-DCCE1D88E064}"
), class = "factor"), Obs.nr = c(1, 1, 1, 1, 1), Gp.nr = c(1,
3, 3, 4, 5)), row.names = c(NA, -5L), groups = structure(list(
Obs.nr = 1, .rows = structure(list(1:5), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -1L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
In here Gp.nr is my group number, IDIndividual1 is my ID.
This is not efficient at all, but as a starting point you can use (GN denotes the group number)
my_ID <- unique(df$ID)
matrix <- matrix(nrow = length(my_ID),ncol = length(my_ID))
for (i in 1:length(my_ID)){
for (j in 1:length(my_ID)){
matrix[i,j] <- length(intersect(df$GN[df$ID == my_ID[i]],df$GN[df$ID == my_ID[j]]))}}
Check this out:
## Creating the Dataframe
df = data.frame(ID = c("Aa","Bd","Cc","Dd","Cr"),
GroupNumber=c(1,2,1,3,3))
## Loading the libraries
library(dplyr)
library(tidyverse)
library(stringr)
## Grouping to find out which observations share same group
df1 = df %>%
group_by(GroupNumber) %>%
summarise(ID_=paste(ID, collapse="-"),
CountbyID = n_distinct(ID_)) %>%
filter(str_detect(ID_, "-"))
## Creating all possible pair combinations and then joining and concatenating all rows
df2 = data.frame(t(combn(df$ID,2))) %>%
mutate(Comb = paste(X1,"-",X2, sep = "")) %>%
left_join(df1, by=c("Comb"="ID_")) %>%
select(Comb, CountbyID) %>%
replace(is.na(.), 0) %>%
group_by(CountbyID) %>%
summarise(ID=paste(Comb, collapse=";"))
Hope this helps!
UPDATE
The way the dataframe is setup, its causing issues to the "IDIndividual1" column. Based on the way it is setup, it has more factor levels than the unique data points. Therefore, I simply converted it to a character. Try the code below:
df = df[,c("IDIndividual1","Gp.nr")]
colnames(df) = c("ID","GroupNumber")
df$ID = as.character(df$ID) ## Converting factors to characters
## Loading the libraries
library(dplyr)
library(tidyverse)
library(stringr)
## Grouping to find out which observations share same group
df1 = df %>%
group_by(GroupNumber) %>%
summarise(ID_=paste(ID, collapse="-"),
CountbyID = n_distinct(ID_)) %>%
filter(str_detect(ID_, "-"))
## Creating all possible pair combinations and then joining and concatenating all rows
df2 = data.frame(t(combn(df$ID,2))) %>%
distinct() %>%
filter(X1 != X2) %>%
mutate(Comb = paste(X1,"-",X2, sep = "")) %>%
left_join(df1, by=c("Comb"="ID_")) %>%
select(Comb, CountbyID) %>%
replace(is.na(.), 0) %>%
group_by(CountbyID) %>%
summarise(ID=paste(Comb, collapse=";"))
I have two datasets:
df1
ID paddock cow ID
90/123 10 09/123
90/124 11 09/124
90/125 11 09/124
df2
ID paddock
09/123 20
09/124 21
I would like to match df1$cowID with df2$ID and return df2$paddock for whatever row matches. My current code is as follows:
dt <- ifelse(df1$cowID %in% df2$ID, df2$paddock[i], NA)
But I'm getting a return error. Could someone direct me in the right direction please? Thanks in advance!
You might consider joining the datasets.
dplyr::left_join(df1, df2, by = c('cow ID', 'ID')
You should probably use match :
df1$df2_paddock <- df2$paddock[match(df1$cow_ID, df2$ID)]
df1
# ID paddock cow_ID df2_paddock
#1 90/123 10 09/123 20
#2 90/124 11 09/124 21
data
df1 <- structure(list(ID = structure(1:2, .Label = c("90/123", "90/124"
), class = "factor"), paddock = 10:11, cow_ID = structure(1:2, .Label = c("09/123",
"09/124"), class = "factor")), class = "data.frame", row.names = c(NA, -2L))
df2 <- structure(list(ID = structure(1:2, .Label = c("09/123", "09/124"
), class = "factor"), paddock = 20:21), class = "data.frame",
row.names = c(NA, -2L))
You can do that by joining the two dataframes and getting the column that you want.
Using Base R
df1 <-
data.frame(
ID = c("90/123", "90/124"),
paddock = c(10, 11),
cow_ID = c("09/123", "09/124")
)
df2 <-
data.frame(
ID = c("90/123", "90/124"),
paddock = c(20, 21)
)
# Joining the two dataframes by ID then choosing coloum of interest
merge(df1, df2, by = c("ID"), suffixes = c(".x", ".y"))["paddock.y"]
# paddock.y
# 20
# 21
Using Dplyr
library(dplyr)
df1 <-
data.frame(
ID = c("90/123", "90/124"),
paddock = c(10, 11),
cow_ID = c("09/123", "09/124")
)
df2 <-
data.frame(
ID = c("90/123", "90/124"),
paddock = c(20, 21)
)
# Joining the two dataframes by ID then choosing coloum of interest
df1 %>%
inner_join(df2, by = c("ID"), suffixes = c(".x", ".y")) %>%
select(paddock.y) %>%
rename(paddock = paddock.y)
# paddock
# 20
# 21
If you would like to use ifelse(), maybe you can use the following code to make it
with(df2,ifelse(ID %in% df1$cow_ID,paddock,NA))
such that
> with(df2,ifelse(ID %in% df1$cow_ID,paddock,NA))
[1] 20 21
DATA
df1 <- structure(list(ID = structure(1:3, .Label = c("90/123", "90/124",
"90/125"), class = "factor"), paddock = c(10, 11, 11), cow_ID = structure(c(1L,
2L, 2L), .Label = c("09/123", "09/124"), class = "factor")), class = "data.frame", row.names = c(NA,
-3L))
df2 <- structure(list(ID = structure(1:2, .Label = c("09/123", "09/124"
), class = "factor"), paddock = c(20, 21)), class = "data.frame", row.names = c(NA,
-2L))
Problem
There is 1 main station (df) and 3 local stations (s) stacked in a single data.frame with values for three days. The idea is to take each day from the main station, find the relative anomaly of the three local stations, and smooth it using inverse distance weighting (IDW) from the phylin package. This is then applied to the value in the main station by multiplication.
Any suggestions on improving this code (e.g. data.table, dplyr, apply)? I still don't know how to approach this problem without the cumbersome for loop.
dput
s <- structure(list(id = c("USC00031152", "USC00034638", "USC00036352",
"USC00031152", "USC00034638", "USC00036352", "USC00031152", "USC00034638",
"USC00036352"), lat = c(33.59, 34.7392, 35.2833, 33.59, 34.7392,
35.2833, 33.59, 34.7392, 35.2833), long = c(-92.8236, -90.7664,
-93.1, -92.8236, -90.7664, -93.1, -92.8236, -90.7664, -93.1),
year = c(1900, 1900, 1900, 1900, 1900, 1900, 1900, 1900,
1900), month = c(1, 1, 1, 1, 1, 1, 1, 1, 1), day = c(1L,
1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L), value = c(63.3157576809045,
86.0490598902219, 76.506386949066, 71.3760752788486, 89.9119576975542,
76.3535163951321, 53.7259645981243, 61.7989638892985, 85.8911224149051
)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-9L), .Names = c("id", "lat", "long", "year", "month", "day",
"value"))
df <- structure(list(id = c(12345, 12345, 12345), lat = c(100, 100,
100), long = c(50, 50, 50), year = c(1900, 1900, 1900), month = c(1,
1, 1), day = 1:3, value = c(54.8780020601509, 106.966029162171,
98.3198828955801)), row.names = c(NA, -3L), class = "data.frame", .Names = c("id",
"lat", "long", "year", "month", "day", "value"))
Code
library(phylin)
nearest <- function(i, loc){
# Stack 3 local stations
stack <- s[loc:(loc+2),]
# Get 1 main station
station <- df[i,]
# Check for NA and build relative anomaly (r)
stack <- stack[!is.na(stack$value),]
stack$r <- stack$value/station$value
# Use IDW and return v
v <- as.numeric(ifelse(dim(stack)[1] == 1,
stack$r,
idw(stack$r, stack[,c(2,3,8)], station[,2:3])))
return(v)
}
ncdc <- 1
for (i in 1:nrow(df)){
# Get relative anomaly from function
r <- nearest(i, ncdc)
# Get value from main station and apply anomaly
p <- df[i,7]
df[i,7] <- p*r
# Iterate to next 3 local stations
ncdc <- ncdc + 3
}
Suppose you let your nearest function unchanged.
Then you could get your new value column in df by
newvalue <- sapply(1:NROW(df), function (i) df[i,7] * nearest(i, 3*(i-1)+1))
df$value <- newvalue
Here is some sample data for which I want to encode the gender of the names over time:
names_to_encode <- structure(list(names = structure(c(2L, 2L, 1L, 1L, 3L, 3L), .Label = c("jane", "john", "madison"), class = "factor"), year = c(1890, 1990, 1890, 1990, 1890, 2012)), .Names = c("names", "year"), row.names = c(NA, -6L), class = "data.frame")
Here is a minimal set of the Social Security data, limited to just those names from 1890 and 1990:
ssa_demo <- structure(list(name = c("jane", "jane", "john", "john", "madison", "madison"), year = c(1890L, 1990L, 1890L, 1990L, 1890L, 1990L), female = c(372, 771, 56, 81, 0, 1407), male = c(0, 8, 8502, 29066, 14, 145)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -6L), .Names = c("name", "year", "female", "male"))
I've defined a function which subsets the Social Security data given a year or range of years. In other words, it calculates whether a name was male or female over a given time period by figuring out the proportion of male and female births with that name. Here is the function along with a helper function:
require(plyr)
require(dplyr)
select_ssa <- function(years) {
# If we get only one year (1890) convert it to a range of years (1890-1890)
if (length(years) == 1) years <- c(years, years)
# Calculate the male and female proportions for the given range of years
ssa_select <- ssa_demo %.%
filter(year >= years[1], year <= years[2]) %.%
group_by(name) %.%
summarise(female = sum(female),
male = sum(male)) %.%
mutate(proportion_male = round((male / (male + female)), digits = 4),
proportion_female = round((female / (male + female)), digits = 4)) %.%
mutate(gender = sapply(proportion_female, male_or_female))
return(ssa_select)
}
# Helper function to determine whether a name is male or female in a given year
male_or_female <- function(proportion_female) {
if (proportion_female > 0.5) {
return("female")
} else if(proportion_female == 0.5000) {
return("either")
} else {
return("male")
}
}
Now what I want to do is use plyr, specifically ddply, to subset the data to be encoded by year, and merge each of those pieces with the value returned by the select_ssa function. This is the code I have.
ddply(names_to_encode, .(year), merge, y = select_ssa(year), by.x = "names", by.y = "name", all.x = TRUE)
When calling select_ssa(year), this command works just fine if I hard code a value like 1890 as the argument to the function. But when I try to pass it the current value for year that ddply is working with, I get an error message:
Error in filter_impl(.data, dots(...), environment()) :
(list) object cannot be coerced to type 'integer'
How can I pass the current value of year on to ddply?
I think you're making things too complicated by trying to do a join inside ddply. If I were to use dplyr I would probably do something more like this:
names_to_encode <- structure(list(name = structure(c(2L, 2L, 1L, 1L, 3L, 3L), .Label = c("jane", "john", "madison"), class = "factor"), year = c(1890, 1990, 1890, 1990, 1890, 2012)), .Names = c("name", "year"), row.names = c(NA, -6L), class = "data.frame")
ssa_demo <- structure(list(name = c("jane", "jane", "john", "john", "madison", "madison"), year = c(1890L, 1990L, 1890L, 1990L, 1890L, 1990L), female = c(372, 771, 56, 81, 0, 1407), male = c(0, 8, 8502, 29066, 14, 145)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -6L), .Names = c("name", "year", "female", "male"))
names_to_encode$name <- as.character(names_to_encode$name)
names_to_encode$year <- as.integer(names_to_encode$year)
tmp <- left_join(ssa_demo,names_to_encode) %.%
group_by(year,name) %.%
summarise(female = sum(female),
male = sum(male)) %.%
mutate(proportion_male = round((male / (male + female)), digits = 4),
proportion_female = round((female / (male + female)), digits = 4)) %.%
mutate(gender = ifelse(proportion_female == 0.5,"either",
ifelse(proportion_female > 0.5,"female","male")))
Note that 0.1.1 is still a little finicky about the types of join columns, so I had to convert them. I think I saw some activity on github that suggested that was either fixed in the dev version, or at least something they're working on.