Combine ifelse two conditions and loop - r

I have a liste of dataframes (file1, file2, ..., file 72). For each dataframe I want to create one variable containing information from another dataframe based on two conditions.
The idea is simple:
condition 1: if file*$countryid equals source$country, and
condition 2: if file*$year is higher than source$starting but lower than source$ending, then if true I want to create a column file*$rank with the value in source$rank
I have been trying code lines like this but this code does not go through all lines in source:
file1$rank<-ifelse(file1$countryid=source$countryid & file1$year>source$starting & file1$year<source$ending,source$rank,NA)
In addition I would like to implement this within a loop to avoid iterating manually through all these dataframes:
dflist<-Filter(is.data.frame, mget(ls()))
dflist<-function(df,x){df$rank<-ifelse(df$countryid=source$countryid & df$year>source$starting & df$year<source$ending,source$rank,NA))
Here is an example of the data I have.
Thank you!
> dput(file1)
structure(list(id = c(1, 2, 3), countryid = c(10, 10, 13), year = c(1948,
1954, 1908)), row.names = c(NA, -3L), class = c("tbl_df", "tbl",
"data.frame"))
dput(file2)
structure(list(id = c(1, 2, 3), countryid = c(13, 10, 13), year = c(1907,
1908, 1907)), row.names = c(NA, -3L), class = c("tbl_df", "tbl",
"data.frame"))
> dput(source)
structure(list(country = c(13, 13, 13, 10, 10, 10), rank = c(1,
2, 3, 1, 2, 3), starting = c(1885, 1909, 1940, 1902, 1907, 1931
), ending = c(1908, 1939, 1960, 1906, 1930, 1960)), row.names = c(NA,
-6L), class = c("tbl_df", "tbl", "data.frame"))

We can use a non-equi join after getting all the file\\d+ datasets into a list
library(data.table)
out <- lapply(mget(ls(pattern = '^file\\d+$')), function(dat)
setDT(dat)[, year := as.integer(year)][as.data.table(source), rank := i.rank,
on = .(countryid = country, year > starting, year < ending)])
-output
out
#$file1
# id countryid year rank
#1: 1 10 1948 3
#2: 2 10 1954 3
#3: 3 13 1908 NA
#$file2
# id countryid year rank
#1: 1 13 1907 1
#2: 2 10 1908 2
#3: 3 13 1907 1
if it needs to update the original objects, use list2env
list2env(out, .GlobalEnv)

Related

Create new column with if else in R

I have a database like this:
structure(list(code = c(1, 2, 3, 4), age = c(25, 30, 45, 50),
car = c(0, 1, 0, 1)), row.names = c(NA, -4L), class = c("tbl_df",
"tbl", "data.frame"))
I want to create a column "drivers under 40" with this conditions:
0 if Age<40 & car==0
1 if Age<40 & car==1
How do I create the third column with this conditions?
I tried using the code "if else" to create a variable but it doesn't work.
drivers <- ifelse((age <= 40) & (car==0), 0, ifelse((age<=40) & (car==1), 1))
Is maybe the code written wrong?
Is there another method to do it? I am afraid to mess up the parentheses so I'd prefer another method, if there is any faster
Here is a dplyr version with case_when
library(dplyr)
df %>%
mutate(drivers_under_40 = case_when(age <= 40 & car==0 ~ 0,
age <= 40 & car==1 ~ 1,
TRUE ~ NA_real_))
code age car drivers_under_40
<dbl> <dbl> <dbl> <dbl>
1 1 25 0 0
2 2 30 1 1
3 3 45 0 NA
4 4 50 1 NA
A base R option
df1$drivers_under_40 <- with(df1, (age <= 40 & car == 1)* NA^(age> 40))
df1$drivers_under_40
[1] 0 1 NA NA
Unless you work with dplyr you have to specify the data in your ifelse statement.
data$column for example. Also you have to assign a new column for the operation.
And the last else-statement is missing.
so your ifelse statement should look like this:
data = structure(list(code = c(1, 2, 3, 4), age = c(25, 30, 45, 50),
car = c(0, 1, 0, 1)), row.names = c(NA, -4L), class = c("tbl_df",
"tbl", "data.frame"))
data$drivers <- ifelse((data$age <= 40) & (data$car==0), 0, ifelse((data$age<=40) & (data$car==1), 1, "here you have to fill another 'else' value"))

Performance indices for unequal datasets in R

I wanted to do the performance indices in R. My data looks like this (example):
enter image description here
I want to ignore the comparison of values in Time 2 and 4 in data frame 1 and then compare it with the available set of observed data. I know how to develop the equation for the performance indicators (R2, RMSE, IA, etc.), but I am not sure how to ignore the data in the simulated data frame when corresponding observed data is not available for comparison.
Perhaps just do a left join, and compare the columns directly?
library(dplyr)
left_join(d2,d1 %>% rename(simData=Data), by="Time")
Output:
Time Data simData
<dbl> <dbl> <dbl>
1 1 57 52
2 3 88 78
3 5 19 23
Input:
d1 = structure(list(Time = c(1, 2, 3, 4, 5), Data = c(52, 56, 78,
56, 23)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-5L))
d2 = structure(list(Time = c(1, 3, 5), Data = c(57, 88, 19)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -3L))

Using ddply in combo with weighted.mean in a for loop with dynamic variables

my dataset looks like this:
structure(list(GEOLEV2 = structure(c("768001001", "768001001",
"768001002", "768001002", "768001006", "768001006", "768001002",
"768001002", "768001002", "768001002", "768002016", "768002016"
), format.stata = "%9s"), DHSYEAR = structure(c(1988, 1988, 1988,
1988, 1998, 1998, 1998, 1998, 2013, 2013, 2013, 2013), format.stata = "%9.0g"),
v005 = structure(c(1e+06, 1e+06, 1e+06, 1e+06, 1815025, 1815025,
1517492, 1517492, 1350366, 1350366, 617033, 617033), format.stata = "%9.0g"),
age = structure(c(37, 22, 18, 46, 15, 29, 18, 42, 19, 15,
35, 16), format.stata = "%9.0g"), highest_year_edu = structure(c(2,
6, NA, NA, 5, NA, 2, 3, 2, NA, 5, 3), format.stata = "%9.0g")), row.names = c(NA,
-12L), class = c("tbl_df", "tbl", "data.frame"), label = "Written by R")
I want to collapse it on a df1$GEOLEV2/df1$DHSYEAR basis, with weighted.mean as the collapsing function. Each variable shall remain with the same name.
I chose the function ddply and when I try it on a single variable, it works:
ddply(df1, ~ df1$GEOLEV2+ df1$DHSYEAR, summarise, age = weighted.mean(age, v005, na.rm = TRUE))
However, when I build the loop, the function returns me an error. My trial was:
df1_collapsed <- ddply(df1, ~ df1$GEOLEV2+ df1$DHSYEAR, summarise, age = weighted.mean(age, v005, na.rm = TRUE))
for (i in names(df1[4,5)) {
variable <- ddply(df1, ~ df1$GEOLEV2+ df1$DHSYEAR, summarise, i = weighted.mean(i, v005, na.rm = TRUE))
df1_collapsed <- left_join(df1_collapsed, variable, by = c("df1$GEOLEV2", "df1$DHSYEAR"))
}
and the error is
Error in weighted.mean.default(i, v005, na.rm = TRUE) :
'x' and 'w' must have the same length
How can I build the for loop, embedding the variable name in the loop?
In general in R you don't need loops for grouping and summarising (which you would call collapsing in Stata). You can use dplyr for this type of operation:
df1 %>%
group_by(GEOLEV2, DHSYEAR) %>%
summarise(
across(age:highest_year_edu, ~ weighted.mean(.x, v005, na.rm = TRUE))
)
# A tibble: 6 x 4
# Groups: GEOLEV2 [4]
# GEOLEV2 DHSYEAR age highest_year_edu
# <chr> <dbl> <dbl> <dbl>
# 1 768001001 1988 29.5 4
# 2 768001002 1988 32 NaN
# 3 768001002 1998 30 2.5
# 4 768001002 2013 17 2
# 5 768001006 1998 22 5
# 6 768002016 2013 25.5 4

Summarize by year and create a new variable containing a vector / list of unique values for each row

I have a dataset with all natural disaster that occured over a certain time period. I would like to summarize them by year and state. When summarizing I would like to create a variable (= d_disasters) that shows me the unique types of natural disasters, e.g. for Texas, I would expect to only show Hurricane.
I am currently using dplyr:group_by and dplyr::summarize to summarize my data by year and by state & dplyr::mutate and dplyr:map_int to create new variables with the total number of natural disasters per year ($n_disasters using length) and the unique number of natural disasters ($n_distinct using n_distinct()).
Starting dataset:
structure(list(year = c(1998, 1998, 1998, 1998, 1998), country = c("US",
"US", "US", "US", "US"), state = c("Texas", "Texas", "California",
"New York", "New York"), deaths = c(12, 5, 9, 10, 18), injured = c(3,
1, 3, 5, 9), disastertype = c("Hurricane", "Hurricane", "Wild fire",
"Flood", "Epidemic")), class = "data.frame", row.names = c(NA,
-5L))
Result dataset:
structure(list(year = c(1998, 1998, 1998), state = c("California",
"New York", "Texas"), u_disastertype = c("Wild fire", "Flood, Epidemic",
"Hurricane"), disastertype = c("Wild fire", "Flood, Epidemic",
"Hurricane, Hurricane"), deaths = c(9, 28, 17), injured = c(3,
14, 4), n_distinct = c(1L, 2L, 1L), n_disasters = c(1L, 2L, 2L
)), class = c("grouped_df", "tbl_df", "tbl", "data.frame"), row.names = c(NA,
-3L), groups = structure(list(year = 1998, .rows = structure(list(
1:3), ptype = integer(0), class = c("vctrs_list_of", "vctrs_vctr",
"list"))), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-1L), .drop = TRUE))
EDIT: Edited for clarification.
Try aggregate. This takes the output of 2 3 aggregates and puts them together.
list2 <- function(x){ c(unique(x),length(table(x))) }
lt <- list(year=dat$year, county=dat$country, state=dat$state )
data.frame( aggregate( dat[,c(4,5)], lt, sum ),
setNames( aggregate( dat$disastertype, lt, list2 )[,4, drop=F], colnames(dat)[6] ),
setNames( aggregate( dat$disastertype, lt, length )[,4, drop=F], "n_disasters") )
year county state deaths injured disastertype n_disasters
1 1998 US California 9 3 Wild fire, 1 1
2 1998 US New York 28 14 Flood, Epidemic, 2 2
3 1998 US Texas 17 4 Hurricane, 1 2
Not sure if you want to keep the n_... columns or not though...
EDIT: added "n_disasters"
EDIT2: added suggestion to include "distinct disasters"
The solution using dplyr with group_by and summarize. The key part is to run u_disastertype = toString(unique(disastertype)), before disastertype = paste(disastertype, collapse = ', '),
naturaldisaster2 <- naturaldisaster %>%
group_by(year, state) %>%
summarise(
u_disastertype = toString(unique(disastertype)),
disastertype = paste(disastertype, collapse = ', '),
deaths=sum(deaths),
injured=sum(injured)
)
The answer is based on this Stackoverflow answer to a similar question, where only one operation was run on the column whereas I am running two operations on the same column: https://stackoverflow.com/a/46367425/11045110

R: new column based whether categorical levels of another column are the same or different from each other

I am having a problem creating a new column in a data where the column content is defined by levels in a factor in a different column are the same or different, which is dependent on another 2 columns.
Basically, I have a bunch of cows with different ID's that can have different parities. The quarter is the udder quarter affected by the disease and I would like to create a new column with a result that is based on whether quarters are the same or different or occurring once. Any help would be appreciated. Code for abbreviated data frame below/ The new column is the one I would like to achieve.
AnimalID <- c(10,10,10,10,12,12,12,12,14)
Parity <- c(8,8,9,9,4,4,4,4,2)
Udder_quarter <- c("LH","LH","RH","RH","LH","RH","LF","RF","RF")
new_column <- c("same quarter","same quarter","different quarter","different quarter","different quarter","different quarter","different quarter","different quarter","one quarter")
quarters<- data.frame(AnimalID,Parity,Udder_quarter,new_column)
structure(list(HerdAnimalID = c(100165, 100165, 100327, 100327,
100450, 100450), Parity = c(6, 6, 5, 5, 3, 3), no_parities = c(1,
1, 1, 1, 1, 1), case = c("1pathogen_lact", "1pathogen_lact",
"1pathogen_lact", "1pathogen_lact", "1pathogen_lact", "1pathogen_lact"
), FARM = c(1, 1, 1, 1, 1, 1), `CASE NO` = c("101", "101", "638",
"638", "593", "593"), MASTDATE = structure(c(1085529600, 1087689600,
1097884800, 1101254400, 1106092800, 1106784000), class = c("POSIXct",
"POSIXt"), tzone = "UTC"), QRT = c("LF", "LF", "RH", "LF", "LH",
"LH"), MastitisDiagnosis = c("Corynebacterium spp", "Corynebacterium spp",
"S. uberis", "S. uberis", "Bacillus spp", "Bacillus spp"), PrevCalvDate =
structure(c(1075334400,
1075334400, 1096156800, 1096156800, 1091145600, 1091145600), class =
c("POSIXct",
"POSIXt"), tzone = "UTC")), .Names = c("HerdAnimalID", "Parity",
"no_parities", "case", "FARM", "CASE NO", "MASTDATE", "QRT",
"MastitisDiagnosis", "PrevCalvDate"), row.names = c(NA, -6L), class =
c("tbl_df",
"tbl", "data.frame"))
library(dplyr)
quarters %>%
group_by(AnimalID) %>%
mutate(new_column = ifelse(n()==1, 'one quarter', NA)) %>%
group_by(Parity, add=T) %>%
mutate(new_column=ifelse(length(unique(Udder_quarter))==1 & is.na(new_column),
"same quarter",
ifelse(length(unique(Udder_quarter))>1,
"different quarter",
new_column))) %>%
data.frame()
Output is:
AnimalID Parity Udder_quarter new_column
1 10 8 LH same quarter
2 10 8 LH same quarter
3 10 9 RH same quarter
4 10 9 RH same quarter
5 12 4 LH different quarter
6 12 4 RH different quarter
7 12 4 LF different quarter
8 12 4 RF different quarter
9 14 2 RF one quarter
Sample data:
quarters <- structure(list(AnimalID = c(10, 10, 10, 10, 12, 12, 12, 12, 14
), Parity = c(8, 8, 9, 9, 4, 4, 4, 4, 2), Udder_quarter = structure(c(2L,
2L, 4L, 4L, 2L, 4L, 1L, 3L, 3L), .Label = c("LF", "LH", "RF",
"RH"), class = "factor")), .Names = c("AnimalID", "Parity", "Udder_quarter"
), row.names = c(NA, -9L), class = "data.frame")
I would use ave to do that:
f <- function(x) {
if (length(x)==1) return("one")
else if (all(x == x[1])) return("same")
else return("different")
}
ave(Udder_quarter, interaction(AnimalID, Parity), FUN=f)
# [1] "same" "same" "same" "same" "different"
# [6] "different" "different" "different" "one"

Resources