I have a long-format balanced data frame (df1) that has 7 columns:
df1 <- structure(list(Product_ID = c(1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3,
3, 3, 3, 3), Product_Category = structure(c(1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L), .Label = c("A", "B"), class = "factor"),
Manufacture_Date = c(1950, 1950, 1950, 1950, 1950, 1960,
1960, 1960, 1960, 1960, 1940, 1940, 1940, 1940, 1940), Control_Date = c(1961L,
1962L, 1963L, 1964L, 1965L, 1961L, 1962L, 1963L, 1964L, 1965L,
1961L, 1962L, 1963L, 1964L, 1965L), Country_Code = structure(c(1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L), .Label = c("ABC",
"DEF", "GHI"), class = "factor"), Var1 = c(NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), Var2 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), row.names = c(NA,
15L), class = "data.frame")
Each Product_ID in this data set is linked with a unique Product_Category and Country_Code and Manufacture_Date, and is followed over time (Control_Date). Product_Category has two possible values (A or B); Country_Code and Manufacture_Date have 190 and 90 unique values, respectively. There are 400,000 unique Product_ID's, that are followed over a period of 50 years (Control_Date from 1961 to 2010). This means that df1 has 20,000,000 rows. The last two columns of this data frame are NA at the beginning and have to be filled using the data available in another data frame (df2):
df2 <- structure(list(Product_ID = 1:6, Product_Category = structure(c(1L,
2L, 1L, 1L, 1L, 2L), .Label = c("A", "B"), class = "factor"),
Manufacture_Date = c(1950, 1960, 1940, 1950, 1940, 2000),
Country_Code = structure(c(1L, 2L, 3L, 1L, 2L, 3L), .Label = c("ABC",
"DEF", "GHI"), class = "factor"), Year_1961 = c(5, NA, 10,
NA, 6, NA), Year_1962 = c(NA, NA, 4, 5, 3, NA), Year_1963 = c(8,
6, NA, 5, 6, NA), Year_1964 = c(NA, NA, 9, NA, 10, NA), Year_1965 = c(6,
NA, 7, 4, NA, NA)), row.names = c(NA, 6L), class = "data.frame")
This second data frame contains another type of information on the exact same 400,000 products, in wide-format. Each row represents a unique product (Product_ID) accompanied by its Product_Category, Manufacture_Date and Country_Code. There are 50 other columns (for each year from 1961 to 2010) that contain a measured value (or NA) for each product in each of those years.
Now what I would like to do is to fill in the Var1 & Var2 columns in the first data frame, by doing some calculation on the data available in the second data frame. More precisely, for each row in the first data frame (i.e. a product at Control_Date "t"), the last two columns are defined as follows:
Var1: total number of products in df2 with the same Product_Category, Manufacture_Date and Country_Code that have non-NA value in Year_t;
Var2: total number of products in df2 with different Product_Category but the same Manufacture_Date and Country_Code that have non-NA value in Year_t.
My initial solution with nested for-loops is as follows:
for (i in unique(df1$Product_ID)){
Category <- unique(df1[which(df1$Product_ID==i),"Product_Category"])
Opposite_Category <- ifelse(Category=="A","B","A")
Manufacture <- unique(df1[which(df1$Product_ID==i),"Manufacture_Date"])
Country <- unique(df1[which(df1$Product_ID==i),"Country_Code"])
ID_Similar_Product <- df2[which(df2$Product_Category==Category & df2$Manufacture_Date==Manufacture & df2$Country_Code==Country),"Product_ID"]
ID_Quasi_Similar_Product <- df2[which(df2$Product_Category==Opposite_Category & df2$Manufacture_Date==Manufacture & df2$Country_Code==Country),"Product_ID"]
for (j in unique(df1$Control_Date)){
df1[which(df1$Product_ID==i & df1$Control_Date==j),"Var1"] <- length(which(!is.na(df2[which(df2$Product_ID %in% ID_Similar_Product),paste0("Year_",j)])))
df1[which(df1$Product_ID==i & df1$Control_Date==j),"Var2"] <- length(which(!is.na(df2[which(df2$Product_ID %in% ID_Quasi_Similar_Product),paste0("Year_",j)])))
}
}
The problem with this approach is that it takes a lot of time to be run. So I would like to know if anybody could suggest a vectorized version that would do the job in less time.
See if this does what you want. I'm using the data.table package since you have a rather large (20M) dataset.
library(data.table)
setDT(df1)
setDT(df2)
# Set keys on the "triplet" to speed up everything
setkey(df1, Product_Category, Manufacture_Date, Country_Code)
setkey(df2, Product_Category, Manufacture_Date, Country_Code)
# Omit the Var1 and Var2 from df1
df1[, c("Var1", "Var2") := NULL]
# Reshape df2 to long form
df2.long <- melt(df2, measure=patterns("^Year_"))
# Split "variable" at the "_" to extract 4-digit year into "Control_Date" and delete leftovers.
df2.long[, c("variable","Control_Date") := tstrsplit(variable, "_", fixed=TRUE)][
, variable := NULL]
# Group by triplet, Var1=count non-NA in value, join with...
# (Group by doublet, N=count non-NA), update Var2=N-Var1.
df2_N <- df2.long[, .(Var1 = sum(!is.na(value))),
by=.(Product_Category, Manufacture_Date, Country_Code)][
df2.long[, .(N = sum(!is.na(value))),
by=.(Manufacture_Date, Country_Code)],
Var2 := N - Var1, on=c("Manufacture_Date", "Country_Code")]
# Update join: df1 with df2_N
df1[df2_N, c("Var1","Var2") := .(i.Var1, i.Var2),
on = .(Product_Category, Manufacture_Date, Country_Code)]
df1
Product_ID Product_Category Manufacture_Date Control_Date Country_Code Var1 Var2
1: 3 A 1940 1961 GHI 4 0
2: 3 A 1940 1962 GHI 4 0
3: 3 A 1940 1963 GHI 4 0
4: 3 A 1940 1964 GHI 4 0
5: 3 A 1940 1965 GHI 4 0
6: 1 A 1950 1961 ABC 6 0
7: 1 A 1950 1962 ABC 6 0
8: 1 A 1950 1963 ABC 6 0
9: 1 A 1950 1964 ABC 6 0
10: 1 A 1950 1965 ABC 6 0
11: 2 B 1960 1961 DEF NA NA
12: 2 B 1960 1962 DEF NA NA
13: 2 B 1960 1963 DEF NA NA
14: 2 B 1960 1964 DEF NA NA
15: 2 B 1960 1965 DEF NA NA
df2
Product_ID Product_Category Manufacture_Date Country_Code Year_1961 Year_1962 Year_1963 Year_1964 Year_1965
1: 5 A 1940 DEF 6 3 6 10 NA
2: 3 A 1940 GHI 10 4 NA 9 7
3: 1 A 1950 ABC 5 NA 8 NA 6
4: 4 A 1950 ABC NA 5 5 NA 4
5: 2 B 1940 DEF NA NA 6 NA NA
6: 6 B 2000 GHI NA NA NA NA NA
Related
I have a dataframe that looks like this:
# A tibble: 9 x 5
# Groups: group [3]
group year value1 value2 value3
<int> <dbl> <int> <int> <int>
1 1 2000 NA 3 4
2 1 2001 8 3 4
3 1 2002 4 3 NA
4 2 2000 NA NA 1
5 2 2001 9 NA 1
6 2 2002 1 NA NA
7 3 2000 NA 5 NA
8 3 2001 9 5 NA
9 3 2002 NA 5 NA
I need a script that returns the years of the first and last non-na value for each column, irrespective of group. Ideally, the output would look like this. Beware the actual dataset is much larger.
start end
value 1 2001 2002
value 2 2000 2002
value 3 2000 2001
We can reshape into 'long' format and then do a group by the 'name' and summarise to get the min and max 'year'
library(dplyr)
library(tidyr)
library(tibble)
df1 %>%
select(-group) %>%
pivot_longer(cols = starts_with('value'), values_drop_na = TRUE) %>%
group_by(name) %>%
summarise(start = min(year), end = max(year)) %>%
column_to_rownames('name')
# start end
#value1 2001 2002
#value2 2000 2002
#value3 2000 2001
Or with melt from data.table
library(data.table)
melt(setDT(df1), id.var = c('year', 'group'), na.rm = TRUE)[,
.(start = min(year), end = max(year)), .(variable)]
Or we could also make use of summarise_at
df1 %>%
summarise_at(vars(starts_with('value')), ~
list(range(year[!is.na(.)]))) %>%
unnest(everything()) %>%
pivot_longer(everything())
data
df1 <- structure(list(group = c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L),
year = c(2000L, 2001L, 2002L, 2000L, 2001L, 2002L, 2000L,
2001L, 2002L), value1 = c(NA, 8L, 4L, NA, 9L, 1L, NA, 9L,
NA), value2 = c(3L, 3L, 3L, NA, NA, NA, 5L, 5L, 5L), value3 = c(4L,
4L, NA, 1L, 1L, NA, NA, NA, NA)), class = "data.frame",
row.names = c("1",
"2", "3", "4", "5", "6", "7", "8", "9"))
A base solution wehre we find first not NA in year for the mentioned columns value1 to value3.
data.frame(t(sapply(paste0("value", 1:3), function(i){
val_i <- df1[ , i]
data.frame(start =
df1$year[min(which(!is.na(val_i)))], end=
df1$year[max(which(!is.na(val_i)))])
})))
Another data.table option. Not sure if using names(.SD) is recommended but it should scale pretty well
library(data.table)
setDT(df1)[, .(val = names(.SD),
start = lapply(.SD, function(x) min(year[!is.na(x)])),
end = lapply(.SD, function(x) max(year[!is.na(x)]))), .SDcols = startsWith(names(df1), "value")]
val start end
1: value1 2001 2002
2: value2 2000 2002
3: value3 2000 2001
I am looking for a way to change my way in such a way that it sorts the data into quintiles instead of the top 5 and bottom 5. My current code looks like this:
CombData <- CombData %>%
group_by(Date) %>%
mutate(
R=min_rank(Value),
E_P = case_when(
R < 6 ~ "5w",
R > max(R, na.rm =TRUE) - 5 ~ "5b",
TRUE ~ NA_character_)
) %>%
ungroup() %>%
arrange(Date, E_P)
My dataset is quite large therefore I will just provide sample data. The data I use is more complex and the code should, therefore, allow for varying lengths of the column Date and also for multiple values that are missing (NAs):
df <- data.frame( Date = c(rep("2010-01-31",16), rep("2010-02-28", 14)), Value=c(rep(c(1,2,3,4,5,6,7,8,9,NA,NA,NA,NA,NA,15),2))
Afterward, I would also like to test the minimum size of quintiles i.e. how many data points are minimum in each quintile in the entire dataset.
The expected output would look like this:
structure(list(Date = structure(c(14640, 14640, 14640, 14640,
14640, 14640, 14640, 14640, 14640, 14640, 14640, 14640, 14640,
14640, 14640, 14640, 14668, 14668, 14668, 14668, 14668, 14668,
14668, 14668, 14668, 14668, 14668, 14668, 14668, 14668), class = "Date"),
Value = c(1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 15, NA, NA, NA, NA,
NA, 2, 3, 4, 5, 6, 7, 8, 9, 15, NA, NA, NA, NA, NA), R = c(1L,
1L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, NA, NA, NA, NA,
NA, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, NA, NA, NA, NA, NA
), S_P = c("Worst", "Worst", "Worst", NA, NA, NA, NA, "Best",
"Best", "Best", NA, NA, NA, NA, NA, NA, "Worst", "Worst", NA, NA,
NA, NA, NA, "Best", "Best", NA, NA, NA, NA, NA)), row.names = c(NA,
-30L), class = c("tbl_df", "tbl", "data.frame"))
Probably, you could use something like this with quantile :
library(dplyr)
out <- CombData %>%
group_by(Date) %>%
mutate(S_P = case_when(Value <= quantile(Value, 0.2, na.rm = TRUE) ~ 'Worst',
Value >= quantile(Value, 0.8, na.rm = TRUE) ~ 'Best'))
You could change the value of quantile according to your preference.
To get minimum number of "Best" and "Worst" we can do :
out %>%
count(Date, S_P) %>%
na.omit() %>%
ungroup() %>%
select(-Date) %>%
group_by(S_P) %>%
top_n(-1, n)
# S_P n
# <chr> <int>
#1 Best 2
#2 Worst 2
When I understand you correctly, you want to rank your column 'Value' and mark those with rank below the quantile 20% with "worst" and those above 80% with "best". After that you want a table.
You could use use ave for both, the ranking and the quantile identification. The quantile function yields three groups, that you can identify with findInterval, code as a factor variable and label them at will. I'm not sure, though, which ranks should be included in the quantiles, I therefore make the E_P coding in two separate columns for comparison purposes.
dat2 <- within(dat, {
R <- ave(Value, Date, FUN=function(x) rank(x, na.last="keep"))
E_P <- ave(R, Date, FUN=function(x) {
findInterval(x, quantile(R, c(.2, .8), na.rm=TRUE))
})
E_P.fac <- factor(E_P, labels=c("worst", NA, "best"))
})
dat2 <- dat2[order(dat2$Date, dat2$E_P), ] ## order by date and E_P
Yields:
dat2
# Date Value E_P.fac E_P R
# 1 2010-01-31 1 worst 0 1.5
# 16 2010-01-31 1 worst 0 1.5
# 2 2010-01-31 2 <NA> 1 3.0
# 3 2010-01-31 3 <NA> 1 4.0
# 4 2010-01-31 4 <NA> 1 5.0
# 5 2010-01-31 5 <NA> 1 6.0
# 6 2010-01-31 6 <NA> 1 7.0
# 7 2010-01-31 7 <NA> 1 8.0
# 8 2010-01-31 8 best 2 9.0
# 9 2010-01-31 9 best 2 10.0
# 15 2010-01-31 15 best 2 11.0
# 10 2010-01-31 NA <NA> NA NA
# 11 2010-01-31 NA <NA> NA NA
# 12 2010-01-31 NA <NA> NA NA
# 13 2010-01-31 NA <NA> NA NA
# 14 2010-01-31 NA <NA> NA NA
# 17 2010-02-28 2 worst 0 1.0
# 18 2010-02-28 3 worst 0 2.0
# 19 2010-02-28 4 <NA> 1 3.0
# 20 2010-02-28 5 <NA> 1 4.0
# 21 2010-02-28 6 <NA> 1 5.0
# 22 2010-02-28 7 <NA> 1 6.0
# 23 2010-02-28 8 <NA> 1 7.0
# 24 2010-02-28 9 <NA> 1 8.0
# 30 2010-02-28 15 best 2 9.0
# 25 2010-02-28 NA <NA> NA NA
# 26 2010-02-28 NA <NA> NA NA
# 27 2010-02-28 NA <NA> NA NA
# 28 2010-02-28 NA <NA> NA NA
# 29 2010-02-28 NA <NA> NA NA
When I check the quintiles of the Rank column, it appears to be right.
quantile(dat2$R, c(.2, .8), na.rm=TRUE)
# 20% 80%
# 2.8 8.2
After that you could just make a table to get the numbers of each category.
with(dat2, table(Date, E_P.fac))
# E_P.fac
# Date worst <NA> best
# 2010-01-31 2 6 3
# 2010-02-28 2 6 1
Data
dat <- structure(list(Date = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("2010-01-31", "2010-02-28"
), class = "factor"), Value = c(1, 2, 3, 4, 5, 6, 7, 8, 9, NA,
NA, NA, NA, NA, 15, 1, 2, 3, 4, 5, 6, 7, 8, 9, NA, NA, NA, NA,
NA, 15)), row.names = c(NA, -30L), class = "data.frame")
how can i check unchanged row dynamically from data frame.(by grouping ID)
my data frame.
ID NAME GENDER AGE
1 muthu male 20
1 MUTHU MALE 20
2 NA male 28
3 jake male 30
3 jake male 31
4 jhon male 21
4 \n\rjhon\n\r \n\male\n\r 21
5 NA NA NA
5 NA NA NA
expected result will be.
Unchanged ID
1
2
4
5
dput data ,
structure(list(ID = c(1, 1, 2, 3, 3, 4, 4, 5, 5), NAME = structure(c(4L,
5L, NA, 2L, 2L, 3L, 1L, NA, NA), .Label = c("\\n\\rjhon\\n\\r",
"jake", "jhon", "muthu", "MUTHU"), class = "factor"), GENDER = structure(c(2L,
3L, 2L, 2L, 2L, 2L, 1L, NA, NA), .Label = c("\\n\\male\\n\\r",
"male", "MALE"), class = "factor"), AGE = c(20, 20, 28, 30, 31,
21, 21, NA, NA)), .Names = c("ID", "NAME", "GENDER", "AGE"), row.names = c(NA,
-9L), class = "data.frame")
Here is an option using data.table. Convert the 'data.frame' to 'data.table' (setDT(df1)), grouped by 'ID', remove any \n or \r after converting it to 'lower', find the number of unique elements (uniqueN), check whether it is equal to 1, then Reduce it back to a single logical column using &, and subset the 'ID' based on the logical column ('V1')
library(data.table)
setDT(df1)[, Reduce(`&`, lapply(.SD, function(x)
uniqueN(gsub("([\\]+)(n|r)|[\\]+", "", tolower(x)))==1)) , ID][(V1), .(ID)]
# ID
#1: 1
#2: 2
#3: 4
#4: 5
Here is a base R idea. We clean the names from \\n and \\r and convert them tolower. After that the unchanged rows are the one with duplicates. The second condition, is for the group to have only one entry which we handle with ave.
c(df$ID[duplicated(sapply(df, function(i) tolower(gsub('[\\]n|[\\r]', '', i))))],
df$ID[with(df, ave(ID, ID, FUN = length)) == 1])
#[1] 1 4 5 2
I'm sitting in front of a dataframe that looks like this:
country year Indicator a b c
48996 US 2003 var1 NA NA NA
16953 FR 1988 var2 NA 10664.920 NA
22973 FR 1943 var3 NA 5774.334 NA
8760 CN 1995 var4 8804.565 NA 12750.31
47795 US 2012 var5 NA NA NA
30033 GB 1969 var6 NA 29631.362 NA
25796 FR 1921 var7 NA 14004.520 NA
39534 NL 1941 var8 NA NA NA
42255 NZ 1969 var8 NA NA NA
7249 CN 1995 var9 50635.862 NA 75260.56
What I want to do is basically a long to wide transformation with Indicator as key variable. I would usually use spread() from the tidyr package. However, spread() unfortunately does not accept multiple value columns (in this case a, b and c) and it does not fully do what I want to achieve:
Make the entries of Indicator the new columns
Keep the Country / Year combinations as rows
Creat a UNIQUE row for every old value from a, b and c
Create a Dummy Variable for every "old" value column name (i.e. a,
b, c)
So in the end, the Chinese observations of my example should become
country year var1 [...] var4 [...] var9 dummy.a dummy.b dummy.c
CN 1995 NA 8804.565 50635.862 1 0 0
CN 1995 NA 12750.31 75260.56 0 0 1
As my original dataframe is 58.162x119, I would appreciate something that does not include a lot of manual work :-)
I hope I was clear in what I want to achieve. Thanks for your help!
The above mentioned dataframe can be reproduced using the following code:
structure(list(country = c("US", "FR", "FR", "CN", "US", "GB",
"FR", "NL", "NZ", "CN"), year = c(2003L, 1988L, 1943L, 1995L,
2012L, 1969L, 1921L, 1941L, 1969L, 1995L), Indicator = structure(c(1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 8L, 9L), .Label = c("var1", "var2",
"var3", "var4", "var5", "var6", "var7", "var8", "var9", "var10",
"var11", "var12", "var13", "var14", "var15", "var16", "var17",
"var18"), class = "factor"), a = c(NA, NA, NA, 8804.56480733,
NA, NA, NA, NA, NA, 50635.8621327), b = c(NA, 10664.9199219,
5774.33398438, NA, NA, 29631.3618614, 14004.5195312, NA, NA,
NA), c = c(NA, NA, NA, 12750.3056855, NA, NA, NA, NA, NA, 75260.555946
)), .Names = c("country", "year", "Indicator", "a", "b", "c"), row.names = c(48996L,
16953L, 22973L, 8760L, 47795L, 30033L, 25796L, 39534L, 42255L,
7249L), class = "data.frame")
Here's my solution:
require(tidyr)
mydf <- structure(list(country = c("US", "FR", "FR", "CN", "US", "GB",
"FR", "NL", "NZ", "CN"), year = c(2003L, 1988L, 1943L, 1995L,
2012L, 1969L, 1921L, 1941L, 1969L, 1995L), Indicator = structure(c(1L,
2L, 3L, 4L, 5L, 6L, 7L, 8L, 8L, 9L), .Label = c("var1", "var2",
"var3", "var4", "var5", "var6", "var7", "var8", "var9", "var10",
"var11", "var12", "var13", "var14", "var15", "var16", "var17",
"var18"), class = "factor"), a = c(NA, NA, NA, 8804.56480733,
NA, NA, NA, NA, NA, 50635.8621327), b = c(NA, 10664.9199219,
5774.33398438, NA, NA, 29631.3618614, 14004.5195312, NA, NA,
NA), c = c(NA, NA, NA, 12750.3056855, NA, NA, NA, NA, NA, 75260.555946
)), .Names = c("country", "year", "Indicator", "a", "b", "c"), row.names = c(48996L,
16953L, 22973L, 8760L, 47795L, 30033L, 25796L, 39534L, 42255L,
7249L), class = "data.frame")
mydf %>% gather(key=newIndicator,value=values, a,b,c) %>% filter(!is.na(values)) %>% spread(key=Indicator,values) %>% mutate(indicatorValues=1) %>% spread(newIndicator,indicatorValues,fill=0)
The output
# country year var2 var3 var4 var6 var7 var9 a b c
# 1 CN 1995 NA NA 8804.565 NA NA 50635.86 1 0 0
# 2 CN 1995 NA NA 12750.306 NA NA 75260.56 0 0 1
# 3 FR 1921 NA NA NA NA 14004.52 NA 0 1 0
# 4 FR 1943 NA 5774.334 NA NA NA NA 0 1 0
# 5 FR 1988 10664.92 NA NA NA NA NA 0 1 0
# 6 GB 1969 NA NA NA 29631.36 NA NA 0 1 0
dt would be your original data. dt2 is the final output.
dt2 <- dt %>%
gather(Parameter, Value, a:c) %>%
spread(Indicator, Value) %>%
mutate(Data = ifelse(rowSums(is.na(.[, paste0("var", 1:9)])) != 9, 1, 0)) %>%
filter(Data != 0) %>%
spread(Parameter, Data, fill = 0) %>%
rename(dummy.a = a, dummy.b = b, dummy.c = c)
I am trying to get the maximum value in the column event until an agreement (dummy) is reached; Events are nested in agreements, agreements are nested in dyad which run over year. Note that years are not always continuous, meaning there are breaks between the years (1986, 1987,2001,2002).
I am able to get the maximum values within the dyad with a ddply and max(event); but I struggle how to ‘assign’ the different events to the right agreement (until/after). I am basically lacking an 'identifier' which assigns each observation to an agreement.
The results which I am looking for are already in the column "result".
dyad year event agreement agreement.name result
1 1985 9
1 1986 4 1 agreement1 9
1 1987
1 2001 3
1 2002 1 agreement2 3
2 1999 1
2 2000 5
2 2001 1 agreement3 5
2 2002 2
2 2003
2 2004 1 agreement 4 2
Here is the data in a format which is hopefully easier to use:
df<-structure(list(dyad = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L), year = c(1985L, 1986L, 1987L, 2001L, 2002L, 1999L, 2000L,
2001L, 2002L, 2003L, 2004L), event = c(9L, 4L, NA, 3L, NA, 1L,
5L, NA, 2L, NA, NA), agreement = c(NA, 1L, NA, NA, 1L, NA, NA,
1L, NA, NA, 1L), agreement.name = c("", "agreement1", "", "",
"agreement2", "", "", "agreement3", "", "", "agreement 4"), result = c(NA,
9L, NA, NA, 3L, NA, NA, 5L, NA, NA, 2L)), .Names = c("dyad",
"year", "event", "agreement", "agreement.name", "result"), class = "data.frame", row.names = c(NA,
-11L))
Here is an option using data.table. Convert the 'data.frame' to 'data.table' (setDT(df)), create another grouping variable ('ind') based on the non-empty elements in 'agreement.name'. Grouped by both 'dyad' and 'ind' columns, we create a new column 'result' using ifelse to fill the rows that have 'agreement.name' is non-empty with the max of 'event'
library(data.table)
setDT(df)[, ind:=cumsum(c(TRUE,diff(agreement.name=='')>0)),dyad][,
result:=ifelse(agreement.name!='', max(event, na.rm=TRUE), NA) ,
list(dyad, ind)][, ind:=NULL][]
# dyad year event agreement agreement.name result
# 1: 1 1985 9 NA NA
# 2: 1 1986 4 1 agreement1 9
# 3: 1 1987 NA NA NA
# 4: 1 2001 3 NA NA
# 5: 1 2002 NA 1 agreement2 3
# 6: 2 1999 1 NA NA
# 7: 2 2000 5 NA NA
# 8: 2 2001 NA 1 agreement3 5
# 9: 2 2002 2 NA NA
#10: 2 2003 NA NA NA
#11: 2 2004 NA 1 agreement 4 2
Or instead of ifelse, we can use numeric index
setDT(df)[, result:=c(NA, max(event, na.rm=TRUE))[(agreement.name!='')+1L] ,
list(ind= cumsum(c(TRUE,diff(agreement.name=='')>0)),dyad)][]
data
df <- structure(list(dyad = c(1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L,
2L), year = c(1985L, 1986L, 1987L, 2001L, 2002L, 1999L, 2000L,
2001L, 2002L, 2003L, 2004L), event = c(9L, 4L, NA, 3L, NA, 1L,
5L, NA, 2L, NA, NA), agreement = c(NA, 1L, NA, NA, 1L, NA, NA,
1L, NA, NA, 1L), agreement.name = c("", "agreement1", "", "",
"agreement2", "", "", "agreement3", "", "", "agreement 4")),
.Names = c("dyad",
"year", "event", "agreement", "agreement.name"), row.names = c(NA,
-11L), class = "data.frame")