stratified sampling with group size below sample size in R - r

I have response data by market in the format:
head(df)
ID market q1 q2
470 France 1 3
625 Germany 0 2
155 Italy 1 6
648 Spain 0 5
862 France 1 7
699 Germany 0 8
460 Italy 1 6
333 Spain 1 5
776 Spain 1 4
and the following frequencies:
table(df$market)
France 140
Germany 300
Italy 50
Spain 75
I need to create a data frame with a sample of 100 responses per market, and all responses without replacement in cases when there's less than 100 of them.
so
table(df_new$market)
France 100
Germany 100
Italy 50
Spain 75
Thanks in advance!

The following looks valid:
set.seed(10); DF = data.frame(c1 = sample(LETTERS[1:4], 25, T), c2 = runif(25))
freqs = as.data.frame(table(DF$c1))
freqs$ss = ifelse(freqs$Freq >= 5, 5, freqs$Freq)
#> freqs
# Var1 Freq ss
#1 A 4 4
#2 B 11 5
#3 C 7 5
#4 D 3 3
res = mapply(function(x, y) DF[sample(which(DF$c1 %in% x), y), ],
x = freqs$Var1, y = freqs$ss, SIMPLIFY = F)
do.call(rbind, res)
# c1 c2
#5 A 0.3558977
#17 A 0.2289039
#6 A 0.5355970
#13 A 0.9546536
#3 B 0.2395891
#25 B 0.8015470
#10 B 0.4226376
#15 B 0.5005032
#19 B 0.7289646
#11 C 0.7477465
#9 C 0.8998325
#12 C 0.8226526
#1 C 0.7066469
#4 C 0.7707715
#23 D 0.4861003
#20 D 0.2498805
#21 D 0.1611833

Related

Replacing a vector of numbers by specific characters in a data.frame

I have the following data:
df <- data.frame(a = c(5,5,8,4,2,1,9,8,7,3,6)
I want add a new column to name the numbers based on the following:
names <- c(1 = "A", 2 = "B", 3 = "C", 4 = "D", 5 = "E",
6 = "F", 7 = "G", 8 = "H", 9 = "I", 10 = "J", 11 = "K")
The desired output is:
a b
1 5 E
2 5 E
3 8 H
4 4 D
5 2 B
6 1 A
7 9 I
8 8 H
9 7 G
10 3 C
11 6 F
If you are continuing from your previous question where you want to divide the data by range, you can use labels option in cut.
Using the previous example -
range <- c(0, seq(19, max(DF$AGE) + 10, 10))
labs <- paste(range[-length(range)] + 1, range[-1], sep = '-')
labs
#[1] "1-19" "20-29" "30-39" "40-49" "50-59" "60-69"
transform(DF, GROUP = cut(AGE, c(0, seq(19, max(AGE) + 10, 10)), labels = labs))
# NAME AGE GROUP
#1 Gait 33 30-39
#2 Roc 43 40-49
#3 Bo 37 30-39
#4 Hernd 45 40-49
#5 Bet 44 40-49
#6 Oln 35 30-39
#7 Gai 22 20-29
#8 Rock 30 30-39
#9 Mil 38 30-39
#10 Arli 23 20-29
#11 Re 45 40-49
#12 Fred 43 40-49
#13 Ro 67 60-69
#14 Rock 43 40-49
#15 Wheat 28 20-29
#16 Germa 47 40-49
#17 Rock 16 1-19
#18 Nort 29 20-29
#19 Arli 22 20-29
#20 Rockv 31 30-39

How to add a ranking column for this dataset?

My data is as follows:
df <- data.frame(
comp_name = c("A","B","C","D","E","F","G","H","J","K","L","M"),
country = c("US", "UK", "France", "Germany", "US", "UK", "France", "Germany", "US", "UK", "France", "Germany"),
profit = c(100,125,150,165,150,110,110,125,130,250,95,100)
)
df:
comp_name country profit
1 A US 100
2 B UK 125
3 C France 150
4 D Germany 165
5 E US 150
6 F UK 110
7 G France 110
8 H Germany 125
9 J US 130
10 K UK 250
11 L France 95
12 M Germany 100
I would like to add a rank column to this data frame which ranks companies by profit by country, like this:
comp_name country profit rank
1 A US 100 3
2 B UK 125 2
3 C France 150 1
4 D Germany 165 1
5 E US 150 1
6 F UK 110 3
7 G France 110 2
8 H Germany 125 2
9 J US 130 2
10 K UK 250 1
11 L France 95 3
12 M Germany 100 3
I'm relatively new to R and don't know where to start with this. Any help would be greatly appreciated. Thanks!
Does this work:
library(dplyr)
df %>% group_by(country) %>% mutate(rank = rank(desc(profit)))
# A tibble: 12 x 4
# Groups: country [4]
comp_name country profit rank
<chr> <chr> <dbl> <dbl>
1 A US 100 3
2 B UK 125 2
3 C France 150 1
4 D Germany 165 1
5 E US 150 1
6 F UK 110 3
7 G France 110 2
8 H Germany 125 2
9 J US 130 2
10 K UK 250 1
11 L France 95 3
12 M Germany 100 3
An option with data.table
library(data.table)
setDT(df)[, Rank := frank(-profit), country]
A base R option using rank + ave
transform(
df,
Rank = ave(-profit, country, FUN = rank)
)
gives
comp_name country profit Rank
1 A US 100 3
2 B UK 125 2
3 C France 150 1
4 D Germany 165 1
5 E US 150 1
6 F UK 110 3
7 G France 110 2
8 H Germany 125 2
9 J US 130 2
10 K UK 250 1
11 L France 95 3
12 M Germany 100 3
df %>%
dplyr::group_by(country) %>%
dplyr::group_map(function(x, y){
x %>% dplyr::mutate(rank = rank(-profit))
}) %>%
dplyr::bind_rows()
Karthik S provided a cleaner answer.
Apparently, group_map here is redundant

Fill in values between start and end value in R

W (blue line below) in my data.frame represents where the water level in the river intersects the elevation profile.
In my data.frame, for each group in ID, I need to fill in values between the start and end value (W)
My data
> head(df, 23)
ID elevation code
1 1 150 <NA>
2 1 140 <NA>
3 1 130 W
4 1 120 <NA>
5 1 110 <NA>
6 1 120 <NA>
7 1 130 W
8 1 140 <NA>
9 1 150 <NA>
10 2 90 <NA>
11 2 80 <NA>
12 2 70 <NA>
13 2 66 W
14 2 60 <NA>
15 2 50 <NA>
16 2 66 W
17 2 70 <NA>
18 2 72 <NA>
19 2 68 W
20 2 65 <NA>
21 2 60 <NA>
22 2 68 W
23 2 70 <NA>
I want the final result to look like below
ID elevation code
1 1 150 <NA>
2 1 140 <NA>
3 1 130 W
4 1 120 W
5 1 110 W
6 1 120 W
7 1 130 W
8 1 140 <NA>
9 1 150 <NA>
10 2 90 <NA>
11 2 80 <NA>
12 2 70 <NA>
13 2 66 W
14 2 60 W
15 2 50 W
16 2 66 W
17 2 70 <NA>
18 2 72 <NA>
19 2 68 W
20 2 65 W
21 2 60 W
22 2 68 W
23 2 70 <NA>
I tried many things but my trials were not successful. Your help will be appreciated.
DATA
> dput(df)
structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), elevation = c(150L,
140L, 130L, 120L, 110L, 120L, 130L, 140L, 150L, 90L, 80L, 70L,
66L, 60L, 50L, 66L, 70L, 72L, 68L, 65L, 60L, 68L, 70L), code = c(NA,
NA, "W", NA, NA, NA, "W", NA, NA, NA, NA, NA, "W", NA, NA, "W",
NA, NA, "W", NA, NA, "W", NA)), class = "data.frame", row.names = c(NA,
-23L))
You could do:
df %>%
group_by(ID)%>%
mutate(code = coalesce(code, c(NA, "W")[cumsum(!is.na(code)) %% 2 + 1]))
ID elevation code
1 1 150 <NA>
2 1 140 <NA>
3 1 130 W
4 1 120 W
5 1 110 W
6 1 120 W
7 1 130 W
8 1 140 <NA>
9 1 150 <NA>
10 2 90 <NA>
11 2 80 <NA>
12 2 70 <NA>
13 2 66 W
14 2 60 W
15 2 50 W
16 2 66 W
17 2 70 <NA>
18 2 72 <NA>
19 2 68 W
20 2 65 W
21 2 60 W
22 2 68 W
23 2 70 <NA>
We can try replace + cumsum
df %>%
group_by(ID) %>%
mutate(code = replace(code, cumsum(!is.na(code)) %% 2 == 1, "W")) %>%
ungroup()
which gives
# A tibble: 23 x 3
ID elevation code
<int> <int> <chr>
1 1 150 NA
2 1 140 NA
3 1 130 W
4 1 120 W
5 1 110 W
6 1 120 W
7 1 130 W
8 1 140 NA
9 1 150 NA
10 2 90 NA
# ... with 13 more rows
You can create a helper function that creates a sequence between each start and end and assigns 'W' to it.
assign_w <- function(code) {
inds <- which(code == 'W')
code[unlist(Map(seq, inds[c(TRUE, FALSE)], inds[c(FALSE, TRUE)]))] <- 'W'
code
}
and apply it for each ID :
library(dplyr)
df %>%
group_by(ID) %>%
mutate(result = assign_w(code)) %>%
ungroup
# ID elevation code result
#1 1 150 <NA> <NA>
#2 1 140 <NA> <NA>
#3 1 130 W W
#4 1 120 <NA> W
#5 1 110 <NA> W
#6 1 120 <NA> W
#7 1 130 W W
#8 1 140 <NA> <NA>
#9 1 150 <NA> <NA>
#10 2 90 <NA> <NA>
#11 2 80 <NA> <NA>
#12 2 70 <NA> <NA>
#13 2 66 W W
#14 2 60 <NA> W
#15 2 50 <NA> W
#16 2 66 W W
#17 2 70 <NA> <NA>
#18 2 72 <NA> <NA>
#19 2 68 W W
#20 2 65 <NA> W
#21 2 60 <NA> W
#22 2 68 W W
#23 2 70 <NA> <NA>
library(dplyr)
df %>%
group_by(ID) %>%
mutate(water_flag = (1 * !is.na(code)) * if_else(elevation < lag(elevation, default = 0), 1, -1),
water = if_else(cumsum(water_flag) == 1, "W", NA_character_))
First I tried to use fill but had no success. Then I learned here about the benefit of R's recycling property Rename first and second occurence of the same specific value in a column iteratively (Thanks to Ronak!)
# prepare data with renaming `start` and `stop` sequence
df$code[is.na(df$code)] <- "NA"
df$code[df$code == 'W'] <- c('start', 'end')
df$code[df$code=="NA"]<-NA
# Now with different names of start and stop sequence I was able to implement `cumsum`
library(tidyverse)
df <- df %>%
group_by(grp = cumsum(!is.na(code))) %>%
dplyr::mutate(code = replace(code, first(code) == 'start', 'W'),
code = replace(code, code=='end', 'W')) %>%
ungroup() %>%
select(-grp)
Output:
# A tibble: 23 x 3
ID elevation code
<int> <int> <chr>
1 1 150 NA
2 1 140 NA
3 1 130 W
4 1 120 W
5 1 110 W
6 1 120 W
7 1 130 W
8 1 140 NA
9 1 150 NA
10 2 90 NA
11 2 80 NA
12 2 70 NA
13 2 66 W
14 2 60 W
15 2 50 W
16 2 66 W
17 2 70 NA
18 2 72 NA
19 2 68 W
20 2 65 W
21 2 60 W
22 2 68 W
23 2 70 NA
This answer is similar to #Onyambu's: create an 'index' (ind) that increases by one each time a non-NA is encountered in the 'code' column. If the index value is divisible by 2 (i.e. it is an even number) insert "NA" into the new column. If the index is not divisible by 2, add a "W" into the new column. Then if there is a "W" in the 'code' or 'new' columns, replace the NA in the 'code' column with W and drop the 'new' column from the dataframe.
df %>%
mutate(ind = ifelse(cumsum(!is.na(code)) %% 2 == 0, NA, "W")) %>%
mutate(code = ifelse(ind == "W" | code == "W", "W", NA)) %>%
select(-c(ind))
#> ID elevation code
#>1 1 150 <NA>
#>2 1 140 <NA>
#>3 1 130 W
#>4 1 120 W
#>5 1 110 W
#>6 1 120 W
#>7 1 130 W
#>8 1 140 <NA>
#>9 1 150 <NA>
#>10 2 90 <NA>
#>11 2 80 <NA>
#>12 2 70 <NA>
#>13 2 66 W
#>14 2 60 W
#>15 2 50 W
#>16 2 66 W
#>17 2 70 <NA>
#>18 2 72 <NA>
#>19 2 68 W
#>20 2 65 W
#>21 2 60 W
#>22 2 68 W
#>23 2 70 <NA>
Though the question has been marked as solved(answer accepted) yet for further/future reference, there is a function fill_run in library runner which does exactly this.
fill_run replaces NA values if they were surrounded by pair of identical values. Since our additional requirement is to look at elevation too we can do something like this
df %>% group_by(ID) %>%
mutate(code = runner::fill_run(ifelse(!is.na(code), paste(elevation,code), code), only_within = T))
# A tibble: 23 x 3
# Groups: ID [2]
ID elevation code
<int> <int> <chr>
1 1 150 NA
2 1 140 NA
3 1 130 130 W
4 1 120 130 W
5 1 110 130 W
6 1 120 130 W
7 1 130 130 W
8 1 140 NA
9 1 150 NA
10 2 90 NA
# ... with 13 more rows
Needless to say, you can again mutate non-NA values from code to W very easily, if required.

Merge Dyad_Year with Country_Year data

I have two data-frames, one dyad-year and the other country-year.
Xccode1 ccode2 ccdistance countryname_1 countryname_2 majorpower_1
majorpower_2 milex_1 milper_1
1 1 2 20 0 United States of America Canada 1
0 143981000 2050
2 2 2 31 957 United States of America Bahamas 1
0 143981000 2050
3 3 2 40 1129 United States of America Cuba 1
0 143981000 2050
4 4 2 41 1437 United States of America Haiti 1
Country-Year:
ccode1 year Fac1_A Fac2_A Fac3_A
<int> <int> <dbl> <dbl> <dbl>
1 2 1980 -0.661 4.66 15.5
2 2 1981 -0.661 4.66 15.5
3 2 1982 -0.661 5.11 15.5
4 2 1983 -0.661 5.21 15.5
5 2 1984 -0.661 5.66 15.5
6 2 1985 -0.661 5.21 15.5
7 2 1986 -0.661 5.21 15.5
8 2 1987 -0.661 5.21 15.5
9 2 1988 -0.661 5.21 15.5
10 2 1989 -0.661 5.00 15.5
I'd like to merge this two data-frames so that each country in the dyad has a FacX value, however my attempts at doing this has either given me an error or lots of NA's. I first attempted to use a simple ifelse:
Demo_Dyad$Fac1_A_NR <- ifelse(Demo_Dyad$ccode1 == Cntry_yr$ccode1 &
Demo_Dyad$year == Cntry_yr$year,
Cntry_yr$Fac1_A, NA)
However, that results in each country in the Dyad_Year only having the value once. So e.g. USA <--> Haiti 1981 might have value X, but USA <--> Cuba 1981 will be NA.
I then attempted to do it by grouping in dplyr:
Demo_Dyad %>%
group_by(ccode1, year) %>%
mutate(Fac1_A_NR <- ifelse(ccode1 == Cntry_yr$ccode1 &
year == Cntry_yr$year, Cntry_yr$Fac1_A, NA))
But get the error: Error in `$<-.data.frame`(`*tmp*`, Fac1_A_NR, value = c(-0.660552389122193, :
replacement has 4942 rows, data has 217149
If anyone can see what is wrong with my code I would greatly appreciate it.
If the whole task is to merge two dataframes based on a column or columns they have in common, then use merge. For example:
DATA:
set.seed(111)
df_a <- data.frame(
Xccode = 1:10,
v1a = rnorm(10),
v2a = sample(LETTERS[1:5], 10, replace = T))
df_b <- data.frame(
ccode = 1:10,
v1b = rnorm(10, 5),
v2b = sample(LETTERS[4:7], 10, replace = T))
SOLUTION:
Assuming that the column the two dataframes have in common is Xccodeand, respectively, ccodeyou can use merge and specify the two columns as the ones to merge by:
df_ab <- merge(df_a, df_b, by.x = "Xccode", by.y = "ccode")
df_ab
Xccode v1a v2a v1b v2b
1 1 0.2352207 B 3.806391 E
2 2 -0.3307359 A 5.364187 E
3 3 -0.3116238 C 5.361662 E
4 4 -2.3023457 A 5.346964 G
5 5 -0.1708760 C 5.189737 D
6 6 0.1402782 E 4.840423 D
7 7 -1.4974267 A 5.326549 F
8 8 -1.0101884 A 5.598254 D
9 9 -0.9484756 A 3.158466 F
10 10 -0.4939622 C 7.718056 G

R fill new column based on interval from another dataset (lookup)

Lets say I have this dataset:
df1 = data.frame(groupID = c(rep("a", 6), rep("b", 6), rep("c", 6)),
testid = c(111, 222, 333, 444, 555, 666, 777, 888, 999, 1010, 1111, 1212, 1313, 1414, 1515, 1616, 1717, 1818))
df1
groupID testid
1 a 111
2 a 222
3 a 333
4 a 444
5 a 555
6 a 666
7 b 777
8 b 888
9 b 999
10 b 1010
11 b 1111
12 b 1212
13 c 1313
14 c 1414
15 c 1515
16 c 1616
17 c 1717
18 c 1818
And I have this 2nd dataset:
df2 = data.frame(groupID = c("a", "a", "a", "a", "b", "b", "b", "c", "c", "c"),
testid = c(222, 333, 555, 666, 777, 999, 1010, 1313, 1616, 1818),
bd = c(1, 1, 2, 2, 0, 1, 1, 1, 1, 2))
df2
groupID testid bd
1 a 222 1
2 a 333 1
3 a 555 2
4 a 666 2
5 b 777 0
6 b 999 1
7 b 1010 1
8 c 1313 1
9 c 1616 1
10 c 1818 2
I want to use the intervals in the 2nd dataset to fill in a new variable in the 1st dataset and autofill in values that have two occurances of a bd and NAs everywhere else by group.
Desired output:
groupID testid new_bd
1 a 111 NA
2 a 222 1
3 a 333 1
4 a 444 NA
5 a 555 2
6 a 666 2
7 b 777 0
8 b 888 NA
9 b 999 1
10 b 1010 1
11 b 1111 NA
12 b 1212 NA
13 c 1313 1
14 c 1414 1
15 c 1515 1
16 c 1616 1
17 c 1717 NA
18 c 1818 2
Ideally would like dplyr/tidyr solution but open to any approaches.
similar but these fill all values:
R: Filling timeseries values but only within last 12 months
R autofill blanks in variable until next value
I would start by modifying df2 to start and end of range. And you can loop or do anything else after.
grps <- df2 %>% group_by(groupID, bd) %>% summarize(start = min(testid), end = max(testid))
grps
groupID bd start end
<fct> <dbl> <dbl> <dbl>
1 a 1 222 333
2 a 2 555 666
3 b 0 777 777
4 b 1 999 1010
5 c 1 1313 1616
6 c 2 1818 1818
df1$bd <- NA
for(i in 1:nrow(grps)){
df1$bd[which(df1$test >= grps$start[i] & df1$test <= grps$end[i])] = grps$bd[i]
}
df1
groupID testid bd
1 a 111 NA
2 a 222 1
3 a 333 1
4 a 444 NA
5 a 555 2
6 a 666 2
7 b 777 0
8 b 888 NA
9 b 999 1
10 b 1010 1
11 b 1111 NA
12 b 1212 NA
13 c 1313 1
14 c 1414 1
15 c 1515 1
16 c 1616 1
17 c 1717 NA
18 c 1818 2
Maybe I have overlooked a simpler method but here is what I came up with using dplyr, we first create a left_join between df1 and df2 and fill bd column. We then group_by group_ID and bd and get first and last index of non-NA value in each group and replace values to NA which are less than minimum index and greater than maximum index.
library(dplyr)
left_join(df1, df2, by = c("groupID", "testid")) %>%
mutate(bd1 = bd) %>%
tidyr::fill(bd) %>%
group_by(groupID, bd) %>%
mutate(minRow = if (all(is.na(bd))) 1 else first(which(!is.na(bd1))),
maxRow = if (all(is.na(bd))) n() else last(which(!is.na(bd1))),
new_bd = replace(bd, is.na(bd1) & (row_number() < minRow |
row_number() > maxRow), NA)) %>%
ungroup() %>%
select(names(df1), new_bd)
# groupID testid new_bd
# <fct> <dbl> <dbl>
# 1 a 111 NA
# 2 a 222 1
# 3 a 333 1
# 4 a 444 NA
# 5 a 555 2
# 6 a 666 2
# 7 b 777 0
# 8 b 888 NA
# 9 b 999 1
#10 b 1010 1
#11 b 1111 NA
#12 b 1212 NA
#13 c 1313 1
#14 c 1414 1
#15 c 1515 1
#16 c 1616 1
#17 c 1717 NA
#18 c 1818 2
Here is a solution that works on my test data example above but wont run on my large dataset where I run into the problem of Error: cannot allocate vector of size 45.5 Gb. I believe it is related to the problem outlined here:"The same size explosion can happen if you have lots of the same level in both with otherwise different rows". In my actual dataset I'm looking at date variables, I didn't think this would effect the problem but maybe it does. I'm not sure if there is a work using fuzzyjoin as it works on a subset of the data.
library(tidyverse)
library(fuzzyjoin)
library(tidylog)
grps <- df2 %>% group_by(groupID, bd) %>% summarize(start = min(testid), end = max(testid))
grps
df1 %>%
fuzzy_left_join(grps,
by = c("groupID" = "groupID",
"testid" = "start",
"testid" = "end"),
match_fun = list(`==`, `>=`, `<=`)) %>%
select(groupID = groupID.x, testid, bd, start, end)
select: dropped 2 variables (groupID.x, groupID.y)
groupID testid bd start end
1 a 111 NA NA NA
2 a 222 1 222 333
3 a 333 1 222 333
4 a 444 NA NA NA
5 a 555 2 555 666
6 a 666 2 555 666
7 b 777 0 777 777
8 b 888 NA NA NA
9 b 999 1 999 1010
10 b 1010 1 999 1010
11 b 1111 NA NA NA
12 b 1212 NA NA NA
13 c 1313 1 1313 1616
14 c 1414 1 1313 1616
15 c 1515 1 1313 1616
16 c 1616 1 1313 1616
17 c 1717 NA NA NA
18 c 1818 2 1818 1818
data.table solution:
library(data.table)
> new <- setDT(grps)[setDT(df1),
+ .(groupID, testid, x.start, x.end, x.bd),
+ on = .(groupID, start <= testid, end >= testid)]
> new
groupID testid x.start x.end x.bd
1: a 111 NA NA NA
2: a 222 222 333 1
3: a 333 222 333 1
4: a 444 NA NA NA
5: a 555 555 666 2
6: a 666 555 666 2
7: b 777 777 777 0
8: b 888 NA NA NA
9: b 999 999 1010 1
10: b 1010 999 1010 1
11: b 1111 NA NA NA
12: b 1212 NA NA NA
13: c 1313 1313 1616 1
14: c 1414 1313 1616 1
15: c 1515 1313 1616 1
16: c 1616 1313 1616 1
17: c 1717 NA NA NA
18: c 1818 1818 1818 2
I think it may be done in fuzzyjoin using internal_join but I'm not sure?: https://github.com/dgrtwo/fuzzyjoin/issues/50

Resources