I would really appreciate your help in using R for data wrangling. I have a data where I want to split one column (variable) into two whenever applicable as conditioned by other variables. For example, as per the sample below, the data represents reactions time measures (RT1 and RT2) of some words (item) that appear in different times of reading (block). I want to see if RT1 and RT2 values in block 3, 4, and 5 are correlated with RT1 and RT2 values of the same item at block 1. The target items that appeared in block 1 and re-appeared in subsequent blocks are coded as 'EI' in the column 'condition', whereas items coded as 'E' or 'I' appeared only once.
structure(list(RECORDING_SESSION_LABEL = c(26, 26, 26, 26, 26,
26, 26, 26), RT1 = c(5171, 3857, 3447, 314, 460, 731, 957, 1253
), RT2 = c(357, 328, 122, 39, 86, 132, 173, 215), item = c("foreign",
"detailed", "large", "foreign", "foreign", "large", "large",
"disputable"), block = c(1, 1, 1, 3, 4, 3, 4, 3), condition = c("EI",
"E", "EI", "EI", "EI", "EI", "EI", "I")), row.names = c(NA, -8L
), class = c("tbl_df", "tbl", "data.frame"))
Where a sample of the data would look like this:
> d1
# A tibble: 8 x 6
RECORDING_SESSION_LABEL RT1 RT2 item block condition
<dbl> <dbl> <dbl> <chr> <dbl> <chr>
1 26 5171 357 foreign 1 EI
2 26 3857 328 detailed 1 E
3 26 3447 122 large 1 EI
4 26 314 39 foreign 3 EI
5 26 460 86 foreign 4 EI
6 26 731 132 large 3 EI
7 26 957 173 large 4 EI
8 26 1253 215 disputable 3 I
In order to present in a format that R would understand, the target data frame I want to achieve would be similar to the one below (where the highlighted columns should be added). Rows in blanks at these columns represent items which do not appear repetitively (condition is not coded as 'EI') ; therefore, they are irrelevant and should be coded as 'NA'.
structure(list(RECORDING_SESSION_LABEL = c(26, 26, 26, 26, 26,
26, 26, 26), `RT 1` = c(5171, 3857, 3447, 314, 460, 731, 957,
1253), RT2 = c(357, 328, 122, 39, 86, 132, 173, 215), item = c("foreign",
"detailed", "large", "foreign", "foreign", "large", "large",
"disputable"), block = c(1, 1, 1, 3, 4, 3, 4, 3), condition = c("EI",
"E", "EI", "EI", "EI", "EI", "EI", "I"), `RT 1_at_block1` = c(NA,
NA, NA, 5171, 5171, 3447, 3447, NA), RT2_at_block1 = c(NA, NA,
NA, 357, 357, 122, 122, NA)), row.names = c(NA, -8L), class = c("tbl_df",
"tbl", "data.frame"))
And a sample of the data format targeted would look like this:
> d2
# A tibble: 8 x 8
RECORDING_SESSI~ `RT 1` RT2 item block condition `RT 1_at_block1`
<dbl> <dbl> <dbl> <chr> <dbl> <chr> <dbl>
1 26 5171 357 fore~ 1 EI NA
2 26 3857 328 deta~ 1 E NA
3 26 3447 122 large 1 EI NA
4 26 314 39 fore~ 3 EI 5171
5 26 460 86 fore~ 4 EI 5171
6 26 731 132 large 3 EI 3447
7 26 957 173 large 4 EI 3447
8 26 1253 215 disp~ 3 I NA
# ... with 1 more variable: RT2_at_block1 <dbl>
> head(d2)
# A tibble: 6 x 8
RECORDING_SESSION_LABEL `RT 1` RT2 item block condition `RT 1_at_block1` RT2_at_block1
<dbl> <dbl> <dbl> <chr> <dbl> <chr> <dbl> <dbl>
1 26 5171 357 foreign 1 EI NA NA
2 26 3857 328 detailed 1 E NA NA
3 26 3447 122 large 1 EI NA NA
4 26 314 39 foreign 3 EI 5171 357
5 26 460 86 foreign 4 EI 5171 357
6 26 731 132 large 3 EI 3447 122
Thanks in advance for any help.
A possible solution using dplyr:
d1 <- structure(list(RECORDING_SESSION_LABEL = c(26, 26, 26, 26, 26, 26, 26, 26),
RT1 = c(5171, 3857, 3447, 314, 460, 731, 957, 1253),
RT2 = c(357, 328, 122, 39, 86, 132, 173, 215),
item = c("foreign", "detailed", "large", "foreign", "foreign", "large", "large", "disputable"),
block = c(1, 1, 1, 3, 4, 3, 4, 3), condition = c("EI", "E", "EI", "EI", "EI", "EI", "EI", "I")),
row.names = c(NA, -8L), class = c("tbl_df", "tbl", "data.frame"))
d2 <- d1 %>%
left_join(d1 %>% filter(block == 1) %>% select(RECORDING_SESSION_LABEL, item, RT1_at_block1 = RT1)) %>%
left_join(d1 %>% filter(block == 1) %>% select(RECORDING_SESSION_LABEL, item, RT2_at_block1 = RT2))
After that, d2 looks like this:
RECORDING_SESSION_LABEL RT1 RT2 item block condition RT1_at_block1 RT2_at_block1
<dbl> <dbl> <dbl> <chr> <dbl> <chr> <dbl> <dbl>
1 26 5171 357 foreign 1 EI 5171 357
2 26 3857 328 detailed 1 E 3857 328
3 26 3447 122 large 1 EI 3447 122
4 26 314 39 foreign 3 EI 5171 357
5 26 460 86 foreign 4 EI 5171 357
6 26 731 132 large 3 EI 3447 122
Edit: Adding a mutate if you want to set the values for block 1 to NA:
d2 <- d1 %>%
left_join(d1 %>% filter(block == 1) %>% select(RECORDING_SESSION_LABEL, item, RT1_at_block1 = RT1)) %>%
left_join(d1 %>% filter(block == 1) %>% select(RECORDING_SESSION_LABEL, item, RT2_at_block1 = RT2)) %>%
mutate(RT1_at_block1 = ifelse(block == 1, NA, RT1_at_block1),
RT2_at_block1 = ifelse(block == 1, NA, RT2_at_block1))
I am trying to find the do a function which is similar to a vlookup in excel but which returns the maximum value and the other values in the same row.
The data frame looks like this:
The data frame which I am dealing with are given below:
structure(list(Item = c("ABA", "ABB", "ABC", "ABD", "ABE", "ABF"
)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
structure(list(Item = c("ABA", "ABB", "ABC", "ABD", "ABE", "ABF",
"ABA", "ABB", "ABC", "ABD", "ABE", "ABF", "ABA", "ABB", "ABC",
"ABD", "ABE", "ABF"), Max1 = c(12, 68, 27, 17, 74, 76, 78, 93,
94, 98, 46, 90, 5, 58, 67, 64, 34, 97), Additional1 = c(40, 66,
100, 33, 66, 19, 8, 70, 21, 93, 48, 34, 44, 89, 74, 20, 0, 47
), Additional2 = c(39, 31, 85, 58, 0, 2, 57, 28, 31, 32, 15,
22, 93, 41, 57, 81, 95, 46)), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -18L))
The Expected output for this is given below:
You are looking for slice_max:
Book4 %>%
group_by(Item) %>%
# Item Max1 Additional1 Additional2
# 1 ABA 78 8 57
# 2 ABB 93 70 28
# 3 ABC 94 21 31
# 4 ABD 98 93 32
# 5 ABE 74 66 0
# 6 ABF 97 47 46
Using base R
subset(Book4, Max1 == ave(Max1, Item, FUN = max))
# A tibble: 6 × 4
Item Max1 Additional1 Additional2
<chr> <dbl> <dbl> <dbl>
1 ABE 74 66 0
2 ABA 78 8 57
3 ABB 93 70 28
4 ABC 94 21 31
5 ABD 98 93 32
6 ABF 97 47 46
An alternative base solution that is more resilient to floating-point precision problems (c.f., Why are these numbers not equal?, https://cran.r-project.org/doc/FAQ/R-FAQ.html#Why-doesn_0027t-R-think-these-numbers-are-equal_003f). It also allows two behavior options if there are duplicate max-values:
if you want all of them, use ties.method = "min";
if you want the first (or just one) of them, then ties.method = "first".
Book4[ave(Book4$Max1, Book4$Item, FUN = function(z) rank(-z, ties.method = "first")) == 1,]
# # A tibble: 6 x 4
# Item Max1 Additional1 Additional2
# <chr> <dbl> <dbl> <dbl>
# 1 ABE 74 66 0
# 2 ABA 78 8 57
# 3 ABB 93 70 28
# 4 ABC 94 21 31
# 5 ABD 98 93 32
# 6 ABF 97 47 46
Using R base aggregate + max + merge
> merge(Book4, aggregate(Max1~Item, data = Book4, max), by = c("Item", "Max1"))
Item Max1 Additional1 Additional2
1 ABA 78 8 57
2 ABB 93 70 28
3 ABC 94 21 31
4 ABD 98 93 32
5 ABE 74 66 0
6 ABF 97 47 46
I need your kind tidying data frame. A sample of data is provided below:
> dput(data_1)
structure(list(subject = c("E1", "E1", "E1", "E1", "E1", "E1",
"E1", "E1"), block = c(3, 3, 4, 4, 5, 5, 6, 6), condition = c("EI",
"I", "EI", "I", "EI", "I", "EI", "I"), prev_total_RT = c("963",
"NA", "963", "NA", "963", "NA", "963", "NA"), total_RT = c(271,
1042, 409, 406, 544, 490, 645, 465), Item_number = c(17, 46,
17, 46, 17, 46, 17, 46)), row.names = c(NA, -8L), class = c("tbl_df",
"tbl", "data.frame"))
> data_1
# A tibble: 8 x 6
subject block condition prev_total_RT total_RT Item_number
<chr> <dbl> <chr> <chr> <dbl> <dbl>
1 E1 3 EI 963 271 17
2 E1 3 I NA 1042 46
3 E1 4 EI 963 409 17
4 E1 4 I NA 406 46
5 E1 5 EI 963 544 17
6 E1 5 I NA 490 46
7 E1 6 EI 963 645 17
8 E1 6 I NA 465 46
While values of "prev_total_RT" for the condition "EI" are provided, it is not provided for the condition "I". I need a code that would generate the values of "prev_total_RT" for the condition "I".
The values of "prev_total_RT" for the condition "I" should be the sum of "total_RT" for condition "I" in "block" = 3, 4, and 5. This should be conditioned by each "subject" and "Item_number". For example, for the subject "E1" and Item_number "46" in condition "I", the value of "prev_total_RT" should be the sum of "total_RT" values in "block" 3, 4, 5 : 1042 + 406 + 490 = 1938.
The desired output is provided below:
> dput(data_2)
structure(list(subject = c("E1", "E1", "E1", "E1", "E1", "E1",
"E1", "E1"), block = c(3, 3, 4, 4, 5, 5, 6, 6), condition = c("EI",
"I", "EI", "I", "EI", "I", "EI", "I"), prev_total_RT = c(963,
1938, 963, 1938, 963, 1938, 963, 1938), total_RT = c(271, 1042,
409, 406, 544, 490, 645, 465), Item_number = c(17, 46, 17, 46,
17, 46, 17, 46)), row.names = c(NA, -8L), class = c("tbl_df",
"tbl", "data.frame"))
> data_2
# A tibble: 8 x 6
subject block condition prev_total_RT total_RT Item_number
<chr> <dbl> <chr> <dbl> <dbl> <dbl>
1 E1 3 EI 963 271 17
2 E1 3 I 1938 1042 46
3 E1 4 EI 963 409 17
4 E1 4 I 1938 406 46
5 E1 5 EI 963 544 17
6 E1 5 I 1938 490 46
7 E1 6 EI 963 645 17
8 E1 6 I 1938 465 46
Any help with this would be greatly appreciated.
A straight forward method,
df %>%
group_by(subject, Item_number) %>%
mutate(prev_total_RT = replace(prev_total_RT, condition == 'I', sum(total_RT[block %in% c(3, 4, 5)])))
# subject block condition prev_total_RT total_RT Item_number
# <chr> <dbl> <chr> <chr> <dbl> <dbl>
#1 E1 3 EI 963 271 17
#2 E1 3 I 1938 1042 46
#3 E1 4 EI 963 409 17
#4 E1 4 I 1938 406 46
#5 E1 5 EI 963 544 17
#6 E1 5 I 1938 490 46
#7 E1 6 EI 963 645 17
#8 E1 6 I 1938 465 46
I know the sum of points for each person.
I need to know: what is the minimum number of points that a person could have. And what is the maximum number of points that a person could have.
What I have tried:
min_and_max <- dataset %>%
group_by(person) %>%
dplyr::filter(min(sum(points, na.rm = T))) %>%
distinct(person) %>%
My dataset:
id person points
201 rt99 NA
201 rt99 3
201 rt99 2
202 kt 4
202 kt NA
202 kt NA
203 rr 4
203 rr NA
203 rr NA
204 jk 2
204 jk 2
204 jk NA
322 knm3 5
322 knm3 NA
322 knm3 3
343 kll2 2
343 kll2 1
343 kll2 5
344 kll NA
344 kll 7
344 kll 1
I would suggest this dplyr approach. You have to summarize data like this:
df %>% group_by(id,person) %>%
summarise(Total=sum(points,na.rm = T),
min=min(points,na.rm = T),
# A tibble: 7 x 5
# Groups: id [7]
id person Total min max
<int> <chr> <int> <int> <int>
1 201 rt99 5 2 3
2 202 kt 4 4 4
3 203 rr 4 4 4
4 204 jk 4 2 2
5 322 knm3 8 3 5
6 343 kll2 8 1 5
7 344 kll 8 1 7
Here is the data.table solution -
dataset[, min_points := min(points, na.rm = T), by = person]
dataset[, max_points := max(points, na.rm = T), by = person]
Since I don't have your data, I cannot test this code, but it should work fine.
The summarize() verb is what you want for this. You don't even need to filter out the NA values first since both min() and max() can have na.rm = TRUE.
min_and_max <- dataset %>%
group_by(person) %>%
summarize(min = min(points, na.rm = TRUE),
max = max(points, na.rm = TRUE))
# A tibble: 7 x 3
person min max
<chr> <dbl> <dbl>
1 jk 2 2
2 kll 1 7
3 kll2 1 5
4 knm3 3 5
5 kt 4 4
6 rr 4 4
7 rt99 2 3
structure(list(id = c(201, 201, 201, 202, 202, 202, 203, 203,
203, 204, 204, 204, 322, 322, 322, 343, 343, 343, 344, 344, 344
), person = c("rt99", "rt99", "rt99", "kt", "kt", "kt", "rr",
"rr", "rr", "jk", "jk", "jk", "knm3", "knm3", "knm3", "kll2",
"kll2", "kll2", "kll", "kll", "kll"), points = c(NA, 3, 2, 4,
NA, NA, 4, NA, NA, 2, 2, NA, 5, NA, 3, 2, 1, 5, NA, 7, 1)), class = "data.frame", row.names = c(NA,
-21L), spec = structure(list(cols = list(id = structure(list(), class = c("collector_double",
"collector")), person = structure(list(), class = c("collector_character",
"collector")), points = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1), class = "col_spec"))
I have a dataframe "data" with the following structure:
structure(list(age = c(45, 4, 32, 45), sex = c(1, 0, 1, 0), height = c(165,
178, 145, 132), weight = c(65, 73, 60, 45)), row.names = c(NA,
-4L), class = c("tbl_df", "tbl", "data.frame"))
And I would like to add to this data.frame two new variables (var1, var2), which should be calculated with the two following formulas:
var1 = age*height + (4 if sex==1 OR 2 if sex==0)
var2 = height*weight + (1 if age>40 or 2 if age=<40)
I have a problem both in adding the two variables to the data frame, both in applying a function (I tried to build a function, but seems that can be applied only to a single value and not to all values from all rows).
Can anyone help me, please?
akrun's suggestion of using Boolean arithmetic is a good one but you could also do simply a Boolean version of your own expression substituting multiplication for the if statements.s (whit mild editing of the "=<" to "<=")
data <- structure(list(age = c(45, 4, 32, 45), sex = c(1, 0, 1, 0), height = c(165, 178, 145, 132), weight = c(65, 73, 60, 45)), row.names = c(NA, -4L), class = c("tbl_df", "tbl", "data.frame"))
data <- within(data, {var1 = age*height + 4*(sex==1) + 2 *(sex==0);
var2 = height*weight + (age>40) + 2 *(age <= 40)})
> data
age sex height weight var2 var1
1 45 1 165 65 10726 7429
2 4 0 178 73 12996 714
3 32 1 145 60 8702 4644
4 45 0 132 45 5941 5942
Since the two sets of conditions are each disjoint, the "non-qualifying" choice terms will each be 0.
the function ifelse() is vector based, so it will apply the conditions to each element in the vector.
df <- structure(list(age = c(45, 4, 32, 45), sex = c(1, 0, 1, 0), height = c(165,
178, 145, 132), weight = c(65, 73, 60, 45)), row.names = c(NA,
-4L), class = c("tbl_df", "tbl", "data.frame"))
df$var1 <- ifelse(df$sex == 1,(df$age * df$height) + 4,(df$age * df$height) + 2)
df$var2 <- ifelse(df$age > 40,(df$weight * df$height) + 1,(df$age * df$height) + 2)
final output
> df
# A tibble: 4 x 6
age sex height weight var1 var2
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 45 1 165 65 7429 10726
2 4 0 178 73 714 714
3 32 1 145 60 4644 4642
4 45 0 132 45 5942 5941
I rather the tool case_when() from dplyr package.
Your original data is:
data <-
list(age = c(45, 4, 32, 45),
sex = c(1, 0, 1, 0),
height = c(165, 178, 145, 132),
weight = c(65, 73, 60, 45)),
row.names = c(NA, -4L),
class = c("tbl_df", "tbl", "data.frame"))
The new variables are created by:
data ->
data %>% mutate(var1 = case_when(sex==1 ~ age*height + 4,
sex==0 ~ age*height + 2),
var2 = case_when(age>40 ~ height*weight + 1,
age<=40 ~ height*weight + 2)
The outcome is:
# A tibble: 4 x 6
age sex height weight var1 var2
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 45 1 165 65 7429 10726
2 4 0 178 73 714 12996
3 32 1 145 60 4644 8702
4 45 0 132 45 5942 5941
We convert the logical/binary to numeric index by adding 1 to it and use that to change the values to 2, 4, or just 1, 2 and use that in the calculation
data %>%
mutate(var1 = (age * height) + c(2, 4)[sex + 1],
var2 = (height * weight) + (age <= 40)+1)
# A tibble: 4 x 6
# age sex height weight var1 var2
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 45 1 165 65 7429 10726
#2 4 0 178 73 714 12996
#3 32 1 145 60 4644 8702
#4 45 0 132 45 5942 5941
I am trying to calculate the number of weeks the inventory on hand will last given the sales projections for a dataset with 10s of million of rows. I have listed the expected output in the last column of the data structure given below. I also attached the implementation of this in Excel.
Weeksofsupply = Number of weeks the current inventory on hand will last.
example - in the attached image (SKU_CD 222, STORE_CD 33), the inventory on hand is 19, the sales values are
WK1 + WK2 = 15, Wk1 + Wk2 + Wk3 = 24, Which is greater than 19,
So we are picking 2, which the count of Weeks the current inventory will last.
Expected output in the last column
Data = structure(list(
SKU_CD = c(111, 111, 111, 111, 111, 111, 111,111, 111, 111, 111, 111, 222, 222, 222, 222, 222, 222, 222, 222, 222, 222, 222, 222),
STORE_CD = c(22, 22, 22, 22, 22, 22, 22,22, 22, 22, 22, 22, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33),
FWK_CD = c(201627, 201628, 201629, 201630, 201631, 201632,201633, 201634, 201635, 201636, 201637, 201638, 201627, 201628, 201629, 201630, 201631, 201632, 201633, 201634, 201635, 201636, 201637, 201638),
SALES = c(5, 2, 2, 2, 1, 3, 2, 2, 3, 2, 3, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 7, 5),
INVENTORY = c(29, 27, 25, 23, 22, 19, 17, 15, 12, 10, 25, 1, 19, 17, 15, 13, 12,9, 7, 5, 2, 0, 25, 18),
WeeksofSupply = c("11", "10", "9", "8", "8", "6", "5", "4", "3", "2", "Inventory More", "Inventory Less", "2", "2", "1", "1", "1", "Inventory Less", "Inventory Less", "Inventory Less", "Inventory Less", "Inventory Less", "Inventory More", "Inventory More")),
class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -24L),
.Names = c("SKU_CD", "STORE_CD", "FWK_CD", "SALES", "INVENTORY", "WeeksofSupply"))
Current Excel Code: (Here the weeks are shown in columns, but it should be rows like shown in the expected output.)
exceeds forecast"))
I would appreciate any input to implement this efficiently in R. Many Thanks for your time!
For your revised data in long format, you can do the following...
library(dplyr) #for the grouping functionality
#define a function to calculate weeks Supply from Sales and Inventory
weekSup <- function(sales,inv){
sales <- unlist(sales)
inv <- unlist(inv)
n <- length(sales)
weeksup <- rep(NA,n)
for(i in 1:n){
if(i==n | inv[i]<sales[i]){
weeksup[i] <- ifelse(inv[i]>sales[i],NA,inv[i]/sales[i])
} else {
weeksup[i] <- approxfun(cumsum(sales[i:n]),1:(n-i+1))(inv[i])
#Your 'inventory more' is coded as -1 (a number) to avoid whole column being forced to a character vector
weeksup <- replace(weeksup,is.na(weeksup),-1)
return(weeksup) #for whole weeks, change this to `return(floor(weeksup))`
Data2 <- Data %>% group_by(SKU_CD,STORE_CD) %>% mutate(weekSup=weekSup(SALES,INVENTORY))
<dbl> <dbl> <dbl> <dbl> <dbl> <chr> <dbl>
1 111 22 201627 5 29 11 11.3333333
2 111 22 201628 2 27 10 10.8333333
3 111 22 201629 2 25 9 9.8333333
4 111 22 201630 2 23 8 8.8333333
5 111 22 201631 1 22 8 8.0000000
6 111 22 201632 3 19 6 6.6666667
7 111 22 201633 2 17 5 5.8333333
8 111 22 201634 2 15 4 4.8333333
9 111 22 201635 3 12 3 3.6666667
10 111 22 201636 2 10 2 2.8333333
11 111 22 201637 3 25 Inventory More -1.0000000
12 111 22 201638 6 1 Inventory Less 0.1666667
13 222 33 201627 7 19 2 2.4444444
14 222 33 201628 8 17 2 2.0000000
15 222 33 201629 9 15 1 1.6000000
16 222 33 201630 10 13 1 1.2727273
17 222 33 201631 11 12 1 1.0833333
18 222 33 201632 12 9 Inventory Less 0.7500000
19 222 33 201633 13 7 Inventory Less 0.5384615
20 222 33 201634 14 5 Inventory Less 0.3571429
Here is one way to do it, using the linear interpolation method approxfun...
data$WeeksSupply <- sapply(1:nrow(data),function(i)
data$WeeksSupply <- replace(data$WeeksSupply,is.na(data$WeeksSupply),
"Inventory Exceeds Forecast")
# A tibble: 2 x 12
Inventory Wk1 Wk2 Wk3 Wk4 Wk5 Wk6 Wk7 Wk8 Wk9 Wk10 WeeksSupply
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
1 200 20 15 25 40 35 45 30 50 45 55 6.66666666666667
2 2000 20 15 25 40 35 45 30 50 45 55 Inventory Exceeds Forecast