Sample data
library(dplyr)
df <- data.frame(
ID = c(1,1,1,1,2,2,2,3,3,3),
day = c(3,8,14,29,4,6,8,1,4,9),
value = c(75, 101, 115, 120, 110, 106, 122, 100, 128, 140))
The idea behind the question:
Select the smallest day for each ID subject and multiply the value feature by 1.3 (ID 1 - day 3 - value 75, ID 2 - day 4 - value 110, ID 3 - day 1 - value 100).
Then compare that newly created value with other values that have the same ID, but have different day number.
For example:
The smallest day number for ID 1 is 3. Then multiply the value of that row by 1.3 (75 * 1.3 = 97.5). Compare the newly created value (97.5) with the values ((101, 115, 120)) that have the same ID of 1. Then anwser TRUE or FALSE if the new value is greater than values ((101, 115, 120)).
Repeat that as well for ID 2 and 3.
library(dplyr)
df <- data.frame(
ID = c(1,1,1,1,2,2,2,3,3,3),
day = c(3,8,14,29,4,6,8,1,4,9),
value = c(75, 101, 115, 120, 110, 106, 122, 100, 128, 140))
df %>%
group_by(ID) %>%
mutate(v = value[day == min(day)] * 1.3,
flag = value > v) %>%
ungroup()
# # A tibble: 10 x 5
# ID day value v flag
# <dbl> <dbl> <dbl> <dbl> <lgl>
# 1 1 3 75 97.5 FALSE
# 2 1 8 101 97.5 TRUE
# 3 1 14 115 97.5 TRUE
# 4 1 29 120 97.5 TRUE
# 5 2 4 110 143 FALSE
# 6 2 6 106 143 FALSE
# 7 2 8 122 143 FALSE
# 8 3 1 100 130 FALSE
# 9 3 4 128 130 FALSE
#10 3 9 140 130 TRUE
If you want to flag IDs with at least one TRUE flag you can create flag2 like this:
df %>%
group_by(ID) %>%
mutate(v = value[day == min(day)] * 1.3,
flag = value > v,
flag2 = max(flag)) %>%
ungroup()
# # A tibble: 10 x 6
# ID day value v flag flag2
# <dbl> <dbl> <dbl> <dbl> <lgl> <int>
# 1 1 3 75 97.5 FALSE 1
# 2 1 8 101 97.5 TRUE 1
# 3 1 14 115 97.5 TRUE 1
# 4 1 29 120 97.5 TRUE 1
# 5 2 4 110 143 FALSE 0
# 6 2 6 106 143 FALSE 0
# 7 2 8 122 143 FALSE 0
# 8 3 1 100 130 FALSE 1
# 9 3 4 128 130 FALSE 1
#10 3 9 140 130 TRUE 1
Or extract the IDs as a vector:
df %>%
group_by(ID) %>%
mutate(v = value[day == min(day)] * 1.3,
flag = value > v) %>%
ungroup() -> df2
df2 %>%
filter(flag == TRUE) %>%
distinct(ID) %>%
pull(ID)
#[1] 1 3
Related
I have some values in
df:
# A tibble: 7 × 1
var1
<dbl>
1 0
2 10
3 20
4 210
5 230
6 266
7 267
that I would like to compare to a second dataframe called
value_lookup
# A tibble: 4 × 2
var1 value
<dbl> <dbl>
1 0 0
2 200 10
3 230 20
4 260 30
In particual I would like to make a join based on >= meaning that a value that is greater or equal to the number in var1 gets a values of x. E.g. take the number 210 of the orginal dataframe. Since it is >= 200 and <230 it would get a value of 10.
Here is the expected output:
var1 value
1 0 0
2 10 0
3 20 0
4 210 10
5 230 20
6 266 30
7 267 30
I thought it should be doable using {fuzzyjoin} but I cannot get it done.
value_lookup <- tibble(var1 = c(0, 200,230,260),
value = c(0,10,20,30))
df <- tibble(var1 = c(0,10,20,210,230,266,267))
library(fuzzyjoin)
fuzzyjoin::fuzzy_left_join(
x = df,
y = value_lookup ,
by = "var1",
match_fun = list(`>=`)
)
An option is also findInterval:
df$value <- value_lookup$value[findInterval(df$var1, value_lookup$var1)]
Output:
var1 value
1 0 0
2 10 0
3 20 0
4 210 10
5 230 20
6 266 30
7 267 30
As you're mentioning joins, you could also do a rolling join via data.table with the argument roll = T which would look for same or closest value preceding var1 in your df:
library(data.table)
setDT(value_lookup)[setDT(df), on = 'var1', roll = T]
You can use cut:
df$value <- value_lookup$value[cut(df$var1,
c(value_lookup$var1, Inf),
right=F)]
# # A tibble: 7 x 2
# var1 value
# <dbl> <dbl>
# 1 0 0
# 2 10 0
# 3 20 0
# 4 210 10
# 5 230 20
# 6 266 30
# 7 267 30
I have the following data frame in R. For this experiment I was testing the survival of cells at several times with 2 treatments, and 2 replicates for each treatment. I want to calculate the percentage of cells alive at each time for each treatment/replicate.
For example, for Treat 1 Rep 1 it would be 500/500, 470/500, 100/500, 20/500, for Treat 2 Rep 1 it would be 430/430, 420/430, 300/430, 100/430
Thanks!
x <- data.frame("treatment"= c(1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2),
"rep"=c(1,1,1,1,2,2,2,2,1,1,1,1,2,2,2,2),
"Time" = c(0, 30, 60, 180, 0, 30, 60, 180, 0, 30, 60, 180,0, 30, 60, 180 ),
"cells_alive" = c(500, 470, 100, 20, 476, 310, 99, 2, 430, 420, 300, 100, 489, 451, 289, 4))
We can group by 'treatment', 'rep', calculate the 'prop'ortion by dividing the 'cells_alive' with the value of 'cells_alive' that correspond to 'Time' as 0
library(dplyr)
x1 <- x %>%
group_by(treatment, rep) %>%
mutate(prop = cells_alive/cells_alive[Time == 0])
-output
x1
# A tibble: 16 x 5
# Groups: treatment, rep [4]
# treatment rep Time cells_alive prop
# <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 1 0 500 1
# 2 1 1 30 470 0.94
# 3 1 1 60 100 0.2
# 4 1 1 180 20 0.04
# 5 1 2 0 476 1
# 6 1 2 30 310 0.651
# 7 1 2 60 99 0.208
# 8 1 2 180 2 0.00420
# 9 2 1 0 430 1
#10 2 1 30 420 0.977
#11 2 1 60 300 0.698
#12 2 1 180 100 0.233
#13 2 2 0 489 1
#14 2 2 30 451 0.922
#15 2 2 60 289 0.591
#16 2 2 180 4 0.00818
Or with match
x %>%
group_by(treatment, rep) %>%
mutate(prop = cells_alive/cells_alive[match(0, Time)])
if the 'Time' is already ordered
x %>%
group_by(treatment, rep) %>%
mutate(prop = cells_alive/first(cells_alive))
Having a rough time approaching this problem with a large dataset. Essentially there are multiple rows for the same item. However, only one of the items contains the required value. I need to copy that value to all matching items.
Eg. below, I need item 100 to have a cost of 1203 for every row.
df = data.frame("item" = c(100, 100, 100, 105, 105, 102, 102, 102),
"cost" = c(1203, 0, 0, 66, 0, 1200, 0, 0))
> df
item cost
1 100 1203
2 100 0
3 100 0
4 105 66
5 105 0
6 102 1200
7 102 0
8 102 0
Like so:
df_wanted = data.frame("item" = c(100, 100, 100, 105, 105, 102, 102, 102),
"cost" = c(1203, 1203, 1203, 66, 66, 1200, 1200, 1200))
> df_wanted
item cost
1 100 1203
2 100 1203
3 100 1203
4 105 66
5 105 66
6 102 1200
7 102 1200
8 102 1200
Below is my attempt at I think an inefficient method:
for (row in 1:length(df$cost)){
if (df$cost[row] == 0){
df$cost[row] = df$cost[row-1]
}
}
here is one option. After grouping by 'item', subset the 'cost' where the 'cost' is not 0 and select the first element
library(dplyr)
df %>%
group_by(item) %>%
mutate(cost = first(cost[cost!=0))
# A tibble: 8 x 2
# Groups: item [3]
# item cost
# <dbl> <dbl>
#1 100 1203
#2 100 1203
#3 100 1203
#4 105 66
#5 105 66
#6 102 1200
#7 102 1200
#8 102 1200
Looks like you want to group by item and then replace 0 in cost with the last non-zero value. In each group, cummax(which(cost != 0)) will give the index of the last non-zero value.
library(dplyr)
df %>%
group_by(item) %>%
mutate(cost = cost[cummax(which(cost != 0))]) %>%
ungroup()
## A tibble: 8 x 2
# item cost
# <dbl> <dbl>
#1 100 1203
#2 100 1203
#3 100 1203
#4 105 66
#5 105 66
#6 102 1200
#7 102 1200
#8 102 1200
Base R equivalent is
transform(df, cost = ave(cost, item, FUN = function(x) x[cummax(which(x != 0))]))
What I ended up going with after revisiting this problem as a left_join(). Which makes more sense to me intuitively though it may not be the best solution.
The original DF below.
df = tibble("item" = as.factor(c(100, 100, 100, 105, 105, 102, 102, 102)),
"cost" = c(1203, 0, 0, 66, 0, 0, 1200, 0))
> df
# A tibble: 8 x 2
item cost
<fct> <dbl>
1 100 1203
2 100 0
3 100 0
4 105 66
5 105 0
6 102 0
7 102 1200
8 102 0
Create an 'index' of item-value pairs
df_index <- df %>%
group_by(item) %>%
arrange(-cost) %>%
slice(1)
> df_index
# A tibble: 3 x 2
# Groups: item [3]
item cost
<fct> <dbl>
1 100 1203
2 102 1200
3 105 66
Finally, join the dataframes by item to fill in the empty row values.
df_joined <- df %>%
left_join(df_index, by="item")
> df_joined
# A tibble: 8 x 3
item cost.x cost.y
<fct> <dbl> <dbl>
1 100 1203 1203
2 100 0 1203
3 100 0 1203
4 105 66 66
5 105 0 66
6 102 0 1200
7 102 1200 1200
8 102 0 1200
I am giving a data set called ChickWeight. This has the weights of chicks over a time period. I need to introduce a new variable that measures the current weight difference compared to day 0. The data set is in library(datasets) so you should have it.
library(dplyr)
weightgain <- ChickWeight %>%
group_by(Chick) %>%
filter(any(Time == 21)) %>%
mutate(weightgain = weight - first(weight))
I have this code, but this code just subtracts each weight by 42 which is the weight at time 0 for chick 1. I need each chick to be subtracted by its own weight at time 0 so that the weightgain column is correct.
We could do
library(dplyr)
ChickWeight %>%
group_by(Chick) %>%
mutate(weightgain = weight - weight[Time == 0])
#Or mutate(weightgain = weight - first(weight))
# A tibble: 578 x 5
# Groups: Chick [50]
# weight Time Chick Diet weightgain
# <dbl> <dbl> <ord> <fct> <dbl>
# 1 42 0 1 1 0
# 2 51 2 1 1 9
# 3 59 4 1 1 17
# 4 64 6 1 1 22
# 5 76 8 1 1 34
# 6 93 10 1 1 51
# 7 106 12 1 1 64
# 8 125 14 1 1 83
# 9 149 16 1 1 107
#10 171 18 1 1 129
# … with 568 more rows
Or using base R ave
with(ChickWeight, ave(weight, Chick, FUN = function(x) x - x[1]))
Question:
I am using dplyr to do data analysis in R, and I come across the following problem.
My data frame is like this:
item day val
1 A 1 90
2 A 2 100
3 A 3 110
4 A 5 80
5 A 8 70
6 B 1 75
7 B 3 65
The data frame is already arranged in item, day. Now I want to mutate a new column, with each row being the smallest value of the same group AND having the day to be within the next 2 days.
For the example above, I want the resulting data frame to be:
item day val output
1 A 1 90 100 # the smaller of 100 and 110
2 A 2 100 110 # the only value within 2 days
3 A 3 110 80 # the only value within 2 days
4 A 5 80 NA # there is no data within 2 days
5 A 8 70 NA # there is no data within 2 days
6 B 1 75 65 # the only value within 2 days
7 B 3 65 NA # there is no data within 2 days
I understand that I will probably use group_by and mutate, but how to write the inside function in order to achieve my desired result?
Any help is greatly appreciated. Let me know if you need me to clarify anything. Thank you!
Try this:
df %>%
# arrange(item, day) %>% # if not already arranged
# take note of the next two values & corresponding difference in days
group_by(item) %>%
mutate(val.1 = lead(val),
day.1 = lead(day) - day,
val.2 = lead(val, 2),
day.2 = lead(day, 2) - day) %>%
ungroup() %>%
# if the value is associated with a day more than 2 days away, change it to NA
mutate(val.1 = ifelse(day.1 %in% c(1, 2), val.1, NA),
val.2 = ifelse(day.2 %in% c(1, 2), val.2, NA)) %>%
# calculate output normally
group_by(item, day) %>%
mutate(output = min(val.1, val.2, na.rm = TRUE)) %>%
ungroup() %>%
# arrange results
select(item, day, val, output) %>%
mutate(output = ifelse(output == Inf, NA, output)) %>%
arrange(item, day)
# A tibble: 7 x 4
item day val output
<fctr> <int> <int> <dbl>
1 A 1 90 100
2 A 2 100 110
3 A 3 110 80.0
4 A 5 80 NA
5 A 8 70 NA
6 B 1 75 65.0
7 B 3 65 NA
Data:
df <- read.table(text = " item day val
1 A 1 90
2 A 2 100
3 A 3 110
4 A 5 80
5 A 8 70
6 B 1 75
7 B 3 65", header = TRUE)
We can use complete from the tidyr package to complete the dataset by day, and then use lead from dplyr and rollapply from zoo to find the minimum of the next two days.
library(dplyr)
library(tidyr)
library(zoo)
DF2 <- DF %>%
group_by(item) %>%
complete(day = full_seq(day, period = 1)) %>%
mutate(output = rollapply(lead(val), width = 2, FUN = min, na.rm = TRUE,
fill = NA, align = "left")) %>%
drop_na(val) %>%
ungroup() %>%
mutate(output = ifelse(output == Inf, NA, output))
DF2
# # A tibble: 7 x 4
# item day val output
# <chr> <dbl> <int> <dbl>
# 1 A 1.00 90 100
# 2 A 2.00 100 110
# 3 A 3.00 110 80.0
# 4 A 5.00 80 NA
# 5 A 8.00 70 NA
# 6 B 1.00 75 65.0
# 7 B 3.00 65 NA
DATA
DF <- read.table(text = "item day val
1 A 1 90
2 A 2 100
3 A 3 110
4 A 5 80
5 A 8 70
6 B 1 75
7 B 3 65",
header = TRUE, stringsAsFactors = FALSE)
We'll create a dataset with modified day, so we can left join it on the original dataset, keeping only minimum value.
df %>%
left_join(
bind_rows(mutate(.,day=day-1),mutate(.,day=day-2)) %>% rename(output=val)) %>%
group_by(item,day,val) %>%
summarize_at("output",min) %>%
ungroup
# # A tibble: 7 x 4
# item day val output
# <fctr> <dbl> <int> <dbl>
# 1 A 1 90 100
# 2 A 2 100 110
# 3 A 3 110 80
# 4 A 5 80 NA
# 5 A 8 70 NA
# 6 B 1 75 65
# 7 B 3 65 NA
data
df <- read.table(text = " item day val
1 A 1 90
2 A 2 100
3 A 3 110
4 A 5 80
5 A 8 70
6 B 1 75
7 B 3 65", header = TRUE)