Forming a new variable based on previous row value - r

My data is of the following form
structure(list(Flag = c(1, 0, 0, 1, 0, 0, 1, 0), variable = c(3,
8, 6, 7, 1, 4, 3, 6), sale = c(26, 27, 61, 38, 79, 87, 81, 13
)), .Names = c("Flag", "variable", "sale"), row.names = c(NA,
-8L), class = "data.frame")
And I want to create output as follows
structure(list(Flag = c(1, 0, 0, 1, 0, 0, 1, 0), variable = c(3,
8, 6, 7, 1, 4, 3, 6), sale = c(26, 27, 61, 38, 79, 87, 81, 13
), begin = c(3, -23, -50, 7, -31, -70, 3, -78), end = c(-23,
-50, -111, -31, -70, -151, -78, -91)), .Names = c("Flag", "variable",
"sale", "begin", "end"), row.names = c(NA, -8L), class = "data.frame")
where the ne column begin and end are based on the following algorathim
if flag=1 then
begin=variable;
end=variable-sale;
----------
else
begin=lag(end) ( i.e the previous value of end variable)
end= lag(end)-sale
What I want is when flag is 1 the value of "begin" is equalt to "variable" value and "end" value is "variable-sale" value.
Where as for the others the value of begin is the previous row "end" value and "end" value is (begin-sales) value
Can anyone help me how to write achieve this in R?

I think the example output you provide is incorrect, but I would try the following:
beginEnd <- by(indf, cumsum(indf$Flag), FUN = function(x) {
out <- Reduce("-", c(x[, "variable"][1], x[, "sale"]), accumulate = TRUE)
cbind(begin = head(out, -1),
end = tail(out, -1))
})
cbind(indf, do.call(rbind, beginEnd))
# Flag variable sale begin end
# 1 1 3 26 3 -23
# 2 0 8 27 -23 -50
# 3 0 6 61 -50 -111
# 4 1 7 38 7 -31
# 5 0 1 79 -31 -110
# 6 0 4 87 -110 -197
# 7 1 3 81 3 -78
# 8 0 6 13 -78 -91

Related

Mix "color_bar" and "style" in formattable package

I'm using formattable package and I want to personalize my table but I can't in the way I want.
Here is my table
structure(list(PJ = c(4, 4, 4, 4, 4, 4), V = c(4, 2, 2, 2, 1,
1), E = c(0, 0, 0, 0, 0, 0), D = c(0, 2, 2, 2, 3, 3), GF = c(182,
91, 92, 185, 126, 119), GC = c(84, 143, 144, 115, 141, 168),
Dif = c(98, -52, -52, 70, -15, -49), Pts = c(12, 6, 6, 6,
3, 3)), class = "data.frame", row.names = c("Player1", "Player2",
"Player3", "Player4", "Player5", "Player6"))
It looks like this:
PJ V E D GF GC Dif Pts
Player1 4 4 0 0 182 84 98 12
Player2 4 2 0 2 91 143 -52 6
Player3 4 2 0 2 92 144 -52 6
Player4 4 2 0 2 185 115 70 6
Player5 4 1 0 3 126 141 -15 3
Player6 4 1 0 3 119 168 -49 3
If I want the column GF in bold, I use
formattable(TAB.df, list(
GF = formatter("span",style = style("font.weight"="bold"))
))
If I want a color_bar I run this code:
formattable(TAB.df, list(
GF = color_bar("lightgreen")
))
Nevertheless, I don't know how to combine them and get the "color_bar" with "bold" numbers.

Return new dataframe with columns created inside a function in R with user-given names

Please see my code below:
# functions to get percentile threshold, and assign new values to outliers
get_low_perc <- function(var_name) {
return(quantile(var_name, c(0.01)))
}
get_hi_perc <- function(var_name) {
return(quantile(var_name, c(0.99)))
}
round_up <- function(target_var, flag_var, floor) {
target_var <- as.numeric(ifelse(flag_var == 1, floor, target_var))
return(as.integer(target_var))
}
round_down <- function(target_var, flag_var, ceiling) {
target_var <- as.numeric(ifelse(flag_var == 1, ceiling, target_var))
return(as.integer(target_var))
}
# try putting it all together
no_way <- function(df, df_col_name, df_col_flagH, df_col_flagL) {
lo_perc <- get_low_perc(df_col_name)
hi_perc <- get_hi_perc(df_col_name)
df$df_col_flagH <- as.factor(ifelse(df_col_name < lo_perc, 1, 0))
df$df_col_flagL <- as.factor(ifelse(df_col_name > hi_perc, 1, 0))
df_col_name <- round_up(df_col_name, df_col_flagL, lo_perc)
df_col_name <- round_down(df_col_name, df_col_flagH, hi_perc)
# names(df)[names(df)=='df_col_flagH'] <-
# boxplot(df_col_name)
return(df)
}
I have created 5 custom functions; the first two respectively get the 1th percentile and the 99th percentile of a given variable. The last two round the values in these variables up or down depending on how far away they are from the 1st percentile and the 99th percentile values. The last function is trying to put all these functions together to essentially output a new dataframe containing the same columns in the original df, the updated column, and two new columns indicating values that were flagged as below the 1st percentile and above the 99th percentile. I have produced a mock dataframe below, since I can't seem to pass some of my data here.
df2 = data.frame(col = c(1, 3, 4, 5, 8, 7, 67, 744, 876, 8, 8, 54, 9),
col1 = c(9, 6, 8, 3, 4, 5, 8, 7, 67, 744, 87, 33, 77),
col2 = c(8, 2, 8, 4, 87, 66, 54, 99, 77, 77, 88, 67, 102))
Ideally, after I call the function using the command "no_way(df2, df2$col1, df2$new_col1, df2$new_col2)", I want an output dataframe looking like:
df2 = data.frame(col = c(1, 3, 4, 5, 8, 7, 67, 744, 876, 8, 8, 54, 9),
col1 = c(9, 6, 8, 3, 4, 5, 8, 7, 67, 744, 87, 33, 77), # updated with appropriate values
col2 = c(8, 2, 8, 4, 87, 66, 54, 99, 77, 77, 88, 67, 102),
new_col1 = c(0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0),
new_col2 = c(0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0))
^ Where new_col1 and new_col2 are column names given by the user when calling the function. I am currently getting the dataframe as expected, but the new columns created have kept the function parameters' names, as in:
df2 = data.frame(col = c(1, 3, 4, 5, 8, 7, 67, 744, 876, 8, 8, 54, 9),
col1 = c(9, 6, 8, 3, 4, 5, 8, 7, 67, 744, 87, 33, 77), # updated with appropriate values
col2 = c(8, 2, 8, 4, 87, 66, 54, 99, 77, 77, 88, 67, 102),
df_col_flagH = c(0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0),
df_col_flagL = c(0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0))
I would not mind changing the name of the columns afterwards, but I will be using this function of 17 columns therefore that wouldn't be optimal. Please help.
You should pass new column names as string.
Also ifelse(condition, 1, 0) can be simplified to as.integer(condition).
no_way <- function(df, df_col_name, df_col_flagH, df_col_flagL) {
lo_perc <- get_low_perc(df[[df_col_name]])
hi_perc <- get_hi_perc(df[[df_col_name]])
df[[df_col_flagH]] <- as.factor(as.integer(df[[df_col_name]] < lo_perc))
df[[df_col_flagL]] <- as.factor(as.integer(df[[df_col_name]] > hi_perc))
df[[df_col_name]] <- round_up(df[[df_col_name]], df_col_flagL, lo_perc)
df[[df_col_name]] <- round_down(df[[df_col_name]], df_col_flagH, hi_perc)
return(df)
}
df2 <- no_way(df2, "col1", "new_col1", "new_col2")
df2
# col col1 col2 new_col1 new_col2
#1 1 9 8 0 0
#2 3 9 2 0 0
#3 4 9 8 0 0
#4 5 9 4 1 0
#5 8 9 87 0 0
#6 7 9 66 0 0
#7 67 9 54 0 0
#8 744 9 99 0 0
#9 876 9 77 0 0
#10 8 9 77 0 1
#11 8 9 88 0 0
#12 54 9 67 0 0
#13 9 9 102 0 0

if previous value in one column and next value in another column meet a condition, add 1 into another column using r

I have data like this
structure(list(id = c(1, 1, 2, 2, 2), time = c(1834, 4809, 18,
333, 387), nh_source = c(0, 0, 1, 0, 0), admi_source = c(19,
19, 85, 19, 88), disdest = c(85, 29, 56, 85, 39)), class = "data.frame", row.names = c(NA,
-5L))
and I want to group the ids and check if the previous value in column disdest is 56 or 85 and the next value in column admisorc is 19, then add 1 to column nh_source column.I want the df to look like this
structure(list(id2 = c(1, 1, 2, 2, 2), time = c(1834, 4809, 18,
333, 387), nh_source2 = c(0, 1, 1, 1, 0), admi_source = c(19,
19, 85, 19, 88), disdest = c(85, 29, 56, 85, 39)), class = "data.frame", row.names = c(NA,
-5L))
Create the logical condition with lag after grouping by 'id' and add it to the 'nh_source' (TRUE -> 1 and FALSE -> 0)
library(dplyr)
df1 %>%
group_by(id) %>%
mutate(nh_source = nh_source +
(admi_source == 19 & lag(disdest) %in% c(56, 85))) %>%
ungroup
-output
# A tibble: 5 x 5
id time nh_source admi_source disdest
1 1 1834 0 19 85
2 1 4809 1 19 29
3 2 18 1 85 56
4 2 333 1 19 85
5 2 387 0 88 39

Creating a column based on filtering two data frames of different lengths using R

I got two data sets of different lengths. I want to create a new column in the dataset which got more rows based on filtering a specific column from the shorter df. I am getting a waring " Longer object length is not a multiple of shorter object length". And the result is also not correct. I tried to created a smaller example datasets and tried the same code and its working with correct results. I am not sure why on my original data the results are not correct and I am getting the warning.
The example datasets are
structure(list(id = 1:10, activity = c(0, 0, 0, 0, 1, 0, 0, 1,
0, 0), code = c(2, 5, 11, 15, 3, 18, 21, 3, 27, 55)), class = "data.frame", row.names = c(NA,
-10L))
the second df
structure(list(id2 = 1:20, code2 = c(2, 5, 11, 15, 9, 18, 21,
3, 27, 55, 2, 5, 11, 15, 3, 18, 21, 3, 27, 55), d_Activity = c(0,
0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0)), class = "data.frame", row.names = c(NA,
-20L))
I tried this on both my original datasets where I get the warning and these dummy dfs where no warning and correct results.
data2 <- data2 %>%
mutate(d_Activity = ifelse(code2 %in% data1$code & activity == 1, 1,0))
Actually, you are doing it wrong way. Let me explain-
In sample data it is working because larger df have rows (20) which is multiple of rows in smaller df (10).
So in you syntax what you are doing is, to check one complete vector with another complete vector (column of another df), because R normally works in vectorised way of operations.
the correct way of matching one to many is through purrr::map where each individual value in first argument (code2 here) operates with another vector i.e. df1$code which is not in argument of map.
df1 <- structure(list(id = 1:10, activity = c(0, 0, 0, 0, 1, 0, 0, 1,
0, 0), code = c(2, 5, 11, 15, 3, 18, 21, 3, 27, 55)), class = "data.frame", row.names = c(NA,
-10L))
df2 <- structure(list(id2 = 1:20, code2 = c(2, 5, 11, 15, 9, 18, 21,
3, 27, 55, 2, 5, 11, 15, 3, 18, 21, 3, 27, 55), d_Activity = c(0,
0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0)), class = "data.frame", row.names = c(NA,
-20L))
library(tidyverse)
df2 %>%
mutate(d_Activity = map(code2, ~ +(.x %in% df1$code[df1$activity == 1])))
#> id2 code2 d_Activity
#> 1 1 2 0
#> 2 2 5 0
#> 3 3 11 0
#> 4 4 15 0
#> 5 5 9 0
#> 6 6 18 0
#> 7 7 21 0
#> 8 8 3 1
#> 9 9 27 0
#> 10 10 55 0
#> 11 11 2 0
#> 12 12 5 0
#> 13 13 11 0
#> 14 14 15 0
#> 15 15 3 1
#> 16 16 18 0
#> 17 17 21 0
#> 18 18 3 1
#> 19 19 27 0
#> 20 20 55 0
Created on 2021-06-17 by the reprex package (v2.0.0)

Find maximum in a group, subset by a subset from a different dataframe, to select other value's

I have two data.frames df1 with raw data. df2 has information on where to look in df1.
df1 has groups, defined by "id". In those groups, a subset is defined by df2$value_a1 and df2$value_a2, which represent the range of rows to look in the group. In that subsetgroup I want to find the maximum value_a, to select value_b.
code for df1 and df2
df1 <- data.frame("id" = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3), "value_a" = c(0, 10, 21, 30, 43, 53, 69, 81, 93, 5, 16, 27, 33, 45, 61, 75, 90, 2, 11, 16, 24, 31, 40, 47, 60, 75, 88), "value_b" = c(100, 101, 100, 95, 90, 104, 88, 84, 75, 110, 105, 106, 104, 95, 109, 96, 89, 104, 104, 104, 103, 106, 103, 101, 99, 98, 97), "value_c" = c(0, -1, -2, -2, -2, -2, -1, -1, 0, 0, 0, 0, 1, 1, 2, 2, 1, -1, 0, 0, 1, 1, 2, 2, 1, 1, 0), "value_d" = c(1:27))
df2 <- data.frame("id" = c(1, 2, 3), "value_a1" = c(21, 33, 16), "value_a2" = c(69, 75, 60))
This is df1
id value_a value_b value_c value_d
1 1 0 100 0 1
2 1 10 101 -1 2
3 1 21 100 -2 3
4 1 30 95 -2 4
5 1 43 90 -2 5
6 1 53 104 -2 6
7 1 69 88 -1 7
8 1 81 84 -1 8
9 1 93 75 0 9
10 2 5 110 0 10
11 2 16 105 0 11
12 2 27 106 0 12
13 2 33 104 1 13
14 2 45 95 1 14
15 2 61 109 2 15
16 2 75 96 2 16
17 2 90 89 1 17
18 3 2 104 -1 18
19 3 11 104 0 19
20 3 16 104 0 20
21 3 24 103 1 21
22 3 31 106 1 22
23 3 40 103 2 23
24 3 47 101 2 24
25 3 60 99 1 25
26 3 75 98 1 26
27 3 88 97 0 27
This is df2
id value_a1 value_a2
1 1 21 69
2 2 33 75
3 3 16 60
My result would be df3, which would look like this
id value_a value_c
1 1 53 -2
2 2 61 2
3 3 31 1
I wrote this code to show my line of thinking.
df3 <- df1 %>%
group_by(id) %>%
filter(value_a >= df2$value_a1 & value_a <= df2$value_a2) %>%
filter(value_a == max(value_a)) %>%
pull(value_b)
This however generates a value with three entry's:
[1] 88 95 99
These are not the maximum value_b's...
Perhaps by() would work, but this gets stuck on using a function on two different df's.
It feels like I'm almost there, but still far away...
You can try this. I hope this helps.
df1 %>% left_join(df2) %>% mutate(val=ifelse(value_a>value_a1 & value_a<value_a2,value_b,NA)) %>%
group_by(id) %>% summarise(val=max(val,na.rm=T))
# A tibble: 3 x 2
id val
<dbl> <dbl>
1 1 104
2 2 109
3 3 106
Why don't you try a merge?
Then with data.table syntax:
library(data.table)
df3 <- merge(df1, df2, by = "id", all.x = TRUE)
max_values <- df3[value_a > value_a1 & value_a < value_a2, max(value_b), by = "id"]
max_values
# id V1
# 1: 1 104
# 2: 2 109
# 3: 3 106
I would do this using data.table package since is just what I'm used to
library(data.table)
dt.1 <- data.table("id" = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3), "value_a" = c(0, 10, 21, 30, 43, 53, 69, 81, 93, 5, 16, 27, 33, 45, 61, 75, 90, 2, 11, 16, 24, 31, 40, 47, 60, 75, 88), "value_b" = c(100, 101, 100, 95, 90, 104, 88, 84, 75, 110, 105, 106, 104, 95, 109, 96, 89, 104, 104, 104, 103, 106, 103, 101, 99, 98, 97), "value_c" = c(0, -1, -2, -2, -2, -2, -1, -1, 0, 0, 0, 0, 1, 1, 2, 2, 1, -1, 0, 0, 1, 1, 2, 2, 1, 1, 0), "value_d" = c(1:27))
dt.2 <- data.table("id" = c(1, 2, 3), "value_a1" = c(21, 33, 16), "value_a2" = c(69, 75, 60))
dt.3 <- dt.1[id %in% dt.2[,id],max(value_b), by="id"]
setnames(dt.3, "V1", "max_value_b")
dt.3
To get corresponding line where b is the max values there are several ways, here's one where I only modified a line from the previous code
dt.1[id %in% dt.2[,id],.SD[which.max(value_b), .(value_a, value_b, value_c, value_d)], by="id"]
.SD means the sub-table you already selected with by so for each id selects the local max b and then returns a table which.max() selects the row, and finally .() is an alias for list, so lists the columns you wish from that table.
Perhaps a more readable approach is to first select the desired rows
max.b.rows <- dt.1[id %in% dt.2[,id], which.max(value_b), by="id"][,V1]
dt.3 <- dt.1[max.b.rows,]
BTW, the id %in% dt.2[,id] part is just there to make sure you only select maxima for those ids in table 2
Best

Resources