How can I do the following:
replace all values < 6 with NA,
if there is only one NA in the row, replace the first instance of the minimum value with -99?
Some data that includes an ID variable and a total column:
library(tidyverse)
df <- data.frame(id = c(1,2,3,4,5), a = c(10,12,4,17,3), b = c(9,12,3,20,6), c = c(2,2,10,10,10), d = c(12,16,12,10,12))
df$total <- apply(df[,c(2:5)], 1, sum)
Giving
id a b c d total
1 10 9 2 12 33
2 12 12 2 16 42
3 4 3 10 12 29
4 17 20 10 10 57
5 3 6 10 12 31
My desired output is
id a b c d total
1 10 -99 NA 12 33
2 -99 12 NA 16 42
3 NA NA 10 12 29
4 17 20 10 10 57
5 NA -99 10 12 31
My attempt
df_mod <- df %>%
# Make <6 NA
mutate(
across(
.cols = 'a':'total',
~case_when(
.x < 6 ~ as.numeric(NA),
TRUE ~ .x
)
)
) %>%
# Add a count of NAs
rowwise() %>%
mutate(Count_NA = sum(is.na(cur_data()))) %>%
ungroup()
# Transpose and get row minimum
df_mod2 <- t(df_mod[,-c(1,ncol(df_mod))]) %>%
apply(., 2, function(a){
min <- min(a, na.rm = TRUE)
}
) %>%
cbind(df_mod, .) %>%
rename(., min = .) %>%
tibble(.)
# If count_NA = 1 replace the first instance of min
df_mod2 %>%
rowwise() %>%
mutate(
across(
.cols = 'a':'total',
~case_when(
Count_NA == 1 & .x == min ~ replace(.x, first(match(min, .x)), -99),
TRUE ~ .x)
)
) %>%
select(-'Count_NA', -'min')
Which gives the following
id a b c d total
1 10 -99 NA 12 33
2 -99 -99 NA 16 42
3 NA NA 10 12 29
4 17 20 10 10 57
5 NA -99 10 12 31
Thanks
If you're willing to pivot rather than work rowwise, then this solution will work.
library(dplyr)
df %>%
pivot_longer(names_to = 'col',
values_to = 'val',
-c(id, total)) %>%
group_by(id) %>%
mutate(val2 = rank(val, ties.method = 'first'),
val = ifelse(val < 6, NA , val),
val = ifelse(sum(is.na(val)) == 1 & val2 == 2, -99, val)) %>%
select(-val2) %>%
pivot_wider(names_from = col,
values_from = val) %>%
relocate(total, .after = "d")
Here's the result:
# A tibble: 5 × 6
# Groups: id [5]
id a b c d total
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 10 -99 NA 12 33
2 2 -99 12 NA 16 42
3 3 NA NA 10 12 29
4 4 17 20 10 10 57
5 5 NA -99 10 12 31
It is not clear what you mean by '2nd' minimum value because you replace minimum value. You can use data.table:
library(data.table)
setDT(df)[
,
(cols) := transpose(
lapply(
transpose(lapply(.SD, function(x) fifelse(x < 6, NA_real_, x))),
function(x) if(sum(is.na(x)) == 1) replace(x, which.min(x), -99) else x
)
),
.SDcols = setdiff(names(df), "id")
]
Related
I want to create a variable that counts down a given number of steps before certain value and counts up a given number steps after the value.
In the example below, I want to have a counters before and after the grp == "b". When grp == "b" the value should be 0, and before and after the counters it should be 100.
To do this I can use case_when function as follows:
library(dplyr)
n<-10;k<-3
test_df<-data.frame(id=1:(k*n),grp=rep(letters[1:k],each=n))
before=5;after=3;index=2
test_df2<-test_df %>%
mutate(before_after=
case_when(
grp == letters[index] ~ 0
,dplyr::lag(grp,1) == letters[index] ~ 1
,dplyr::lag(grp,2) == letters[index] ~ 2
,dplyr::lag(grp,3) == letters[index] ~ 3
,dplyr::lead(grp,1) == letters[index] ~ -1
,dplyr::lead(grp,2) == letters[index] ~ -2
,dplyr::lead(grp,3) == letters[index] ~ -3
,dplyr::lead(grp,4) == letters[index] ~ -4
,dplyr::lead(grp,5) == letters[index] ~ -5
,TRUE~100
)
)
The problem is the variables "before" and "after" are not static! So I should make a function to do it as follows:
b_a<-function(before=15,after=6,others=100,grp_f=letters[2]){
}
How to make a function to do it?
Edit:
I created a function but if there us a better and easy solution please feel free to add an answer:
b_a<-unction(test_df,before=15,after=6,others=100,grp_f=letters[2]){
ltr<-grp_f
test_df2<- test_df %>%
group_by(grp) %>%
dplyr::mutate( fr=ifelse(grp==ltr,dplyr::first(id),0 )) %>%
ungroup() %>%
dplyr::mutate( fr=max(fr)) %>%
mutate(fr=-(fr-id)) %>%
mutate(fr=ifelse(fr<0,fr,0 ))
test_df2<- test_df2 %>%
group_by(grp) %>%
dplyr::mutate( lst=ifelse(grp==ltr,dplyr::last(id),0 )) %>%
ungroup() %>%
dplyr::mutate( lst=max(lst)) %>%
mutate(lst=-(lst-id)) %>%
mutate(lst=ifelse(lst>0,lst,0 ))
test_df2<- test_df2 %>% ungroup() %>%
mutate(before_after=lst+fr)
test_df2 %>% mutate(ifelse(before_after >= -before &
before_after <=after,before_after,others))
test_df2
}
This may help:
test_df3<-test_df %>%
group_by(grp) %>%
mutate(is_group_b=ifelse(grp==letters[index],1,0)) %>%
ungroup() %>%
mutate(nr=row_number()) %>%
mutate(nr=nr*is_group_b) %>%
mutate(max_nr=max(nr))%>%
mutate(nr=ifelse(nr==0,NA,nr)) %>%
mutate(min_nr=min(nr,na.rm = TRUE)) %>%
mutate(nr=ifelse(is.na(nr),0,nr)) %>%
mutate(vl=(row_number()-min_nr)) %>%
mutate(vl=vl*(1-is_group_b)) %>%
mutate(vl=if_else(vl>0,row_number()-max_nr,vl))
Try this:
library(dplyr)
test_df %>%
group_by(grp) %>%
mutate(value = ifelse(grp == letters[index-1] & row_number() <= 5, 100, (before:1)*-1),
value1 = ifelse(grp == letters[index+1] & row_number() > 3, 100, 1:after)) %>%
mutate(value = case_when(grp == letters[index] ~ 0,
grp == letters[index-1] ~ value,
grp == letters[index+1] ~ value1), .keep="unused") %>%
print(n=50)
id grp value
<int> <chr> <dbl>
1 1 a 100
2 2 a 100
3 3 a 100
4 4 a 100
5 5 a 100
6 6 a -5
7 7 a -4
8 8 a -3
9 9 a -2
10 10 a -1
11 11 b 0
12 12 b 0
13 13 b 0
14 14 b 0
15 15 b 0
16 16 b 0
17 17 b 0
18 18 b 0
19 19 b 0
20 20 b 0
21 21 c 1
22 22 c 2
23 23 c 3
24 24 c 100
25 25 c 100
26 26 c 100
27 27 c 100
28 28 c 100
29 29 c 100
30 30 c 100
Set result column to 100. Get indices of the focal group. Create replacement values (countdown - zeros - countup) of given lengths. Replace result at relevant indices.
d = data.frame(id = 1:12, grp = rep(letters[1:3], each = 4))
before = 2
after = 3
g = "b"
d$x = 100
i = which(d$grp == g)
v = c(-before:-1, rep(0, length(i)), seq(after))
d$x = replace(d$x, i[1] - before + 0:(length(v) - 1), v)
d
# id grp x
# 1 1 a 100
# 2 2 a 100
# 3 3 a -2
# 4 4 a -1
# 5 5 b 0
# 6 6 b 0
# 7 7 b 0
# 8 8 b 0
# 9 9 c 1
# 10 10 c 2
# 11 11 c 3
# 12 12 c 100
Depending on the data, you may want to add a check so that indices of before- and after-sequences are kept within the bounds of the data, e.g. using %in%:
d = data.frame(id = 1:8, grp = rep(letters[1:3], c(1, 5, 2)))
before = 2
after = 3
g = "b"
d$x = 100
i = which(d$grp == g)
v = c(-before:-1, rep(0, length(i)), seq(after))
i2 = i[1] - before + 0:(length(v) - 1)
ok = i2 %in% seq_len(nrow(d)) # <~ check if indices are within range of data
d$x = replace(d$x, i2[ok], v[ok])
d
# id grp x
# 1 1 a -1 # leading countdown truncated
# 2 2 b 0
# 3 3 b 0
# 4 4 b 0
# 5 5 b 0
# 6 6 b 0
# 7 7 c 1
# 8 8 c 2 # trailing counter truncated
I have a dataframe with group, value and columns based in rollaply mean of the last n values, just like that:
library(dplyr); library(zoo)
df = data.frame( group = c(rep(1,5), rep(2,5)),
value = c(23,14,53,12,56,32,65,76,36,74)) %>%
group_by(group) %>%
mutate(
roll1 = rollapplyr(value, 1, mean, fill = NA, na.rm = T, partial = F),
roll2 = rollapplyr(value, 2, mean, fill = NA, na.rm = T, partial = F),
roll3 = rollapplyr(value, 3, mean, fill = NA, na.rm = T, partial = F)
)
df
group value roll1 roll2 roll3
1 1 23 23 NA NA
2 1 14 14 18.5 NA
3 1 53 53 33.5 30
4 1 12 12 32.5 26.3
5 1 56 56 34 40.3
6 2 32 32 NA NA
7 2 65 65 48.5 NA
8 2 76 76 70.5 57.7
9 2 36 36 56 59
10 2 74 74 55 62
The 'rolln' column represents the average of the last n values.
Then I would like to summarize in a new dataframe which group of values provided the highest average. Remembering that the roll3 column, for example, has a set of 3 values.
I tried to use which.max function, but without success. The position of NA's in the final data.frame isn't important
Thanks in advance
I'd love to see a more concise solution, but this seems to work:
library(tidyverse)
df %>%
pivot_longer(starts_with("roll"), values_to = "avg") %>%
filter(!is.na(avg)) %>%
group_by(group, name) %>%
filter(slider::slide_dbl(avg, max, .after = 2) == max(avg)) %>% # EDIT #2
#filter(avg == max(avg) |
# lead(avg, default = 0) == max(avg) |
# lead(avg, 2, default = 0) == max(avg)) %>%
mutate(items = n() + 1 - parse_number(name)) %>% # EDIT
slice(items:n()) %>%
mutate(row = row_number()) %>%
select(-avg, -items) %>%
pivot_wider(names_from = name, values_from = value)
Result
group row roll1 roll2 roll3
<dbl> <int> <dbl> <dbl> <dbl>
1 1 1 56 12 53
2 1 2 NA 56 12
3 1 3 NA NA 56
4 2 1 76 65 76
5 2 2 NA 76 36
6 2 3 NA NA 74
I have a simple data frame consisting of (account) IDs and default rates for five years. Many default rates are missing. The data can be generated as follows:
ID = rep(1:50, each = 5)
def= rnorm(n=250, mean=0.5, sd=0.2)
ind= which(def %in% sample(def, 100))
def[ind] = NA
df = data.frame(ID = ID, Def = def)
And looks like this:
> head(df, 20)
ID Def
1 1 0.39506938
2 1 NA
3 1 0.42946603
4 1 NA
5 1 NA
6 2 0.45125199
7 2 0.40519126
8 2 NA
9 2 0.65082718
10 2 NA
11 3 NA
12 3 0.46132736
13 3 0.06324983
14 3 0.72630862
15 3 0.63996092
16 4 0.72093890
17 4 NA
18 4 NA
19 4 0.61471461
20 4 0.51788498
How can show the ID numbers for which at least 4 of the 5 default rates are not NAs?
You may try:
library(dplyr)
df %>%
group_by(ID) %>%
dplyr::summarize(p = sum(!is.na(Def))/n()) %>%
filter(p >= 0.8) %>% # or > 0.8?
pull(ID)
[1] 2 8 9 11 13 17 20 23 25 27 28 29 33 38 44 45 47 49
If you need to keep the data frame you can do:
library(dplyr)
df |>
group_by(ID) |>
mutate(m = sum(is.na(Def))) |>
filter(m <= 1) |>
select(ID, Def)
EDIT:
Can be further simplified
df |>
group_by(ID) |>
filter(sum(is.na(Def)) <= 1)
Here is an alternative:
library(dplyr)
df %>%
group_by(ID) %>%
mutate(x = 5 - sum(is.na(Def))) %>%
slice(1) %>%
filter(x >=4) %>%
pull(ID)
I have a very simple case where I want to combine several data frames into one based on a common id elements of a particular data frame.
Example:
id <- c(1, 2, 3)
x <- c(10, 12, 14)
data1 <- data.frame(id, x)
id <- c(2, 3)
x <- c(20, 22)
data2 <- data.frame(id, x)
id <- c(1, 3)
x <- c(30, 32)
data3 <- data.frame(id, x)
Which gives us,
$data1
id x
1 1 10
2 2 12
3 3 14
$data2
id x
1 2 20
2 3 22
$data3
id x
1 1 30
2 3 32
Now, I want to combine all three data frames based on the id's of the data3. The expected output should look like
> comb
id x
1 1 10
2 1 NA
3 1 30
4 3 14
5 3 22
6 3 32
I am trying the following, but not getting the expected output.
library(dplyr)
library(tidyr)
combined <- bind_rows(data1, data2, data3, .id = "id") %>% arrange(id)
Any idea how to get the expected output?
Does this work:
library(dplyr)
library(tidyr)
data1 %>% full_join(data2, by = 'id') %>% full_join(data3, by = 'id') %>% arrange(id) %>% right_join(data3, by = 'id') %>%
pivot_longer(cols = -id) %>% select(-name) %>% distinct()
# A tibble: 6 x 2
id value
<dbl> <dbl>
1 1 10
2 1 NA
3 1 30
4 3 14
5 3 22
6 3 32
Combine the 3 dataframes in one list and use filter to select only the id's in 3rd dataframe.
library(dplyr)
library(tidyr)
bind_rows(data1, data2, data3, .id = "new_id") %>%
filter(id %in% id[new_id == 3]) %>%
complete(new_id, id)
# new_id id x
# <chr> <dbl> <dbl>
#1 1 1 10
#2 1 3 14
#3 2 1 NA
#4 2 3 22
#5 3 1 30
#6 3 3 32
A pure base R solution can also make it
lst <- list(data1, data2, data3)
reshape(
subset(
reshape(
do.call(rbind, Map(cbind, lst, grp = seq_along(lst))),
idvar = "id",
timevar = "grp",
direction = "wide"
),
id %in% lst[[3]]$id
),
idvar = "id",
varying = -1,
direction = "long"
)[c("id", "x")]
which gives
id x
1.1 1 10
3.1 3 14
1.2 1 NA
3.2 3 22
1.3 1 30
3.3 3 32
>
Using base R
do.call(rbind, unname(lapply(mget(ls(pattern = "^data\\d+$")), \(x) {
x1 <- subset(x, id %in% data3$id)
v1 <- setdiff(data3$id, x1$id)
if(length(v1) > 0) rbind(x1, cbind(id = v1, x = NA)) else x1
})))
-output
id x
1 1 10
3 3 14
2 3 22
11 1 NA
12 1 30
21 3 32
bind_rows(data1, data2, data3, .id = 'grp')%>%
complete(id, grp)%>%
select(-grp) %>%
filter(id%in%data3$id)
# A tibble: 6 x 2
id x
<dbl> <dbl>
1 1 10
2 1 NA
3 1 30
4 3 14
5 3 22
6 3 32
I am trying to figure out how to sum values belonging to category a and b by factor file, but also keep the original data.
library(dplyr)
df <- data.frame(ID = 1:20, values = runif(20), category = rep(letters[1:5], 4), file = as.factor(sort(rep(1:5, 4))))
ID values category file
1 1 0.65699229 a 1
2 2 0.70506478 b 1
3 3 0.45774178 c 1
4 4 0.71911225 d 1
5 5 0.93467225 e 1
6 6 0.25542882 a 2
7 7 0.46229282 b 2
8 8 0.94001452 c 2
9 9 0.97822643 d 2
10 10 0.11748736 e 2
11 11 0.47499708 a 3
12 12 0.56033275 b 3
13 13 0.90403139 c 3
14 14 0.13871017 d 3
15 15 0.98889173 e 3
16 16 0.94666823 a 4
17 17 0.08243756 b 4
18 18 0.51421178 c 4
19 19 0.39020347 d 4
20 20 0.90573813 e 4
so that
df[1,2] will be added to df[2,2] to category 'ab' for file 1
df[6,2] will be added to df[7,2] to category 'ab' for file 2
etc.
So far I have this:
df %>%
filter(category %in% c('a' , 'b')) %>%
group_by(file) %>%
summarise(values = sum(values))
Problem
I would like to change the category of the summed values to "ab" and append it to the original data frame in the same pipeline.
Desired output:
ID values category file
1 1 0.65699229 a 1
2 2 0.70506478 b 1
3 3 0.45774178 c 1
4 4 0.71911225 d 1
5 5 0.93467225 e 1
6 6 0.25542882 a 2
7 7 0.46229282 b 2
8 8 0.94001452 c 2
9 9 0.97822643 d 2
10 10 0.11748736 e 2
11 11 0.47499708 a 3
12 12 0.56033275 b 3
13 13 0.90403139 c 3
14 14 0.13871017 d 3
15 15 0.98889173 e 3
16 16 0.94666823 a 4
17 17 0.08243756 b 4
18 18 0.51421178 c 4
19 19 0.39020347 d 4
20 20 0.90573813 e 4
21 21 1.25486225 ab 1
22 22 1.87216325 ab 2
23 23 1.36548126 ab 3
This will get you the result
df %>% bind_rows(
df %>%
filter(category %in% c('a' , 'b')) %>%
group_by(file) %>%
mutate(values = sum(values), category = paste0(category,collapse='')) %>%
filter(row_number() == 1 & n() > 1)
) %>% mutate(ID = row_number())
BTW the code pro produce the dataframe in the example is this one:
df <- data.frame(ID = 1:20, values = runif(20), category = rep(letters[1:5], 4), file = as.factor(sort(rep(1:4, 5))))
now lets say you want to sum multiple columns, you need to provide the list in a vector:
cols = c("values") # columns to be sum
df %>% bind_rows(
df %>%
filter(category %in% c('a' , 'b')) %>%
group_by(file) %>%
mutate_at(vars(cols), sum) %>%
mutate(category = paste0(category,collapse='')) %>%
filter(row_number() == 1 & n() > 1)
) %>% mutate(ID = row_number())
library(dplyr)
df1 %>%
filter(category %in% c('a' , 'b')) %>%
group_by(file) %>%
filter(n_distinct(category) > 1) %>%
summarise(values = sum(values)) %>%
mutate(category="ab",
ID=max(df1$ID)+1:n()) %>%
bind_rows(df1, .)
#> Warning in bind_rows_(x, .id): binding factor and character vector,
#> coercing into character vector
#> Warning in bind_rows_(x, .id): binding character and factor vector,
#> coercing into character vector
#> ID values category file
#> 1 1 0.62585921 a 1
#> 2 2 0.61865851 b 1
#> 3 3 0.05274456 c 1
#> 4 4 0.68156961 d 1
.
.
.
#> 19 19 0.43239411 d 5
#> 20 20 0.85886314 e 5
#> 21 21 1.24451773 ab 1
#> 22 22 0.99001810 ab 2
#> 23 23 1.25331943 ab 3
This data.table approach uses a self-join to get all of the possible two-character combinations.
library(data.table)
setDT(df)
df_self_join <- df[df, on = .(file), allow.cartesian = T
][category != i.category,
.(category = paste0(i.category, category), values = values + i.values, file)
][order(category), .(ID = .I + nrow(df), values, category, file)]
rbindlist(list(df, df_self_join))
ID values category file
1: 1 0.76984382 a 1
2: 2 0.54311583 b 1
3: 3 0.23462016 c 1
4: 4 0.60179043 d 1
...
20: 20 0.03534223 e 5
21: 21 1.31295965 ab 1
22: 22 0.51666175 ab 2
23: 23 1.02305754 ab 3
24: 24 1.00446399 ac 1
25: 25 0.96910373 ac 2
26: 26 0.87795389 ac 4
#total of 80 rows
Here is pretty close dplyr translation:
library(dplyr)
tib <- as_tibble(df)
inner_join(tib, tib, by = 'file')%>%
filter(ID.x != ID.y)%>%
transmute(category = paste0(category.x, category.y)
, values = values.x + values.y
, file)%>%
arrange(category)%>%
bind_rows(tib, .)%>%
mutate(ID = row_number())%>%
filter(category == 'ab') #filter added to show the "ab" files
# A tibble: 3 x 4
ID values category file
<int> <dbl> <chr> <fct>
1 21 1.31 ab 1
2 22 0.517 ab 2
3 23 1.02 ab 3