Related
I have a dataframe as follows:
library(dplyr)
df <- data.frame(A=1:20,
B=c(2,1.8,1.6,1.8,4,6,8,10,12,10,8,6,13,14,15,16,16.5,15,14,13))
mutate(df, C = B - lag(B))
A B C
1 2.0 NA
2 1.8 -0.2
3 1.6 -0.2
4 1.8 0.2
5 4.0 2.2
6 6.0 2.0
7 8.0 2.0
8 10.0 2.0
9 12.0 2.0
10 10.0 -2.0
11 8.0 -2.0
12 6.0 -2.0
13 13.0 7.0
14 14.0 1.0
15 15.0 1.0
16 16.0 1.0
17 16.5 -0.5
18 15.0 -1.0
19 14.0 -1.0
20 13.0 -1.0
And I want to extract out the sequences of minus values where there are 3 or more consecutively together and put in a separate column. So for example put the values from (col C)row 10, 11, 12 in a new column and row 17, 18,19,20 in another new column. This dataframe is huge so I dont know how many new columns I will have. Any help would be appreciated. Thanks
Here is an option with rleid to create a run-length-id grouping based on the sign of the column 'C' i.e. those adjacent elements with same sign will have same grouping 'id' and it gets incremented when there is a difference in sign. Then, we create the columns based on the count (n()) value to be particular number i.e. 3 or 4
library(dplyr)
library(data.table)
df %>%
mutate(C = B - lag(B)) %>%
group_by(grp = rleid(sign(C))) %>%
mutate(newC3 = if(n() ==3 && all(C < 0)) C else NA,
newC4 = if(n() == 4 && all(C < 0) C else NA)
To make this automated, an option is pivot_wider to reshape from 'long' to 'wide' format after creating the grouping id with rleid and replaceing the values that are not negative to NA. In this way, we get only the blocks of negative values to be in a separate column
library(tidyr)
library(stringr)
df %>%
mutate(C = B - lag(B)) %>%
mutate(grp = str_c('C', rleid(sign(C))),
C1 = case_when(C >=0 ~ NA_real_, TRUE ~ C)) %>%
pivot_wider(names_from = grp, values_from = C1)%>%
select(where(~ sum(!is.na(.)) > 0))
-output
# A tibble: 20 x 6
# A B C C2 C4 C7
# <int> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 2 NA NA NA NA
# 2 2 1.8 -0.200 -0.200 NA NA
# 3 3 1.6 -0.200 -0.200 NA NA
# 4 4 1.8 0.200 NA NA NA
# 5 5 4 2.2 NA NA NA
# 6 6 6 2 NA NA NA
# 7 7 8 2 NA NA NA
# 8 8 10 2 NA NA NA
# 9 9 12 2 NA NA NA
#10 10 10 -2 NA -2 NA
#11 11 8 -2 NA -2 NA
#12 12 6 -2 NA -2 NA
#13 13 13 7 NA NA NA
#14 14 14 1 NA NA NA
#15 15 15 1 NA NA NA
#16 16 16 1 NA NA NA
#17 17 16 0 NA NA NA
#18 18 15 -1 NA NA -1
#19 19 14 -1 NA NA -1
#20 20 13 -1 NA NA -1
NOTE: The column names 'C2', 'C4', 'C7' are based on the ids created with rleid. If we wanted to rename, then it can be done with rename_with or rename_at
...
%>%
rename_at(vars(matches('^C\\d+')), ~ str_c('C', seq_along(.)))
I have the following data frame,
Input
For all observations where Month > tenor, the last value of the rate column should be retained for each account for the remaining months. Eg:- Customer 1 has tenor = 5, so for all months greater than 5, the last rate value is retained.
I am using the following code
df$rate <- ifelse(df$Month > df$tenor,tail(df$rate, n=1),df$rate)
But here, the last value is NA so it does not work
Expected output is
Output
this will work, but please have a reproducible example. Others want to help you, not do your homework.
df <- data.frame(
customer = c(1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2),
Month = c(1,2,3,4,5,6,7,8,9,10,1,2,3,4,5,6,7,8,9,10),
tenor = c(5,5,5,5,5,5,5,5,5,5,3,3,3,3,3,3,3,3,3,3),
rate = c(0.2,0.3,0.4,0.5,0.6,NA,NA,NA,NA,NA,0.1,0.2,0.3,NA,NA,NA,NA,NA,NA,NA)
)
fn <- function(cus, mon, ten, rat){
if (mon > ten & is.na(rat)){
return(dplyr::filter(df, customer == cus, Month == ten, tenor == ten)$rate)
}
return(rat)
}
df2 <- mutate(df,
newrate = Vectorize(fn)(customer, Month, tenor, rate)
)
One option is:
library(dplyr)
library(tidyr)
df %>%
group_by(cus_no) %>%
fill(rate, .direction = "down") %>%
ungroup()
# A tibble: 20 x 4
customer Month tenor rate
<dbl> <dbl> <dbl> <dbl>
1 1 1 5 0.2
2 1 2 5 0.3
3 1 3 5 0.4
4 1 4 5 0.5
5 1 5 5 0.6
6 1 6 5 0.6
7 1 7 5 0.6
8 1 8 5 0.6
9 1 9 5 0.6
10 1 10 5 0.6
11 2 1 3 0.1
12 2 2 3 0.2
13 2 3 3 0.3
14 2 4 3 0.3
15 2 5 3 0.3
16 2 6 3 0.3
17 2 7 3 0.3
18 2 8 3 0.3
19 2 9 3 0.3
20 2 10 3 0.3
I can't replicate your data frame so this is a guess right now.
I think dplyr should be the solution:-
library(dplyr)
df%>%
group_by(Month)%>%
replace_na(last(rate))
should work
I Have used the code below to generate a dataframe df1
df1<-data.frame("ID"=c("A", "A", "A", "A", "A", "B", "B", "B", 'B', "B"),
"X_Fr"=c(NA, NA, NA, NA,NA,1,2,3,4,5), "X_Ax"=c(NA, NA, NA, NA, NA,
.2,.3,.4,.2, .3),
"Y_Fr"=c(1,2,3,4,5,1,2,3,4,5),
"Y_Ax"=c(.1,.2,.3,.4,.1,.3,.4,.5,.2,.3),"Z_Fr"=c(1,2,NA, NA, 3,
1,3,4,5,10),
"Z_Ax"=c(.1,.2,NA,NA,.5, .1,.2,.4,.3,.5) )
ID X_Fr X_Ax Y_Fr Y_Ax Z_Fr Z_Ax
1 A NA NA 1 0.1 1 0.1
2 A NA NA 2 0.2 2 0.2
3 A NA NA 3 0.3 NA NA
4 A NA NA 4 0.4 NA NA
5 A NA NA 5 0.1 3 0.5
6 B 1 0.2 1 0.3 1 0.1
7 B 2 0.3 2 0.4 3 0.2
8 B 3 0.4 3 0.5 4 0.4
9 B 4 0.2 4 0.2 5 0.3
10 B 5 0.3 5 0.3 10 0.5
I would like to expand the dataframe to give the following data frame as output
ID X_Fr Y_Fr Z_Fr X_Ax Y_Ax Z_Ax
1 A 1 1 1 NA 0.1 0.1
2 A 2 2 2 NA 0.2 0.2
3 A 3 3 3 NA 0.3 0.5
4 A 4 4 4 NA 0.4 NA
5 A 5 5 5 NA 0.1 NA
6 B 1 1 1 0.2 0.3 0.1
7 B 2 2 2 0.3 0.4 NA
8 B 3 3 3 0.4 0.5 0.2
9 B 4 4 4 0.2 0.2 0.4
10 B 5 5 5 0.3 0.3 0.3
11 B 6 6 6 NA NA NA
12 B 7 7 7 NA NA NA
13 B 8 8 8 NA NA NA
14 B 9 9 9 NA NA NA
15 B 10 10 10 NA NA 0.5
I have tried the following code to obtain the above dataframe
library(tidyr)
library(dplyr)
df2<-df1 %>% complete(ID, nesting(X_Fr=full_seq(na.omit(c(X_Fr, Y_Fr,
Z_Fr)),1), Y_Fr=full_seq(na.omit(c(X_Fr, Y_Fr, Z_Fr)),1),
Z_Fr=full_seq(na.omit(c(X_Fr, Y_Fr, Z_Fr)),1)))
I am unable to obtain this result. I request someone to take a look.
I think it's two steps process so I create two new dataframes processed them and joined them later
library(dplyr)
library(tidyr)
df1x<-df1 %>% select(ID,matches('^X|^Y')) #select ID and any cloumn start with X or Y
df1y<-df1 %>% select(ID,matches('^Z'))
df1y %>% group_by(ID) %>% #group by ID column
arrange(Z_Fr, .by_group=TRUE) %>% #arrange Z_Fr column in ascending order so we can use row_number later
mutate(Z_Fr=coalesce(Z_Fr,as.numeric(row_number()))) %>% #Use row_number to fill NA's in Z_Fr.
#See ?dplyr::row_number() for more details
ungroup() %>% #Before using complete we need to ungroup
complete(ID, nesting(Z_Fr=full_seq(Z_Fr,1))) %>%
left_join(df1x, by=c('ID','Z_Fr'='Y_Fr')) #left join using "on" ID and Z_Fr from df1y and Y_Fr from df1x
I am trying to recode NA values to 0 in a subset of columns using the following dataset:
set.seed(1)
df <- data.frame(
id = c(1:10),
trials = sample(1:3, 10, replace = T),
t1 = c(sample(c(1:9, NA), 10)),
t2 = c(sample(c(1:7, rep(NA, 3)), 10)),
t3 = c(sample(c(1:5, rep(NA, 5)), 10))
)
Each row has a certain number of trials associated with it (between 1-3), specified by the trials column. columns t1-t3 represent scores for each trial.
The number of trials indicates the subset of columns in which NAs should be recoded to 0: NAs that are within the number of trials represent missing data, and should be recoded as 0, while NAs outside the number of trials are not meaningful, and should remain NAs. So, for a row where trials == 3, an NA in column t3 would be recoded as 0, but in a row where trials == 2, an NA in t3 would remain an NA.
So, I tried using this function:
replace0 <- function(x, num.sun) {
x[which(is.na(x[1:(num.sun + 2)]))] <- 0
return(x)
}
This works well for single vectors. When I try applying the same function to a data frame with apply(), though:
apply(df, 1, replace0, num.sun = df$trials)
I get a warning saying:
In 1:(num.sun + 2) :
numerical expression has 10 elements: only the first used
The result is that instead of having the value of num.sun change every row according to the value in trials, apply() simply uses the first value in the trials column for every single row. How could I apply the function so that the num.sun argument changes according to the value of df$trials?
Thanks!
Edit: as some have commented, the original example data had some non-NA scores that didn't make sense according to the trials column. Here's a corrected dataset:
df <- data.frame(
id = c(1:5),
trials = c(rep(1, 2), rep(2, 1), rep(3, 2)),
t1 = c(NA, 7, NA, 6, NA),
t2 = c(NA, NA, 3, 7, 12),
t3 = c(NA, NA, NA, 4, NA)
)
Another approach:
# create an index of the NA values
w <- which(is.na(df), arr.ind = TRUE)
# create an index with the max column by row where an NA is allowed to be replaced by a zero
m <- matrix(c(1:nrow(df), (df$trials + 2)), ncol = 2)
# subset 'w' such that only the NA's which fall in the scope of 'm' remain
i <- w[w[,2] <= m[,2][match(w[,1], m[,1])],]
# use 'i' to replace the allowed NA's with a zero
df[i] <- 0
which gives:
> df
id trials t1 t2 t3
1 1 1 3 NA 5
2 2 2 2 2 NA
3 3 2 6 6 4
4 4 3 0 1 2
5 5 1 5 NA NA
6 6 3 7 0 0
7 7 3 8 7 0
8 8 2 4 5 1
9 9 2 1 3 NA
10 10 1 9 4 3
You could easily wrap this in a function:
replace.NA.with.0 <- function(df) {
w <- which(is.na(df), arr.ind = TRUE)
m <- matrix(c(1:nrow(df), (df$trials + 2)), ncol = 2)
i <- w[w[,2] <= m[,2][match(w[,1], m[,1])],]
df[i] <- 0
return(df)
}
Now, using replace.NA.with.0(df) will produce the above result.
As noted by others, some rows (1, 3 & 10) have more values than trails. You could tackle that problem by rewriting the above function to:
replace.with.NA.or.0 <- function(df) {
w <- which(is.na(df), arr.ind = TRUE)
df[w] <- 0
v <- tapply(m[,2], m[,1], FUN = function(x) tail(x:5,-1))
ina <- matrix(as.integer(unlist(stack(v)[2:1])), ncol = 2)
df[ina] <- NA
return(df)
}
Now, using replace.with.NA.or.0(df) produces the following result:
id trials t1 t2 t3
1 1 1 3 NA NA
2 2 2 2 2 NA
3 3 2 6 6 NA
4 4 3 0 1 2
5 5 1 5 NA NA
6 6 3 7 0 0
7 7 3 8 7 0
8 8 2 4 5 NA
9 9 2 1 3 NA
10 10 1 9 NA NA
Here I just rewrite your function using double subsetting x[paste0('t',x['trials'])], which overcome the problem in the other two solutions with row 6
replace0 <- function(x){
#browser()
x_na <- x[paste0('t',x['trials'])]
if(is.na(x_na)){x[paste0('t',x['trials'])] <- 0}
return(x)
}
t(apply(df, 1, replace0))
id trials t1 t2 t3
[1,] 1 1 3 NA 5
[2,] 2 2 2 2 NA
[3,] 3 2 6 6 4
[4,] 4 3 NA 1 2
[5,] 5 1 5 NA NA
[6,] 6 3 7 NA 0
[7,] 7 3 8 7 0
[8,] 8 2 4 5 1
[9,] 9 2 1 3 NA
[10,] 10 1 9 4 3
Here is a way to do it:
x <- is.na(df)
df[x & t(apply(x, 1, cumsum)) > 3 - df$trials] <- 0
The output looks like this:
> df
id trials t1 t2 t3
1 1 1 3 NA 5
2 2 2 2 2 NA
3 3 2 6 6 4
4 4 3 0 1 2
5 5 1 5 NA NA
6 6 3 7 0 0
7 7 3 8 7 0
8 8 2 4 5 1
9 9 2 1 3 NA
10 10 1 9 4 3
> x <- is.na(df)
> df[x & t(apply(x, 1, cumsum)) > 3 - df$trials] <- 0
> df
id trials t1 t2 t3
1 1 1 3 NA 5
2 2 2 2 2 NA
3 3 2 6 6 4
4 4 3 0 1 2
5 5 1 5 NA NA
6 6 3 7 0 0
7 7 3 8 7 0
8 8 2 4 5 1
9 9 2 1 3 NA
10 10 1 9 4 3
Note: row 1/3/10, is problematic since there are more non-NA values than the trials.
Here's a tidyverse way, note that it doesn't give the same output as other solutions.
Your example data shows results for trials that "didn't happen", I assumed your real data doesn't.
library(tidyverse)
df %>%
nest(matches("^t\\d")) %>%
mutate(data = map2(data,trials,~mutate_all(.,replace_na,0) %>% select(.,1:.y))) %>%
unnest
# id trials t1 t2 t3
# 1 1 1 3 NA NA
# 2 2 2 2 2 NA
# 3 3 2 6 6 NA
# 4 4 3 0 1 2
# 5 5 1 5 NA NA
# 6 6 3 7 0 0
# 7 7 3 8 7 0
# 8 8 2 4 5 NA
# 9 9 2 1 3 NA
# 10 10 1 9 NA NA
Using the more commonly used gather strategy this would be:
df %>%
gather(k,v,matches("^t\\d")) %>%
arrange(id) %>%
group_by(id) %>%
slice(1:first(trials)) %>%
mutate_at("v",~replace(.,is.na(.),0)) %>%
spread(k,v)
# # A tibble: 10 x 5
# # Groups: id [10]
# id trials t1 t2 t3
# <int> <int> <dbl> <dbl> <dbl>
# 1 1 1 3 NA NA
# 2 2 2 2 2 NA
# 3 3 2 6 6 NA
# 4 4 3 0 1 2
# 5 5 1 5 NA NA
# 6 6 3 7 0 0
# 7 7 3 8 7 0
# 8 8 2 4 5 NA
# 9 9 2 1 3 NA
# 10 10 1 9 NA NA
The Data looks like this
library(igraph)
From <- c(1,2,3,4,5,6,7,8)
To <- c(NA,1,2,3,2,NA,6,7)
Value<- c(1,0,0.5,0.5,0,-1,-1,-0.5)
Data <- data.frame(From,To, Value)
Network <- graph.data.frame(Data[,c("From","To")])
Network<- Network - "NA"
plot(Network)
Now i would like to know the AverageValue of the Partial Graph they are in and at it to the initial Dataframe.
At the end it should look like this:
From <- c(1,2,3,4,5,6,7,8)
To <- c(NA,1,2,3,2,NA,6,7)
Value<- c(1,0,0.5,0.5,0,-1,-1,-0.5)
AverageTreeValue<- c(0.4,0.4,0.4,0.4,0.4,-0.833,-0.833,-0.833)
FinalData <- data.frame(From,To, Value, AverageTreeValue)
You can use the clusters function to compute connected components in your graph, aggregate to compute the mean value for each of these clusters, and merge to combine the two together:
Data$group <- clusters(Network)$membership
(FinalData <- merge(Data, aggregate(Value~group, Data, mean), by="group"))
# group From To Value.x Value.y
# 1 1 1 NA 1.0 0.4000000
# 2 1 2 1 0.0 0.4000000
# 3 1 3 2 0.5 0.4000000
# 4 1 4 3 0.5 0.4000000
# 5 1 5 2 0.0 0.4000000
# 6 2 6 NA -1.0 -0.8333333
# 7 2 7 6 -1.0 -0.8333333
# 8 2 8 7 -0.5 -0.8333333
Alternately, you could use match to perform the merge and get some more control over the names of the generated column:
groups <- clusters(Network)$membership
means <- aggregate(Value~group, data.frame(Value=Data$Value, group=groups), mean)
Data$AverageTreeValue <- means$Value[match(groups, means$group)]
Data
# From To Value AverageTreeValue
# 1 1 NA 1.0 0.4000000
# 2 2 1 0.0 0.4000000
# 3 3 2 0.5 0.4000000
# 4 4 3 0.5 0.4000000
# 5 5 2 0.0 0.4000000
# 6 6 NA -1.0 -0.8333333
# 7 7 6 -1.0 -0.8333333
# 8 8 7 -0.5 -0.8333333