Conditional filling NA rows with comparing non-NA labeled rows - r

I want to fill NA rows based on checking the differences between the closest non-NA labeled rows.
For instance
data <- data.frame(sd_value=c(34,33,34,37,36,45),
value=c(383,428,437,455,508,509),
label=c(c("bad",rep(NA,4),"unable")))
> data
sd_value value label
1 34 383 bad
2 33 428 <NA>
3 34 437 <NA>
4 37 455 <NA>
5 36 508 <NA>
6 45 509 unable
I want to evaluate how to change NA rows with checking the difference between sd_value and value those close to bad and unablerows.
if we want to get differences between the rows we can do;
library(dplyr)
data%>%
mutate(diff_val=c(0,diff(value)), diff_sd_val=c(0,diff(sd_value)))
sd_value value label diff_val diff_sd_val
1 34 383 bad 0 0
2 33 428 <NA> 45 -1
3 34 437 <NA> 9 1
4 37 455 <NA> 18 3
5 36 508 <NA> 53 -1
6 45 509 unable 1 9
The condition how I want to label the NA rows is
if the diff_val<50 and diff_sd_val<9 label them with the last non-NA label else use the first non-NA label after the last NA row.
So that the expected output would be
sd_value value label diff_val diff_sd_val
1 34 383 bad 0 0
2 33 428 bad 45 -1
3 34 437 bad 9 1
4 37 455 bad 18 3
5 36 508 unable 53 -1
6 45 509 unable 1 9
The possible solution I cooked up so far:
custom_labelling <- function(x,y,label){
diff_sd_val<-c(NA,diff(x))
diff_val<-c(NA,diff(y))
label <- NA
for (i in 1:length(label)){
if(is.na(label[i])&diff_sd_val<9&diff_val<50){
label[i] <- label
}
else {
label <- label[i]
}
}
return(label)
}
which gives
data%>%
mutate(diff_val=c(0,diff(value)), diff_sd_val=c(0,diff(sd_value)))%>%
mutate(custom_label=custom_labelling(sd_value,value,label))
Error in mutate_impl(.data, dots) :
Evaluation error: missing value where TRUE/FALSE needed.
In addition: Warning message:
In if (is.na(label[i]) & diff_sd_val < 9 & diff_val < 50) { :
the condition has length > 1 and only the first element will be used

One option is to find NA and non-NA index and based on the condition select the closest label to it.
library(dplyr)
#Create a new dataframe with diff_val and diff_sd_val
data1 <- data%>% mutate(diff_val=c(0,diff(value)), diff_sd_val=c(0,diff(sd_value)))
#Get the NA indices
NA_inds <- which(is.na(data1$label))
#Get the non-NA indices
non_NA_inds <- setdiff(1:nrow(data1), NA_inds)
#For every NA index
for (i in NA_inds) {
#Check the condition
if(data1$diff_sd_val[i] < 9 & data1$diff_val[i] < 50)
#Get the last non-NA label
data1$label[i] <- data1$label[non_NA_inds[which.max(i > non_NA_inds)]]
else
#Get the first non-NA label after last NA value
data1$label[i] <- data1$label[non_NA_inds[i < non_NA_inds]]
}
data1
# sd_value value label diff_val diff_sd_val
#1 34 383 bad 0 0
#2 33 428 bad 45 -1
#3 34 437 bad 9 1
#4 37 455 bad 18 3
#5 36 508 unable 53 -1
#6 45 509 unable 1 9
You can remove diff_val and diff_sd_val columns later if not needed.
We can also create a function
custom_label <- function(label, diff_val, diff_sd_val) {
NA_inds <- which(is.na(label))
non_NA_inds <- setdiff(1:length(label), NA_inds)
new_label = label
for (i in NA_inds) {
if(diff_sd_val[i] < 9 & diff_val[i] < 50)
new_label[i] <- label[non_NA_inds[which.max(i > non_NA_inds)]]
else
new_label[i] <- label[non_NA_inds[i < non_NA_inds]]
}
return(new_label)
}
and then apply it
data%>%
mutate(diff_val = c(0, diff(value)),
diff_sd_val = c(0, diff(sd_value)),
new_label = custom_label(label, diff_val, diff_sd_val))
# sd_value value label diff_val diff_sd_val new_label
#1 34 383 bad 0 0 bad
#2 33 428 <NA> 45 -1 bad
#3 34 437 <NA> 9 1 bad
#4 37 455 <NA> 18 3 bad
#5 36 508 <NA> 53 -1 unable
#6 45 509 unable 1 9 unable
If we want to apply it by group we can add a group_by statement and it should work.
data%>%
group_by(group) %>%
mutate(diff_val = c(0, diff(value)),
diff_sd_val = c(0, diff(sd_value)),
new_label = custom_label(label, diff_val, diff_sd_val))

Related

Why does the frequency reduce if I use ifelse function in R?Is there a way to create categories from the combination of 2 variables/columns?

when I do
table(df$strategy.x)
0 1 2 3
70 514 223 209
table(df$strategy.y)
0 1 2 3
729 24 7 4
I want to create a variable with both of these combined. I tried this
df <- df %>%
mutate(nstrategy1 = ifelse(strategy.x==1| strategy.y==1 , 1, 0))
table(df$nstrategy1)
0 1
399 519
I am supposed to get 514 + 24 = 538 but I got 519 instead
df <- df %>% mutate(nstrategy2 = ifelse(strategy.x==2| strategy.y==2 , 1, 0))
table(df$nstrategy2)
0 1
578 228
Similarly, I am supposed to get 223 + 7 = 230, but I got 228 instead
Is there a good way to merge both strategy.x and strategy.y and end up with a table like the following with 4 categories?
0 1 2 3
799 538 230 213
table(mtcars$am) # 13 1's
table(mtcars$vs) # 14 1's
mtcars$ones = ifelse(mtcars$am == 1 | mtcars$vs == 1, 1, 0)
table(mtcars$ones) # 20 1's < 13 + 14 = 27
Why is it showing only 20 1's instead of 27? It's because there are 7 + 6 + 7 = 20 cars with either one or two 1's in am and vs. There are 13 with am==1 (6+7), and 14 with vs==1 (7+7). Seven cars are in the bottom left because they have 1's in both dimensions, which you are expecting/seeking to count twice.
table(mtcars$am, mtcars$vs)
# 0 1
# 0 12 7
# 1 6 7
The simplest way to get the sum of the two results would be by adding the two table objects:
table(mtcars$am) + table(mtcars$vs)
# 0 1
# 37 27

Sorting elements by column in R

I have a simple code for matrix
ind1=which(macierz==1,arr.ind = TRUE)
fragment of theresult is
> ind1
row col
TCGA.CH.5737.01 53 1
TCGA.CH.5791.01 66 1
P03.1334.Tumor 322 1
P04.1790.Tumor 327 1
CPCG0340.F1 425 1
TCGA.CH.5737.01 53 2
TCGA.CH.5791.01 66 2
P03.1334.Tumor 322 2
P04.1790.Tumor 327 2
CPCG0340.F1 425 2
I would like to sort it by first column alphabetical. How can I do this in R?
It looks as if ind1 is a matrix and the first column is the rownames, so you probably need something like ind1 <- ind1[order(rownames(ind1)),]
You need (assuming your first column is called "label" and those are not rownames)
ind1[order(ind1$label),]
order() return a list of row indexes after sorting alphabetically the data frame. Just to make the example reproducible I created your data frame so
ind1 <- data.frame ( label = c("TCGA.CH.5737.01", "TCGA.CH.5791.01",
"P03.1334.Tumor","P04.1790.Tumor", "CPCG0340.F1" , "TCGA.CH.5737.01",
"TCGA.CH.5791.01","P03.1334.Tumor", "P04.1790.Tumor", "CPCG0340.F1"),
row = c(53,66,322,327,425,53,66,322,327,425), col =
c(1,1,1,1,1,2,2,2,2,2),
stringsAsFactors = FALSE)
and the output is
> ind1[order(ind1$label),]
label row col
5 CPCG0340.F1 425 1
10 CPCG0340.F1 425 2
3 P03.1334.Tumor 322 1
8 P03.1334.Tumor 322 2
4 P04.1790.Tumor 327 1
9 P04.1790.Tumor 327 2
1 TCGA.CH.5737.01 53 1
6 TCGA.CH.5737.01 53 2
2 TCGA.CH.5791.01 66 1
7 TCGA.CH.5791.01 66 2
Hope that helps.
Regards, Umberto

R: sum rows from column A until conditioned value in column B

I'm pretty new to R and can't seem to figure out how to deal with what seems to be a relatively simple problem. I want to sum the rows of the column 'DURATION' per 'TRIAL_INDEX', but then only those first rows where the values of 'X_POSITION" are increasing. I only want to sum the first round within a trial where X increases.
The first rows of a simplified dataframe:
TRIAL_INDEX DURATION X_POSITION
1 1 204 314.5
2 1 172 471.6
3 1 186 570.4
4 1 670 539.5
5 1 186 503.6
6 2 134 306.8
7 2 182 503.3
8 2 806 555.7
9 2 323 490.0
So, for TRIAL_INDEX 1, only the first three values of DURATION should be added (204+172+186), as this is where X has the highest value so far (going through the dataframe row by row).
The desired output should look something like:
TRIAL_INDEX DURATION X_POSITION FIRST_PASS_TIME
1 1 204 314.5 562
2 1 172 471.6 562
3 1 186 570.4 562
4 1 670 539.5 562
5 1 186 503.6 562
6 2 134 306.8 1122
7 2 182 503.3 1122
8 2 806 555.7 1122
9 2 323 490.0 1122
I tried to use dplyr, to generate a new dataframe that can be merged with my original dataframe.
However, the code doesn't work, and also I'm not sure on how to make sure it's only adding the first rows per trial that have increasing values for X_POSITION.
FirstPassRT = dat %>%
group_by(TRIAL_INDEX) %>%
filter(dplyr::lag(dat$X_POSITION,1) > dat$X_POSITION) %>%
summarise(FIRST_PASS_TIME=sum(DURATION))
Any help and suggestions are greatly appreciated!
library(data.table)
dt = as.data.table(df) # or setDT to convert in place
# find the rows that will be used for summing DURATION
idx = dt[, .I[1]:.I[min(.N, which(diff(X_POSITION) < 0), na.rm = T)], by = TRIAL_INDEX]$V1
# sum the DURATION for those rows
dt[idx, time := sum(DURATION), by = TRIAL_INDEX][, time := time[1], by = TRIAL_INDEX]
dt
# TRIAL_INDEX DURATION X_POSITION time
#1: 1 204 314.5 562
#2: 1 172 471.6 562
#3: 1 186 570.4 562
#4: 1 670 539.5 562
#5: 1 186 503.6 562
#6: 2 134 306.8 1122
#7: 2 182 503.3 1122
#8: 2 806 555.7 1122
#9: 2 323 490.0 1122
Here is something you can try with dplyr package:
library(dplyr);
dat %>% group_by(TRIAL_INDEX) %>%
mutate(IncLogic = X_POSITION > lag(X_POSITION, default = 0)) %>%
mutate(FIRST_PASS_TIME = sum(DURATION[IncLogic])) %>%
select(-IncLogic)
Source: local data frame [9 x 4]
Groups: TRIAL_INDEX [2]
TRIAL_INDEX DURATION X_POSITION FIRST_PASS_TIME
(int) (int) (dbl) (int)
1 1 204 314.5 562
2 1 172 471.6 562
3 1 186 570.4 562
4 1 670 539.5 562
5 1 186 503.6 562
6 2 134 306.8 1122
7 2 182 503.3 1122
8 2 806 555.7 1122
9 2 323 490.0 1122
If you want to summarize it down to one row per trial you can use summarize like this:
library(dplyr)
df <- data_frame(TRIAL_INDEX = c(1,1,1,1,1,2,2,2,2),
DURATION = c(204,172,186,670, 186,134,182,806, 323),
X_POSITION = c(314.5, 471.6, 570.4, 539.5, 503.6, 306.8, 503.3, 555.7, 490.0))
res <- df %>%
group_by(TRIAL_INDEX) %>%
mutate(x.increasing = ifelse(X_POSITION > lag(X_POSITION), TRUE, FALSE),
x.increasing = ifelse(is.na(x.increasing), TRUE, x.increasing)) %>%
filter(x.increasing == TRUE) %>%
summarize(FIRST_PASS_TIME = sum(X_POSITION))
res
#Source: local data frame [2 x 2]
#
# TRIAL_INDEX FIRST_PASS_TIME
# (dbl) (dbl)
#1 1 1356.5
#2 2 1365.8

Filter rows based on values of multiple columns in R

Here is the data set, say name is DS.
Abc Def Ghi
1 41 190 67
2 36 118 72
3 12 149 74
4 18 313 62
5 NA NA 56
6 28 NA 66
7 23 299 65
8 19 99 59
9 8 19 61
10 NA 194 69
How to get a new dataset DSS where value of column Abc is greater than 25, and value of column Def is greater than 100.It should also ignore any row if value of atleast one column in NA.
I have tried few options but wasn't successful. Your help is appreciated.
There are multiple ways of doing it. I have given 5 methods, and the first 4 methods are faster than the subset function.
R Code:
# Method 1:
DS_Filtered <- na.omit(DS[(DS$Abc > 20 & DS$Def > 100), ])
# Method 2: which function also ignores NA
DS_Filtered <- DS[ which( DS$Abc > 20 & DS$Def > 100) , ]
# Method 3:
DS_Filtered <- na.omit(DS[(DS$Abc > 20) & (DS$Def >100), ])
# Method 4: using dplyr package
DS_Filtered <- filter(DS, DS$Abc > 20, DS$Def >100)
DS_Filtered <- DS %>% filter(DS$Abc > 20 & DS$Def >100)
# Method 5: Subset function by default ignores NA
DS_Filtered <- subset(DS, DS$Abc >20 & DS$Def > 100)

Custom sorting of a dataframe in R

I have a binomail dataset that looks like this:
df <- data.frame(replicate(4,sample(1:200,1000,rep=TRUE)))
addme <- data.frame(replicate(1,sample(0:1,1000,rep=TRUE)))
df <- cbind(df,addme)
df <-df[order(df$replicate.1..sample.0.1..1000..rep...TRUE..),]
The data is currently soreted in a way to show the instances belonging to 0 group then the ones belonging to the 1 group. Is there a way I can sort the data in a 0-1-0-1-0... fashion? I mean to show a row that belongs to the 0 group, the row after belonging to the 1 group then the zero group and so on...
All I can think about is complex functions. I hope there's a simple way around it.
Thank you,
Here's an attempt, which will add any extra 1's at the end:
First make some example data:
set.seed(2)
df <- data.frame(replicate(4,sample(1:200,10,rep=TRUE)),
addme=sample(0:1,10,rep=TRUE))
Then order:
with(df, df[unique(as.vector(rbind(which(addme==0),which(addme==1)))),])
# X1 X2 X3 X4 addme
#2 141 48 78 33 0
#1 37 111 133 3 1
#3 115 153 168 163 0
#5 189 82 70 103 1
#4 34 37 31 174 0
#6 189 171 98 126 1
#8 167 46 72 57 0
#7 26 196 30 169 1
#9 94 89 193 134 1
#10 110 15 27 31 1
#Warning message:
#In rbind(which(addme == 0), which(addme == 1)) :
# number of columns of result is not a multiple of vector length (arg 1)
Here's another way using dplyr, which would make it suitable for within-group ordering. It's also probably pretty quick. If there's unbalanced numbers of 0's and 1's, it will leave them at the end.
library(dplyr)
df %>%
arrange(addme) %>%
mutate(n0 = sum(addme == 0),
orderme = seq_along(addme) - (n0 * addme) + (0.5 * addme)) %>%
arrange(orderme) %>%
select(-n0, -orderme)

Resources