So I am using R and trying to change values in a data frame in one column by comparing two columns together. I have something like
Median MyPrice
10 0
20 18
20 20
30 35
15 NA
And I would like to say something like
if(MyPrice == 0 & MyPrice < Median){MyPrice <- 1
}else if (MyPrice == Median){MyPrice <- 2
}else if (MyPrice > Median){MyPrice <- 3
}else {MyPrice <- 4}
To come up with
Median MyPrice
10 1
20 1
20 2
30 3
15 4
But there is always an error. I have also tried something like
for(i in MyPrice){if(MyPrice == 0 & MyPrice < Median){MyPrice <- 1
}else if (MyPrice == Median){MyPrice <- 2
}else if (MyPrice > Median){MyPrice <- 3
}else {MyPrice <- 4}
}
The for loop runs but it changes all values in MyPrice to 4. I also tried the ifelse() function but it seemed to have an issue taking that many arguments at once.
I would also not be opposed to a new column being added to the end of the data frame if a solution like that is easier.
You don't necessarily have to use a for loop. Start by setting every comparison to 4.
> x$Comp=4
> x$Comp[x$Median>x$MyPrice]=1 #if Median is higher, comparison = 1
> x$Comp[x$Median==x$MyPrice]=2 #if Median is equal to MyPrice, comparison = 2
> x$Comp[x$Median<x$MyPrice]=3 #if Median is lower, comparison = 3
> x
Median MyPrice Comp
1 10 0 1
2 20 18 1
3 20 20 2
4 30 35 3
5 15 NA 4
Given your first argument that if MyPrice == 0 & MyPrice < Median, your 2nd row where Median: 20 and MyPrice: 18 should also be 4. Here is a working nested ifelse statement with an NA handler after.
df <- as.data.frame(matrix(c(10,0,20,18,20,20,30,35,15,NA), byrow = T, ncol = 2))
colnames(df) <- c("Median","MyPrice")
df$NewPrice <- ifelse(df$MyPrice == 0 & df$MyPrice < df$Median, 1,
ifelse(df$MyPrice == df$Median, 2,
ifelse(df$MyPrice > df$Median, 3, 4)))
df$NewPrice[is.na(df$MyPrice)] <- 4
df
# Median MyPrice NewPrice
#1 10 0 1
#2 20 18 4
#3 20 20 2
#4 30 35 3
#5 15 NA 4
What about setting a new variable with all values in 4 and then, replace those cases where your conditions apply?
Simple, straight forward and easy to read :-)
#(Following the example from #Evans Friedland)
df <- as.data.frame(matrix(c(10,0,20,18,20,20,30,35,15,NA), byrow = T, ncol = 2))
colnames(df) <- c("Median","MyPrice")
df <- mutate(df, myNewPrice = 4) #set my new price to 4, then edit by following your conditions
df$myNewPrice<- replace (df$myNewPrice, df$MyPrice == 0 & df$MyPrice < df$Median, 1)
df$myNewPrice<- replace (df$myNewPrice, df$MyPrice == df$Median , 2)
df$myNewPrice<- replace (df$myNewPrice, df$MyPrice > df$Median , 3)
df$myNewPrice <- as.numeric (df$myNewPrice) #might, might not be needed.
Related
I have a dataframe like this:
dat<- data.frame (
'Ones'=c(0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0),
'Thats'=c(0,5,3,6,8,4,5,6,8,3,1,3,4,5,6,7,4,3,4,5))
I have to create a function (gap1) that detects each 1 in Ones and than sums n-1, n and n+1 in Thats, with n being in the same row as 1.
For example in this dataset I have two 1.
dat<- data.frame (
'Ones'=c(0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0),
'Thats'=c(0,5,3,6,8,4,5,6,8,3,1,3,4,5,6,7,4,3,4,5))
dat
This should be the output:
Ones Thats gap1
1 4 17 #(8+4+5)
1 1 7 #(3+1+3)
I would like to extend this gap at will, for example:
Ones Thats gap1 gap2 gap3 ...
1 4 17 29 #(6+8+4+5+6)
1 1 7 9 #(8+3+1+3+4)
There is another problem I have to consider:
Suppose we have this data frame:
dat<- data.frame (
'Ones'=c(1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0),
'Thats'=c(0,5,3,6,8,4,5,6,8,3,1,NA,4,5,6,7,4,3,4,5))
In case there is a 1 at the beginning (or at the end), or if there is an NA, the function should use available data.
In this case, for example:
Ones Thats gap1 gap2
1 0 5 (0+5) 8 #(0+5+3)
1 4 17 (8+4+5) 29 #(6+8+4+5+6)
1 1 4 (3+1+NA) 16 #(8+3+1+NA+4)
Do you have any advice?
Using tidyverse / collapse
For arbitrary number of lead and lags the collapse package offers a nice function flag, which has further arguments to specify columns (cols), or grouping variables g.
library(dplyr)
f <- function(df, n){
df %>%
collapse::flag(-n:n) %>%
transmute(Ones, Thats, gap = rowSums(., na.rm = T) - 1) %>%
filter(Ones == 1)
}
x <- data.frame (
'Ones'=c(1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0),
'Thats'=c(0,5,3,6,8,4,5,6,8,3,1,NA,4,5,6,7,4,3,4,5))
# we can now specify how many lags to count:
f(x, 1)
Ones Thats gap
1 1 0 5
2 1 4 17
3 1 1 4
f(x, 2)
Ones Thats gap
1 1 0 8
2 1 4 29
3 1 1 16
Or if you want to specify the number of gaps to compute, we can simplify the function to
f <- function(df, n){
df %>%
collapse::flag(-n:n) %>%
rowSums(na.rm = T) - 1
}
x %>%
mutate(gap1 = f(., 1),
gap2 = f(., 2)) %>%
filter(Ones == 1)
Ones Thats gap1 gap2
1 1 0 5 8
2 1 4 17 29
3 1 1 4 16
Base R
If you like terse functions:
f <- Vectorize(\(df, n) rowSums(collapse::flag(df, -n:n), na.rm = T) - 1, "n")
x[paste0("gap", 1:2)] <- f(x, 1:2) ; subset(x, Ones == 1)
Ones Thats gap1 gap2
1 1 0 5 8
6 1 4 17 29
11 1 1 4 16
With BaseR,
myfun <- function(data,gap=1) {
points <- which(data["Ones"]==1)
sapply(points, function(x) {
bottom <- ifelse(x-gap<=0,1,x -gap)
top <- ifelse(x+ gap > nrow(data),nrow(data),x +gap)
sum(data[bottom:top,"Thats"], na.rm=T)
})
}
#> myfun(dat,1)
#[1] 5 17 4
#> myfun(dat,2)
#[1] 8 29 16
Another base R solution
f <- function(dat, width = 1)
{
dat$gaps <- sapply(seq(nrow(dat)), function(x) {
if(dat$Ones[x] == 0) return(0)
i <- x + seq(2 * width + 1) - (width + 1)
i <- i[i > 0]
i <- i[i < nrow(dat)]
sum(dat$Thats[i])
})
dat[dat$Ones == 1,]
}
f(dat, 1)
#> Ones Thats gaps
#> 6 1 4 17
#> 11 1 1 7
f(dat, 2)
#> Ones Thats gaps
#> 6 1 4 29
#> 11 1 1 19
I'm looking for a way to identify a growing season which consists of a number of days greater than say 60 between the last frost day of spring and the first frost day in the fall. A general version of this problem is this. If I have a vector of numbers like testVec, I want the item numbers of the beginning and end range of values where the number of items is 5 or greater and all of them are greater than 0.
testVec <- c(1,3,4,0, 1, -5, 6, 0, 1,3,4,6,7,5,9, 0)
In this example, the relevant range is 1,3,4,6,7,5,9 which is testVec[9] to testVec[15]
One option could be:
testVec[with(rle(testVec > 0), rep(lengths * values >= 5, lengths))]
[1] 1 3 4 6 7 5 9
Here, the idea is to, first, create runs of values that are smaller or equal to zero and bigger than zero. Second, it checks whether the runs of values bigger than zero are of length 5 or more. Finally, it subsets the original vector for the runs of values bigger than zero with length 5 or more.
1) rleid This also handles any number of sequences including zero. rleid(ok) is a vector the same length as ok such that the first run of identical elements is replaced with 1, the second run with 2 and so on. The result is a list of vectors where each vector has its positions in the original input as its names.
library(data.table)
getSeq <- function(x) {
names(x) <- seq_along(x)
ok <- x > 0
s <- split(x[ok], rleid(ok)[ok])
unname(s)[lengths(s) >= 5]
}
getSeq(testVec)
## [[1]]
## 9 10 11 12 13 14 15
## 1 3 4 6 7 5 9
getSeq(numeric(16))
## list()
getSeq(c(testVec, 10 * testVec))
## [[1]]
## 9 10 11 12 13 14 15
## 1 3 4 6 7 5 9
##
## [[2]]
## 25 26 27 28 29 30 31
## 10 30 40 60 70 50 90
If a data frame were desired then following gives the values and which sequence the row came from. The row names indicate the positions in the original input.
gs <- getSeq(c(testVec, 10 * testVec))
names(gs) <- seq_along(gs)
if (length(gs)) stack(gs) else gs
## values ind
## 9 1 1
## 10 3 1
## 11 4 1
## 12 6 1
## 13 7 1
## 14 5 1
## 15 9 1
## 25 10 2
## 26 30 2
## 27 40 2
## 28 60 2
## 29 70 2
## 30 50 2
## 31 90 2
2) gregexpr Replace each element that is > 0 with 1 and each other element with 0 pasting the 0's and 1's into a single character string. Then use gregexpr to look for sequences of 1's at least 5 long and for the ith such nonoverlapping sequence return the first positions, g, and lengths, attr(g, "match.length"). Define a function vals which extracts the values at the required positions from testVec of the ith such nonoverlapping sequence returning a list such that the ith component of the list is the ith such sequence. The names in the output vector are its positions in the input.
getSeq2 <- function(x) {
g <- gregexpr("1{5,}", paste(+(x > 0), collapse = ""))[[1]]
vals <- function(i) {
ix <- seq(g[i], length = attr(g, "match.length")[i])
setNames(x[ix], ix)
}
if (length(g) == 1 && g == -1) list() else lapply(seq_along(g), vals)
}
getSeq2(testVec)
## [[1]]
## 9 10 11 12 13 14 15
## 1 3 4 6 7 5 9
The above handles any number of sequences including 0 but if we knew there were exactly one sequence (which is the case for the example in the question) then it could be simplified to the following where the return value is just that vector:
g <- gregexpr("1{5,}", paste(+(testVec > 0), collapse = ""))[[1]]
ix <- seq(g, length = attr(g, "match.length"))
setNames(testVec[ix], ix)
## 9 10 11 12 13 14 15
## 1 3 4 6 7 5 9
You could "fix" #tmfmnk's solution like this:
f1 <- function(x, threshold, n) {
range(which(with(rle(x > threshold), rep(lengths * values >= n, lengths))))
}
x <- c(1, 3, 4, 0, 1, -5, 6, 0, 1,3,4,6,7,5,9, 0)
f1(x, 0, 5)
#[1] 9 15
But that does not work well when there are multiple runs
xx <- c(x, x)
f1(xx, 0, 5)
#[1] 9 31
Here is another, not so concise approach that returns the start and end of the longest run (the first one if there are ties).
f2 <- function(x, threshold, n) {
y <- x > threshold
y[is.na(y)] <- FALSE
a <- ave(y, cumsum(!y), FUN=cumsum)
m <- max(a)
if (m < n) return (c(NA, NA))
i <- which(a == m)[1]
c(i-m+1, i)
}
f2(x, 0, 5)
#[1] 9 15
f2(xx, 0, 5)
#[1] 9 15
or with rle
f3 <- function(x, threshold, n) {
y <- x > threshold
r <- rle(y)
m <- max(r$lengths)
if (m < n) return (c(NA, NA))
i <- sum(r$lengths[1:which.max(r$lengths)[1]])
c(i-max(r$lengths)+1, i)
}
f3(x, 0, 5)
#[1] 9 15
f3(xx, 0, 5)
#[1] 9 15
If you wanted the first run that is at least n, that is you do not want a next run, even if it is longer, you could do
f4 <- function(x, threshold, n) {
y <- with(rle(x > threshold), rep(lengths * values >= n, lengths))
i <- which(y)[1]
j <- i + which(!y[-c(1:i)])[1] - 1
c(i, j)
}
I want to subtract 1 from the values of column A if column B is <= 20.
A = c(1,2,3,4,5)
B = c(10,20,30,40,50)
df = data.frame(A,B)
output
A B
1 0 10
2 1 20
3 3 30
4 4 40
5 5 50
My data is very huge so I prefer not to use a loop. Is there any computationally efficient method in R?
You can do
df$A[df$B <= 20] <- df$A[df$B <= 20] - 1
# A B
#1 0 10
#2 1 20
#3 3 30
#4 4 40
#5 5 50
We can break this down step-by-step to understand how this works.
First we check which numbers in B is less than equal to 20 which gives us a logical vector
df$B <= 20
#[1] TRUE TRUE FALSE FALSE FALSE
Using that logical vector we can select the numbers in A
df$A[df$B <= 20]
#[1] 1 2
Subtract 1 from those numbers
df$A[df$B <= 20] - 1
#[1] 0 1
and replace these values for the same indices in A.
With dplyr we can also use case_when
library(dplyr)
df %>%
mutate(A = case_when(B <= 20 ~ A - 1,
TRUE ~ A))
Another possibility:
df$A <- ifelse(df$B < 21, df$A - 1, df$A)
And here is a data.table solution:
library(data.table)
setDT(df)
df[B <= 20, A := A - 1]
I am practicing a simple R loop. From a vector "m" with values 1 to 20, i want to create a loop that save a selected value in a object"a" and the remaining values in object "b".
This is what i did:
a=NULL
b=NULL
m <- c(1:20)
for (i in m)
if (i == 4){
a[[i]] <- i
} else {
b[[i]] <- i
}
This is the output:
> a
[1] NA NA NA 4
> b
[1] 1 2 3 NA 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
My question is: how can i improve my loop code so the output does not show NAs, and without using function "na.omit"?
Thanks
a=NULL
b=NULL
m <- c(1:20)
for (i in m){
if (i == 4){
a <- i
} else {
b <- append(b, i)
}
}
This will put a single value (in this case 4) in object a, and will consecutively add the other values to b.
Result:
> a
[1] 4
> b
[1] 1 2 3 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
Another way to solve it is with vector operations. We doesn't need to do a loop to solve some problems about classification.
In your case, we can use:
m = c(1:20)
subset_with = m[m == 4] # It returns the values with the maching (m == 4)
subset_without = m[m != 4] # It returns the values with the maching (m != 4)
I hope this helps you.
I have a data set and would like to do two things:
Set certain row values in Col A to 0 based on values in Col B
Create a new column with values of either 0 or 1 based on the edited values in Col A
My current approach is shown below - the issue is I occasionally get an error:
Error in `[<-.data.frame`(`*tmp*`, "OCS_dose", value = 0) :
replacement has 1 row, data has 0
As the numbers that I am generating are randomly selected and on certain trials there are no rows to update in Col A based on the numbers in Col B.
Here is an example of my code that causes the error:
pbo_IFNlow_data[pbo_IFNlow_data$OCS_status == 0,]['OCS_dose'] <- 0
OCS_status is either a 0 or 1 that is generated using:
pbo_OCS_status_low <- sample(c(0,1), replace = TRUE,
size = pbo_n_IFNlow, prob=c(1-.863, 0.863))
Therefore on occasion, I have no 0's... In my mind R should then just not try to update anything.
Is there a better way to do what I am trying to do?
Here is a more complete segment of my code:
pbo_OCS_status_low <- sample(c(0,1), replace = TRUE, size = pbo_n_IFNlow, prob=c(1-.863, 0.863)) #on OCS = 1
#OCS dose
pbo_OCS_dose_low <- rtruncnorm(pbo_n_IFNlow, a=0, b=Inf, mean=12.8, sd=8.1)
#IFN boolean flag
pbo_IFN_low <- rep(0, pbo_n_IFNlow)
#SLEDAI score
pbo_SLEDAI_low <- rtruncnorm(pbo_n_IFNlow, a=0, b=Inf, mean=11.1, sd=4.4)
#Response criteria met for SRI score reduction
pbo_SRI_low <- sample(c(0,1), replace = TRUE, size = pbo_n_IFNlow, prob=c(1-0.423, 0.423))
pbo_IFNlow_data <- cbind(IFN_status=pbo_IFN_low,
OCS_status=pbo_OCS_status_low,
OCS_dose=pbo_OCS_dose_low,
SLEDAI=pbo_SLEDAI_low,
SRI_response=pbo_SRI_low)
pbo_IFNlow_data <- data.frame(pbo_IFNlow_data)
#set those off OCS to 0
pbo_IFNlow_data[pbo_IFNlow_data$OCS_status == 0,]['OCS_dose'] <- 0
#stratifcation factor for OCS dosage
pbo_IFNlow_data$OCS_lessthan10 <- "temp"
pbo_IFNlow_data[pbo_IFNlow_data$OCS_dose < 10, ]['OCS_lessthan10'] <- 1
pbo_IFNlow_data[pbo_IFNlow_data$OCS_dose >= 10, ]['OCS_lessthan10'] <- 0
#stratification factor for SLE score
pbo_IFNlow_data$SLE_lessthan10 <- "temp"
pbo_IFNlow_data[pbo_IFNlow_data$SLEDAI < 10, ]['SLE_lessthan10'] <- 1
pbo_IFNlow_data[pbo_IFNlow_data$SLEDAI >= 10, ]['SLE_lessthan10'] <- 0
It would be easier if we can have a minimal reproducible example. If I understand your question correctly, you may want to try ifelse statement in R?
df <- data.frame(colA = seq(1, 10), colB = seq(11, 20))
# Set certain row values in Col A to 0 based on values in Col B
df$colA <- ifelse(df$colB > 15, 0, df$colB)
# Create a new column with values of either 0
# or 1 based on the edited values in Col A
df$colC <- ifelse(df$colA == 0, 1, 0)
print(df)
## colA colB colC
## 1 11 11 0
## 2 12 12 0
## 3 13 13 0
## 4 14 14 0
## 5 15 15 0
## 6 0 16 1
## 7 0 17 1
## 8 0 18 1
## 9 0 19 1
## 10 0 20 1