ERROR: syntax error at or near "settlement_date" - postgresql-9.1

ERROR: syntax error at or near "settlement_date"
LINE 4: if settlement_date > '2015-01-01'
^
********** Error **********
ERROR: syntax error at or near "settlement_date"
SQL state: 42601
Character: 50
update "Recon".ship_error
set
if settlement_date > '2015-01-01'
then
shipping_fee = case
when shipping_zone = 'NA' and order_status !='return_completed' then 0
when shipping_zone = 'NATIONAL' and total_weight <= 0.5 and order_status !='return_completed' then -55
when shipping_zone = 'LOCAL' and total_weight <= 0.5 and order_status !='return_completed' then -29.4
when shipping_zone = 'ZONAL' and total_weight <= 0.5 and order_status !='return_completed' then -55
when shipping_zone = 'NA' and order_status !='return_completed' then 0
when shipping_zone = 'NATIONAL' and total_weight >= 0.5 and total_weight <= 1 and order_status !='return_completed' then -55
when shipping_zone = 'LOCAL' and total_weight >= 0.5 and total_weight <=1 order_status !='return_completed' then -29.4
when shipping_zone = 'ZONAL' and total_weight >= 0.5 and total_weight <=1 order_status !='return_completed' then -55
end
end
if settlement_date <= '2015-01-01'
then
when shipping_zone = 'NA' and order_status !='return_completed' then 0
when shipping_zone = 'NATIONAL' and total_weight <= 0.5 and order_status !='return_completed' then -43.4
when shipping_zone = 'LOCAL' and total_weight <= 0.5 and order_status !='return_completed' then -24.3
when shipping_zone = 'ZONAL' and total_weight <= 0.5 and order_status !='return_completed' then -43.4
when shipping_zone = 'NA' and order_status !='return_completed' then 0
when shipping_zone = 'NATIONAL' and total_weight >= 0.5 and total_weight <= 1 and order_status !='return_completed' then -86.8
when shipping_zone = 'LOCAL' and total_weight >= 0.5 and total_weight <=1 order_status !='return_completed' then -58.3
when shipping_zone = 'ZONAL' and total_weight >= 0.5 and total_weight <=1 order_status !='return_completed' then -86.8
end
end
from "Recon".ship_error;
or i also tried this code
update "Recon".ship_error
set shipping_fee = case
when settlement_date > '2015-01-01'
then
--shipping_fee = case
when shipping_zone = 'NA' and order_status !='return_completed' then 0
when shipping_zone = 'NATIONAL' and total_weight <= 0.5 and order_status !='return_completed' then -55
when shipping_zone = 'LOCAL' and total_weight <= 0.5 and order_status !='return_completed' then -29.4
when shipping_zone = 'ZONAL' and total_weight <= 0.5 and order_status !='return_completed' then -55
when shipping_zone = 'NA' and order_status !='return_completed' then 0
when shipping_zone = 'NATIONAL' and total_weight >= 0.5 and total_weight <= 1 and order_status !='return_completed' then -55
when shipping_zone = 'LOCAL' and total_weight >= 0.5 and total_weight <=1 order_status !='return_completed' then -29.4
when shipping_zone = 'ZONAL' and total_weight >= 0.5 and total_weight <=1 order_status !='return_completed' then -55
end
end
when settlement_date <= '2015-01-01'
then
when shipping_zone = 'NA' and order_status !='return_completed' then 0
when shipping_zone = 'NATIONAL' and total_weight <= 0.5 and order_status !='return_completed' then -43.4
when shipping_zone = 'LOCAL' and total_weight <= 0.5 and order_status !='return_completed' then -24.3
when shipping_zone = 'ZONAL' and total_weight <= 0.5 and order_status !='return_completed' then -43.4
when shipping_zone = 'NA' and order_status !='return_completed' then 0
when shipping_zone = 'NATIONAL' and total_weight >= 0.5 and total_weight <= 1 and order_status !='return_completed' then -86.8
when shipping_zone = 'LOCAL' and total_weight >= 0.5 and total_weight <=1 order_status !='return_completed' then -58.3
when shipping_zone = 'ZONAL' and total_weight >= 0.5 and total_weight <=1 order_status !='return_completed' then -86.8
end
end
from "Recon".ship_error;

update "Recon".ship_error
set shipping_fee = case
--shipping_fee = case
when shipping_zone = 'NA' and order_status !='return_completed' then 0
when shipping_zone = 'NATIONAL' and total_weight <= 0.5 and settlement_date > '2015-01-01' and order_status !='return_completed' then -55
when shipping_zone = 'LOCAL' and total_weight <= 0.5 and settlement_date > '2015-01-01' and order_status !='return_completed' then -29.4
when shipping_zone = 'ZONAL' and total_weight <= 0.5 and settlement_date > '2015-01-01' and order_status !='return_completed' then -55
when shipping_zone = 'NA' and order_status !='return_completed' then 0
when shipping_zone = 'NATIONAL' and total_weight >= 0.5 and total_weight <= 1 and settlement_date > '2015-01-01' and order_status !='return_completed' then -55
when shipping_zone = 'LOCAL' and total_weight >= 0.5 and total_weight <=1 and settlement_date > '2015-01-01' and order_status !='return_completed' then -29.4
when shipping_zone = 'ZONAL' and total_weight >= 0.5 and total_weight <=1 and settlement_date > '2015-01-01' and order_status !='return_completed' then -55
when shipping_zone = 'NA' and order_status !='return_completed' then 0
when shipping_zone = 'NATIONAL' and total_weight <= 0.5 and settlement_date <= '2015-01-01' and order_status !='return_completed' then -43.4
when shipping_zone = 'LOCAL' and total_weight <= 0.5 and settlement_date <= '2015-01-01' and order_status !='return_completed' then -29.4
when shipping_zone = 'ZONAL' and total_weight <= 0.5 and settlement_date <= '2015-01-01' and order_status !='return_completed' then -43.4
when shipping_zone = 'NA' and order_status !='return_completed' then 0
when shipping_zone = 'NATIONAL' and total_weight >= 0.5 and total_weight <= 1 and settlement_date <= '2015-01-01' and order_status !='return_completed' then -86.8
when shipping_zone = 'LOCAL' and total_weight >= 0.5 and total_weight <=1 and settlement_date <= '2015-01-01' and order_status !='return_completed' then -58.8
when shipping_zone = 'ZONAL' and total_weight >= 0.5 and total_weight <=1 and settlement_date <= '2015-01-01' and order_status !='return_completed' then -86.8
end ;

You had a couple of errors in your query (a couple of times a missing and and extra end etc). Here is the query corrected:
UPDATE "Recon".ship_error
SET shipping_fee = CASE
WHEN settlement_date > '2015-01-01'
THEN
CASE
WHEN shipping_zone = 'NA'
AND order_status != 'return_completed'
THEN 0
WHEN shipping_zone = 'NATIONAL'
AND total_weight <= 0.5
AND order_status != 'return_completed'
THEN - 55
WHEN shipping_zone = 'LOCAL'
AND total_weight <= 0.5
AND order_status != 'return_completed'
THEN - 29.4
WHEN shipping_zone = 'ZONAL'
AND total_weight <= 0.5
AND order_status != 'return_completed'
THEN - 55
WHEN shipping_zone = 'NA'
AND order_status != 'return_completed'
THEN 0
WHEN shipping_zone = 'NATIONAL'
AND total_weight >= 0.5
AND total_weight <= 1
AND order_status != 'return_completed'
THEN - 55
WHEN shipping_zone = 'LOCAL'
AND total_weight >= 0.5
AND total_weight <= 1
AND order_status != 'return_completed'
THEN - 29.4
WHEN shipping_zone = 'ZONAL'
AND total_weight >= 0.5
AND total_weight <= 1
AND order_status != 'return_completed'
THEN - 55
END
WHEN settlement_date <= '2015-01-01'
THEN CASE
WHEN shipping_zone = 'NA'
AND order_status != 'return_completed'
THEN 0
WHEN shipping_zone = 'NATIONAL'
AND total_weight <= 0.5
AND order_status != 'return_completed'
THEN - 43.4
WHEN shipping_zone = 'LOCAL'
AND total_weight <= 0.5
AND order_status != 'return_completed'
THEN - 24.3
WHEN shipping_zone = 'ZONAL'
AND total_weight <= 0.5
AND order_status != 'return_completed'
THEN - 43.4
WHEN shipping_zone = 'NA'
AND order_status != 'return_completed'
THEN 0
WHEN shipping_zone = 'NATIONAL'
AND total_weight >= 0.5
AND total_weight <= 1
AND order_status != 'return_completed'
THEN - 86.8
WHEN shipping_zone = 'LOCAL'
AND total_weight >= 0.5
AND total_weight <= 1
AND order_status != 'return_completed'
THEN - 58.3
WHEN shipping_zone = 'ZONAL'
AND total_weight >= 0.5
AND total_weight <= 1
AND order_status != 'return_completed'
THEN - 86.8
END
END;

Related

Error adding multiple columns to R dataframe

I have a dataframe dept_sales
Store 2010-02-19 2010-02-26 2010-03-05 2010-03-19 2010-05-14 2010-12-10
1 2 0.78 7.02 0.78 0.78 0 0.00
2 4 0.00 0.00 0.00 0.00 0 1.56
3 18 0.00 0.00 0.00 0.00 28 0.0
I am trying to add multiple columns to this dataframe all with value 0. I did this
dept_sales[, dropped_columns] = 0
where dropped_columns is just a list of dates:
[1] "2010-02-05" "2010-02-12" "2010-03-12" "2010-03-26" "2010-04-02" "2010-04-09" "2010-04-16" "2010-04-23" "2010-04-30" "2010-05-07" "2010-05-21"
[12] "2010-05-28" "2010-06-04" "2010-06-11" "2010-06-18" "2010-06-25" "2010-07-02" "2010-07-09" "2010-07-16" "2010-07-23" "2010-07-30" "2010-08-06"
[23] "2010-08-13" "2010-08-20" "2010-08-27" "2010-09-03" "2010-09-10" "2010-09-17" "2010-09-24" "2010-10-01" "2010-10-08" "2010-10-15" "2010-10-22"
[34] "2010-10-29" "2010-11-05" "2010-11-12" "2010-11-19" "2010-11-26" "2010-12-03" "2010-12-17" "2010-12-24" "2010-12-31" "2011-01-07" "2011-01-14"
[45] "2011-01-21" "2011-01-28" "2011-02-04" "2011-02-11" "2011-02-18" "2011-02-25"
which I get error
Error in `[<-.data.frame`(`*tmp*`, , dropped_columns, value = 0) :
new columns would leave holes after existing columns
We can mimic that error by going a bit extreme:
dept_sales[, 10000] <- 0
# Error in `[<-.data.frame`(`*tmp*`, , 10000, value = 0) :
# new columns would leave holes after existing columns
It appears that your dropped_columns may be real Dates instead of strings. Convert to strings first.
dept_sales[,dropped_columns] <- 0
# Error in `[<-.data.frame`(`*tmp*`, , dropped_columns, value = 0) :
# new columns would leave holes after existing columns
dept_sales[,as.character(dropped_columns)] <- 0
dept_sales[,1:16] # just a subset of columns for demonstration here
# Store 2010-02-19 2010-02-26 2010-03-05 2010-03-19 2010-05-14 2010-12-10 2010-02-05 2010-02-12 2010-03-12 2010-03-26 2010-04-02 2010-04-09 2010-04-16 2010-04-23 2010-04-30
# 1 2 0.78 7.02 0.78 0.78 0 0.00 0 0 0 0 0 0 0 0 0
# 2 4 0.00 0.00 0.00 0.00 0 1.56 0 0 0 0 0 0 0 0 0
# 3 18 0.00 0.00 0.00 0.00 28 0.00 0 0 0 0 0 0 0 0 0
Data
dept_sales <- structure(list(Store = c(2L, 4L, 18L), "2010-02-19" = c(0.78, 0, 0), "2010-02-26" = c(7.02, 0, 0), "2010-03-05" = c(0.78, 0, 0), "2010-03-19" = c(0.78, 0, 0), "2010-05-14" = c(0L, 0L, 28L), "2010-12-10" = c(0, 1.56, 0)), class = "data.frame", row.names = c("1", "2", "3"))
dropped_columns <- structure(c(14645, 14652, 14680, 14694, 14701, 14708, 14715, 14722, 14729, 14736, 14750, 14757, 14764, 14771, 14778, 14785, 14792, 14799, 14806, 14813, 14820, 14827, 14834, 14841, 14848, 14855, 14862, 14869, 14876, 14883, 14890, 14897, 14904, 14911, 14918, 14925, 14932, 14939, 14946, 14960, 14967, 14974, 14981, 14988, 14995, 15002, 15009, 15016, 15023, 15030), class = "Date")

Summarize counts based on multiple conditions

I am trying to get a summary of my data based on combinations of two variables.
The following code used to work on the data:
df <- data_frame(fc = runif(1000, -5, 5),
padj = runif(1000, 0, 1))
df %>%
summarise(
dn_red = count(fc < -1.5, padj <= 0.1),
dn_pink = count(fc < -1.5, padj >= 0.1),
dn_blue = count(fc>-1.5 & fc< 0, padj <= 0.1),
dn_grey = count(fc>-1.5 & fc< 0, padj >= 0.1),
up_red = count(fc > 1.5, padj <= 0.1),
up_pink = count(fc > 1.5, padj >= 0.1),
up_blue = count(fc < 1.5 & fc > 0, padj <= 0.1),
up_grey = count(fc < 1.5 & fc > 0, padj >= 0.1)
)
Running it after a couple of months since writing it throws the following error:
Error: Problem with `summarise()` input `dn_red`.
x no applicable method for 'count' applied to an object of class "logical"
ℹ Input `dn_red` is `count(fc < -1.5, padj <= 0.1)`.
I can see that count outputs a tibble with logical vectors corresponding to the conditions. What I am trying to get out of it is a summary of the counts, where both the conditions are TRUE. The code above used to do just that...
You perhaps want sum instead of count!
set.seed(1)
df <- data.frame(fc = runif(1000, -5, 5),
padj = runif(1000, 0, 1))
df %>%
summarise(
dn_red = sum(fc < -1.5, padj <= 0.1),
dn_pink = sum(fc < -1.5, padj >= 0.1),
dn_blue = sum(fc>-1.5 & fc< 0, padj <= 0.1),
dn_grey = sum(fc>-1.5 & fc< 0, padj >= 0.1),
up_red = sum(fc > 1.5, padj <= 0.1),
up_pink = sum(fc > 1.5, padj >= 0.1),
up_blue = sum(fc < 1.5 & fc > 0, padj <= 0.1),
up_grey = sum(fc < 1.5 & fc > 0, padj >= 0.1)
)
dn_red dn_pink dn_blue dn_grey up_red up_pink up_blue up_grey
1 494 1250 269 1025 458 1214 267 1023
But this is creating overlaps. So you need to replace , within logical conditions with either & or | as the case may be. See.
df %>%
summarise(
dn_red = sum(fc < -1.5 & padj <= 0.1),
dn_pink = sum(fc < -1.5 & padj >= 0.1),
dn_blue = sum(fc>-1.5 & fc< 0 & padj <= 0.1),
dn_grey = sum(fc>-1.5 & fc< 0 & padj >= 0.1),
up_red = sum(fc > 1.5 & padj <= 0.1),
up_pink = sum(fc > 1.5 & padj >= 0.1),
up_blue = sum(fc < 1.5 & fc > 0 & padj <= 0.1),
up_grey = sum(fc < 1.5 & fc > 0 & padj >= 0.1)
)
dn_red dn_pink dn_blue dn_grey up_red up_pink up_blue up_grey
1 44 328 20 127 40 296 18 127
If this is what you expected, then it is advisable to divide 1000 data points into eight colors. Use this code instead
df %>% mutate(new = case_when(
fc < -1.5 & padj <= 0.1 ~ 'dn_red',
fc < -1.5 & padj >= 0.1 ~ 'dn_pink',
fc > -1.5 & fc < 0 & padj <= 0.1 ~ 'dn_blue',
fc > -1.5 & fc < 0 & padj >= 0.1 ~'dn_grey',
fc > 1.5 & padj <= 0.1 ~ 'up_red',
fc > 1.5 & padj >= 0.1 ~ 'up_pink',
fc < 1.5 & fc > 0 & padj <= 0.1 ~ 'up_blue',
fc < 1.5 & fc > 0 & padj >= 0.1 ~ 'up_grey',
TRUE ~ 'others'
)) %>% count(new)
new n
1 dn_blue 20
2 dn_grey 127
3 dn_pink 328
4 dn_red 44
5 up_blue 18
6 up_grey 127
7 up_pink 296
8 up_red 40
or better use janitor to have a frequency count
df %>% mutate(new = case_when(
fc < -1.5 & padj <= 0.1 ~ 'dn_red',
fc < -1.5 & padj >= 0.1 ~ 'dn_pink',
fc > -1.5 & fc < 0 & padj <= 0.1 ~ 'dn_blue',
fc > -1.5 & fc < 0 & padj >= 0.1 ~'dn_grey',
fc > 1.5 & padj <= 0.1 ~ 'up_red',
fc > 1.5 & padj >= 0.1 ~ 'up_pink',
fc < 1.5 & fc > 0 & padj <= 0.1 ~ 'up_blue',
fc < 1.5 & fc > 0 & padj >= 0.1 ~ 'up_grey',
TRUE ~ 'others'
)) %>% janitor::tabyl(new) %>%
janitor::adorn_totals()
new n percent
dn_blue 20 0.020
dn_grey 127 0.127
dn_pink 328 0.328
dn_red 44 0.044
up_blue 18 0.018
up_grey 127 0.127
up_pink 296 0.296
up_red 40 0.040
Total 1000 1.000

Select all values of a variables for which there is data for every year

Say I have some data with 2 numeric variables ranging from 0 to 1 (it1, it2), a name variable, which has the name of the subject the numeric variable belongs to and then some date for every measure, ranging from year 2014 to 2017. Now, what I want to do is create a data set that only contains measures of people that have values for every year of my measure, and then in the future maybe specify that I only want measures for people with data ranging from 2015 to 2017. Does anybody have any hint on what package or code could help me with my problem? Thanks in advance.
date <- c("2015-11-26", "2015-12-30","2016-11-13", "2014-09-22", "2014-01-13", "2014-07-26", "2016-11-26", "2016-04-04", "2017-04-09", "2017-02-23", "2015-03-22")
names <- c("Max", "Allen", "Allen", "Bob", "Max", "Sarah", "Max", "Sarah", "Max", "Sarah", "Sarah")
it1 <- c(0.6, 0.3, 0.1, 0.2, 0.3, 0.8, 0.8, 0.5, 0.5, 0.3, 0.7)
it2 <- c(0.5, 0.8, 0.1, 0.4, 0.4, 0.4, 0.5, 0.8, 0.6, 0.5, 0.4)
date <- as.Date(date, format = "%Y-%m-%d")
myframe <- data.frame(date, names, it1, it2)
Desired output:
date <- c("2015-11-26", "2014-01-13", "2014-07-26", "2016-11-26", "2016-04-04", "2017-04-09", "2017-02-23", "2015-03-22")
names <- c("Max", "Max", "Sarah", "Max", "Sarah", "Max", "Sarah", "Sarah")
it1 <- c(0.6, 0.3, 0.8, 0.8, 0.5, 0.5, 0.3, 0.7)
it2 <- c(0.5, 0.4, 0.4, 0.5, 0.8, 0.6, 0.5, 0.4)
date <- as.Date(date, format = "%Y-%m-%d")
myframe <- data.frame(date, names, it1, it2)
Create a table of year vs. name and for those names in all years select out those rows. No packages are used.
tab <- table(as.POSIXlt(myframe$date)$year + 1900, myframe$names)
subset(myframe, names %in% colnames(tab)[colSums(sign(tab)) == nrow(tab)])
giving:
date names it1 it2
1 2015-11-26 Max 0.6 0.5
5 2014-01-13 Max 0.3 0.4
6 2014-07-26 Sarah 0.8 0.4
7 2016-11-26 Max 0.8 0.5
8 2016-04-04 Sarah 0.5 0.8
9 2017-04-09 Max 0.5 0.6
10 2017-02-23 Sarah 0.3 0.5
11 2015-03-22 Sarah 0.7 0.4
library(lubridate)
myframe[with(data = myframe[year(myframe$date) >= 2014 & year(myframe$date) <= 2017,],
expr = ave(year(date), names, FUN = function(x)
all(year(date) %in% x))) == 1,]
# date names it1 it2
#1 2015-11-26 Max 0.6 0.5
#5 2014-01-13 Max 0.3 0.4
#6 2014-07-26 Sarah 0.8 0.4
#7 2016-11-26 Max 0.8 0.5
#8 2016-04-04 Sarah 0.5 0.8
#9 2017-04-09 Max 0.5 0.6
#10 2017-02-23 Sarah 0.3 0.5
#11 2015-03-22 Sarah 0.7 0.4

How to plot 3D in R with multi-conditions

I have data set with 3 features as below:
V1 V2 V3
0.268 0.917 0.191
0.975 0.467 0.447
0.345 0.898 0.984
0.901 0.043 0.456
0.243 0.453 0.964
0.001 0.464 0.953
0.998 0.976 0.978
0.954 0.932 0.923
How to plot this data in 3D graphic based on the following conditions giving different colour for each condition.
(v1>=0.90 && v3>=0.90 && v3>=0.90) || (v1>=0.90 && v3< 0.50 && v3< 0.50) || (v1 < 0.50 && v3>=0.90 && v3< 0.50)|| (v1< 0.50 && v3< 0.50 && v3>=0.90)
I assumed the second statement in each condition is referring to V2, which makes more sense. To color the points according to which condition is met first you need to create a column with that value:
df = data.frame(
"V1" = c(0.268,0.975,0.345,0.901,0.243,0.001,0.998,0.954),
"V2" = c(0.917,0.467,0.898,0.043,0.453,0.464,0.976,0.932),
"V3" = c(0.191,0.447,0.984,0.456,0.964,0.953,0.978,0.923)
)
df = df %>%
mutate(
group = case_when(
V1 >= 0.9 & V2 >= 0.9 & V3 >=0.9 ~ "1",
V1 >= 0.9 & V2 < 0.5 & V3 < 0.5 ~ "2",
V1 < 0.5 & V2 >= 0.9 & V3 <0.5 ~ "3",
V1 <0.5 & V2 <0.5 & V3 >=0.9 ~ "4",
T ~ "5"
))
Then we can use the plotlyor scatterplot3d packages to build the graph:
scatterplot3d(x=df$V1,y=df$V2,z=df$V3,color=df$group)
plot_ly(x=df$V1,y=df$V2,z=df$V3,color = df$group)
You can start by creating a logical vector using the vectorized &;|
# Create the logical vector
ind <- (mat$v1>=0.90 & mat$v3>=0.90 & mat$v3>=0.90) | (mat$v1>=0.90 & mat$v3< 0.50 & mat$v3< 0.50) |
(mat$v1 < 0.50 & mat$v3>=0.90 & mat$v3< 0.50) | (mat$v1< 0.50 & mat$v3< 0.50 & mat$v3>=0.90)
And now one can plot it e.g. using the plotly
# plot
plotly::plot_ly(x = mat$v1[ind], y = mat$v2[ind], z = mat$v3[ind])
With the data
mat = structure(list(v1 = c(0.268, 0.975, 0.345, 0.901, 0.243, 0.001,
0.998, 0.954), v2 = c(0.917, 0.467, 0.898, 0.043, 0.453, 0.464,
0.976, 0.932), v3 = c(0.191, 0.447, 0.984, 0.456, 0.964, 0.953,
0.978, 0.923)), class = "data.frame", row.names = c(NA, -8L))

How to give value to new column based on the if condition from existing column values?

I need to achieve the following condition,
if column Avg_sales_greaterthan_7 == 'YES'
{
column Avg_sales_after_outliner_rejection == column Avg_cache_out
}
else if column Avg_sales_greaterthan_7 == 'NO'
{
column Avg_sales_after_outliner_rejection == column Avg_sales_for_3mon
}
Data set used: df_sales3
|Location_code| Avg_cache | Avg_sales_for_3mon | Avg_sales_greaterthan_7|Avg_cache_out|Avg_sales_after_outliner_rejection|
+-------------+------------------+---------------------+------------------------+-------------+----------------------------------+
| 1003| 752.0| 8.17| YES| 5.15| 5.15|
| 1010| 1906.0| 13.33| NO | 20.72| 13.33|
| 1014| 7965.0| 86.58| YES| 80.32| 80.32|
| 1031|3199.6400000000003| 34.78| YES| 30.88| 30.88|
| 1040|1690.5069999999998| 18.38| YES| 14.21| 14.21|
| 1047| 1000.0| 10.87| NO | 8.73| 10.87|
| 1061| 1133.0| 12.32| NO | 8.61| 12.32|
I used this sparkR code to achieve this condition:
df_1 <- filter(df_sales_3, df_sales_3$Avg_sales_greater_than_7 == "YES")
df_1$Avg_sales_after_outliner_rejection <- df_1$Avg_cache_out
df_2 <- filter(df_sales_3, df_sales_3$Avg_sales_greater_than_7 == "NO")
df_2$Avg_sales_after_outliner_rejection <- df_2$Avg_sales_for_3mon
df_sales_3 <- unionAll(df_1, df_2)
Is there any efficient way to write this code, like using fuctions.
You can use raw SQL and CASE WHEN expression:
df <- createDataFrame(sqlContext,
data.frame(foo=c(TRUE, FALSE, TRUE), x=c(1, 0, 3), y=c(-1, -3, -5)))
registerTempTable(df, "df")
head(sql(sqlContext, "SELECT *, CASE WHEN foo THEN x ELSE y END as bar FROM df"))
## foo x y bar
## 1 TRUE 1 -1 1
## 2 FALSE 0 -3 -3
## 3 TRUE 3 -5 3
Using when / otherwise functions like this:
otherwise(when(df$foo == TRUE, df$x), df$y)
should work as well but it looks like this it is broken in 1.5
Using sqldf you could do this
library(sqldf)
sqldf("select * , case when col4 == 'YES' then col5 else col3 end new from data")
Using apply
data$new = as.numeric(apply(data, 1,
function(x) if(x['col4'] == "YES") x['col5'] else x['col3']))
#> data
# col1 col2 col3 col4 col5 col6 new
#1 1003 752.000 8.17 YES 5.15 5.15 5.15
#2 1010 1906.000 13.33 NO 20.72 13.33 13.33
#3 1014 7965.000 86.58 YES 80.32 80.32 80.32
#4 1031 3199.640 34.78 YES 30.88 30.88 30.88
#5 1040 1690.507 18.38 YES 14.21 14.21 14.21
#6 1047 1000.000 10.87 NO 8.73 10.87 10.87
#7 1061 1133.000 12.32 NO 8.61 12.32 12.32
Using data.table you could do this
library(data.table)
setDT(data)[, new := if(col4 == 'YES') col5 else col3, by = 1:nrow(data)]
#> data
# col1 col2 col3 col4 col5 col6 new
#1: 1003 752.000 8.17 YES 5.15 5.15 5.15
#2: 1010 1906.000 13.33 NO 20.72 13.33 13.33
#3: 1014 7965.000 86.58 YES 80.32 80.32 80.32
#4: 1031 3199.640 34.78 YES 30.88 30.88 30.88
#5: 1040 1690.507 18.38 YES 14.21 14.21 14.21
#6: 1047 1000.000 10.87 NO 8.73 10.87 10.87
#7: 1061 1133.000 12.32 NO 8.61 12.32 12.32
sample data
data = structure(list(col1 = c(1003L, 1010L, 1014L, 1031L, 1040L, 1047L,
1061L), col2 = c(752, 1906, 7965, 3199.64, 1690.507, 1000, 1133
), col3 = c(8.17, 13.33, 86.58, 34.78, 18.38, 10.87, 12.32),
col4 = structure(c(2L, 1L, 2L, 2L, 2L, 1L, 1L), .Label = c("NO",
"YES"), class = "factor"), col5 = c(5.15, 20.72, 80.32, 30.88,
14.21, 8.73, 8.61), col6 = c(5.15, 13.33, 80.32, 30.88, 14.21,
10.87, 12.32)), .Names = c("col1", "col2", "col3", "col4",
"col5", "col6"), class = "data.frame", row.names = c(NA, -7L))
We can try this
temp <- df_sales3$greaterthan_7 == "YES"
df_sales3$after_outliner_rejection[temp] <- df_sales3$cache_out[temp]
df_sales3$after_outliner_rejection[!temp] <- df_sales3$for_3mon[!temp]
Note that I've modified the column names for the sake of clarity.

Resources