Improved query statements using a lot of count cases when - mariadb

COUNT(CASE WHEN a = apple THEN 1 END) as '1',
COUNT(CASE WHEN a = orange THEN 1 END) as '2',
COUNT(CASE WHEN a = mango THEN 1 END) as '3',
COUNT(CASE WHEN a = melon THEN 1 END) as '4',
COUNT(CASE WHEN a = grape THEN 1 END) as '5',
COUNT(CASE WHEN a = lemon THEN 1 END) as '6',
COUNT(CASE WHEN a = watermelon THEN 1 END) as '7',
I have a separate category, but I am counting values that are not grouped as count case when. It has such poor performance that I don't know how to solve it.

Related

How can I check whether a group contains the correct number of observations in R?

I have a data set with monthly results for each site. I need to delete any sites that don't have at least one sample from each season.
An example of the data is below:
df <- data.frame(site = c('D', 'D', 'D', 'D', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'A', 'B', 'B', 'B'),
result = c('1', '2', '1.5', '3', '1.8', '7', '3.2', '4', '1','1.1', '3', '3.3', '2', '5', '4'),
season = c('w', 'sp', 'su', 'a', 'sp', 'sp', 'sp', 'su', 'a','a', 'w', 'w', 'sp', 'w', 's')
In this case, all the data for site D and A would be retained as they have at least 1 sample per season, but all the data for site B would be deleted.
I am struggling with the logic steps of how to do this and would appreciate some pointers please. I am doing this in R. I think I need to group_by site but then I don't know what I should do next.
library(dplyr)
df %>%
group_by(site) %>%
filter(length(unique(season)) == 4) %>%
ungroup()
output:
# A tibble: 12 x 3
site result season
<chr> <chr> <chr>
1 D 1 w
2 D 2 sp
3 D 1.5 su
4 D 3 a
5 A 1.8 sp
6 A 7 sp
7 A 3.2 sp
8 A 4 su
9 A 1 a
10 A 1.1 a
11 A 3 w
12 A 3.3 w

Mix values from 2 dataframes in a mutate function with conditions

I have 2 dataframes with these forms:
DF1 <- data.frame(
idCarte = c('a', 'a', 'b', 'b', 'b'),
idPack = c('1', '2', '2', '3', '3'),
timeIn = c('10:00:02', '12:00:50', '11:40:00', '12:10:35', '15:15:00'),
timeOut = c('12:00:00', '14:00:00', '11:50:00', '15:00:00', '16:00:00')
)
DF1
idCarte idPack timeIn timeOut
a 1 10:00:02 12:00:00
a 2 12:00:50 14:00:00
b 2 11:40:00 11:50:00
b 3 12:10:35 15:00:00
b 3 15:15:35 16:00:00
DF2 <- data.frame(
idCarte = c('a', 'a', 'b', 'b', 'b'),
idPack = c('1', '2', '3', '3', '2'),
timeBetween = c('11:00:02', '13:00:50', '14:10:35', '15:20:00', '18:00:00')
)
DF2
idCarte idPack timeBetween
a 1 11:00:02
a 2 13:00:50
b 3 14:10:35
b 3 15:20:00
b 2 18:00:00
And I want to get this result
idCarte idPack timeIn timeOut timeBetween
a 1 10:00:02 12:00:00 11:00:02
a 2 12:00:50 14:00:00 13:00:50
b 2 11:40:00 11:50:00 NA
b 3 12:10:35 15:00:00 14:10:35
b 3 15:15:00 16:00:00 15:20:00
I can do it with a for loop like this but it's really slow
for (i in 1:nrow(DF1)) {
timeBetweenLocal <- DF2 %>%
filter(
idCarte == DF1[i,"idCarte"] &
idPack == DF1[i,"idPack"] &
timeBetween >= DF1[i,"timeIn"] &
timeBetween <= DF1[i,"timeOut"]
)
if (nrow(timeBetweenLocal) > 0) {
DF1[i, "timeBetween"] <- timeBetweenLocal[1, "timeBetween"]
} else {
DF1[i, "timeBetween"] <- NA
}
}
I want to do it in a vectorised way with dplyr::mutate to go faster but it seems a little bit tricky.
DF1 %>%
mutate (
timeBetween = ifelse (
nrow(DF2 %>%
dplyr::filter(
idCarte == .$idCarte &
idPack == .$idPack &
timeBetween >=.$timeIn &
timeBetween <= .$timeOut
)
) > 0,
DF2 %>%
dplyr::filter(
idCarte == .$idCarte &
idPack == .$idPack &
timeBetween >=.$timeIn &
timeBetween <= .$timeOut
),
NA
)
)
# Error : Result must have length 4, not 0
My problem is that I need test the matching time because there are multiple idCarte, idPack
Is anyone have an idea to vectorise this algorithm ?
Thanks
Here is a solution with left_join and case_when. left_join can lead to duplicated rows, you can use na.omit or filter(!duplicated(...)) if you want remove some duplications.
library(lubridate)
library(dplyr)
# Yours data
DF1 <- data.frame(stringsAsFactors = F,
idCarte = c('a', 'a', 'b', 'b', 'b'),
idPack = c('1', '2', '2', '3', '3'),
timeIn = c('10:00:02', '12:00:50', '11:40:00', '12:10:35', '15:15:00'),
timeOut = c('12:00:00', '14:00:00', '11:50:00', '15:00:00', '16:00:00')
)
DF2 <- data.frame(stringsAsFactors = F,
idCarte = c('a', 'a', 'b', 'b', 'b'),
idPack = c('1', '2', '3', '3', '2'),
timeBetween = c('11:00:02', '13:00:50', '14:10:35', '15:20:00', '18:00:00')
)
# Solution with left_join lead to duplicate rows
df = left_join(x = DF1, y = DF2, by = c("idCarte"="idCarte","idPack"="idPack")) %>%
mutate(timeBetween = case_when(hms(timeBetween)>= hms(timeIn) & hms(timeBetween)<= hms(timeOut) ~ timeBetween,
T ~ NA_character_
)
)
# The output
#
# idCarte idPack timeIn timeOut timeBetween
# 1 a 1 10:00:02 12:00:00 11:00:02
# 2 a 2 12:00:50 14:00:00 13:00:50
# 3 b 2 11:40:00 11:50:00 <NA>
# 4 b 3 12:10:35 15:00:00 14:10:35
# 5 b 3 12:10:35 15:00:00 <NA>
# 6 b 3 15:15:00 16:00:00 <NA>
# 7 b 3 15:15:00 16:00:00 15:20:00
Here is the dplyr solution as mentioned in the comments:
library(dplyr)
library(lubridate)
DF1 %>%
left_join(DF2) %>%
mutate(timeIn = as_datetime(hms(timeIn)),
timeOut = as_datetime(hms(timeOut)),
timeBetween = as_datetime(hms(timeBetween))) %>%
filter(timeBetween > timeIn & timeBetween < timeOut | is.na(timeBetween))
#Joining, by = c("idCarte", "idPack")
# idCarte idPack timeIn timeOut timeBetween
#1 a 1 1970-01-01 10:00:02 1970-01-01 12:00:00 1970-01-01 11:00:02
#2 a 2 1970-01-01 12:00:50 1970-01-01 14:00:00 1970-01-01 13:00:50
#3 b 2 1970-01-01 11:40:00 1970-01-01 11:50:00 <NA>
#4 b 3 1970-01-01 12:10:35 1970-01-01 15:00:00 1970-01-01 14:10:35
To check for matches in the first two columns we may use outer. For multiple matches we want to check whether the time is between timeIn and timeOut. Therefore it's advantageous to convert the times into POSIXct format.
DF1[3:4] <- lapply(DF1[3:4], as.POSIXct, format="%H:%M:%S")
DF2[3] <- as.POSIXct(DF2[[3]], format="%H:%M:%S")
For the outer we code a convenience function.
rp <- function(x) Reduce(paste, x)
Now we create a list w with indices which of the first two columns of both data frames do match using outer.
w <- apply(outer(rp(DF1[1:2]), rp(DF2[1:2]), `==`), 1, which)
Look at the lapply(... in following line; we call each list entry of w, throw either NA if it's empty or chose that entry that falls within the time frame of DF1. Empty elements we again turn to NA. The do.call("c", ...) concatenates the resulting list into a vector, that we can cbind to DF1.
res <- cbind(DF1, timeBetween=do.call("c", lapply(seq(w), function(i) {
r <- DF2[w[[i]], 3]
if (length(r) == 0) r <- NA
else r <- r[r > DF1[i, 3] & r < DF1[i, 4]]
if (length(r) == 0) r <- NA
return(r)
})))
Optionally, we can strip of the dates at the end.
res[3:5] <- lapply(res[3:5], strftime, format="%H:%M:%S")
Result
res
# idCarte idPack timeIn timeOut timeBetween
# 1 a 1 10:00:02 12:00:00 11:00:02
# 2 a 2 12:00:50 14:00:00 13:00:50
# 3 b 2 11:40:00 11:50:00 <NA>
# 4 b 3 12:10:35 15:00:00 14:10:35
# 5 b 3 15:15:00 16:00:00 15:20:00

Converting single coulmn data into multiple columns in plsql

Item LOCATION
R11565 D11
R11565 D12
R11565 D14
R11565 D15
I want output as
Item Location1 Location2 Location3 Location4
R11565 D11 D12 D13 D14
You could use pivoting logic here with the help of ROW_NUMBER():
WITH cte AS (
SELECT Item, LOCATION, ROW_NUMBER() OVER (PARTITION BY Item ORDER BY LOCATION) rn
FROM yourTable
)
SELECT
Item,
MAX(CASE WHEN rn = 1 THEN LOCATION END) AS Location1,
MAX(CASE WHEN rn = 2 THEN LOCATION END) AS Location2,
MAX(CASE WHEN rn = 3 THEN LOCATION END) AS Location3,
MAX(CASE WHEN rn = 4 THEN LOCATION END) AS Location4
FROM cte
GROUP BY
Item;

R - How to one hot encoding a single column while keep other columns still?

I have a data frame like this:
group student exam_passed subject
A 01 Y Math
A 01 N Science
A 01 Y Japanese
A 02 N Math
A 02 Y Science
B 01 Y Japanese
C 02 N Math
What I would like to achieve is the below result:
group student exam_passed subject_Math subject_Science subject_Japanese
A 01 Y 1 0 0
A 01 N 0 1 0
A 01 Y 0 0 1
A 02 N 1 0 0
A 02 Y 0 1 0
B 01 Y 0 0 1
C 02 N 1 0 0
Here is the test data frame:
df <- data.frame(
group = c('A', 'A', 'A', 'A', 'A', 'B', 'C'),
student = c('01', '01', '01', '02', '02', '01', '02'),
exam_pass = c('Y', 'N', 'Y', 'N', 'Y', 'Y', 'N'),
subject = c('Math', 'Science', 'Japanese', 'Math', 'Science', 'Japanese', 'Math')
)
I have tried for loop, however, the original data is too large to deal with, and
mltools::one_hot(df, col = 'subject')
doesn't work either because of the this error:
Error in `[.data.frame`(dt, , cols, with = FALSE) :
unused argument (with = FALSE)
Could anyone help me with this? Thanks!
require(tidyr)
require(dplyr)
df %>% mutate(value = 1) %>% spread(subject, value, fill = 0 )
group student exam_pass Japanese Math Science
1 A 01 N 0 0 1
2 A 01 Y 1 1 0
3 A 02 N 0 1 0
4 A 02 Y 0 0 1
5 B 01 Y 1 0 0
6 C 02 N 0 1 0
another option
library(dplyr)
df %>%
mutate(subject_Math = ifelse(subject=='Math', 1, 0),
subject_Science = ifelse(subject=='Science', 1, 0),
subject_Japanese = ifelse(subject=='Japanese', 1, 0))
You can do this with the arcanely-named contrasts function.
Relevant section of the docs:
if contrasts = FALSE an identity matrix is returned.
So here's a basic implementation:
encode_onehot <- function(x, colname_prefix = "", colname_suffix = "") {
if (!is.factor(x)) {
x <- as.factor(x)
}
encoding_matrix <- contrasts(x, contrasts = FALSE)
encoded_data <- encoding_matrix[as.integer(x)]
colnames(encoded_data) <- paste0(colname_prefix, colnames(encoded_data), colname_suffix)
encoded_data
}
df <- cbind(df, encode_onehot(df$subject, "subject_"))
This is fairly generic, has no dependencies on other libraries, and should be reasonably fast except on very large datasets.
Here is a more generic solution using data.table library and caret
library(caret)
library(data.table)
dt <- data.table(
group = c('A', 'A', 'A', 'A', 'A', 'B', 'C'),
student = c('01', '01', '01', '02', '02', '01', '02'),
exam_pass = c('Y', 'N', 'Y', 'N', 'Y', 'Y', 'N'),
subject = c('Math', 'Science', 'Japanese', 'Math', 'Science', 'Japanese', 'Math')
)
vars <- 'subject'
separator <- '_'
bin_vars <- predict(dummyVars( as.formula(paste0("~",paste0(vars,collapse = "+"))),
data = dt, na.action = na.pass), newdata = dt)
colnames(bin_vars) <- paste0(gsub(vars,paste0(vars,separator),colnames(bin_vars)))
dt[,vars:=NULL]
dt <- cbind(dt,bin_vars)
You can take advantage of R casting booleans as integers.
Something like this:
new.data<-cbind(
old.data,
math=as.integer(old.data$subject=="math")
)

Transference values in column on the next Date (in R)

I have a data frame like:
df <- data.frame(id = c('1', '2', '3', '4', '5', '6', '7', '8', '9', '10'), Date = c("01-Feb-17", "05-Feb-17", "01-May-17", "03-May-17","24-May-17", "05-Oct-17", "20-Oct-17", "25-Oct-17", "01-Dec-17", "12-Dec-17"), Name=c("John", "Jack", "Jack", "John", "John", "Jack", "John", "Jack", "John", "Jack"), Workout=c('150', '130', '140', '160', '150', '130', '140', '160', '150', '130'))
Now I want to shift values in the column Workout for every Name on the next Date.
For example:
150 move from 01-Feb-17 (John) to 03-May-17 (John)
ect.
to value "Jack" the same action
df <- data.frame(id = c('1', '2', '3', '4', '5', '6', '7', '8', '9', '10'),
Date = c("01-Feb-17", "05-Feb-17", "01-May-17", "03-May-17","24-May-17", "05-Oct-2017", "20-Oct-17", "25-Oct-17", "01-Dec-2017", "12-Dec-2017"),
Name=c("John", "Jack", "Jack", "John", "John", "Jack", "John", "Jack", "John", "Jack"),
Workout=c('150', '130', '140', '160', '150', '130', '140', '160', '150', '130'))
library(dplyr)
df %>%
group_by(Name) %>% # for every name
mutate(Workout = lag(Workout)) %>% # replace value with the previous one
ungroup() # forget the grouping
# # A tibble: 10 x 4
# id Date Name Workout
# <fct> <fct> <fct> <fct>
# 1 1 01-Feb-17 John NA
# 2 2 05-Feb-17 Jack NA
# 3 3 01-May-17 Jack 130
# 4 4 03-May-17 John 150
# 5 5 24-May-17 John 160
# 6 6 05-Oct-2017 Jack 140
# 7 7 20-Oct-17 John 150
# 8 8 25-Oct-17 Jack 130
# 9 9 01-Dec-2017 John 140
#10 10 12-Dec-2017 Jack 160
I assume your dataset will be ordered by Date like in your example. If not you can order it using the arrange function.

Resources