R Function Query - r

Write a function called "query". It takes five inputs
"age_low": default value = 1
"age_high": default value = 100
"keyword": default value = ""
"rating_low": default value = 1
"rating_high": default value = 5
and returns a data frame which contains the rows of df where the reviewer's age is between "age_low" and "age_high" (inclusive),the review text contains "keyword" (case insensitive), and the rating is between "rating_low" and "rating_high" (inclusive).
Save the output of query(age_low=20, age_high=50) in df1
Save the output of query(rating_high=3) in df2
Save the output of query(age_low=20, age_high=50, keyword="price", rating_low=4, rating_high=5) in df3
query <- function(age_low=1, age_high=100, keyword="", rating_low=1, rating_high=5) {
return(df$Age>= age_low and df$Age<=age_high,
tolower(df$`Review Text` == tolower(keyword)),
df$Rating>=rating_low and df$Rating<=rating_high)
}
df1 <- query(age_low=20, age_high=50)
df2 <- query(rating_high=3)
df3 <- query(age_low=20, age_high=50, keyword="price", rating_low=4, rating_high=5)

Except for the criteria, you should also put df as an input for your function as a good habit.
query <- function(df=df, age_low=1, age_high=100, keyword="", rating_low=1, rating_high=5) {
# first filter
out_df <- df[df$Age>=age_low & df$Age<=age_high, ]
# second filter
out_df <- out_df[tolower(out_df$`Review Text`) == tolower(keyword), ]
# third filter
out_df <- out_df[out_df$Rating>=rating_low & out_df$Rating<=rating_high, ]
return(out_df)
}
df1 <- query(age_low=20, age_high=50)
df2 <- query(rating_high=3)
df3 <- query(age_low=20, age_high=50, keyword="price", rating_low=4, rating_high=5)

Related

Matching logical subscripts to the size of the indexed input

I have created two data frames that I then turn into lists (e.g., list1 and list2). I removed one element from list2 to better represent my example data set.
library(dplyr)
intervals <- rep_len(c("01-01-2022", "01-11-2022", "01-31-2022"), 100)
ID <- rep(c("A","B", "C"), 100)
df <- data.frame(ID = as.factor(ID),
intervals = as.factor(intervals))
list1 <- df %>%
group_by(ID, intervals) %>%
group_split()
intervals <- rep_len(c("01-01-2022", "01-11-2022", "01-31-2022"), 25)
ID <- rep(c("A","B"), 25)
df2 <- data.frame(ID = as.factor(ID),
intervals = as.factor(intervals))
list2 <- df2 %>%
group_by(ID, intervals) %>%
group_split()
list2 <- list2[-6]
For each of these list I have added an attribute, and I have included a function to check the added attribute more readily (check).
# Convenience function to grab the attributes for you
check <- function(list, attribute_name) {
return(attr(list, attribute_name))
}
# Add an attribute to hold the attributes of each list element
attr(list1, "match") <- data.frame(id = sapply(list1, function(x) paste(x$ID[1])),
interval_start_date = sapply(list1, function(x) paste(x$intervals[1]))
)
# Check the attributes
check(list1, "match")
# Add an attribute "tab" to hold the attributes of each list element
attr(list2, "match") <- data.frame(id = sapply(list2, function(x) paste(x$ID[1])),
interval_start_date = sapply(list2, function(x) paste(x$intervals[1]))
)
# Check the attributes
check(list2, "match")
I have created an index for the two list, and the objective here is to remove any list components that don't have the same ID and the same intervals. The goal is to have only the matching IDs with the same intervals.
# Creates an index for the two list based on the attributes,
dat2 <- check(list1, "match")
dat1 <- check(list2, "match")
# Removes rows where the id isn't present in both data frames, and creates a
# index where both the interval and id are the same.
if (!length(unique(dat2$id)) == length(unique(dat1$id))){
dat3 <- dat2[dat2$id %in% dat1$id, ]
dat4 <- dat1[dat1$id %in% dat2$id, ]
i1 <- paste(dat3[["id"]], format(as.Date(dat3[["interval_"]]),
"%Y-%d")) %in%
paste(dat4[["id"]], format(as.Date(dat4[["interval_"]]),
"%Y-%d"))
}
Now here is where I begin to get an error:
# Error occurs because the lengths of `i1` is not the same as `list2`
out <- list1[i1]
I know that this is occuring because list1 does not have the same length as i1. I'm wondering if there is a way to appending logical values to i1 to get it the same length as list1, but in a way that it doesn't remove values from list1 that we actually do want to keep. Any thoughts?
Here is my expected output for list1, where I hope it ends up with only the same IDs and intervals as list2.
# Expected output
expected_list1 <- list(list1[1], list1[2],list1[3], list1[4], list1[5])
This answer is close to what I would like, but it has an additional element. I think ultimately the attribute table should be similiar to that of dat4.
test <- list1[dat2$id %in% dat1$id][i1]
# Add an attribute "tab" to hold the attributes of each list element
attr(test, "match") <- data.frame(id = sapply(test, function(x) paste(x$ID[1])),
interval_start_date = sapply(test, function(x) paste(x$intervals[1]))
)
# Check the attributes
check(test, "match")
There was a mismatch in the column name i.e. it is not interval_, but interval_start_date in dat1 and dat2. [[ will look for exact match whereas $ can match partial names as well
if (!length(unique(dat2$id)) == length(unique(dat1$id))){
ids_common <- intersect(dat2$id, dat1$id)
inds1 <- dat2$id %in% ids_common
inds2 <- dat1$id %in% ids_common
i1 <- paste(dat2[["id"]], format(as.Date(dat2[["interval_start_date"]]),
"%Y-%d")) %in%
paste(dat1[["id"]], format(as.Date(dat1[["interval_start_date"]]),
"%Y-%d"))
out <- list1[i1 & inds1]
}
-checking
> length(out)
[1] 5
> i1
[1] TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE

Find Date in Data Frame in R

I'm trying to use a list of dates to find the same date in a data frame and using ifelse loop to add a new column. Here is the code:
library(lubridate)
df <- tibble(DateCol = seq(ymd("2021-01-01"),ymd("2021-05-31"),"days"))
df2 <- c("2021-02-04","2021-02-07","2021-02-17","2021-02-25")
for (i in 1:length(df$DateCol)) {
if (df$DateCol[i] == df2) {
df$ValueCol[i] <- "1"
} else {
df$ValueCol[i] <- "0"
return(i)
}
}
The final df I get only has the first date of df2 is 1 in df$ValueCol. Not sure how to make this loop work, seems like there are some mistakes in the df$DateCol[i] == df2 part.
You can do this without a loop.
df$ValueCol <- as.integer(df$DateCol %in% as.Date(df2))
df
as.integer is faster way than using ifelse with 1 and 0 as output.
df$ValueCol <- ifelse(df$DateCol %in% as.Date(df2), 1, 0)
Your loop can be corrected by using %in% instead of ==, as we use %in% to compare more than 1 value.
library(tibble)
library(lubridate)
df <- tibble(DateCol = seq(ymd("2021-01-01"),ymd("2021-05-31"),"days"))
df2 <- as.Date(c("2021-02-04","2021-02-07","2021-02-17","2021-02-25"))
df$ValueCol <- NA
for (i in 1:length(df$DateCol)) {
if (df$DateCol[i] %in% df2) {
df$ValueCol[i] <- 1
} else {
df$ValueCol[i] <- 0
}
}
Or you can try using dplyr
library(dplyr)
library(magrittr)
df <- tibble(DateCol = seq(ymd("2021-01-01"),ymd("2021-05-31"),"days"))
df2 <- ymd("2021-02-04","2021-02-07","2021-02-17","2021-02-25")
df3 <- df %>% mutate(ValueCol = if_else(DateCol %in% df2, 1, 0))

selecting list of columns from dataframe to convert to subset

I am trying to create a function , for that at the input i am giving a list of modifying columns .
for eg: sample data is
dataa<-data.frame(
aa = c("q","r","y","v","g","y","d","s","n","k","y","d","s","t","n","u","l","h","x","c","q","r","y","v","g","y","d","s","n","k","y","d","s","t","n","u","l","h","x","c"),
col1=c(1,2,3,2,1,2,3,4,4,4,5,3,4,2,1,2,5,3,2,1,2,4,2,1,3,2,1,2,3,1,2,2,4,4,4,1,2,5,3,5),
col2=c(2,1,1,7,4,1,2,7,5,7,2,6,2,2,6,3,4,3,2,5,7,5,6,4,4,6,5,6,4,1,7,3,2,7,7,2,3,7,2,4)
)
my requirement is like , i can create any more than one cuts like below, or can be a list of cuts
may be i am trying to recode my dataset
dataa$col3 <- ifelse(dataa$aa == "y",1,0)
dataa$col4 <- ifelse(dataa$col2 == 7,1,0)
so now in my function requirement i want a subset of selected variables for calculation.
for eg:
#i am applying my function like this
dat1 = dataa
var1 = "col1" # variable for which calculation will be done
grouping_var = list(dataa$col3,dataa$col4)
total_var= TRUE
#fun_1 <- function(dat1,var1,grouping_var,total_var){
total_col <- ifelse(total_var== TRUE,1,0)
var1 <- rlang::parse_expr(var1)
var2 <- dat1[unlist(grouping_var)] # i am trying to create a subset dataframe of selected grouping_var
#var2 <- data.frame(sapply(grouping_var,c)) # i have tried this too
dat1 <- dat1 %>% select(!!var1,!!var2)
# so after this line i would have a subset to calculations accordingly
var_lab(dat1[[1]]) <- ""
var_lab(dat1[[2]]) <- ""
tab1 <- expss::cro_cpct(total(),dat1[[1]],dat1[[2]])
tab1 <- as.data.frame(tab1)
#}
It is not possible to select columns based on the values of the column.
An easy way would be to pass them as character vector and select. Try :
dat1 = dataa
var1 = "col1"
grouping_var = c('col3', 'col4') #Passing columns as character vector
total_var= TRUE
#fun_1 <- function(dat1,var1,grouping_var,total_var){
total_col <- as.integer(total_var)
#total_col <- ifelse(total_var== TRUE,1,0)
dat1 <- dat1[grouping_var]
expss::var_lab(dat1[[1]]) <- ""
expss::var_lab(dat1[[2]]) <- ""
tab1 <- expss::cro_cpct(expss::total(),dat1[[1]],dat1[[2]])
tab1 <- as.data.frame(tab1)
#}

extrapolate data within the foreach loop

Take the following example, I create a data.frame df1.
For each iteration, we mix up the order of df1 and rename it df2
We then apply conditions to df2, that are:
when df2[1,1] == 1, then we want to subset df2 so that we remove the case of df2$B==125, and if df2[1,1] != 1, then make no action
as a second step when df2[1,1] == 3, then we want to subset df2 so that we remove the case of df2$B==108, and if df2[1,1] != 1, then make no action
But I dont know how to code this step. Can someone fill in the gaps below.
When we run the code, the output should sucessfully return values between 9 and 10.
require(doParallel)
set.seed(123)
A <- 1:10
B <- c(106,144,131,107,125,108,105,119,112,127)
df1 <- data.frame(A,B)
m <- 100
Sample = foreach(i=c(1:m)) %do%{
#shuffle order of data
df2 <- df1[sample(1:nrow(df1)), ]
df2
# when df2[1,1] == 1 then remove df2$B==125, otherwise leave df2 as is
# or
# when df2[1,1] == 3 then remove df2$B==108, otherwise leave df2 as is
length(df2$A)
}
Try this code:
require(doParallel)
set.seed(123)
A <- 1:10
B <- c(106,144,131,107,125,108,105,119,112,127)
df1 <- data.frame(A,B)
m <- 100
Sample = foreach(i=c(1:m)) %do%{
#shuffle order of data
df2 <- df1[sample(1:nrow(df1)), ]
df2
if (df2[1,1]==1) df2 <- df2[-which(df2$B==125),]
if (df2[1,1]==3) df2 <- df2[-which(df2$B==108),]
print(nrow(df2))
}

Predominant calculation for Character fields

I'm trying to loop through my column names where type = character and return one Data frame which contains all the predominant values of each character column, grouped by an ID field.
Is there a way to replicate the following code in some kind of loop?:
DF_Characters <- DF_Characters[,sapply(dfr,is.character)]
##Predominance Column1##
Predom <- select(DF_Characters, Group_ID, Column_1)
Predom <- group_by(Predom,Group_ID, Column_1)
Predom <- summarise(Predom,
CountPredom = n()
)
Predom <- arrange(Predom,Group_ID, desc(CountPredom) )
Predom <- data.table(Predom, key="Group_ID")
Predominant_Column_1 <- Predom[,head(.SD,1),by=Group_ID]
##Predominant Column_2##
Predom <- select(DF_Characters, Group_ID, Column_2)
Predom <- group_by(Predom,Group_ID, Column_2)
Predom <- summarise(Predom,
CountPredom = n()
)
Predom <- arrange(Predom,Group_ID, desc(CountPredom) )
Predom <- data.table(Predom, key="Group_ID")
Predominant_Column_2 <- Predom[,head(.SD,1),by=Group_ID]
##Merge final table##
Merged <- merge(Predominant_Column_1 ,Predominant_Column_2 ,by="Group_ID")
Also to clarify my question I added a dummy table:
DF_Character_table
Result shoul look like this
Result Table
So for Group 1 Petre was the predominant name in Column 1 and Car was the predominant mode of travel. Column 1 and Column 2 predominance should be calculated respectively.
Thank you
This is probably not the best solution but it works.
##########Predominant Calculations
#Character fields
DF_Characters <- as.data.frame(dfr)
DF_Characters <- DF_Characters[,sapply(dfr,is.character)]
# Field names without the Group by id
CharactersToMerge <- c(names(DF_Characters))
#Add Groupby ID to Character fields
Character_Field_List <- c("Groupby_ID", names(DF_Characters))
DF_Characters <- subset(dfr,select = Character_Field_List)
#Column Names to loop through
DF_FieldsToMerge <- subset(dfr,select = CharactersToMerge)
# Predominant Table
fin_table <- DF_Characters %>% group_by(Groupby_ID) %>%
tally(sort = TRUE) #Count observations
# Loop and merge tables to Predominant Table
for(i in names(DF_FieldsToMerge)){
temp_table <- DF_Characters %>% group_by_("Groupby_ID", i ) %>%
tally(sort = TRUE)
temp_table <- temp_table[,head(.SD,1),by=Groupby_ID] #Remove ties
temp_table <- subset(temp_table,select = c("Groupby_ID", i)) #remove counts
fin_table <- merge(fin_table, temp_table, by="Groupby_ID")
}

Resources