Predominant calculation for Character fields

Predominant calculation for Character fields - r

I'm trying to loop through my column names where type = character and return one Data frame which contains all the predominant values of each character column, grouped by an ID field.
Is there a way to replicate the following code in some kind of loop?:
DF_Characters <- DF_Characters[,sapply(dfr,is.character)]
##Predominance Column1##
Predom <- select(DF_Characters, Group_ID, Column_1)
Predom <- group_by(Predom,Group_ID, Column_1)
Predom <- summarise(Predom,
CountPredom = n()
)
Predom <- arrange(Predom,Group_ID, desc(CountPredom) )
Predom <- data.table(Predom, key="Group_ID")
Predominant_Column_1 <- Predom[,head(.SD,1),by=Group_ID]
##Predominant Column_2##
Predom <- select(DF_Characters, Group_ID, Column_2)
Predom <- group_by(Predom,Group_ID, Column_2)
Predom <- summarise(Predom,
CountPredom = n()
)
Predom <- arrange(Predom,Group_ID, desc(CountPredom) )
Predom <- data.table(Predom, key="Group_ID")
Predominant_Column_2 <- Predom[,head(.SD,1),by=Group_ID]
##Merge final table##
Merged <- merge(Predominant_Column_1 ,Predominant_Column_2 ,by="Group_ID")
Also to clarify my question I added a dummy table:
DF_Character_table
Result shoul look like this
Result Table
So for Group 1 Petre was the predominant name in Column 1 and Car was the predominant mode of travel. Column 1 and Column 2 predominance should be calculated respectively.
Thank you

This is probably not the best solution but it works.
##########Predominant Calculations
#Character fields
DF_Characters <- as.data.frame(dfr)
DF_Characters <- DF_Characters[,sapply(dfr,is.character)]
# Field names without the Group by id
CharactersToMerge <- c(names(DF_Characters))
#Add Groupby ID to Character fields
Character_Field_List <- c("Groupby_ID", names(DF_Characters))
DF_Characters <- subset(dfr,select = Character_Field_List)
#Column Names to loop through
DF_FieldsToMerge <- subset(dfr,select = CharactersToMerge)
# Predominant Table
fin_table <- DF_Characters %>% group_by(Groupby_ID) %>%
tally(sort = TRUE) #Count observations
# Loop and merge tables to Predominant Table
for(i in names(DF_FieldsToMerge)){
temp_table <- DF_Characters %>% group_by_("Groupby_ID", i ) %>%
tally(sort = TRUE)
temp_table <- temp_table[,head(.SD,1),by=Groupby_ID] #Remove ties
temp_table <- subset(temp_table,select = c("Groupby_ID", i)) #remove counts
fin_table <- merge(fin_table, temp_table, by="Groupby_ID")
}

Related

R Function Query

Write a function called "query". It takes five inputs
"age_low": default value = 1
"age_high": default value = 100
"keyword": default value = ""
"rating_low": default value = 1
"rating_high": default value = 5
and returns a data frame which contains the rows of df where the reviewer's age is between "age_low" and "age_high" (inclusive),the review text contains "keyword" (case insensitive), and the rating is between "rating_low" and "rating_high" (inclusive).
Save the output of query(age_low=20, age_high=50) in df1
Save the output of query(rating_high=3) in df2
Save the output of query(age_low=20, age_high=50, keyword="price", rating_low=4, rating_high=5) in df3
query <- function(age_low=1, age_high=100, keyword="", rating_low=1, rating_high=5) {
return(df$Age>= age_low and df$Age<=age_high,
tolower(df$`Review Text` == tolower(keyword)),
df$Rating>=rating_low and df$Rating<=rating_high)
}
df1 <- query(age_low=20, age_high=50)
df2 <- query(rating_high=3)
df3 <- query(age_low=20, age_high=50, keyword="price", rating_low=4, rating_high=5)

Except for the criteria, you should also put df as an input for your function as a good habit.
query <- function(df=df, age_low=1, age_high=100, keyword="", rating_low=1, rating_high=5) {
# first filter
out_df <- df[df$Age>=age_low & df$Age<=age_high, ]
# second filter
out_df <- out_df[tolower(out_df$`Review Text`) == tolower(keyword), ]
# third filter
out_df <- out_df[out_df$Rating>=rating_low & out_df$Rating<=rating_high, ]
return(out_df)
}
df1 <- query(age_low=20, age_high=50)
df2 <- query(rating_high=3)
df3 <- query(age_low=20, age_high=50, keyword="price", rating_low=4, rating_high=5)

selecting list of columns from dataframe to convert to subset

I am trying to create a function , for that at the input i am giving a list of modifying columns .
for eg: sample data is
dataa<-data.frame(
aa = c("q","r","y","v","g","y","d","s","n","k","y","d","s","t","n","u","l","h","x","c","q","r","y","v","g","y","d","s","n","k","y","d","s","t","n","u","l","h","x","c"),
col1=c(1,2,3,2,1,2,3,4,4,4,5,3,4,2,1,2,5,3,2,1,2,4,2,1,3,2,1,2,3,1,2,2,4,4,4,1,2,5,3,5),
col2=c(2,1,1,7,4,1,2,7,5,7,2,6,2,2,6,3,4,3,2,5,7,5,6,4,4,6,5,6,4,1,7,3,2,7,7,2,3,7,2,4)
)
my requirement is like , i can create any more than one cuts like below, or can be a list of cuts
may be i am trying to recode my dataset
dataa$col3 <- ifelse(dataa$aa == "y",1,0)
dataa$col4 <- ifelse(dataa$col2 == 7,1,0)
so now in my function requirement i want a subset of selected variables for calculation.
for eg:
#i am applying my function like this
dat1 = dataa
var1 = "col1" # variable for which calculation will be done
grouping_var = list(dataa$col3,dataa$col4)
total_var= TRUE
#fun_1 <- function(dat1,var1,grouping_var,total_var){
total_col <- ifelse(total_var== TRUE,1,0)
var1 <- rlang::parse_expr(var1)
var2 <- dat1[unlist(grouping_var)] # i am trying to create a subset dataframe of selected grouping_var
#var2 <- data.frame(sapply(grouping_var,c)) # i have tried this too
dat1 <- dat1 %>% select(!!var1,!!var2)
# so after this line i would have a subset to calculations accordingly
var_lab(dat1[[1]]) <- ""
var_lab(dat1[[2]]) <- ""
tab1 <- expss::cro_cpct(total(),dat1[[1]],dat1[[2]])
tab1 <- as.data.frame(tab1)
#}

It is not possible to select columns based on the values of the column.
An easy way would be to pass them as character vector and select. Try :
dat1 = dataa
var1 = "col1"
grouping_var = c('col3', 'col4') #Passing columns as character vector
total_var= TRUE
#fun_1 <- function(dat1,var1,grouping_var,total_var){
total_col <- as.integer(total_var)
#total_col <- ifelse(total_var== TRUE,1,0)
dat1 <- dat1[grouping_var]
expss::var_lab(dat1[[1]]) <- ""
expss::var_lab(dat1[[2]]) <- ""
tab1 <- expss::cro_cpct(expss::total(),dat1[[1]],dat1[[2]])
tab1 <- as.data.frame(tab1)
#}

For Loop to Reorder Columns in R

I am trying to re-order columns in R using a for loop since the column range needs to be dynamic. Does anyone know what is missing from my code?
Group <- c("A","B","C","D")
Attrib1 <- c("x","y","x","z")
Attrib2 <- c("q","w","u","i")
Day1A <- c(5,4,6,3)
Day2A <- c(6,5,7,4)
Day3A <- c(9,8,10,7)
Day1B <- c(4,3,5,2)
Day2B <- c(3,2,4,1)
Day3B <- c(2,1,3,0)
df <- data.frame(Group, Attrib1,Attrib2,Day1A,Day2A,Day3A,Day1B,Day2B,Day3B)
day_count <- 3
for(i in 4:ncol(df)) {
if (i == day_count+3) break
df[c(i,day_count+i)]
}
Here is my desired result:
df <- data.frame(Group, Attrib1,Attrib2,Day1A,Day1B,Day2A,Day2B,Day3A,Day3B)

So, in theory you can just do sort(colnames(df)[4:ncol(df)]) to get that, but it gets tricky when you have say Day1A..Day10A..Day20A
Below is a quick workaround, to get the numbers and alphabets:
COLS = colnames(df)[4:ncol(df)]
day_no = as.numeric(gsub("[^0-9]","",COLS))
day_letter = gsub("Day[0-9]*","",COLS)
o = order(day_no,day_letter)
To get your final dataframe:
df[,c(colnames(df)[1:3],COLS[o])]

An option with select
library(dplyr)
library(stringr)
df %>%
select(Group, starts_with('Attrib'),
names(.)[-(1:3)][order(str_remove_all(names(.)[-(1:3)], '\\D+'))])

Filter dataframe by vector of column names and constant column names

This is surely easy but for the life of me I can't find the right syntax.
I want to keep all "ID_" columns, regardless of the number of columns and the numbers attached, and keep other columns by constant name.
Something like the below command that doesn't work (on the recreated data, every time):
###Does not work, but shows what I am trying to do
testdf1 <- df1[,c(paste(idvec, collapse="','"),"ConstantNames_YESwant")]
Recreated data:
rand <- sample(1:2, 1)
if(rand==1){
df1 <- data.frame(
ID_0=0,
ID_1=1,
ID_2=11,
ID_3=111,
LotsOfColumnsWithVariousNames_NOwant="unwanted_data",
ConstantNames_YESwant="wanted_data",
stringsAsFactors = FALSE
)
desired.df1 <- data.frame(
ID_0=0,
ID_1=1,
ID_2=11,
ID_3=111,
ConstantNames_YESwant="wanted_data",
stringsAsFactors = FALSE
)
}
if(rand==2){
df1 <- data.frame(
ID_0=0,
ID_1=1,
LotsOfColumnsWithVariousNames_NOwant="unwanted_data",
ConstantNames_YESwant="wanted_data",
stringsAsFactors = FALSE
)
desired.df1 <- data.frame(
ID_0=0,
ID_1=1,
ConstantNames_YESwant="wanted_data",
stringsAsFactors = FALSE
)
}

Is this what you want?
library(tidyverse)
df1 %>%
select(matches("ID_*"), ConstantNames_YESwant)
df1 %>%
select(starts_with("ID"), ConstantNames_YESwant)
# ID_0 ID_1 ConstantNames_YESwant
# 1 0 1 wanted_data

In base R , you could do
#Get all the ID columns
idvec <- grep("ID", colnames(df1), value = TRUE)
#Select ID columns and the constant names you want.
df1[c(idvec, "ConstantNames_YESwant")]
# ID_0 ID_1 ConstantNames_YESwant
#1 0 1 wanted_data

Passing vector with multiple values into R function to generate data frame

I have a table, called table_wo_nas, with multiple columns, one of which is titled ID. For each value of ID there are many rows. I want to write a function that for input x will output a data frame containing the number of rows for each ID, with column headers ID and nobs respectively as below for x <- c(2,4,8).
## id nobs
## 1 2 1041
## 2 4 474
## 3 8 192
This is what I have. It works when x is a single value (ex. 3), but not when it contains multiple values, for example 1:10 or c(2,5,7). I receive the warning "In ID[counter] <- x : number of items to replace is not a multiple of replacement length". I've just started learning R and have been struggling with this for a week and have searched manuals, this site, Google, everything. Can someone help please?
counter <- 1
ID <- vector("numeric") ## contain x
nobs <- vector("numeric") ## contain nrow
for (i in x) {
r <- subset(table_wo_nas, ID %in% x) ## create subset for rows of ID=x
ID[counter] <- x ## add x to ID
nobs[counter] <- nrow(r) ## add nrow to nobs
counter <- counter + 1 } ## loop
result <- data.frame(ID, nobs) ## create data frame

In base R,
# To make a named vector, either:
tmp <- sapply(split(table_wo_nas, table_wo_nas$ID), nrow)
# OR just:
tmp <- table(table_wo_nas$ID)
# AND
# arrange into data.frame
nobs_df <- data.frame(ID = names(tmp), nobs = tmp)
Alternately, coerce the table into a data.frame directly, and rename:
nobs_df <- data.frame(table(table_wo_nas$ID))
names(nobs_df) <- c('ID', 'nobs')
If you only want certain rows, subset:
nobs_df[c(2, 4, 8), ]
There are many, many more options; these are just a few.
With dplyr,
library(dplyr)
table_wo_nas %>% group_by(ID) %>% summarise(nobs = n())
If you only want certain IDs, add on a filter:
table_wo_nas %>% group_by(ID) %>% summarise(nobs = n()) %>% filter(ID %in% c(2, 4, 8))

Seems pretty straightforward if you just use table again:
tbl <- table( table_wo_nas[ , 'ID'] )
data.frame( IDs = names(tbl), nobs= tbl)
Could also get a quick answer although with different column names using:
as.data.frame(table( table_wo_nas[ , 'ID'] ))

Try this.
x=c(2,4,8)
count_of_id=0
#df is your data frame table_wo_nas
count_of<-function(x)
{for(i in 1 : length(x))
{count_of_id[i]<-length(which(df$id==x[i])) #find out the n of rows for each unique value of x
}
df_1<-cbind(id,count_of_id)
return(df_1)
}

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Predominant calculation for Character fields - r

Related

R Function Query

selecting list of columns from dataframe to convert to subset

For Loop to Reorder Columns in R

Filter dataframe by vector of column names and constant column names

Passing vector with multiple values into R function to generate data frame

Categories

Resources