selecting list of columns from dataframe to convert to subset - r

I am trying to create a function , for that at the input i am giving a list of modifying columns .
for eg: sample data is
dataa<-data.frame(
aa = c("q","r","y","v","g","y","d","s","n","k","y","d","s","t","n","u","l","h","x","c","q","r","y","v","g","y","d","s","n","k","y","d","s","t","n","u","l","h","x","c"),
col1=c(1,2,3,2,1,2,3,4,4,4,5,3,4,2,1,2,5,3,2,1,2,4,2,1,3,2,1,2,3,1,2,2,4,4,4,1,2,5,3,5),
col2=c(2,1,1,7,4,1,2,7,5,7,2,6,2,2,6,3,4,3,2,5,7,5,6,4,4,6,5,6,4,1,7,3,2,7,7,2,3,7,2,4)
)
my requirement is like , i can create any more than one cuts like below, or can be a list of cuts
may be i am trying to recode my dataset
dataa$col3 <- ifelse(dataa$aa == "y",1,0)
dataa$col4 <- ifelse(dataa$col2 == 7,1,0)
so now in my function requirement i want a subset of selected variables for calculation.
for eg:
#i am applying my function like this
dat1 = dataa
var1 = "col1" # variable for which calculation will be done
grouping_var = list(dataa$col3,dataa$col4)
total_var= TRUE
#fun_1 <- function(dat1,var1,grouping_var,total_var){
total_col <- ifelse(total_var== TRUE,1,0)
var1 <- rlang::parse_expr(var1)
var2 <- dat1[unlist(grouping_var)] # i am trying to create a subset dataframe of selected grouping_var
#var2 <- data.frame(sapply(grouping_var,c)) # i have tried this too
dat1 <- dat1 %>% select(!!var1,!!var2)
# so after this line i would have a subset to calculations accordingly
var_lab(dat1[[1]]) <- ""
var_lab(dat1[[2]]) <- ""
tab1 <- expss::cro_cpct(total(),dat1[[1]],dat1[[2]])
tab1 <- as.data.frame(tab1)
#}

It is not possible to select columns based on the values of the column.
An easy way would be to pass them as character vector and select. Try :
dat1 = dataa
var1 = "col1"
grouping_var = c('col3', 'col4') #Passing columns as character vector
total_var= TRUE
#fun_1 <- function(dat1,var1,grouping_var,total_var){
total_col <- as.integer(total_var)
#total_col <- ifelse(total_var== TRUE,1,0)
dat1 <- dat1[grouping_var]
expss::var_lab(dat1[[1]]) <- ""
expss::var_lab(dat1[[2]]) <- ""
tab1 <- expss::cro_cpct(expss::total(),dat1[[1]],dat1[[2]])
tab1 <- as.data.frame(tab1)
#}

Related

Matching logical subscripts to the size of the indexed input

I have created two data frames that I then turn into lists (e.g., list1 and list2). I removed one element from list2 to better represent my example data set.
library(dplyr)
intervals <- rep_len(c("01-01-2022", "01-11-2022", "01-31-2022"), 100)
ID <- rep(c("A","B", "C"), 100)
df <- data.frame(ID = as.factor(ID),
intervals = as.factor(intervals))
list1 <- df %>%
group_by(ID, intervals) %>%
group_split()
intervals <- rep_len(c("01-01-2022", "01-11-2022", "01-31-2022"), 25)
ID <- rep(c("A","B"), 25)
df2 <- data.frame(ID = as.factor(ID),
intervals = as.factor(intervals))
list2 <- df2 %>%
group_by(ID, intervals) %>%
group_split()
list2 <- list2[-6]
For each of these list I have added an attribute, and I have included a function to check the added attribute more readily (check).
# Convenience function to grab the attributes for you
check <- function(list, attribute_name) {
return(attr(list, attribute_name))
}
# Add an attribute to hold the attributes of each list element
attr(list1, "match") <- data.frame(id = sapply(list1, function(x) paste(x$ID[1])),
interval_start_date = sapply(list1, function(x) paste(x$intervals[1]))
)
# Check the attributes
check(list1, "match")
# Add an attribute "tab" to hold the attributes of each list element
attr(list2, "match") <- data.frame(id = sapply(list2, function(x) paste(x$ID[1])),
interval_start_date = sapply(list2, function(x) paste(x$intervals[1]))
)
# Check the attributes
check(list2, "match")
I have created an index for the two list, and the objective here is to remove any list components that don't have the same ID and the same intervals. The goal is to have only the matching IDs with the same intervals.
# Creates an index for the two list based on the attributes,
dat2 <- check(list1, "match")
dat1 <- check(list2, "match")
# Removes rows where the id isn't present in both data frames, and creates a
# index where both the interval and id are the same.
if (!length(unique(dat2$id)) == length(unique(dat1$id))){
dat3 <- dat2[dat2$id %in% dat1$id, ]
dat4 <- dat1[dat1$id %in% dat2$id, ]
i1 <- paste(dat3[["id"]], format(as.Date(dat3[["interval_"]]),
"%Y-%d")) %in%
paste(dat4[["id"]], format(as.Date(dat4[["interval_"]]),
"%Y-%d"))
}
Now here is where I begin to get an error:
# Error occurs because the lengths of `i1` is not the same as `list2`
out <- list1[i1]
I know that this is occuring because list1 does not have the same length as i1. I'm wondering if there is a way to appending logical values to i1 to get it the same length as list1, but in a way that it doesn't remove values from list1 that we actually do want to keep. Any thoughts?
Here is my expected output for list1, where I hope it ends up with only the same IDs and intervals as list2.
# Expected output
expected_list1 <- list(list1[1], list1[2],list1[3], list1[4], list1[5])
This answer is close to what I would like, but it has an additional element. I think ultimately the attribute table should be similiar to that of dat4.
test <- list1[dat2$id %in% dat1$id][i1]
# Add an attribute "tab" to hold the attributes of each list element
attr(test, "match") <- data.frame(id = sapply(test, function(x) paste(x$ID[1])),
interval_start_date = sapply(test, function(x) paste(x$intervals[1]))
)
# Check the attributes
check(test, "match")
There was a mismatch in the column name i.e. it is not interval_, but interval_start_date in dat1 and dat2. [[ will look for exact match whereas $ can match partial names as well
if (!length(unique(dat2$id)) == length(unique(dat1$id))){
ids_common <- intersect(dat2$id, dat1$id)
inds1 <- dat2$id %in% ids_common
inds2 <- dat1$id %in% ids_common
i1 <- paste(dat2[["id"]], format(as.Date(dat2[["interval_start_date"]]),
"%Y-%d")) %in%
paste(dat1[["id"]], format(as.Date(dat1[["interval_start_date"]]),
"%Y-%d"))
out <- list1[i1 & inds1]
}
-checking
> length(out)
[1] 5
> i1
[1] TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE FALSE

Horizontal table for many variables with the same categories

I'm looking for an easy way to make a table in R that shows each variable as a row in the dataframe and then each variable category as the column of the dataframe. In each cell the frequency of that category should be displayed and then the sum is the last column. The point is to display distribution for different variables with the same categories easily. I have included to a picture to show what I'm looking for.
I have managed to produce some code that achieves what I want, but it takes a lot of time to do this for each variable i want to include in the table.
mydata <- as.data.frame((table(mydat$var)))
mydata <- as.data.frame(t(mydata))
mydata <- lapply(mydata, as.numeric)
mydata <- as.data.frame(mydata)
mydata$sum <- (mydata$category 1 + mydata$category 2 + mydata$category 3)
mydata[-c(1), ]
The result looks like this:
To add more variables I imagine that i could use rbind(), but there might be some easier way to achieve something similar?
Here is a reproducible example using the mtcars dataset.
data("mtcars")
tdata <- as.data.frame(table(mtcars$cyl))
tdata1 <- as.data.frame(t(tdata))
tdata2 <- lapply(tdata1, as.numeric)
tdata3 <- as.data.frame(tdata2)
tdata3$sum <- (tdata3$V1 + tdata3$V2 + tdata3$V3)
tdata3 <- tdata3[-c(1),]
tdata3
Assuming you have a data.frame where each variable has the same categories (as in your example):
df <- data.frame(Var1 = c(rep("Cat1", 30),
rep("Cat2", 10),
rep("Cat3", 20) ),
Var2 = c(rep("Cat1", 10),
rep("Cat2", 20),
rep("Cat3", 30) ),
Var3 = c(rep("Cat1", 5),
rep("Cat2", 25),
rep("Cat3", 30) ) )
You could use lapply() to apply the table() function to every column in your data.frame:
tab <- lapply(colnames(df), function(x) table(df[, x]))
As lapply() outputs a list, use do.call to bind them, and rowSums() to create the sum column:
tab <- data.frame(do.call(rbind, t(tab)))
tab$Sum <- rowSums(tab)
# add variable labels as rows
rownames(tab) <- colnames(df)
The output will look like this:
Cat1 Cat2 Cat3 Sum
Var1 30 10 20 60
Var2 10 20 30 60
Var3 5 25 30 60
And, you could throw all this in a function:
my_tab_fun <- function(df) {
tab <- lapply(colnames(df),
function(x) table(df[, x]))
tab <- data.frame(
do.call(rbind, t(tab)))
tab$Sum <- rowSums(tab)
rownames(tab) <- colnames(df)
return(tab)
}
my_tab_fun(df)

Filter dataframe by vector of column names and constant column names

This is surely easy but for the life of me I can't find the right syntax.
I want to keep all "ID_" columns, regardless of the number of columns and the numbers attached, and keep other columns by constant name.
Something like the below command that doesn't work (on the recreated data, every time):
###Does not work, but shows what I am trying to do
testdf1 <- df1[,c(paste(idvec, collapse="','"),"ConstantNames_YESwant")]
Recreated data:
rand <- sample(1:2, 1)
if(rand==1){
df1 <- data.frame(
ID_0=0,
ID_1=1,
ID_2=11,
ID_3=111,
LotsOfColumnsWithVariousNames_NOwant="unwanted_data",
ConstantNames_YESwant="wanted_data",
stringsAsFactors = FALSE
)
desired.df1 <- data.frame(
ID_0=0,
ID_1=1,
ID_2=11,
ID_3=111,
ConstantNames_YESwant="wanted_data",
stringsAsFactors = FALSE
)
}
if(rand==2){
df1 <- data.frame(
ID_0=0,
ID_1=1,
LotsOfColumnsWithVariousNames_NOwant="unwanted_data",
ConstantNames_YESwant="wanted_data",
stringsAsFactors = FALSE
)
desired.df1 <- data.frame(
ID_0=0,
ID_1=1,
ConstantNames_YESwant="wanted_data",
stringsAsFactors = FALSE
)
}
Is this what you want?
library(tidyverse)
df1 %>%
select(matches("ID_*"), ConstantNames_YESwant)
df1 %>%
select(starts_with("ID"), ConstantNames_YESwant)
# ID_0 ID_1 ConstantNames_YESwant
# 1 0 1 wanted_data
In base R , you could do
#Get all the ID columns
idvec <- grep("ID", colnames(df1), value = TRUE)
#Select ID columns and the constant names you want.
df1[c(idvec, "ConstantNames_YESwant")]
# ID_0 ID_1 ConstantNames_YESwant
#1 0 1 wanted_data

In R, how can I copy rows from one dataframe to another when the df being copied to has 2 additional columns?

I have a tab delimited text file with 12 columns that I am uploading to my program. I go on to create another dataframe with a structure similar to the one uploaded and add 2 more columns to it.
excelfile = read.delim(ExcelPath)
matchedPictures<- excelfile[0,]
matchedPictures$beforeName <- character()
matchedPictures$afterName <- character()
Now I have a function in which I do the following:
Based on a condition, I obtain the row number pictureMatchNum of the row I need to copy from excelfile to matchedPictures.
I should then copy the row from excelfile to matchedPictures. I tried a couple of different ways so far.
a.
rowNumber = nrow(matchedPictures) + 1
matchedPictures[rowNumber,1:12] <<- excelfile[pictureMatchNum,1:12]
b.
matchedPictures[rowNumber,1:12] <<- rbind(matchedPictures, excelfile[pictureWordMatches,1:12], make.row.names = FALSE)
2a. doesn't seem to work because it copies the indices from the excelfileand uses them as row names in the matchedPictures - which is why I decided to go with rbind
2b. doesn't seem to work because rbind needs to have the columns be identical and matchedPictureshas 2 extra columns.
EDIT START - Including reproducible example.
Here is some reproducible code (with fewer columns and fake data)
excelfile <- data.frame(x = letters, y = words[length(letters)], z= fruit[length(letters)] )
matchedPictures <- excelfile[0,]
matchedPictures$beforeName <- character()
matchedPictures$afterName <- character()
pictureMatchNum1 = match(1, str_detect("A", regex(excelfile$x, ignore_case = TRUE)))
rowNumber1 = nrow(matchedPictures) + 1
pictureMatchNum2 = match(1, str_detect("D", regex(excelfile$x, ignore_case = TRUE)))
rowNumber2 = nrow(matchedPictures) + 1
The 2 options I tried are
2a.
matchedPictures[rowNumber1,1:3] <<- excelfile[pictureMatchNum1,1:3]
matchedPictures[rowNumber1,"beforeName"] <<- "xxx"
matchedPictures[rowNumber1,"afterName"] <<- "yyy"
matchedPictures[rowNumber2,1:3] <<- excelfile[pictureMatchNum2,1:3]
matchedPictures[rowNumber2,"beforeName"] <<- "uuu"
matchedPictures[rowNumber2,"afterName"] <<- "www"
OR
2b.
matchedPictures[rowNumber1,1:3] <<- rbind(matchedPictures, excelfile[pictureMatchNum1,1:3], make.row.names = FALSE)
matchedPictures[rowNumber1,"beforeName"] <<- "xxx"
matchedPictures[rowNumber1,"afterName"] <<- "yyy"
matchedPictures[rowNumber2,1:3] <<- rbind(matchedPictures, excelfile[pictureMatchNum2,1:3], make.row.names = FALSE)
matchedPictures[rowNumber2,"beforeName"] <<- "uuu"
matchedPictures[rowNumber2,"afterName"] <<- "www"
EDIT END
Additionally, I have also seen the suggestions in many places that rather than using empty dataframes, one should have vectors and append data to the vectors and then combine them into a dataframe. Is this suggestion valid when I have so many columns and would need to have 14 separate vectors and copy each one of them individually?
What can I do to make this work?
You could
first determine the row indices of excelfile that match your criteria
extract these rows
then generate the data to fill your columns beforeName and afterName
then append these columns to your new data frame
Example:
excelfile <- data.frame(x = letters, y = words[length(letters)],
z = fruit[length(letters)])
## Vector of patterns:
patternVec <- c("A", "D", "M")
## Look for appropriate rows in file 'excelfile':
indexVec <- vapply(patternVec,
function(myPattern) which(str_detect(myPattern,
regex(excelfile$x, ignore_case = TRUE))), integer(1))
## Extract these rows:
matchedPictures <- excelfile[indexVec,]
## Somehow generate the data for columns 'beforeName' and 'afterName':
## I do not know how this information is generated so I just insert
## some dummy code here:
beforeNameVec <- c("xxx", "uuu", "mmm")
afterNameVec <- c("yyy", "www", "nnn")
## Then assign these variables:
matchedPictures$beforeName <- beforeNameVec
matchedPictures$afterName <- afterNameVec
matchedPictures
# x y z beforeName afterName
# a air dragonfruit xxx yyy
# d air dragonfruit uuu www
# m air dragonfruit mmm nnn
You can make this much simpler by using dplyr
library(dplyr)
library(stringr)
excelfile <- data.frame(x = letters, y = words[length(letters)], z= fruit[length(letters)],
stringsAsFactors = FALSE ) #add stringsAsFactors to have character columns
pictureMatch <- excelfile %>%
#create a match column
mutate(match = ifelse(str_detect(x,"a") | str_detect(x,'d'),1,0)) %>%
#filter to only the columns that match your condition
filter(match ==1)
pictureMatch <- pictureMatch[['x']] #convert to a vector
matchedPictures <- excelfile %>%
filter(x %in% pictureMatch) %>% #grab the rows that match your condition
mutate(beforeName = c('xxx','uuu'), #add your names
afterName = c('yyy','www'))

Predominant calculation for Character fields

I'm trying to loop through my column names where type = character and return one Data frame which contains all the predominant values of each character column, grouped by an ID field.
Is there a way to replicate the following code in some kind of loop?:
DF_Characters <- DF_Characters[,sapply(dfr,is.character)]
##Predominance Column1##
Predom <- select(DF_Characters, Group_ID, Column_1)
Predom <- group_by(Predom,Group_ID, Column_1)
Predom <- summarise(Predom,
CountPredom = n()
)
Predom <- arrange(Predom,Group_ID, desc(CountPredom) )
Predom <- data.table(Predom, key="Group_ID")
Predominant_Column_1 <- Predom[,head(.SD,1),by=Group_ID]
##Predominant Column_2##
Predom <- select(DF_Characters, Group_ID, Column_2)
Predom <- group_by(Predom,Group_ID, Column_2)
Predom <- summarise(Predom,
CountPredom = n()
)
Predom <- arrange(Predom,Group_ID, desc(CountPredom) )
Predom <- data.table(Predom, key="Group_ID")
Predominant_Column_2 <- Predom[,head(.SD,1),by=Group_ID]
##Merge final table##
Merged <- merge(Predominant_Column_1 ,Predominant_Column_2 ,by="Group_ID")
Also to clarify my question I added a dummy table:
DF_Character_table
Result shoul look like this
Result Table
So for Group 1 Petre was the predominant name in Column 1 and Car was the predominant mode of travel. Column 1 and Column 2 predominance should be calculated respectively.
Thank you
This is probably not the best solution but it works.
##########Predominant Calculations
#Character fields
DF_Characters <- as.data.frame(dfr)
DF_Characters <- DF_Characters[,sapply(dfr,is.character)]
# Field names without the Group by id
CharactersToMerge <- c(names(DF_Characters))
#Add Groupby ID to Character fields
Character_Field_List <- c("Groupby_ID", names(DF_Characters))
DF_Characters <- subset(dfr,select = Character_Field_List)
#Column Names to loop through
DF_FieldsToMerge <- subset(dfr,select = CharactersToMerge)
# Predominant Table
fin_table <- DF_Characters %>% group_by(Groupby_ID) %>%
tally(sort = TRUE) #Count observations
# Loop and merge tables to Predominant Table
for(i in names(DF_FieldsToMerge)){
temp_table <- DF_Characters %>% group_by_("Groupby_ID", i ) %>%
tally(sort = TRUE)
temp_table <- temp_table[,head(.SD,1),by=Groupby_ID] #Remove ties
temp_table <- subset(temp_table,select = c("Groupby_ID", i)) #remove counts
fin_table <- merge(fin_table, temp_table, by="Groupby_ID")
}

Resources