Related
Intro: Working in R, I often need to reorganize information from lists of data.frames to create a summary table. In this example, I start with a single data.frame, and I show my function that converts key information from the data.frame into a single row. Bearing in mind that my desired output requires the sorting of a mixture of numeric and character data, I can’t help wondering if there is an easier technique to do this kind of thing.
My question: Can anyone provide advice, or better yet a solution, for a simpler technique to convert data.frames like these into rows, while respecting the specific sorting of the data?
#sample data
input_df <- data.frame(M1 = c("Age", "Weight", "Speed", "Range"),
dogs = c(100, 120, 85, 105),
cats = c(115, 89, 80, 111),
birds = c(100, 90, 100, 104))
# desired summary row
desired_row <- data.frame(Model = "M1",
dogs = "Weight (120)",
cats = "Age (115), Range (111)",
birds = "Range (104)",
stringsAsFactors = F)
desired_row$Model <- factor(desired_row$Model)
# my function
makeRow <- function(dat1) {
# get model name
mod <- data.frame(Model = names(dat1[1]))
# make list of variables with model varible
d1 <- setNames(lapply(names(dat1)[-1], function(x) cbind(dat1[1],
dat1[x])), names(dat1)[-1])
# create a sorted named vector, largest-to-smallest
sorted_named_vec <- function(x) {
sort(setNames(x[[2]], x[[1]]), decreasing = T)
}
d2 <- lapply(d1, sorted_named_vec)
# implement a criterion to report only top indexes
keep_tops <- function(x) {
ifelse(x == max(x) | x >= 110 | (x > 102) & ((x -
100)/(max(x) - 100) > 0.33), x, "")
}
d3 <- lapply(d2, keep_tops)
# remove blank character elements
remove_blank_elements <- function(x) {
x[nchar(x) > 0]
}
d4 <- lapply(d3, remove_blank_elements)
# collapse variable name with top values and add parenthesis
collapse_to_string <- function(x) {
paste0(names(x), " (", x, "),", collapse = " ")
}
d5 <- lapply(d4, collapse_to_string)
# remove the last comma
remove_last_comma <- function(x) {
gsub("\\,$", "", x)
}
d6 <- lapply(d5, remove_last_comma)
# consturct a row from the list
row <- cbind(mod, as.data.frame(d6, stringsAsFactors = F))
row
}
# call
row_output <- makeRow(dat1 = input_df)
row_output
# check output to desired
identical(desired_row, row_output)
not sure if more efficient, but slightly less code and more direct approach imo.
makeRow <- function(dat1) {
#make data frame for row with model name
d0 <- data.frame(mod = names(dat1)[1]) #col name changed later
# implement a criterion to report only top indexes -> now return if true or false
keep_tops <- function(x) {
x == max(x) | x >= 110 | (x > 102) & ((x - 100)/(max(x) - 100) > 0.33)
}
vals =c() #empty -> for values of each cols
# make list of variables with model variables(dat1 cols)
#use the columns of the df directly
for(col in 2:ncol(dat1)){
#make temp df with each and evaluate what row to keep in the same line
df = dat1[keep_tops(dat1[,col])==1,c(1,col)]
df[,2] = paste0("(",df[,2],")") #add the () around the numbers
val = apply(as.data.frame(apply(df, 1, paste0, collapse=" ")), 2, paste0, collapse=", ") #collapse rows, then cols
vals = c(vals, val) #add this variable values to the values' list
}
# bind the first col made earlier with these values
row <- cbind(d0, as.data.frame(t(vals), stringsAsFactors = F))
colnames(row) = colnames(dat1) #rename the columns to match
row
}
# call
row_output <- makeRow(dat1 = input_df)
# check output to desired
identical(desired_row$birds, row_output$birds)
with your 'input_df', identical() was TRUE.
Here is some example data:
set.seed(1234) # Make the results reproducible
count <- 100
cs1 <- round(rchisq(count, 1), 2)
cs2 <- round(rchisq(count, 2), 2)
c(rep("Present", 30), rep("Absent", 30), rep("NA", 40)) -> temp
temp[temp == "NA"] <- NA
as.factor(temp) -> temp
temp1 <- round(rnorm(count, 3), 2)
temp1[7] <- NA
temp2 <- round(rnorm(count, 7), 2)
temp2[54] <- NA
c(rep("Yes", 30), rep("No", 30), rep("Maybe", 30), rep("NA", 10)) -> temp3
temp3[temp3 == "NA"] <- NA
as.factor(temp3) -> temp3
c(rep("Group A", 55), rep("Group B", 45)) -> temp4
as.factor(temp4) -> temp4
mydata <- data.frame(cs1, cs2, temp, temp1, temp2, temp3, temp4)
mydata$cs2[56:100] <- NA ; mydata
I know I can compute summary statistics for each variable stratified by temp4 like so:
by(mydata, mydata$temp4, summary)
However, I would also like to compute either a t.test or a chisq.test for each variable stratified by temp4. I've tried simply modifying the above code to do that but it always gives me an error. It seems the error stems from the fact that some of the variables in the data frame are numeric (and thus, would need a t.test) while others are factors (and thus, would need a chisq.test).
Is there a simple way to tell R to check the variable to see what kind it is, and then run the appropriate test, all at once? And to still print out all of the results even if it encounters an error?
I am not worried about the appropriateness of doing this (e.g., I am aware of the risks of multiple testing, etc) but rather just need to know how to do it. Thanks!
You can use lapply to loop through the variables and decide inside the anonymous function which test to conduct.
When an error occurs, it's caught by tryCatch and instead of a test result the final list will have the error message as a member.
tests_list <- lapply(mydata[-ncol(mydata)], function(x){
tryCatch({
if(is.numeric(x)){
if(length(levels(mydata$temp4)) == 2){
t.test(x ~ temp4, data = mydata)
}else{
aov(x ~ temp4, data = mydata)
}
}else{
tbl <- table(x, mydata$temp4)
chisq.test(tbl)
}
}, error = function(e) e)
})
err <- sapply(tests_list, inherits, "error")
tests_list$cs1
tests_list$temp3
tests_list[[err]]
Yes, you can loop through designated columns, keeping temp4 as the factor, and check class of each column (named x within the anonymous function). You can use sapply or apply(X, MARGIN = 2, FUN ...). Note that I'm explicitly subsetting mydata because I find it more explicit and readable.
sapply(mydata[, c("cs1", "cs2", "temp", "temp1", "temp2", "temp3")], FUN = function(x, group) {
if (class(x) == "numeric") {
# perform t-test, e.g. t.test(x ~ group)
return(result_of_t_test)
}
if (class(x) == "factor") {
# perform chi-square test
return(result_of_chisq_test)
}
}, group = mydata$temp4)
I'm using for loop to find all specific strings (df2$x2) in another dataframe (df1$x1) and what my purpose is create new column the df1$test and write the df$x2 value.
For example:
df1 <- data.frame(x1 = c("TE-T6-3 XYZ12X","TE-D31L-2 QWE12X","TE-H6-1 ABC12X","TE-D31L-2 QWE12X","EC20 QWX12X"),
Y = c(2017,2017,2018,2018,2017),
Sales = c(25,50,30,40,90))
df1$x1 <- as.character(as.factor(df1$x1))
df2 <- data.frame(x2 = c("TE-T6-5","TE-D31L-2","TE-H6-15","EC500","EC20","TE-D31L-2"),
Y = c(2018,2017,2018,2017,2018,2018),
P = c(100,300,200,50,150,300))
df2$x2 <- as.character(as.factor(df2$x2))
for(i in 1:nrow(df2)){
f <- df2[i,1]
df1$test <- ifelse(grepl(f, df1$x1),f,"not found")
}
What should I do after the end of loop? I know that problem is y is refreshing every time. I tried "if" statement to create new data frame and save outputs but it didn't work. It's writing only one specific string.
Thank you in advance.
Expected output:
df1 <- data.frame(x1 = c("TE-T6-3 XYZ12X","TE-D31L-2 QWE12X","TE-H6-1 ABC12X","TE-D31L-2 QWE12X","EC20 QWX12X"),
output = c("not found","TE-D31L-2","not found","TE-D31L-2","EC20"))
Do you want to have one new column for each string? if that is what you need, your code should be:
df1 <- data.frame(x1 = c("TE-T6-3 XYZ12X","TE-D31L-2 QWE12X","TE-H6-1 ABC12X","TE-D31L-2 QWE12X","EC20 QWX12X"),
Y = c(2017,2017,2018,2018,2017),
Sales = c(25,50,30,40,90))
df1$x1 <- as.character(as.factor(df1$x1))
df2 <- data.frame(x2 = c("TE-T6-5","TE-D31L-2","TE-H6-15","EC500","EC20","TE-D31L-2"),
Y = c(2018,2017,2018,2017,2018,2018),
P = c(100,300,200,50,150,300))
df2$x2 <- as.character(as.factor(df2$x2))
for(i in 1:nrow(df2)){
f <- df2[i,1]
df1$test <- ""
df1$test<-ifelse(grepl(f, df1$x1),T,F)
colnames(df1) <- c(colnames(df1[1:length(df1[1,])-1]),f)
}
it creates a new column with a temp name and then rename it with the string evaluated. Also i change "not found" for F, but you can use whatever you want.
[EDIT:]
If you want that expected output, you can use this code:
df1 <- data.frame(x1 = c("TE-T6-3 XYZ12X","TE-D31L-2 QWE12X","TE-H6-1 ABC12X","TE-D31L-2 QWE12X","EC20 QWX12X"),
Y = c(2017,2017,2018,2018,2017),
Sales = c(25,50,30,40,90))
df1$x1 <- as.character(as.factor(df1$x1))
df2 <- data.frame(x2 = c("TE-T6-5","TE-D31L-2","TE-H6-15","EC500","EC20","TE-D31L-2"),
Y = c(2018,2017,2018,2017,2018,2018),
P = c(100,300,200,50,150,300))
df2$x2 <- as.character(as.factor(df2$x2))
df1$output <- "not found"
for(i in 1:nrow(df2)){
f <- df2[i,1]
df1$output[grepl(f, df1$x1)]<-f
}
Very similar of what you have done, but it was needed to index which rows you have to write.
This only works when the data only can have one match, it is a little more complicated if you can have more than one match for row. But i think that's not your problem.
You simply need to split the df1$x1 strings on space and merge (or match since you are only interested in one variable)on df2$x2, i.e.
v1 <- sub('\\s+.*', '', df1$x1)
v1[match(v1, df2$x2)]
#[1] NA "TE-D31L-2" NA "TE-D31L-2" "EC20"
I have some data, that looks like the following:
"Name","Length","Startpos","Endpos","ID","Start","End","Rev","Match"
"Name_1",140,0,138,"1729",11,112,0,1
"Name_2",132,0,103,"16383",23,232,0,1
"Name_3",102,0,100,"1729",22,226,1,1
"Name_4",112,0,130,"16383",99,992,1,1
"Name_5",132,0,79,"1729",81,820,1,1
"Name_6",112,0,163,"16383",81,820,0,1
"Name_7",123,0,164,"1729",54,542,1,1
"Name_8",123,0,65,"16383",28,289,0,1
I have used the order function to order according to first "ID then "Start".
"Name","Length","Startpos","Endpos","ID","Start","End","Rev","Match"
"Name_1",140,0,138,"1729",11,112,0,1
"Name_3",102,0,100,"1729",22,226,1,1
"Name_7",123,0,164,"1729",54,542,1,1
"Name_5",132,0,79,"1729",81,820,1,1
"Name_2",132,0,103,"16383",23,232,0,1
"Name_8",123,0,65,"16383",28,289,0,1
…
Now I need to do two things:
First I need to create a table that includes pairwise couples out of each ID group. For a group in one ID containing the names (1,2,3,4,5), I need to create the pairs (12,23,34,45). So for the above example, the pairs would be (Name_1+Name_3, Name_3+Name_7, Name_7+Name_5).
My output for the above example, would look like this:
"Start_Name_X","Start_Name_Y","Length_Name_X","Length_Name_Y","Name_Name_X","Name_Name_Y","ID","New column"
11, 22, 140, 102, "Name_1", Name_3", 1729,,
22, 54, 102, 123, "Name_3", Name_7, 1729,,
54, 81, 123, 132, "Name_7", Name_5, 1729,,
23, 28, 132, 123, "Name_2", "Name_8", 16383,,
…
So I need to create pairs through ascending "Start", but within each "ID".
I am thinking it should be done with a for loop, but I am a newbie, so pulling the data to a new table with the for loop confuses me in itself, and especially the constraint of doing it within each unique "ID", I have no idea how to do.
I have experimented with splitting the data into groups according to ID using split, but it doesn't really get me further with creating the new data table.
I have created the ned data-table with the following code:
column_names = data.frame(Start_Name_X ="Start_Name_x",
Start_Name_Y="Start_Name_Y", Length_Name_X ="Length_Name_X",
Length_Name_Y="Length_Name_Y", Name_X="Name_X", Name_Y="Name_Y", ID="ID",
New_Column="New_Column")
write.table(column_names, file = "datatabel.csv", row.names=FALSE, append =
FALSE, col.names = FALSE, sep=",", quote=TRUE)
And this is the table, I would like to write to.
Is a for loop the write way to handle this, and if so, can you give me a few clues on how to start?
It can be done with only one loop:
df <- read.table(sep = ",", header = TRUE, stringsAsFactors = FALSE,
text = "\"Name\",\"Length\",\"Startpos\",\"Endpos\",\"ID\",\"Start\",\"End\",\"Rev\",\"Match\"\n\"Name_1\",140,0,138,\"1729\",11,112,0,1\n\"Name_2\",132,0,103,\"16383\",23,232,0,1\n\"Name_3\",102,0,100,\"1729\",22,226,1,1\n\"Name_4\",112,0,130,\"16383\",99,992,1,1\n\"Name_5\",132,0,79,\"1729\",81,820,1,1\n\"Name_6\",112,0,163,\"16383\",81,820,0,1\n\"Name_7\",123,0,164,\"1729\",54,542,1,1\n\"Name_8\",123,0,65,\"16383\",28,289,0,1",
)
df <- df[order(df$ID, df$Start), ]
inds <- c("Name", "Start", "Length")
indsSorted <- c("Start_Name_X","Start_Name_Y","Length_Name_X","Length_Name_Y","Name_Name_X","Name_Name_Y","ID","New_Column")
out <- data.frame(matrix(nrow = 0, ncol = 8))
colnames(out) <- c("Start_Name_X","Start_Name_Y","Length_Name_X","Length_Name_Y","Name_Name_X","Name_Name_Y","ID","New_Column")
for (i in unique(df$ID)){
dfID <- subset(df, ID == i)
dfHead <- head(dfID, n = nrow(dfID) - 1)[, inds]
colnames(dfHead) <- paste0(colnames(dfHead), "_Name_X")
dfTail <- tail(dfID, n = nrow(dfID) - 1)[, inds]
colnames(dfTail) <- paste0(colnames(dfTail), "_Name_Y")
out <- rbind(out, cbind(dfHead, dfTail, ID = i, New_Column = '', stringsAsFactors = FALSE)[, indsSorted])
}
out
This will probably be horribly slow if the input is large. It can be optimized, but I didn't bother since using data.table is probably much quicker.
dt <- data.table(df, key = "ID,Start")
fn <- function(dtIn, id){
dtHead <- head(dtIn, n = nrow(dtIn) - 1)
setnames(dtHead, paste0(colnames(dtHead), "_Name_X"))
dtTail <- tail(dtIn, n = nrow(dtIn) - 1)
setnames(dtTail, paste0(colnames(dtTail), "_Name_Y"))
cbind(dtHead, dtTail, ID = id, New_Column = '')
}
out2 <- dt[, fn(.SD, ID), by = ID, .SDcols = c("Name", "Start", "Length")]
out2 <- as.data.frame(out2[, indsSorted, with = FALSE])
Rownames are different but otherwise the results are identical. The function used can probably be optimized as well.
rownames(out) <- NULL
rownames(out2) <- NULL
identical(out, out2)
I'm trying to interpret + as an All Pro, * as a Pro Bowl, and then the absence of either of those two to default to the name.
My issue seems to be with the "else" portion of my ifelse. Instead of inserting the string it inserts a number.
# libraries
library(plyr)
library(dplyr)
library(XML)
library(stringr)
# file names
model_no <- "pfr_003"
model_name <- "fantasy_football_"
image_name <- paste(model_name, model_no, ".RData", sep="")
# parameters
first_season <- 2011
last_season <- 2013
# seasons
num_seasons <- as.numeric(last_season - first_season + 1)
seasons <- seq(first_season, last_season, by=1)
pfr <- data.frame()
for (i in 1:num_seasons) {
url <- paste("http://www.pro-football-reference.com/years/", seasons[i],"/fantasy.htm", sep = "")
df <- readHTMLTable(url,which=1, header=FALSE)
df$season = seasons[i]
df <- df[c(2, 3, 4, 5, 6, 20, 25)]
pfr <- rbind(pfr, df)
rm(df)
print(seasons[i])
}
names(pfr) <- c("NameInfo", "Team", "Age", "G", "GS", "Pos", "Year")
pfr <- pfr[pfr$Team != "Tm", ]
pfr <- pfr[pfr$Name != "Passing", ]
pfr$AllPro <- ifelse(is.na(str_locate(string=pfr$NameInfo, '[+]')[,1]) == TRUE, 0, 1)
pfr$ProBowl <- ifelse(is.na(str_locate(string=pfr$NameInfo, '[*]')[,1]) == TRUE, 0, 1)
# Everything above is cool
# This ifelse works just fine
pfr$test1 <- ifelse(pfr$AllPro == 1, "AP", ifelse(pfr$ProBowl == 1, "PB", "None"))
# but when I want to strip the + an * from the NameInfo field I come across an issue
# it works fine for anyone that is AP or PB, but instead of the "else" portion
# defaulting to the NameInfo field it inserts a (seemingly random) number
pfr$test2 <- ifelse(pfr$AllPro == 1, str_sub(pfr$Name, end=str_locate(string=pfr$Name, '[+]')[,1]-2),
ifelse(pfr$ProBowl == 1, str_sub(pfr$Name, end=str_locate(string=pfr$Name, '[*]')[,1]-1),
pfr$NameInfo))
Any help would be greatly appreciated.
Thanks.
Not a random number, but a factor level. Your pfr$NameInfo is a factor. Change the last else to as.character(pfr$NameInfo) if you want a string.