write.fwf column names don't line up with values - r

The following code produces a table whose column names don't line up with its values:
library( gdata )
test0 <- matrix(5:28, nrow = 4)
row.names(test0) <- paste("r", 1:4, sep = "")
colnames(test0) <- paste("c", 1:6, sep = "")
test0[3, 2] <- 1234567890
test0[ , 3] <- 0.19412341293479123840214
test0 <- format(test0, digits = 5, trim = T, width = 10, scientific = T)
write.fwf(test0, file = paste("test", ".txt", sep = ""), width = 11, rowname = T, colname = T, quote = F)
How can I make column names line up with each column's values (in order to have the table to be readable by GAMS)?

Curious that the column names are not treated in the same way. A work-around could be to add the column names as a row in the table and then write the table without column names...
i.e.
test1 <- rbind( colnames(test0) , test0 )
write.fwf(test1, file = paste("test", ".txt", sep = ""),
width = 11,
rownames = T,
colnames = F, #Don't print the column names
quote = F )
This looks like:

I created a small script for saving a dataframe to fwf format with columns aligned with values.
See the latest gist code
suppressPackageStartupMessages({
library(gdata)
library(stringr)
})
#' Generate automatically .fwf file (fixed width file) in R
#' #description This function creates automatically fixed width file
#' It align columns headers with datas
#' #param df dataframe
#' #param filename filename
#' #param nbspaces nb spaces for columns separator
#' #param replace_na Empty/NA chain replacement
#' #param rowname If it's defined, it convert rownames column to named column
#' #examples write_fwf(mtcars, "carname", "/tmp/mtcars.fwf")
#'
#' # colnames: carname,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
#' # cols: 22,7,6,8,6,7,8,8,5,5,7,7
#' carname mpg cyl disp hp drat wt qsec vs am gear carb
#' Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
#' Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
#' Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
#' Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
write_fwf <- function(df, filename,rowname = FALSE,nbspaces = 3, replace_na = "NA") {
# Convert rownames to column
if (rowname) {
df <- tibble::rownames_to_column(df, rowname)
}
# Convert all columns to character
tmpdf = data.frame(df)
tmpdf[] <- lapply(df, as.character)
# Compute column size
nasize=nchar(replace_na)
maxwidthname <- nchar(colnames(tmpdf))
maxwidthvalue <- sapply(tmpdf, function(x) max(nchar(x)))
maxcols <- pmax(maxwidthname,maxwidthvalue,nasize)
delta <- maxwidthvalue - maxwidthname
# Compute header
header <- c()
for (idx in seq(ncol(df))) {
if (is.character(df[,idx])) {
header <- append(header,paste0(colnames(df)[idx],strrep(" ",max(delta[idx],0))))
} else {
header <- append(header,paste0(strrep(" ",max(delta[idx],0)), colnames(df)[idx]))
}
}
# Open file
file <- file(filename, "w")
# Write header
writeLines(paste("# colnames:", paste(colnames(df), collapse=',')),file)
writeLines(paste("# cols:", paste(unlist(maxcols+nbspaces), collapse=',')),file)
writeLines(header,file, sep=strrep(" ",nbspaces))
writeLines("", file, sep="\n")
close(file)
# Export data
write.fwf(
df,
file=filename,
append=TRUE,
width=maxcols,
colnames=FALSE,
na=replace_na,
sep=strrep(" ",nbspaces),
justify="left"
)
}
#' Read automatically .fwf file (fixed width file) in R
#' #description This function read and detect automatically fixed width file
#' #param maxsearchlines nb lines for the searching the columns metadata description
#' #examples read_fwf("/tmp/mtcars.fwf")
read_fwf <- function(filename,maxsearchlines=100) {
# Search columns informations
file <- file(filename, "r")
on.exit(close(file))
lines <- readLines(file,n=maxsearchlines)
idxname <- str_which(lines,"# colnames: ")
colnames <- str_replace(lines[idxname], "# colnames: ", "")
colnames <- unlist(str_split(colnames, ","))
idxcols <- str_which(lines,"# cols: ")
colwidths <- str_replace(lines[idxcols], "# cols: ", "")
colwidths <- str_split(colwidths, ",")
colwidths <- strtoi(unlist(colwidths))
return(read.fwf(file=filename, skip=idxcols+1, col.names = colnames, widths=colwidths,strip.white=TRUE))
}
Sample utilization
write_fwf(mtcars, "carname", "/tmp/mtcars.fwf")
The result
# colnames: carname,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
# cols: 22,7,6,8,6,7,8,8,5,5,7,7
carname mpg cyl disp hp drat wt qsec vs am gear carb
Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4

Related

Prevent change to dataframe format in R

I have a dataframe that must have a specific layout. Is there a way for me to make R reject any command I attempt that would change the number or names of the columns?
It is easy to check the format of the data table manually, but I have found no way to make R do it for me automatically every time I execute a piece of code.
regards
This doesn’t offer the level of foolproof safety I think you’re looking for (hard to know without more details), but you could define a function operator that yields modified functions that error if changes to columns are detected:
same_cols <- function(fn) {
function(.data, ...) {
out <- fn(.data, ...)
stopifnot(identical(sort(names(.data)), sort(names(out))))
out
}
}
For example, you could create modified versions of dplyr functions:
library(dplyr)
my_mutate <- same_cols(mutate)
my_summarize <- same_cols(summarize)
which work as usual if columns are preserved:
mtcars %>%
my_mutate(mpg = mpg / 2) %>%
head()
# mpg cyl disp hp drat wt qsec vs am gear carb
# Mazda RX4 10.50 6 160 110 3.90 2.620 16.46 0 1 4 4
# Mazda RX4 Wag 10.50 6 160 110 3.90 2.875 17.02 0 1 4 4
# Datsun 710 11.40 4 108 93 3.85 2.320 18.61 1 1 4 1
# Hornet 4 Drive 10.70 6 258 110 3.08 3.215 19.44 1 0 3 1
# Hornet Sportabout 9.35 8 360 175 3.15 3.440 17.02 0 0 3 2
# Valiant 9.05 6 225 105 2.76 3.460 20.22 1 0 3 1
mtcars %>%
my_summarize(across(everything(), mean))
# mpg cyl disp hp drat wt qsec vs am
# 1 20.09062 6.1875 230.7219 146.6875 3.596563 3.21725 17.84875 0.4375 0.40625
# gear carb
# 1 3.6875 2.8125
But throw errors if changes to columns are made:
mtcars %>%
my_mutate(mpg2 = mpg / 2)
# Error in my_mutate(., mpg2 = mpg/2) :
# identical(sort(names(.data)), sort(names(out))) is not TRUE
mtcars %>%
my_summarize(mpg = mean(mpg))
# Error in my_summarize(., mpg = mean(mpg)) :
# identical(sort(names(.data)), sort(names(out))) is not TRUE
You mention the names and columns need to be the same, also realize that with data.table also names are updated by reference. See the example below.
foo <- data.table(
x = letters[1:5],
y = LETTERS[1:5]
)
colnames <- names(foo)
colnames
# [1] "x" "y"
setnames(foo, colnames, c("a", "b"))
foo[, z := "oops"]
colnames
# [1] "a" "b" "z"
identical(colnames, names(foo))
# [1] TRUE
To check that both the columns and names are unalterated (and in same order here) you can take right away a copy of the names. And after each code run, you can check the current names with the copied names.
foo <- data.table(
x = letters[1:5],
y = LETTERS[1:5]
)
colnames <- copy(names(foo))
setnames(foo, colnames, c("a", "b"))
foo[, z := "oops"]
identical(colnames, names(foo))
[1] FALSE
colnames
# [1] "x" "y"
names(foo)
# [1] "a" "b" "z"

Loop within Loop in R

I am trying to figure out how to run two different loops on the same code. I am trying to create a matrix where I am filling a column with the mean of a variable for each year.
Here's the code I am using to do it right now:
matplot2 = as.data.frame(matrix(NA, nrow=16, ncol=4))
matplot2[1,1] = mean(matplot[matplot$Year==2003, 'TotalTime'])
matplot2[2,1] = mean(matplot[matplot$Year==2004, 'TotalTime'])
matplot2[3,1] = mean(matplot[matplot$Year==2005, 'TotalTime'])
matplot2[4,1] = mean(matplot[matplot$Year==2006, 'TotalTime'])
matplot2[5,1] = mean(matplot[matplot$Year==2007, 'TotalTime'])
matplot2[6,1] = mean(matplot[matplot$Year==2008, 'TotalTime'])
matplot2[7,1] = mean(matplot[matplot$Year==2009, 'TotalTime'])
matplot2[8,1] = mean(matplot[matplot$Year==2010, 'TotalTime'])
matplot2[9,1] = mean(matplot[matplot$Year==2011, 'TotalTime'])
matplot2[10,1] = mean(matplot[matplot$Year==2012, 'TotalTime'])
matplot2[11,1] = mean(matplot[matplot$Year==2013, 'TotalTime'])
matplot2[12,1] = mean(matplot[matplot$Year==2014, 'TotalTime'])
matplot2[13,1] = mean(matplot[matplot$Year==2015, 'TotalTime'])
matplot2[14,1] = mean(matplot[matplot$Year==2016, 'TotalTime'])
matplot2[15,1] = mean(matplot[matplot$Year==2017, 'TotalTime'])
matplot2[16,1] = mean(matplot[matplot$Year==2018, 'TotalTime'])
If it were just the year changing, I would write the loop like this:
for(i in 2003:2018) {
matplot2[1,1] = mean(matplot[matplot$Year==i, 'TotalTime'])
}
But, I need the row number in the matrix I'm printing the results into to change as well. How can I write a loop where I am printing the results of all these means into one column of a matrix?
In other words, I need to be able to have it loop matplot2[j,1] in addition to the matplot$Year==i.
Any suggestions would be greatly appreciated!
Your literal calculations of the mean(TotalTime) can all be reduced to a single command (with no for loop required):
matplot2 <- aggregate(TotalTime ~ Year, data = matplot, FUN = mean)
That should return a two-column frame with the unique values of Year in the first column, and the respective means in the second column.
Demonstrated with data I have:
head(mtcars)
# mpg cyl disp hp drat wt qsec vs am gear carb
# Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
# Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
# Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
# Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
# Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
# Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
res <- aggregate(disp ~ cyl, data = mtcars, FUN = mean)
res
# cyl disp
# 1 4 105.1364
# 2 6 183.3143
# 3 8 353.1000
This and more can be seen in summarize by group (of which this question is essentially a dupe, even if you didn't know to ask it that way).
R is a vectorized language so passing a vector of values for the index and year should work.
i<-1:16
matplot2[i,1] = mean(matplot[matplot$Year==(2002 + i), 'TotalTime'])

Why does na.rm=TRUE not work for weighted SD in R?

I have a dataframe of 10 columns with house prices, that in some cases, includes NAs.
I want to create a new column of weighted sd but for the rows that have a few NAs, I get the following error:
Error in e2[[j]] : subscript out of bounds
What I use per row (and works for rows without NAs):
weighted.sd(my.df[40,2:10], c(9,9,9,9,9,9,9,9,9), na.rm = TRUE)
Example
library(radiant.data)
data("mtcars")
mtcars[mtcars == 0] <- NA
weighted.sd(mtcars[18,1:11], c(11,11,11,11,11,11,11,11,11,11,11), na.rm = TRUE)#works
weighted.sd(mtcars[5,1:11], c(11,11,11,11,11,11,11,11,11,11,11), na.rm = TRUE)#issue here
What is the problem here and how can I create a new column with the weighted SD per row?
The problem appears to be that weighted.sd() will not operate as you are expecting across rows of a data frame.
Running weighted.sd we can see the code:
weighted.sd <- function (x, wt, na.rm = TRUE)
{
if (na.rm) {
x <- na.omit(x)
wt <- na.omit(wt)
}
wt <- wt/sum(wt)
wm <- weighted.mean(x, wt)
sqrt(sum(wt * (x - wm)^2))
}
In your example, you are not feeding in a vector for x, but rather a single row of a data frame. Function na.omit(x) will remove that entire row, due to the NA values - not elements of the vector.
You can try to convert the row to a vector with as.numeric(), but that will fail for this function as well due to how NA is removed from wt.
It seems like something like this is probably what you want. Of course, you have to be careful that you are feeding in valid columns for x.
weighted.sd2 <- function (x, wt, na.rm = TRUE) {
x <- as.numeric(x)
if (na.rm) {
is_na <- is.na(x)
x <- x[!is_na]
wt <- wt[!is_na]
}
wt <- wt/sum(wt)
wm <- weighted.mean(x, wt)
sqrt(sum(wt * (x - wm)^2))
}
weighted.sd2(mtcars[18,1:11], c(11,11,11,11,11,11,11,11,11,11,11), na.rm = TRUE)#works
# [1] 26.76086
weighted.sd2(mtcars[5,1:11], c(11,11,11,11,11,11,11,11,11,11,11), na.rm = TRUE)#issue here
# [1] 116.545
To apply this to all columns, you can use apply().
mtcars$weighted.sd <- apply(mtcars[,1:11], 1, weighted.sd2, wt = rep(11, 11))
mpg cyl disp hp drat wt qsec vs am gear carb weighted.sd
Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 NA 1 4 4 52.61200
Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 NA 1 4 4 52.58011
Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1 37.06108
Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 NA 3 1 78.36300
Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 NA NA 3 2 116.54503
...
If you do a CTRL+click on weigted.sd function you can see the source code:
function (x, wt, na.rm = TRUE)
{
if (na.rm) {
x <- na.omit(x)
wt <- na.omit(wt)
}
wt <- wt/sum(wt)
wm <- weighted.mean(x, wt)
sqrt(sum(wt * (x - wm)^2))
}
When you run it, value vector contain values without NA's and it is reduced. But the weigth vector has the same length as before, resulting in an error.
A solution would be:
weighted.sd(mtcars[5,!is.na(mtcars[5,1:11])],
c(11,11,11,11,11,11,11,11,11,11,11)[!is.na(mtcars[5,1:11])], na.rm = TRUE)
It's not elegant... But it does the job!

Subset a dataframe using colnames from another dataframe

Question:
I have a particular problem where I want to subset a given dataframe columnwise where the column names are stored in another dataframe.
Example using mtcars dataset:
options(stringsAsFactors = FALSE)
col_names <- c("hp,disp", "disp,hp,mpg")
df_col_names <- as.data.frame(col_names)
vec <- df_col_names[1,] # first row contains "hp" and "disp"
mtcars_new <- mtcars[, c("hp", "disp")] ## assuming that vec gives colnames
I even tried inserting double quotes to each of the words using the following:
Attempted solution:
options(stringsAsFactors = FALSE)
col_names <- c("hp,disp", "disp,hp,mpg")
df_col_names <- as.data.frame(col_names)
df_col_names$col_names <- gsub("(\\w+)", '"\\1"', df_col_names$col_names)
vec <- df_col_names[1,]
vec2 <- gsub("(\\w+)", '"\\1"', vec)
mtcars_new <- mtcars[,vec2] ## this should be same as mtcars[, c("hp", "disp")]
Expected Solution
mtcars_new <- mtcars[,vec2] is equal to mtcars_new <- mtcars[, c("hp", "disp")]
Here's another way to do this:
col_names <- c("hp,disp", "disp,hp,mpg")
vec2 <- unlist(str_split(col_names[[1]],','))
mtcars_new <- mtcars[,vec2]
What you are doing is picking the first element from the col_names vector, splitting it by the separator, then unlisting it (because str_split() makes a list), then you are using your new vector of names to subset the mtcars data-frame.
Do you need this?
lapply(strsplit(as.character(df_col_names$col_names), ","), function(x) mtcars[x])
#[[1]]
# hp disp
#Mazda RX4 110 160.0
#Mazda RX4 Wag 110 160.0
#Datsun 710 93 108.0
#Hornet 4 Drive 110 258.0
#Hornet Sportabout 175 360.0
#.....
#[[2]]
# disp hp mpg
#Mazda RX4 160.0 110 21.0
#Mazda RX4 Wag 160.0 110 21.0
#Datsun 710 108.0 93 22.8
#Hornet 4 Drive 258.0 110 21.4
#Hornet Sportabout 360.0 175 18.7
#....
Here, we split the column names on comma (",") and then subset it from the dataframe using lapply. This returns a list of dataframes with length of list which is same as number of rows in the data frame.
If you want to subset only the first row, you could do
mtcars[strsplit(as.character(df_col_names$col_names[1]), ",")[[1]]]

Finding duplicate columns in a data.table

I have a pretty big data.table (500 x 2000), and I need to find out if any of the columns are duplicates, i.e., have the same values for all rows. Is there a way to efficiently do this within the data.table structure?
I have tried a naive two loop approach with all(col1 == col2) for each pair of columns, but it takes too long. I have also tried converting it to a data.frame and using the above approach, and it still takes quite a long time.
My current solution is to convert the data.table to a matrix and use the apply() function as:
similarity.matrix <- apply(m, 2, function(x) colSums(x == m)))/nrow(m)
However, the approach forces the modes of all elements to be the same, and I'd rather not have that happen. What other options do I have?
Here is a sample construction for the data.table:
m = matrix(sample(1:10, size=1000000, replace=TRUE), nrow=500, ncol=2000)
DF = as.data.frame(m)
DT = as.data.table(m)
Following the suggestion of #Haboryme*, you can do this using duplicated to find any duplicated vectors. duplicated usually works rowwise, but you can transpose it with t() just for finding the duplicates.
DF <- DF[ , which( !duplicated( t( DF ) ) ) ]
With a data.table, you may need to add with = FALSE (I think this depends on the version of data.table you're using).
DT <- DT[ , which( !duplicated( t( DT ) ) ), with = FALSE ]
*#Haboryme, if you were going to turn your comment into an answer, please do and I'll remove this one.
Here's a different approach, where you hash each column first and then call duplicated.
library(digest)
dups <- duplicated(sapply(DF, digest))
DF <- DF[,which(!dups)]
Depending on your data this might be a faster way.
I am using mtcars for a reproducible result:
library(data.table)
library(digest)
# Create data
data <- as.data.table(mtcars)
data[, car.name := rownames(mtcars)]
data[, car.name.dup := car.name] # create a duplicated row
data[, car.name.not.dup := car.name] # create a second duplicated row...
data[1, car.name.not.dup := "Moon walker"] # ... but change a value so that it is no longer a duplicated column
data contains now:
> head(data)
mpg cyl disp hp drat wt qsec vs am gear carb car.name car.name.dup car.name.not.dup
1: 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4 Mazda RX4 Mazda RX4 Moon walker
2: 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4 Mazda RX4 Wag Mazda RX4 Wag Mazda RX4 Wag
3: 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1 Datsun 710 Datsun 710 Datsun 710
4: 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1 Hornet 4 Drive Hornet 4 Drive Hornet 4 Drive
5: 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2 Hornet Sportabout Hornet Sportabout Hornet Sportabout
6: 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1 Valiant Valiant Valiant
Now find the duplicated colums:
# create a vector with the checksum for each column (and keep the column names as row names)
col.checksums <- sapply(data, function(x) digest(x, "md5"), USE.NAMES = T)
# make a data table with one row per column name and hash value
dup.cols <- data.table(col.name = names(col.checksums), hash.value = col.checksums)
# self join using the hash values and filter out all column name pairs that were joined to themselves
dup.cols[dup.cols,, on = "hash.value"][col.name != i.col.name,]
Results in:
col.name hash.value i.col.name
1: car.name.dup 58fed3da6bbae3976b5a0fd97840591d car.name
2: car.name 58fed3da6bbae3976b5a0fd97840591d car.name.dup
Note: The result still contains both directions (col1 == col2 and col2 == col1) and should be deduplicated ;-)

Resources