Export R data.frame to SPSS - r

There is a package foreign with a function write.foreign() that can write a SPS and CSV file. The SPS file than can read the CSV fiel into SPSS including labels. Fine so far, but there are some issues with that function:
Newer SPSS versions may show an error that you have too few format definitions in DATA LIST
If there are "labels" for numeric variables stored via attr(), these are lost.
Even if the SPSS vesion supports strings up to 32767, the function write.foreign() stops if there are more than 255 in any variable.
Theres a star (*) if any character variables are used, but newer SPSS versions cannot handle that.
The CSV file is comma-separated and does (can) not use quotes, therefore no commas are allowed in strings (character)
Non-ASCII caracters (e.g. umlauts) will crash the import
Should you have a character that contains any NA value, you'll see...
... an error message like this:
Error in if (any(lengths > 255L)) stop("Cannot handle character variables longer than 255") :
missing value where TRUE/FALSE needed
I spent a lot of time with that and then found a good posting (http://r.789695.n4.nabble.com/SPSS-export-in-R-package-foreign-td921491.html) to start on and make it better. Here's my result, I'd like to share with you.

To export an R data.frame to SPSS, use write_sav from the haven package:
library(haven)
write_sav(mtcars, "mtcars.sav")

This function is a replacement for foreign:write.foreign to handle the issues stated above.
Note: To avoid issues with SPSS finding the CSV file, please specify the full path (!) at least for datafile (also if using the original foreign:write.foreign()).
Note: This script will replace a tabulator (TAB) and other spacing (incl. CR+LF) in strings by a blank without warning. One may consider using GET DATA instead of the troublesome DATA LIST to solve that limitation.
Note: There may be a warning In FUN(X[[i]], ...) : probable complete loss of accuracy in modulus - this refers to counting the decimals and can be ignored.
Note: POSIXlt and POSIXct variables are not yet handled by the script properly.
writeForeignMySPSS = function (df, datafile, codefile, varnames = NULL, len = 32767) {
adQuote <- function (x) paste("\"", x, "\"", sep = "")
# Last variable must not be empty for DATA LIST
if (any(is.na(df[[length(df)]]))) {
df$END_CASE = 0
}
# http://stackoverflow.com/questions/5173692/how-to-return-number-of-decimal-places-in-r
decimalplaces <- function(x) {
y = x[!is.na(x)]
if (length(y) == 0) {
return(0)
}
if (any((y %% 1) != 0)) {
info = strsplit(sub('0+$', '', as.character(y)), ".", fixed=TRUE)
info = info[sapply(info, FUN=length) == 2]
if (length(info) >= 2) {
dec = nchar(unlist(info))[seq(2, length(info), 2)]
} else {
return(0)
}
return(max(dec, na.rm=T))
} else {
return(0)
}
}
dfn <- lapply(df, function(x) if (is.factor(x))
as.numeric(x)
else x)
# Boolean variables (dummy coding)
bv = sapply(dfn, is.logical)
for (v in which(bv)) {
dfn[[v]] = ifelse(dfn[[v]], 1, 0)
}
varlabels <- names(df)
# Use comments where applicable
for (i in 1:length(df)) {
cm = comment(df[[i]])
if (is.character(cm) && (length(cm) > 0)) {
varlabels[i] = comment(df[[i]])
}
}
if (is.null(varnames)) {
varnames <- abbreviate(names(df), 8L)
if (any(sapply(varnames, nchar) > 8L))
stop("I cannot abbreviate the variable names to eight or fewer letters")
if (any(varnames != varlabels))
warning("some variable names were abbreviated")
}
varnames <- gsub("[^[:alnum:]_\\$##]", "\\.", varnames)
dl.varnames <- varnames
chv = sapply(df, is.character)
if (any(chv)) {
for (v in which(chv)) {
dfn[[v]] = gsub("\\s", " ", dfn[[v]])
}
lengths <- sapply(df[chv], function(v) max(nchar(v), na.rm=T))
if (any(lengths > len)) {
warning(paste("Clipped strings in", names(df[chv]), "to", len, "characters"))
for (v in which(chv)) {
df[[v]] = substr(df[[v]], start=1, stop=len)
}
}
lengths[is.infinite(lengths)] = 0
lengths[lengths < 1] = 1
lengths <- paste("(A", lengths, ")", sep = "")
# star <- ifelse(c(FALSE, diff(which(chv) > 1)), " *",
dl.varnames[chv] <- paste(dl.varnames[chv], lengths)
}
# decimals and bools
nmv = sapply(df, is.numeric)
dbv = sapply(df, is.numeric)
nv = (nmv | dbv)
decimals = sapply(df[nv], FUN=decimalplaces)
dl.varnames[nv] = paste(dl.varnames[nv], " (F", decimals+8, ".", decimals, ")", sep="")
if (length(bv) > 0) {
dl.varnames[bv] = paste(dl.varnames[bv], "(F1.0)")
}
rmv = !(chv | nv | bv)
if (length(rmv) > 0) {
dl.varnames[rmv] = paste(dl.varnames[rmv], "(F8.0)")
}
# Breaks in output
brv = seq(1, length(dl.varnames), 10)
dl.varnames[brv] = paste(dl.varnames[brv], "\n", sep=" ")
cat("SET LOCALE = ENGLISH.\n", file = codefile)
cat("DATA LIST FILE=", adQuote(datafile), " free (TAB)\n", file = codefile, append = TRUE)
cat("/", dl.varnames, " .\n\n", file = codefile, append = TRUE)
cat("VARIABLE LABELS\n", file = codefile, append = TRUE)
cat(paste(varnames, adQuote(varlabels), "\n"), ".\n", file = codefile,
append = TRUE)
factors <- sapply(df, is.factor)
if (any(factors)) {
cat("\nVALUE LABELS\n", file = codefile, append = TRUE)
for (v in which(factors)) {
cat("/\n", file = codefile, append = TRUE)
cat(varnames[v], " \n", file = codefile, append = TRUE)
levs <- levels(df[[v]])
cat(paste(1:length(levs), adQuote(levs), "\n", sep = " "),
file = codefile, append = TRUE)
}
cat(".\n", file = codefile, append = TRUE)
}
# Labels stored in attr()
attribs <- !unlist(lapply(sapply(df, FUN=attr, which="1"), FUN=is.null))
if (any(attribs)) {
cat("\nVALUE LABELS\n", file = codefile, append = TRUE)
for (v in which(attribs)) {
cat("/\n", file = codefile, append = TRUE)
cat(varnames[v], " \n", file = codefile, append = TRUE)
# Check labeled values
tc = list()
for (tcv in dimnames(table(df[[v]]))[[1]]) {
if (!is.null(tcl <- attr(df[[v]], tcv))) {
tc[tcv] = tcl
}
}
cat(paste(names(tc), tc, "\n", sep = " "),
file = codefile, append = TRUE)
}
cat(".\n", file = codefile, append = TRUE)
}
ordinal <- sapply(df, is.ordered)
if (any(ordinal)) {
tmp = varnames[ordinal]
brv = seq(1, length(tmp), 10)
tmp[brv] = paste(tmp[brv], "\n")
cat(paste("\nVARIABLE LEVEL", paste(tmp, collapse=" "), "(ORDINAL).\n"),
file = codefile, append = TRUE)
}
num <- sapply(df, is.numeric)
if (any(num)) {
tmp = varnames[num]
brv = seq(1, length(tmp), 10)
tmp[brv] = paste(tmp[brv], "\n")
cat(paste("\nVARIABLE LEVEL", paste(tmp, collapse=" "), "(SCALE).\n"),
file = codefile, append = TRUE)
}
cat("\nEXECUTE.\n", file = codefile, append = TRUE)
write.table(dfn, file = datafile, row = FALSE, col = FALSE,
sep = "\t", quote = F, na = "", eol = "\n", fileEncoding="UTF-8")
}
On the long term, the changes might be considered to be merged into the foreignpackage. Unfortunately, the bug reporting system for the r-project is currently limited to previously registered developers.

The SPSS extension command STATS GET R can read a data frame directly into an SPSS dataset from a saved R workspace. If this extension command is not already installed (it will show up on the File menu), it can be installed from the Utilities menu (Statistics 22-23) or the Extensions menu (Statistics 24+).

What I've found is that:
'foreign' can't handle values with commas in them (basically, it fails what Excel can do with .csv files).
'haven' demands SPSS-compatible names in the r data set.
What I did for the latter is rename them as haven found them.
It seemed to work in blocks:
'Initial 1' through 'Initial 4'
'Final 1' through 'Final 4'
'relapse' and 'Relapse'.

Related

How to save a value that's generated and used in an r function to keep it in the environment?

I have a function (Save.R) that creates a few variables and saves them in a table for further use.
I also have a matrix in my main code that I want to replace some of its cells with a FileName that is generated in the function.
Question: how do I keep FileName and save it to my environment?
*I'm new to R please explain in simple words.
I have tried to input my matrix as an input to Save.R and replace cells as it generates the FileName(s) but it does not work.
for (i in 1:435){
X = subset(NGAW2_Flatfile_Vertical_5percentdamping, grepl(Uniques[i,1],
NGAW2_Flatfile_Vertical_5percentdamping$`Station ID No.`))
if (nrow(X)==1){
# Match[count,] = subset(NGAW2_Flatfile_Vertical_5percentdamping, grepl(Uniques[i,1], NGAW2_Flatfile_Vertical_5percentdamping$`Station ID No.`))
Match[count,] = X[1,]
H1 = substring(X[1,113], 10,15)
H2 = substring(X[1,114], 10,15)
V = substring(X[1,115], 10,15)
St.ID = substring(X[1,9], 1, 7)
Save(H1, H2, V, Match)
count=count+1
}
}
Save <- function(H1, H2, V){
H1 = paste(H1, ".DAT", sep = "")
data = read.delim(H1, sep = "", header = FALSE)
When1 = substring(data[2,1],2,11)
FileName1 = paste("20", When1, "_", St.ID, "_", "H1", sep = "" )
}

adding to lists together using cbind

This program works because I made the varibles inisde lapply global by using the <<- operator. However, it does not work with the real files in the real program. These are .tsv files whith named columns. The answer I get when I run the real program is: Error: (converted from warning) Error in : (converted from warning) Error in : arguments imply differing number of rows: 3455, 4319. What might be causing this?
lc <- list("test.txt", "test.txt", "test.txt", "test.txt")
lc1 <- list("test.txt", "test.txt", "test.txt")
lc2 <- list("test.txt", "test.txt")
#list of lists. The lists contain file names
lc <- list(lc, lc1, lc2)
#new names for the three lists in the list of lists
new_dataFns <- list("name1", "name2", "name3")
file_paths <- NULL
new_path <- NULL
#add the file names to the path and read and merge the contents of each list in the list of lists
lapply(
lc,
function(lc) {
filenames <- file.path(getwd(), lc)
dataList <<- lapply(filenames, function (lc) read.table(file=lc, header=TRUE))
dataList <<- lapply(dataList, function(dataList) {merge(as.data.frame(dataList),as.data.frame(dataList))})
}
)
#add the new name of the file to the path total will be 3 paths/fille_newname.tsv.
lapply(new_dataFns, function(new_dataFns) {new_path <<- file.path(getwd(), new_dataFns)})
print(new_path)
print(dataList)
finalFiles <- merge(as.data.frame(dataList), as.data.frame(new_path))
print(finalFiles)
I found a solution to the problem by writing a different type of code. Please see below. The input to the function is provided by the app input widgets
glyCount1 <- function(answer = NULL, fileChoice = NULL, combination = NULL, enteredValue = NULL, nameList) {
lc = nameList
new_dataFns <- gsub(" ", "", nameList)
first_path <- NULL
new_path <- NULL
old_path <- NULL
file_content <- NULL
for(i in 1:length(lc)){
for(j in 1:length(lc[[i]])){
if(!is.null(lc[[i]])){
first_path[[j]]<- paste(getwd(), "/", lc[[i]][j], sep = "")
tryCatch(file_content[[j]] <- read.csv(file = first_path[[i]], header = TRUE, sep = ","), error = function(e) NULL)
old_path[[j]] <- paste(getwd(), "/", i, ".csv", sep = "")
write.table(file_content[[j]], file = old_path[[j]], append = TRUE, col.names = FALSE)
}
}
}
}

Removing new lines in R

I am trying to bring multiple rows into one cell in my CSV file. I first began with converting my text file into a CSV file, however the final column needs to have all the contents in one cell, and it's currently being split into multiple. The CSV File currently looks like the first picture, and needs to look like the second picture. Picture1Picture2
I have the following code:
mydata = read.table ("rolled_swiftmessage_test.txt", sep="|", allowEscapes
= TRUE, fill = FALSE)
write.table(mydata, file="rolled_swiftmessage_test.csv",sep=",",col.names=
FALSE,row.names= FALSE)
Currently it produces Picture_1, and I need it to produce picture_2. How do I fix it? Thanks!
After corresponding with the OP and seeing the kind of data that she has, this is my updated answer:
mydata <- read.table ("Test_TextFile.txt", sep="|", allowEscapes = TRUE, fill = FALSE, stringsAsFactors = F)
# Remove rows full of dashes
for(row in 1:nrow(mydata)) {
if(grepl('^\\-+$', mydata$V1[row])) mydata <- mydata[-row,]
}
empty_rows <- which(grepl('^\\s*$', mydata$V1))
rows_to_squeeze <- split(empty_rows, cumsum(c(1, diff(empty_rows) != 1)))
for(i in length(rows_to_squeeze):1) {
mydata$V12[rows_to_squeeze[[i]][1] - 1] <- paste(mydata$V12[seq(rows_to_squeeze[[i]][1] - 1, rows_to_squeeze[[i]][length(rows_to_squeeze[[i]])])], collapse = ' ')
mydata <- mydata[-seq(rows_to_squeeze[[i]][1], rows_to_squeeze[[i]][length(rows_to_squeeze[[i]])]),]
}
write.table(mydata, file="rolled_swiftmessage_test.csv", sep=",", col.names = FALSE, row.names = FALSE)
Original answer
Here you have my attempt at this. It's not pretty, but I think it works. Basically, I read the file as lines of text, not a table, I operate on the lines to join those that belong on the same 'message' cell, and then I put them in a nice data frame that can be saved as a csv file. Let me know if you need any other tweaks:
install.packages('stringr') ## if not installed yet
library(stringr) ## in order to use str_detect and str_split below
mydata <- readLines("rolled_swiftmessage_test.txt")
new_mydata = vector('character')
current <- 1
while(!is.na(mydata[current])) {
if(str_detect(mydata[current], '\\{')) {
i <- 1
while(!str_detect(mydata[current + i], '\\}')) {
mydata[current] <- paste(mydata[current], mydata[current + i], collapse = ' ')
i = i + 1
}
mydata[current] <- paste(mydata[current], mydata[current + i], collapse = ' ')
mydata[current] <- gsub('\\| \\| \\| \\|', '', mydata[current])
new_mydata = c(new_mydata, mydata[current])
current = current + i + 1
} else {
new_mydata = c(new_mydata, mydata[current])
current = current + 1
}
}
new_mydata <- sapply(new_mydata, function(x) str_split(x, '\\|'))
new_mydata <- as.data.frame(t(as.data.frame(new_mydata)))
write.table(new_mydata, file="rolled_swiftmessage_test.csv", sep=",", col.names = FALSE, row.names = FALSE)
The resulting image after opening the csv file (notice that I added the same row to the original text file three times just so that I would have more lines for testing):

Stream processing large csv file in R

I need to make a couple of relatively simple changes to a very large csv file (c.8.5GB). I tried initially using various reader functions: read.csv, readr::read.csv, data.table::fread. However: they all run out of memory.
I'm thinking I need to use a stream processing approach instead; read a chunk, update it, write it, repeat. I found this answer which is on the right lines; however I don't how to terminate the loop (I'm relatively new to R).
So I have 2 questions:
What's the right way to make the while loop work?
Is there a better way (for some definition of 'better')? e.g. is there some way to do this using dplyr & pipes?
Current code as follows:
src_fname <- "testdata/model_input.csv"
tgt_fname <- "testdata/model_output.csv"
#Changes needed in file: rebase identifiers, set another col to constant value
rebase_data <- function(data, offset) {
data$'Unique Member ID' <- data$'Unique Member ID' - offset
data$'Client Name' <- "TestClient2"
return(data)
}
CHUNK_SIZE <- 1000
src_conn = file(src_fname, "r")
data <- read.csv(src_conn, nrows = CHUNK_SIZE, check.names=FALSE)
cols <- colnames(data)
offset <- data$'Unique Member ID'[1] - 1
data <- rebase_data(data, offset)
#1st time through, write the headers
tgt_conn = file(tgt_fname, "w")
write.csv(data,tgt_conn, row.names=FALSE)
#loop over remaining data
end = FALSE
while(end == FALSE) {
data <- read.csv(src_conn, nrows = CHUNK_SIZE, check.names=FALSE, col.names = cols)
data <- rebase_data(data, offset)
#write.csv doesn't support col.names=FALSE; so use write.table which does
write.table(data, tgt_conn, row.names=FALSE, col.names=FALSE, sep=",")
# ??? How to test for EOF and set end = TRUE if so ???
# This doesn't work, presumably because nrow() != CHUNK_SIZE on final loop?
if (nrow(data) < CHUNK_SIZE) {
end <- TRUE
}
}
close(src_conn)
close(tgt_conn)
Thanks for any pointers.
Sorry to poke a 2-year-old thread, but now with readr::read_csv_chunked (auto-loaded along with dplyr when loading tidyverse), we could also do like:
require(tidyverse)
## For non-exploratory code, as #antoine-sac suggested, use:
# require(readr) # for function `read_csv_chunked` and `read_csv`
# require(dplyr) # for the pipe `%>%` thus less parentheses
src_fname = "testdata/model_input.csv"
tgt_fname = "testdata/model_output.csv"
CHUNK_SIZE = 1000
offset = read_csv(src_fname, n_max=1)$comm_code %>% as.numeric() - 1
rebase.chunk = function(df, pos) {
df$comm_code = df$comm_code %>% as.numeric() - offset
df$'Client Name' = "TestClient2"
is.append = ifelse(pos > 1, T, F)
df %>% write_csv(
tgt_fname,
append=is.append
)
}
read_csv_chunked(
src_fname,
callback=SideEffectChunkCallback$new(rebase.chunk),
chunk_size = chunck.size,
progress = T # optional, show progress bar
)
Here the tricky part is to set is.append based on parameter pos, which indicates the start row number of the data frame df within original file. Within readr::write_csv, when append=F the header (columns name) will be written to file, otherwise not.
Try this out:
library("chunked")
read_chunkwise(src_fname, chunk_size=CHUNK_SIZE) %>%
rebase_data(offset) %>%
write_chunkwise(tgt_fname)
You may need to fiddle a bit with the colnames to get exactly what you want.
(Disclaimer: haven't tried the code)
Note that there is no vignette with the package but the standard usage is described on github: https://github.com/edwindj/chunked/
OK I found a solution, as follows:
# src_fname <- "testdata/model_input.csv"
# tgt_fname <- "testdata/model_output.csv"
CHUNK_SIZE <- 20000
#Changes needed in file: rebase identifiers, set another col to constant value
rebase_data <- function(data, offset) {
data$'Unique Member ID' <- data$'Unique Member ID' - offset
data$'Client Name' <- "TestClient2"
return(data)
}
#--------------------------------------------------------
# Get the structure first to speed things up
#--------------------------------------------------------
structure <- read.csv(src_fname, nrows = 2, check.names = FALSE)
cols <- colnames(structure)
offset <- structure$'Unique Member ID'[1] - 1
#Open the input & output files for reading & writing
src_conn = file(src_fname, "r")
tgt_conn = file(tgt_fname, "w")
lines_read <- 0
end <- FALSE
read_header <- TRUE
write_header <- TRUE
while(end == FALSE) {
data <- read.csv(src_conn, nrows = CHUNK_SIZE, check.names=FALSE, col.names = cols, header = read_header)
if (nrow(data) > 0) {
lines_read <- lines_read + nrow(data)
print(paste0("lines read this chunk: ", nrow(data), ", lines read so far: ", lines_read))
data <- rebase_data(data, offset)
#write.csv doesn't support col.names=FALSE; so use write.table which does
write.table(data, tgt_conn, row.names=FALSE, col.names=write_header, sep = ",")
}
if (nrow(data) < CHUNK_SIZE) {
end <- TRUE
}
read_header <- FALSE
write_header <- FALSE
}
close(src_conn)
close(tgt_conn)

try to create new variable using loop in R,but failed

I am a new user to R.I have already imported all data from all my txt file using the code down below,but i want to create a new variable when importing data,the variable is called case.The value of case for the first row is 1 and for the rest is 0.
And when i try to run the code,the console did not say anytime wrong ,the data has been imported, but the new variable wasn't created.I don't know why.
for(i in Filenames){
perpos <- which(strsplit(i, "")[[1]]==".")
data=assign(
gsub(" ","",substr(i, 1, perpos-1)),
read.table(paste(filepath,i,sep=""),fill=TRUE,header=TRUE,quote ="",row.names = NULL,sep="\t")
)
strsplit(i, "")
filename = strsplit(as.character(i),"\\.txt")
data$case = ifelse(data$NAME=="filename",1,0)
}
Thanks guys! I used #joosts's code and made some ajustment. The code down below works just fine.
fn <- paste(filepath,Filenames,sep="")
mylist <- lapply(fn, read.table,fill = TRUE, header = TRUE, quote = "",row.names = NULL, sep = "\t",stringsAsFactors=FALSE)
for(i in 1:length(Filenames)){
mylist[[i]]<- cbind(mylist[[i]], case = 0)
if(nrow(mylist[[i]])>0) {
mylist[[i]]$case[1] <- 1
}
mylist[[i]]<- cbind(mylist[[i]], ID = i)
}
do.call(rbind, mylist)
I am assuming you want to read in multiple text files, with each file containing the same columns (in the same order). In order to combine multiple dataframes (the things that result from calling read.data()), you should call the function rbind().
And I assume your code to get a filename without the extension is slightly overcomplex...
for(file in filenames) {
sanitized_filename <- gsub(" ", "", strsplit(file, "\\.")[[1]][1])
file.frame <- read.table(paste(filepath, file, sep=""), fill = TRUE, header = TRUE, quote = "", row.names = NULL, sep = "\t")
file.frame <- cbind(file.frame, name = I(sanitized_filename), case = 0)
if(nrow(file.frame)>0) {
file.frame$case[1] <- 1
}
data <- ifelse(exists("data"), rbind(data, file.frame), file.frame)
}

Resources