Assign value to data frame in R to all elements conditionally - r

I try to assign value to all cells in a dataframe having a specific value
by this code
train_data <- read.csv("train_set.csv",header=TRUE)
train_data[train_data == "<NA>"] <- 0
But it does not work, I still see the values unchanged. How can I change values? Data in CSV is as below
spec1 spec2
NA NA
NA NA
NA NA
NA NA
NA NA
NA NA
NA NA
NA NA
SP-0013 SP-0063
SP-0013 SP-0063
NA NA
NA NA
NA NA
NA NA
NA NA
NA NA

As #akrun and others have mentioned we may need an actual copy of your data, but give this a shot:
train_data <- read.csv("train_set.csv", header=TRUE, na.strings = c("NA", "<NA>"), stringsAsFactors=FALSE)
train_data[is.na(train_data)] <- 0

Related

R na.approx error: need at least two non-NA values to interpolate

Sample Data
1/1/2000 NA NA NA 29.71 NA
1/2/2000 NA NA NA NA NA
1/3/2000 NA NA NA NA NA
1/4/2000 NA NA NA 29.25 NA
1/5/2000 NA NA NA 30.28 NA
1/6/2000 NA NA NA 27.66 NA
1/7/2000 NA NA NA 27.22 NA
1/8/2000 NA NA NA 27.27 NA
1/9/2000 170 4.1 NA 5.24 NA
1/10/2000 NA NA NA NA NA
1/11/2000 NA NA NA 27.65 NA
1/12/2000 NA NA NA 28.28 100.57
1/13/2000 NA NA NA 27.52 NA
I'm trying to interpolate a lot of NA values.
I have unique dates (key), but most [other] data columns begin/end with with NULL/NA values (combined_data_z[,a]). I care to interpolate these [other] columns empty values against date, I'm having this error when attempting
Error in approx(x[!na], y[!na], xout, ...) : need at least two
non-NA values to interpolate
library(zoo)
#start with 2 because 1st column is date
a=2
for (i in parsedList)
{
dates <- combined_data_z[,1]
test1 <- combined_data_z[,a]
test1_z <- zoo(test1)
test1_z_approx <- na.fill(na.approx(test1_z, x=dates, rule=2, na.rm = FALSE), "extend")
#print(test1_z_approx)
a=a+1
}
update: apparently it has something to do with the for loop, when I removed it and tested using print statements and built up from there, I found that it works when not enclosed in brackets (but I need the loop).
dates <- combined_data_z[,1]
test1 <- combined_data_z[,4]
test1_z <- zoo(test1)
test1_z_approx <- na.fill(na.approx(test1_z, x=dates, rule=2, na.rm = FALSE), "extend")
print(test1_z_approx)
For the following dataset you provided in comments this works:
library(zoo)
combined_data_z <- read.csv(file="http://thistleknot.sytes.net/wordpress/wp-content/uploads/2018/04/output_NoNA.csv")
test1_z_approx <- matrix(NA, ncol=ncol(combined_data_z)-2, nrow = nrow(combined_data_z))
for (i in 3:ncol(combined_data_z))
{
dates <- combined_data_z[,1]
test1 <- combined_data_z[,i]
test1_z <- zoo(test1)
test1_z_approx[,i-2] <-as.matrix( na.fill(na.approx(test1_z, x=dates, rule=2, na.rm = FALSE), "extend"))[,1]
}
If your dataset starts with the "date" column , then the code will look like:
head(combined_data_z)
# date CPIAUCSL UNRATE MEHOINUSA672N INTDSRUSM193N CIVPART
# 1 1/1/2000 169.3 4 58544 5 67.3
# 2 1/2/2000 NA NA NA NA NA
# 3 1/3/2000 NA NA NA NA NA
# 4 1/4/2000 NA NA NA NA NA
# 5 1/5/2000 NA NA NA NA NA
# 6 1/6/2000 NA NA NA NA NA
test1_z_approx <- matrix(NA, ncol=ncol(combined_data_z)-1, nrow = nrow(combined_data_z))
for (i in 2:ncol(combined_data_z))
{
dates <- combined_data_z[,1]
test1 <- combined_data_z[,i]
test1_z <- zoo(test1)
test1_z_approx[,i-1] <-as.matrix( na.fill(na.approx(test1_z, x=dates, rule=2, na.rm = FALSE), "extend"))[,1]
}
head(test1_z_approx)
# [,1] [,2] [,3] [,4] [,5]
#[1,] 169.3000 4.000000 58544 5.000000 67.30000
#[2,] 224.0420 4.033100 59039 2.844406 64.07145
#[3,] 196.4639 3.959895 59039 4.579983 65.57215
#[4,] 188.9426 3.939930 59039 5.053322 65.98144
#[5,] 186.4355 3.933275 59039 5.211101 66.11786
#[6,] 183.9284 3.926620 59039 5.368881 66.25429
Thanks goes to Katia for the assist (specifically my x's and y's needing to be in separate dataframes)
combined_data_z <- df3
#https://stackoverflow.com/a/50173660/1731972
#file begins with numeric iterations
#ncol(combined_data_z)
dates <- combined_data_z[1]
print(dates)
#important to start at 2!, otherwise na.approx will not work!
#either copy from 2: on or copy whole and drop first column (date)
#test1 <- combined_data_z[c(2:length(parsedList)+1)]
#drop date
test1 <- combined_data_z
test1[1] <- NULL
print(test1)
#wtf, had to add data.frame today!
test1_z <- zoo(data.frame(test1))
date_z <- zoo(data.frame(dates))
print(test1_z)
#colnames(test1_z)
print(dates)
test1_z_approx <- na.fill(na.approx(test1_z, dates$date, rule=2, na.rm = FALSE), "extend")
print(test1_z_approx)
#new <- NULL
print(new)
new <- c(data.frame(dates),data.frame(test1_z_approx))
print(new)
write.csv(new, file = "output_test.csv")

Creating multiple columns in data table with a for loop

My data table looks like:
head(data)
Date AI AGI ADI ASI ARI ERI NVRI SRI FRI IRI
1: 1991-09-06 NA 2094.19 NA NA NA NA NA NA NA NA
2: 1991-09-13 NA 2204.94 NA NA NA NA NA NA NA NA
3: 1991-09-20 NA 2339.10 NA NA NA NA NA NA NA NA
4: 1991-09-27 NA 2387.81 NA NA NA NA NA NA NA NA
5: 1991-10-04 NA 2459.94 NA NA NA NA NA NA NA NA
6: 1991-10-11 NA 2571.07 NA NA NA NA NA NA NA NA
Don't worry about the NAs. What I want to do is make a "percentage change" column for each of the columns apart from date.
What I've done so far is:
names_no_date <- unique(names(data))[!unique(names(data)) %in% "Date"]
for (i in names_no_date){
data_ch <- data[, paste0(i, "ch") := i/shift(i, n = 1, type = "lag")-1]}
I get the error:
Error in i/shift(i, n = 1, type = "lag") :
non-numeric argument to binary operator
I'm wondering how I get around this error?
i is a string, so you are trying to divide a string in i/shift(i, n = 1, type = "lag"):
> "AI"/NA
Error in "AI"/NA : non-numeric argument to binary operator
Instead, do
for (i in names_no_date){
data[, paste0(i, "ch") := get(i)/shift(get(i), n = 1, type = "lag")-1]
}
Also see Referring to data.table columns by names saved in variables.
Edit: #Frank writes in the comments that a more concise way to produce OP's output is
data[, paste0(names_no_date, "_pch") := .SD/shift(.SD) - 1, .SDcols=names_no_date]

Opening csv of specific sequences: NAs come out of nowhere?

I feel like this is a relatively straightforward question, and I feel I'm close but I'm not passing edge-case testing. I have a directory of CSVs and instead of reading all of them, I only want some of them. The files are in a format like 001.csv, 002.csv,...,099.csv, 100.csv, 101.csv, etc which should help to explain my if() logic in the loop. For example, to get all files, I'd do something like:
id = 1:1000
setwd("D:/")
filenames = as.character(NULL)
for (i in id){
if(i < 10){
i <- paste("00",i,sep="")
}
else if(i < 100){
i <- paste("0",i,sep="")
}
filenames[[i]] <- paste(i,".csv", sep="")
}
y <- do.call("rbind", lapply(filenames, read.csv, header = TRUE))
The above code works fine for id=1:1000, for id=1:10, id=20:70 but as soon as I pass it id=99:100 or any sequence involving numbers starting at over 100, it introduces a lot of NAs.
Example output below for id=98:99
> filenames
098 099
"098.csv" "099.csv"
Example output below for id=99:100
> filenames
099
"099.csv" NA NA NA NA NA NA NA NA
NA NA NA NA NA NA NA NA NA
NA NA NA NA NA NA NA NA NA
NA NA NA NA NA NA NA NA NA
NA NA NA NA NA NA NA NA NA
NA NA NA NA NA NA NA NA NA
NA NA NA NA NA NA NA NA NA
NA NA NA NA NA NA NA NA NA
NA NA NA NA NA NA NA NA NA
NA NA NA NA NA NA NA NA NA
NA NA NA NA NA NA NA NA NA
"100.csv"
I feel like I'm missing some catch statement in my if() logic. Any insight would be greatly appreciated! :)
You can avoid the loop for creating the filenames
filenames <- sprintf('%03d.csv', 1:1000)
y <- do.call(rbind, lapply(filenames, read.csv, header = TRUE))
#akrun has given you a much better way of solving your task. But in terms of the actual issue with your code, the problem is that for i < 100 you subset by a character vector (implicitly converted using paste) while for i >= 100 you subset by an integer. When you use id = 99:100 this translates to:
filenames <- character(0)
filenames["099"] <- "099.csv" # length(filenames) == 1L
filenames[100] <- "100.csv" # length(filenames) == 100L, with all(filenames[2:99] == NA)
Assigning to a named member of a vector that doesn't yet exist will create a new member at position length(vector) + 1 whereas assigning to a numbered position that is > length(vector) will also fill in every intervening position with NA.
Another approach, although less efficient than #akrun's solution, is with the following function:
merged <- function(id = 1:332) {
df <- data.frame()
for(i in 1:length(id)){
add <- read.csv(sprintf('%03d.csv', id[i]))
df <- rbind(df,add)
}
df
}
Now, you can merge the files with:
dat <- merged(99:100)
Furthermore, you can assign columnnames by inserting the following line in the function just before the last line with df:
colnames(df) <- c(..specify the colnames in here..)

how can I add to columns in R

I cannot seem to add two columns in R.
when I try
dat$V1 + dat$V2
I get
[1] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
Warning message:
In Ops.factor(dat$V1, dat$V2) : + not meaningful for factors
lots of other questions suggest to do as I have done, however as you can see this does not work for me. what is the problem?
Try to convert your factor columns to numeric: If V1 and V2 are 1st two columns.
dat[,1:2] <- lapply(dat[,1:2], function(x) as.numeric(as.character(x)))
dat$V1 +dat$V2
For example:
dat <- data.frame(V1= factor(1:5), V2= factor(6:10))
dat$V1+dat$V2
#[1] NA NA NA NA NA
#Warning message:
#In Ops.factor(dat$V1, dat$V2) : + not meaningful for factors
dat[,1:2] <- lapply(dat[,1:2], function(x) as.numeric(as.character(x)))
dat$V1 +dat$V2
#[1] 7 9 11 13 15

NA s in Correlation in R

I have two dataframes with a single row and would like to find the correlation using cor() function in R.
### data A
structure(list(`244901_at` = 5.9926850249, `244902_at` = 6.3553842023,
`244903_at` = 8.8921318402, `244904_at` = 6.4579518676, `244905_at` = 4.7964593532,
`244906_at` = 8.3237756365, `244907_at` = 4.3723366423, `244908_at` = 4.7352416175,
`244909_at` = 4.5714368032, `244910_s_at` = 4.1291856864), .Names = c("244901_at",
"244902_at", "244903_at", "244904_at", "244905_at", "244906_at",
"244907_at", "244908_at", "244909_at", "244910_s_at"), class = "data.frame", row.names = c(NA, -1L))
data B
structure(list(`244901_at` = 4.750238726, `244902_at` = 5.0413815841,
`244903_at` = 4.9859823666, `244904_at` = 6.1587895393, `244905_at` = 4.8531009472,
`244906_at` = 5.6846558629, `244907_at` = 4.584193219, `244908_at` = 4.5031021576,
`244909_at` = 4.4333119965, `244910_s_at` = 4.1019972842), .Names = c("244901_at",
"244902_at", "244903_at", "244904_at", "244905_at", "244906_at",
"244907_at", "244908_at", "244909_at", "244910_s_at"), class = "data.frame", row.names = c(NA, -1L))
when I calculate the correlation it gives me NA.
cor(data A, data B)
244901_at 244902_at 244903_at 244904_at 244905_at 244906_at 244907_at 2 44908_at
244901_at NA NA NA NA NA NA NA NA
244902_at NA NA NA NA NA NA NA NA
244903_at NA NA NA NA NA NA NA NA
244904_at NA NA NA NA NA NA NA NA
244905_at NA NA NA NA NA NA NA NA
244906_at NA NA NA NA NA NA NA NA
244907_at NA NA NA NA NA NA NA NA
244908_at NA NA NA NA NA NA NA NA
244909_at NA NA NA NA NA NA NA NA
244910_s_at NA NA NA NA NA NA NA NA
244909_at
244901_at NA
244902_at NA
244903_at NA
244904_at NA
244905_at NA
244906_at NA
244907_at NA
244908_at NA
244909_at NA
244910_s_at NA
If your data are in data frame then function cor() will calculate correlation between columns of your two data frame. In your case you get all NA because there is only one row in your data frame.
You have to transpose your data frames so that this one row becomes one column and then you can calculate correlation coefficient. To transpose you can use function t().
cor(t(df.A),t(df.B))

Resources