R - Matching columns in dataframe - r

I have this NDVI timeseries dataset where the first column is dates and the next three are NDVI data for three different IDs (59231, 158157, 282302)
Date X59231 X158157 X282302
1 13149 NA 0.398 NA
2 13157 0.344 0.267 0.327
3 13165 NA 0.431 NA
. ..... ..... ..... .....
Here's the dput:
structure(list(Date = c(13149L, 13157L, 13165L, 13173L, 13181L,
13189L, 13197L, 13205L, 13213L, 13221L, 13229L, 13237L, 13245L,
13253L, 13261L, 13269L, 13277L, 13285L, 13293L, 13301L, 13309L,
13317L, 13325L, 13333L, 13341L, 13349L, 13357L, 13365L, 13373L,
13381L, 13389L, 13397L, 13405L, 13413L, 13421L, 13429L, 13437L,
13445L, 13453L, 13461L, 13469L, 13477L, 13485L, 13493L, 13501L,
13509L), X59231 = c(NA, 0.344, NA, 0.398, NA, 0.587, NA, NA,
0.451, 0.597, 0.593, 0.556, 0.559, 0.375, 0.374, 0.386, 0.425,
0.383, 0.349, 0.315, 0.282, 0.323, 0.315, 0.359, 0.292, 0.271,
0.297, 0.307, 0.322, 0.344, 0.297, 0.285, 0.273, 0.282, 0.281,
0.304, 0.314, NA, 0.391, 0.601, 0.65, NA, 0.653, 0.666, 0.519,
0.625), X158157 = c(0.398, 0.267, 0.431, NA, 0.36, 0.434, 0.434,
0.465, 0.447, 0.521, 0.539, 0.563, 0.595, 0.541, 0.553, 0.381,
0.533, 0.505, 0.551, NA, 0.546, 0.535, 0.523, 0.501, 0.508, 0.51,
0.506, 0.51, 0.514, 0.526, 0.555, 0.545, 0.53, 0.539, 0.531,
0.53, NA, 0.585, 0.597, 0.32, 0.569, 0.601, NA, NA, 0.52, 0.532
), X282302 = c(NA, 0.327, NA, 0.282, 0.26, 0.293, 0.25, 0.288,
0.336, 0.299, 0.29, 0.28, NA, 0.305, 0.319, NA, 0.255, 0.292,
0.294, NA, NA, 0.367, 0.331, 0.344, 0.283, 0.284, 0.291, 0.273,
0.239, 0.285, 0.249, 0.285, 0.247, 0.288, 0.276, NA, 0.317, 0.375,
0.38, 0.417, 0.374, 0.491, NA, NA, NA, 0.471)), class = "data.frame", row.names = c(NA,
-46L))
I run the following code to smooth the timeseries (get rid of noise) and find the multiple maxs and mins for each ID's NDVI timeseries.
rm(list=ls())
#Read in csv data
df=read.csv("Data.csv", header = TRUE)
date_col = df[,1]
num_cols = length(df[1,]) #count number of columns there are
num_Dcols = num_cols-1 #count the number of columns there are minus the index (first) column
#Function to append columns to a dataframe
cbind.fill <- function(...){
nm <- list(...)
nm <- lapply(nm, as.matrix)
n <- max(sapply(nm, nrow))
do.call(cbind, lapply(nm, function (x)
rbind(x, matrix(, n-nrow(x), ncol(x)))))
}
#Create an empty data frame
finalDF = data.frame(matrix(ncol=(0),nrow=0)) #create empty dataframe
#Create an empty vector for column names
CNames = c()
for (i in c(1:num_Dcols)){
df_sub = df[,c(1,i+1)] #create a data frame of the date column and the i+1 column
df_removeNA = na.omit(df_sub)
#Append the date column to the final data frame
df_date = df_removeNA[,1]
finalDF = cbind.fill(finalDF, df_date)
#Append the NDVI timeseries column to the final data frame
df_data = df_removeNA[,2]
finalDF = cbind.fill(finalDF, df_data)
stl_1=stl(ts(df_data, frequency=4), "periodic")
#Function to calculate all the maximums
ts_max<-function(signal)
{
points_max=which(diff(sign(diff(signal)))==-2)+1
return(points_max)
}
#Function to calculate all the minimums
ts_min<-function(signal)
{
points_min=which(diff(sign(diff(-signal)))==-2)+1
return(points_min)
}
#Smooth the timeseries
trend_1=as.numeric(stl_1$time.series[,2])
#Find max and mins of the smoothed timeseries
max_1=ts_max(trend_1)
min_1=ts_min(trend_1)
#Append max and mins to the final data frame
finalDF = cbind.fill(finalDF, df_data[max_1])
finalDF = cbind.fill(finalDF, df_data[min_1])
#Append column names to the column names vector
CNames = c(CNames, toString(colnames(df_sub[1])))
CNames = c(CNames, toString(colnames(df_sub[2])))
CNames = c(CNames, paste(c(toString(colnames(df_sub[2])), "_Max"), collapse=''))
CNames = c(CNames, paste(c(toString(colnames(df_sub[2])), "_Min"), collapse=''))
#Plot final results
plot(df_date, trend_1, type = 'l')
abline(v=df_date[max_1], col="red")
abline(v=df_date[min_1], col="blue")
}
#Rename final data frame's column names
colnames(finalDF) = CNames
#Export final data frame to CSV
write.csv(finalDF, file = "finalDF_smooth.csv")
Here's an image of all the maxs and mins for the first column of NDVI timeseries data.
What I'm trying to figure out is how to add two new columns into the original (or new) data frame next to each ID column where I can store the maximums and minimums. The maximums and minimums need to be placed in the cell that matches its corresponding date. In other words, I need two duplicated columns of each ID column. Inserted next to each ID column with all values replaced with NA except the maximums and minimums. Both of which were calculated in the smoothing code above. For example, this is what I need the final dataframe to look like :
Date 59231 59231_Max 59231_Min 158157 158157_Max 158157_Min 282302 282302_Max 282302_Min
13149 NA NA NA 0.398 NA NA NA NA NA
13157 0.344 NA NA 0.267 NA NA 0.327 NA NA
13165 NA NA NA 0.431 NA NA NA NA NA
13173 0.398 NA NA NA NA NA 0.282 NA NA
13181 NA NA NA 0.360 NA NA 0.260 NA NA
13189 0.587 NA NA 0.434 NA NA 0.293 NA 0.293
13197 NA NA NA 0.434 NA NA 0.25 NA NA
13205 NA NA NA 0.465 NA NA 0.288 NA NA
13213 0.451 NA NA 0.447 NA NA 0.336 NA NA
13221 0.597 NA NA 0.521 NA NA 0.299 0.299 NA
... ... .. .. ... .. .. ... ... ..
This is what it looks like right now.
Date 59231 59231_Max 59231_Min Date 158157 158157_Max 158157_Min Date 282302 282302_Max 282302_Min
13157 0.344 0.593 0.386 13149 0.398 0.595 0.533 13157 0.327 0.299 0.293
13173 0.398 0.425 0.282 13157 0.267 0.546 0.508 13173 0.282 0.331 0.255
13189 0.587 0.315 0.297 13165 0.431 0.545 0.539 13181 0.260 NA 0.285
13213 0.451 0.322 0.273 13181 0.360 0.530 0.320 13189 0.293 NA NA
13221 0.597 0.653 NA 13189 0.434 NA NA 13197 0.250 NA NA
13229 0.593 NA NA 13197 0.434 NA NA 13205 0.288 NA NA
13237 0.556 NA NA 13205 0.465 NA NA 13213 0.336 NA NA
13245 0.559 NA NA 13213 0.447 NA NA 13221 0.299 NA NA
13253 0.375 NA NA 13221 0.521 NA NA 13229 0.290 NA NA
13261 0.374 NA NA 13229 0.539 NA NA 13237 0.280 NA NA
..... ... .. .. ..... ..... .. .. ..... ..... ... ..
Note: I had to omit NAs during each loop so the code produces a CSV file with a unique subset date column for each ID. I would love to just have one date column like the ideal table above.
In my code I started to create a new data frame and appending each column after each loop but I can't figure out how to match up the maxs and mins in the right cells. Right now all the max and mins are stacked at the top of their columns. Any ideas? Thanks.

How about this? It adds the min and max columns.
df
df$max <- apply(df[2:4], 1, max, na.rm = TRUE)
df$min <- apply(df[2:4], 1, min, na.rm = TRUE)
head(df)
Which produces:
ID X59231 X158157 X282302 max min
1 13149 NA 0.398 NA 0.398 0.398
2 13157 0.344 0.267 0.327 0.344 0.267
3 13165 NA 0.431 NA 0.431 0.431
4 13173 0.398 NA 0.282 0.398 0.282
5 13181 NA 0.360 0.260 0.360 0.260
6 13189 0.587 0.434 0.293 0.587 0.293
I have added this based on the clarification that you have provided. You can ignore the bit above:
This will produce what you want. I have only done it for the first column, but you can just change the variables to get the other columns.
library(dplyr)
df2 <- as_tibble(df)
df2 <- df2 %>%
mutate(X59231_min = min(X59231, na.rm = TRUE))%>%
mutate(X59231_min = ifelse(X59231 == X59231_min, X59231_min, NA)) %>%
mutate(X59231_max = max(X59231, na.rm = TRUE))%>%
mutate(X59231_max = ifelse(X59231 == X59231_max, X59231_max, NA))
So:
df2 %>% filter(!is.na(X59231_min))
gives us:
# A tibble: 1 x 6
ID X59231 X158157 X282302 X59231_min X59231_max
<int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 13349 0.271 0.51 0.284 0.271 NA
And:
df2 %>% filter(!is.na(X59231_max))
Shows:
# A tibble: 1 x 6
ID X59231 X158157 X282302 X59231_min X59231_max
<int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 13493 0.666 NA NA NA 0.666
You should be able to do it for the other columns.

Related

Divide matrix values by category means in R

I have a matrix (A) containing 211 rows and 6 columns (one per time period) and a different matrix (B) containing 211 rows and 2 columns, the second of which contains categorial information (1-9).
My aim is to create a new matrix (C) where each value in matrix A is the value(A) divided by the mean of (value(A) by category(B)). I managed to compute the means for each category per column with the aggregate function. These are stored in a separate dataframe, column_means, with each time wave in a separate column. This also contains the information about the group in column_means[,1].
I don't understand how to proceed from here and am looking for an elegant solution so I can transfer this knowledge to future projects (and possibly improve my existing code). My guess is that the solution is hidden somewhere in dplyr and rather simple once you know it.
Thank you for any suggestions.
Data example:
##each column here represents a wave:
initialmatrix <- structure(c(0.882647671948723, 0.847932241438909, 0.753052308699317,
0.754977233408875, NA, 0.886095543329695, 0.849625252682829,
0.78893884364632, 0.77111113840682, NA, 0.887255207679895, 0.851503493865384,
0.812107856411831, 0.793982699495818, NA, 0.885212452552841,
0.854894065774315, 0.815265718290737, 0.806766276556325, NA,
0.882027335190646, 0.85386634818439, 0.818052477777012, 0.815997781565393,
NA, 0.88245957310107, 0.855819521951304, 0.830425687228663, 0.820857689847061,
NA), .Dim = 5:6, .Dimnames = list(NULL, c("V1", "V2", "V3", "V4",
"V5", "V6")))
##the first column is unique ID, the 2nd the category:
categories <- structure(c(1L, 2L, 3L, 4L, 5L, 2L, 1L, 2L, 2L, 4L), .Dim = c(5L,
2L), .Dimnames = list(NULL, c("V1", "V2")))
##the first column represents the category, column 1-6 the mean per category for each corresponding wave in "initialmatrix"
column.means <- structure(list(Group.1 = 1:5, x = c(0.805689153058216, 0.815006230419524,
0.832326976776262, 0.794835253329865, 0.773041961434791), asset_means_2...2. = c(0.80050960343197,
0.81923553710203, 0.833814773618545, 0.797834687980729, 0.780028077018158
), asset_means_3...2. = c(0.805053341257357, 0.828691564900149,
0.833953165695685, 0.799381078569563, 0.785813047374534), asset_means_4...2. = c(0.806116664276125,
0.832439754757116, 0.835982197159582, 0.801702200401293, 0.788814840753852
), asset_means_5...2. = c(0.807668548993891, 0.83801834926905,
0.836036508152776, 0.803433961863399, 0.79014026195926), asset_means_6...2. = c(0.808800359101212,
0.840923947682599, 0.839660313992458, 0.804901773257962, 0.793165113115977
)), row.names = c(NA, 5L), class = "data.frame")
Is this what you are trying to do?
options(digits=3)
divisor <- column.means[categories[, 2], -1]
divisor
# x asset_means_2...2. asset_means_3...2. asset_means_4...2. asset_means_5...2. asset_means_6...2.
# 2 0.815 0.819 0.829 0.832 0.838 0.841
# 1 0.806 0.801 0.805 0.806 0.808 0.809
# 2.1 0.815 0.819 0.829 0.832 0.838 0.841
# 2.2 0.815 0.819 0.829 0.832 0.838 0.841
# 4 0.795 0.798 0.799 0.802 0.803 0.805
initialmatrix/divisor
# x asset_means_2...2. asset_means_3...2. asset_means_4...2. asset_means_5...2. asset_means_6...2.
# 2 1.083 1.082 1.071 1.063 1.053 1.049
# 1 1.052 1.061 1.058 1.061 1.057 1.058
# 2.1 0.924 0.963 0.980 0.979 0.976 0.988
# 2.2 0.926 0.941 0.958 0.969 0.974 0.976
# 4 NA NA NA NA NA NA
This looks like a job for Superma ... no wait ... map2.
library(dplyr)
library(purrr)
as_tibble(initialmatrix) %>%
mutate(category = as.double(as_tibble(categories)$V2),
across(starts_with('V'),
~ unlist(map2(., category, ~ .x/mean(c(.x, .y)))))) %>%
select(-category)
# V1 V2 V3 V4 V5 V6
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 0.612 0.614 0.615 0.614 0.612 0.612
# 2 0.918 0.919 0.920 0.922 0.921 0.922
# 3 0.547 0.566 0.578 0.579 0.581 0.587
# 4 0.548 0.557 0.568 0.575 0.580 0.582
# 5 NA NA NA NA NA NA

Modify columns' values depending on other columns

My dataset contains NDVI values and NDVI-QualityDescriptor values(PixelQa) for different areas in different dates. I basically want to erase (setting to NA) the NDVI values that are related to bad quality descriptor (PixelQa). The number suffix of the column names relates both data: PixelQa_1 is related to NDVI_1 and so on.
Therefore to "clean" my data I have to check PixelQa values in order to assess if I have to change its related NDVI value. There is 3 possible situations:
PixelQa is NA -> then NDVI should be also NA.
Pixel Qa is 66±0.5 OR 130±0.5 -> then NDVI remains the same value.
Pixel Qa is different to 66±0.5 OR 130±0.5 -> then NDVI value is set to NA (this is bad quality data which needs to be ignored).
My dataset could be:
DataNDVI_split <- data.frame("21feb1987_NDVI" = c(0.123, NA, 0.192, 0.234, NA), "21feb1987_PixelQa" = c(66.30, NA, 66.00, 79.87, NA), "18jul1987_NDVI" = c(0.223, NA, 0.230, 0.334, NA), "21feb1987_PixelQa" = c(66.30, NA, 66.00, 79.87, NA), stringsAsFactors = FALSE)
DataNDVI_split
X21feb1987_NDVI1 X21feb1987_PixelQa1 X18jul1987_NDVI2 X21feb1987_PixelQa2
1 0.123 66.30 0.223 66.30
2 NA NA NA NA
3 0.192 66.00 0.230 66.00
4 0.234 79.87 0.334 79.87
5 NA NA NA NA
And "clean" it should look like:
DataNDVI_split <- data.frame("21feb1987_NDVI" = c(0.123, NA, 0.192, 0.234, NA), "21feb1987_PixelQa" = c(66.30, NA, 66.00, 79.87, NA), "18jul1987_NDVI" = c(0.223, NA, 0.230, 0.334, NA), "21feb1987_PixelQa" = c(66.30, NA, 66.00, 79.87, NA), stringsAsFactors = FALSE)
DataNDVI_split
X21feb1987_NDVI1 X21feb1987_PixelQa1 X18jul1987_NDVI2 X21feb1987_PixelQa2
1 0.123 66.30 0.223 66.30
2 NA NA NA NA
3 0.192 66.00 0.230 66.00
4 NA 79.87 NA 79.87
5 NA NA NA NA
Here's a tentative solution.
First, I'd split up the data into two separate dataframes, thus:
df_ndvi <- DataNDVI[grepl("NDVI", DataNDVI$Data), ]
df_ndvi
Data X21feb1987 X18jul1987
1 NDVI1 0.123 0.223
2 NDVI2 NA NA
3 NDVI3 0.192 0.230
4 NDVI4 0.234 0.334
5 NDVI5 NA NA
df_pixel <- DataNDVI[!grepl("NDVI", DataNDVI$Data), ]
df_pixel
Data X21feb1987 X18jul1987
6 PixelQa1 66.30 66.00
7 PixelQa2 NA NA
8 PixelQa3 66.00 124.23
9 PixelQa4 79.87 86.00
10 PixelQa5 NA NA
To perform the desired changes, there are many possible ways. One way is by using a forloop through all the columns in df_ndvi (except the first!) and defining an ifelse statement to see whether or not the conditions hold true and to define actions to be taken in either case:
for(i in 2:3){
df_ndvi[,i] <- ifelse(df_pixel[,i] < 65.5 | df_pixel[,i] > 66.5, NA, df_ndvi[,i])
}
This results in these corrections in df_ndvi:
df_ndvi
Data X21feb1987 X18jul1987
1 NDVI1 0.123 0.223
2 NDVI2 NA NA
3 NDVI3 0.192 NA
4 NDVI4 NA NA
5 NDVI5 NA NA
EDIT:
If you prefer to split-up the data in this way:
DataNDVI_split <- data.frame("21feb1987_NDVI" = c(0.123, NA, 0.192, 0.234, NA), "21feb1987_PixelQa" = c(66.30, NA, 66.00, 79.87, NA), "18jul1987_NDVI" = c(0.223, NA, 0.230, 0.334, NA), "21feb1987_PixelQa" = c(66.30, NA, 66.00, 79.87, NA), stringsAsFactors = FALSE)
DataNDVI_split
X21feb1987_NDVI X21feb1987_PixelQa X18jul1987_NDVI X21feb1987_PixelQa.1
1 0.123 66.30 0.223 66.30
2 NA NA NA NA
3 0.192 66.00 0.230 66.00
4 0.234 79.87 0.334 79.87
5 NA NA NA NA
then the for loop could be adapted thus:
for(i in c(1,3)){
DataNDVI_split[,i] <- ifelse(DataNDVI_split[,i+1] < 65.5 | DataNDVI_split[,i+1] > 66.5, NA, DataNDVI_split[,i])
}
The result is this:
DataNDVI_split
X21feb1987_NDVI X21feb1987_PixelQa X18jul1987_NDVI X21feb1987_PixelQa.1
1 0.123 66.30 0.223 66.30
2 NA NA NA NA
3 0.192 66.00 0.230 66.00
4 NA 79.87 NA 79.87
5 NA NA NA NA

Push up and tighten Dataframe. General solution

I want to push up (metaphorically) the dataframe in ordner to get rid of the spaces (NA-Values)
My Data:
> dput(df1)
structure(list(ID = c("CN1-1", "CN1-1", "CN1-1", "CN1-10", "CN1-10",
"CN1-10", "CN1-11", "CN1-11", "CN1-11", "CN1-12", "CN1-12", "CN1-12",
"CN1-13", "CN1-13", "CN1-13"), v1 = c(0.37673, NA, NA, 1.019972,
NA, NA, 0.515152, NA, NA, 0.375139, NA, NA, 0.508125, NA, NA),
v2 = c(NA, 0.732, NA, NA, 0, NA, NA, 0.748, NA, NA, 0.466,
NA, NA, 0.57, NA), v2 = c(NA, NA, 0.357, NA, NA, 0.816, NA,
NA, 0.519, NA, NA, 0.206, NA, NA, 0.464)), .Names = c("ID",
"v1", "v2", "v2"), row.names = c(NA, 15L), class = "data.frame")
>
Looks like:
ID v1 v2 v2
1 CN1-1 0.376730 NA NA
2 CN1-1 NA 0.732 NA
3 CN1-1 NA NA 0.357
4 CN1-10 1.019972 NA NA
5 CN1-10 NA 0.000 NA
6 CN1-10 NA NA 0.816
7 CN1-11 0.515152 NA NA
8 CN1-11 NA 0.748 NA
9 CN1-11 NA NA 0.519
10 CN1-12 0.375139 NA NA
11 CN1-12 NA 0.466 NA
12 CN1-12 NA NA 0.206
13 CN1-13 0.508125 NA NA
14 CN1-13 NA 0.570 NA
15 CN1-13 NA NA 0.464
Please note: I'm not sure if the pattern is consistent over all rows. It could also be possible, that one or more variables are prominent 2+ times per ID Group.
Desired output:
ID v1 v2 v2
1 CN1-1 0.376730 0.732 0.357
2 CN1-10 1.019972 0.000 0.816
...
My idea was to melt then get rid of all NA values and then dcast. Any better approach?
EDIT:
duplicated could look like this.
16 CN1-x 0.508125 NA NA
17 CN1-x NA 0.570 NA
18 CN1-x NA NA 0.464
19 CN1-x NA NA 0.134
do.call(rbind,
lapply(split(df1, df1$ID), function(a)
data.frame(ID = a$ID[1], lapply(a[-1], sum, na.rm = TRUE))))
# ID v1 v2 v2.1
#CN1-1 CN1-1 0.376730 0.732 0.357
#CN1-10 CN1-10 1.019972 0.000 0.816
#CN1-11 CN1-11 0.515152 0.748 0.519
#CN1-12 CN1-12 0.375139 0.466 0.206
#CN1-13 CN1-13 0.508125 0.570 0.464

ggplot with missing values

I have a dataset like this
data <- data.frame(Time=as.Date(c("2007-01-31", "2007-02-28", "2007-03-31", "2007-04-30",
"2007-05-31", "2007-06-30", "2007-07-31", "2007-08-31", "2007-09-30", "2007-10-31",
"2007-11-30", "2007-12-31"), format="%Y-%m-%d"),
a=c(NA, NA, 1.201, NA, NA, 1.206, NA, NA, 1.212, NA, NA, 1.237),
b=c(1.187, 1.201, 1.206, 1.219, 1.211, 1.187, 1.202, 1.213, 1.209, 1.208,
1.219, 1.237),
c=c(1.198, 1.201, 1.203, 1.206, 1.207,1.207, 1.208, 1.21, 1.214, 1.22, 1.228, 1.236))
> data
Time a b c
1 2007-01-31 NA 1.187 1.198
2 2007-02-28 NA 1.201 1.201
3 2007-03-31 1.201 1.206 1.203
4 2007-04-30 NA 1.219 1.206
5 2007-05-31 NA 1.211 1.207
6 2007-06-30 1.206 1.187 1.207
7 2007-07-31 NA 1.202 1.208
8 2007-08-31 NA 1.213 1.210
9 2007-09-30 1.207 1.209 1.214
10 2007-10-31 NA 1.208 1.220
11 2007-11-30 NA 1.219 1.228
12 2007-12-31 1.219 1.237 1.236
I want a ggplot with points for a and lines for b & c. I came up with
ggplot(data, aes(x=Time)) + geom_point(aes(y=a, colour="a"), size=3) +
geom_line(aes(y=b, colour="b")) + geom_line(aes(y=c, colour="c"))
The problem is, that I have many more variables like b & c, which I want to plot. Is there a more efficient way to do it?
One way of doing it is with the tidyr package (and some dplyr)
library('dplyr')
library('tidyr')
data2 <- gather(data, grp, value, a:c)
ggplot(data2, aes(x = Time, y = value, color = grp, group = grp)) +
geom_point(data = filter(data2, grp == 'a'), size = 3) +
geom_line()
The plot

Strip leading zero from numeric vector without changing class

I have the following data, which is a few Major League Baseball statistics.
Year AVG SLG TB OBP IsoPow RC
1 1986 0.223 0.300 172 0.330 0.194 64.1
2 1987 0.261 0.356 271 0.329 0.230 92.8
3 1988 0.283 0.357 264 0.368 0.208 100.0
4 1989 0.248 0.328 247 0.351 0.178 91.9
5 1990 0.301 0.374 293 0.406 0.264 128.0
6 1991 0.292 0.367 262 0.410 0.222 118.2
Usually, percentage-type MLB statistics are displayed as a decimal, but with the leading zero removed. I'd like to do the same, but also preserve the class of the variable, which in this case is numeric.
For example, with bonds$AVG I'd like the result to be a numeric vector that looks exactly like
[1] .223 .261 .283 .248 .301 .292
Using sub, the vector goes from numeric to character, then back to its original numeric state after wrapping it with as.numeric.
> sub(0, "", bonds$AVG)
# [1] ".223" ".261" ".283" ".248" ".301" ".292"
> as.numeric(sub(0, "", bonds$AVG))
# [1] 0.223 0.261 0.283 0.248 0.301 0.292
Is this possible in R?
bonds <-
structure(list(Year = c(1986, 1987, 1988, 1989, 1990, 1991),
AVG = c(0.223, 0.261, 0.283, 0.248, 0.301, 0.292), SLG = c(0.3,
0.356, 0.357, 0.328, 0.374, 0.367), TB = c(172, 271, 264,
247, 293, 262), OBP = c(0.33, 0.329, 0.368, 0.351, 0.406,
0.41), IsoPow = c(0.194, 0.23, 0.208, 0.178, 0.264, 0.222
), RC = c(64.1, 92.8, 100, 91.9, 128, 118.2)), .Names = c("Year",
"AVG", "SLG", "TB", "OBP", "IsoPow", "RC"), row.names = c(NA,
6L), class = "data.frame")
Perhaps you could generalize the following by modifying print.data.frame?
f1 <- function(x) noquote(sub(0, "", x))
f1(bonds$AVG)
.223 .261 .283 .248 .301 .292

Resources