Related
I have a table test whose NA values I would like to approximate based on linear interpolation between values that do exist.
For example, the second row plotted looks like this:
v1 <- unlist(test[2,])
plot(v1[!is.na(v1)], names(v1)[!is.na(v1)], type="l", add = TRUE)
How would one go about interpolating/approximating the NA values along the x-axis in this case? Any suggestions in base R or dplyr would be helpful
test
variable 26500 30000 30100 30700 31600 33700 33800 33900 34000 34600 34800 35100 35200 35300
1 -20 NA 0 NA NA 10 20 NA NA NA 30 NA NA NA NA
2 -10 NA 0 NA NA NA 10 NA NA NA 20 NA NA NA 30
3 0 0 NA NA NA NA NA 10 NA NA NA 20 NA NA NA
4 24 NA NA NA 0 NA NA NA NA 10 NA NA NA 20 NA
5 40 NA NA 0 NA NA NA NA 10 NA NA NA 20 NA NA
6 55 NA NA 0 NA NA NA NA 10 NA NA NA 20 NA NA
35400 35600 35800 35900 36200 36300 36400 36700 36900 37000 37200 37800 37900 38000 38200
1 40 NA NA NA 50 NA NA NA NA NA 60 NA NA NA 70
2 NA NA NA 40 NA NA NA 50 NA NA NA 60 NA NA NA
3 NA 30 NA NA 40 NA NA NA 50 NA NA NA 60 NA NA
4 NA NA 30 NA NA 40 NA NA NA 50 NA NA NA 60 NA
5 NA NA 30 NA NA 40 NA NA NA 50 NA NA NA NA 60
6 NA NA NA 30 NA NA 40 NA NA 50 NA NA NA NA 60
38800 39000 39100 39200 39700 39800 39900 40000 40200 40600 40700 40800 41700 41800
1 NA NA NA 80 NA NA NA NA 90 NA NA NA 100 NA
2 70 NA NA NA 80 NA NA NA NA 90 NA NA 100 NA
3 70 NA NA NA NA 80 NA NA NA NA 90 NA 100 NA
4 NA 70 NA NA NA NA NA 80 NA NA NA 90 100 NA
5 NA NA 70 NA NA NA NA 80 NA NA NA 90 NA 100
6 NA 70 NA NA NA NA 80 NA NA NA NA 90 100 NA
Here is the sample data:
dput(test)
structure(list(variable = c(-20, -10, 0, 24, 40, 55), `26500` = c(NA,
NA, 0L, NA, NA, NA), `30000` = c(0L, 0L, NA, NA, NA, NA), `30100` = c(NA,
NA, NA, NA, 0L, 0L), `30700` = c(NA, NA, NA, 0L, NA, NA), `31600` = c(10L,
NA, NA, NA, NA, NA), `33700` = c(20L, 10L, NA, NA, NA, NA), `33800` = c(NA,
NA, 10L, NA, NA, NA), `33900` = c(NA, NA, NA, NA, 10L, 10L),
`34000` = c(NA, NA, NA, 10L, NA, NA), `34600` = c(30L, 20L,
NA, NA, NA, NA), `34800` = c(NA, NA, 20L, NA, NA, NA), `35100` = c(NA,
NA, NA, NA, 20L, 20L), `35200` = c(NA, NA, NA, 20L, NA, NA
), `35300` = c(NA, 30L, NA, NA, NA, NA), `35400` = c(40L,
NA, NA, NA, NA, NA), `35600` = c(NA, NA, 30L, NA, NA, NA),
`35800` = c(NA, NA, NA, 30L, 30L, NA), `35900` = c(NA, 40L,
NA, NA, NA, 30L), `36200` = c(50L, NA, 40L, NA, NA, NA),
`36300` = c(NA, NA, NA, 40L, 40L, NA), `36400` = c(NA, NA,
NA, NA, NA, 40L), `36700` = c(NA, 50L, NA, NA, NA, NA), `36900` = c(NA,
NA, 50L, NA, NA, NA), `37000` = c(NA, NA, NA, 50L, 50L, 50L
), `37200` = c(60L, NA, NA, NA, NA, NA), `37800` = c(NA,
60L, NA, NA, NA, NA), `37900` = c(NA, NA, 60L, NA, NA, NA
), `38000` = c(NA, NA, NA, 60L, NA, NA), `38200` = c(70L,
NA, NA, NA, 60L, 60L), `38800` = c(NA, 70L, 70L, NA, NA,
NA), `39000` = c(NA, NA, NA, 70L, NA, 70L), `39100` = c(NA,
NA, NA, NA, 70L, NA), `39200` = c(80L, NA, NA, NA, NA, NA
), `39700` = c(NA, 80L, NA, NA, NA, NA), `39800` = c(NA,
NA, 80L, NA, NA, NA), `39900` = c(NA, NA, NA, NA, NA, 80L
), `40000` = c(NA, NA, NA, 80L, 80L, NA), `40200` = c(90L,
NA, NA, NA, NA, NA), `40600` = c(NA, 90L, NA, NA, NA, NA),
`40700` = c(NA, NA, 90L, NA, NA, NA), `40800` = c(NA, NA,
NA, 90L, 90L, 90L), `41700` = c(100L, 100L, 100L, 100L, NA,
100L), `41800` = c(NA, NA, NA, NA, 100L, NA)), row.names = c(NA,
-6L), class = "data.frame")
We could use na.interp from forecast
library(forecast)
test[-1] <- t(apply(test[-1], 1, na.interp))
Or with na.approx
test[-1] <- t(apply(test[-1], 1, na.approx, na.rm = FALSE))
then do the plotting
v1 <- unlist(test[2, -1])
plot(v1, names(v1), type = 'l')
If you want to switch easily between different interpolation methods (or time series imputation methods in general) you can also use the imputeTS package.
For the requested solution this would be:
library("imputeTS")
test[-1] <- t(apply(test[-1], 1, na_interpolation, option = "linear"))
Switching to Spline interpolation would look like this:
test[-1] <- t(apply(test[-1], 1, na_interpolation, option = "stine"))
Another option could be Stineman interpolation:
test[-1] <- t(apply(test[-1], 1, na_interpolation, option = "spline"))
Other imputation methods like na_ma (moving average imputation), na_kalman (Kalman smoothing on structural time series models) would be also possible, if you replace the na_interpolation with the specific function (see also GitHub package Readme for a imputation function overview).
The way I have extracted my results somehow kept them as diagonal elements in a data frame. I would like to reduce the data down, keeping the row names and col names. I.e. merge the row names and col names.
1750:10-K:2006 1800:10-K:2006 1923:10-K:2006 2488:10-K:2006
1750:10-K:2005 0.9291217 NA NA NA
1800:10-K:2005 NA 0.9690067 NA NA
1923:10-K:2005 NA NA 0.8584429 NA
2488:10-K:2005 NA NA NA 0.956372
2969:10-K:2005 NA NA NA NA
3133:10-K:2005 NA NA NA NA
3197:10-K:2005 NA NA NA NA
3333:10-K:2005 NA NA NA NA
3370:10-K:2005 NA NA NA NA
3673:10-K:2005 NA NA NA NA
2969:10-K:2006 3133:10-K:2006 3197:10-K:2006 3333:10-K:2006
1750:10-K:2005 NA NA NA NA
1800:10-K:2005 NA NA NA NA
1923:10-K:2005 NA NA NA NA
2488:10-K:2005 NA NA NA NA
2969:10-K:2005 0.861327 NA NA NA
3133:10-K:2005 NA 0.9375159 NA NA
3197:10-K:2005 NA NA 0.9633629 NA
3333:10-K:2005 NA NA NA 0.9752259
3370:10-K:2005 NA NA NA NA
3673:10-K:2005 NA NA NA NA
3370:10-K:2006 3673:10-K:2006
1750:10-K:2005 NA NA
1800:10-K:2005 NA NA
1923:10-K:2005 NA NA
2488:10-K:2005 NA NA
2969:10-K:2005 NA NA
3133:10-K:2005 NA NA
3197:10-K:2005 NA NA
3333:10-K:2005 NA NA
3370:10-K:2005 0.941602 NA
3673:10-K:2005 NA 0.9745789
Expected output:
1750:10-K:2005_1750:10-K:2006 0.9291217
1800:10-K:2005_1800:10-K:2006 0.9690067
1923:10-K:2005_1923:10-K:2006 0.8584429
2488:10-K:2005_2488:10-K:2006 0.956372
Data:
structure(list(`1750:10-K:2006` = c(0.929121725727165, NA, NA,
NA, NA, NA, NA, NA, NA, NA), `1800:10-K:2006` = c(NA, 0.96900670959669,
NA, NA, NA, NA, NA, NA, NA, NA), `1923:10-K:2006` = c(NA, NA,
0.858442889654398, NA, NA, NA, NA, NA, NA, NA), `2488:10-K:2006` = c(NA,
NA, NA, 0.956371967288172, NA, NA, NA, NA, NA, NA), `2969:10-K:2006` = c(NA,
NA, NA, NA, 0.861326963904054, NA, NA, NA, NA, NA), `3133:10-K:2006` = c(NA,
NA, NA, NA, NA, 0.93751593784196, NA, NA, NA, NA), `3197:10-K:2006` = c(NA,
NA, NA, NA, NA, NA, 0.963362873672737, NA, NA, NA), `3333:10-K:2006` = c(NA,
NA, NA, NA, NA, NA, NA, 0.975225879729218, NA, NA), `3370:10-K:2006` = c(NA,
NA, NA, NA, NA, NA, NA, NA, 0.941602039119482, NA), `3673:10-K:2006` = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, 0.974578948898938)), row.names = c("1750:10-K:2005",
"1800:10-K:2005", "1923:10-K:2005", "2488:10-K:2005", "2969:10-K:2005",
"3133:10-K:2005", "3197:10-K:2005", "3333:10-K:2005", "3370:10-K:2005",
"3673:10-K:2005"), class = "data.frame")
You can try diag but you have to convert to matrix first, i.e.
data.frame(v1 = rownames(df), v2 = diag(as.matrix(df)))
# v1 v2
#1 1750:10-K:2005 0.9291217
#2 1800:10-K:2005 0.9690067
#3 1923:10-K:2005 0.8584429
#4 2488:10-K:2005 0.9563720
#5 2969:10-K:2005 0.8613270
#6 3133:10-K:2005 0.9375159
#7 3197:10-K:2005 0.9633629
#8 3333:10-K:2005 0.9752259
#9 3370:10-K:2005 0.9416020
#10 3673:10-K:2005 0.9745789
Here is a solution with dplyr:
library(dplyr)
df %>%
rownames_to_column() %>%
gather(KPI,Value,-rowname) %>%
mutate(KPI = paste0(rowname,KPI,sep="_")) %>%
drop_na() %>%
select(-rowname)
I am trying to plot date wise multi-variate data along with a independent variable on the top axes.
To do so - I merged my multi-variate (response variables) dataframe with the single input (independent variable) into a single data frame.
The resulting dataframe now has several NA values in rows and columns (for both data sets).
My question:
Why am I loosing the width / dodge with my current code?
Does this have to do with NA values in the factored variable in my data?
How do you work with NA values in a factored variable ?
Half my dataset is a completely different variable and needs only 1 column. The only reason I merged them was because I wanted to bring all the data on the same plot (plan was to use grob after this, but I got stuck here)
Before this, I was using this code to plot the geom_bar with the dataframe just with the response variables and it worked.
The dataframe name is final, Factor variable is TYPE which has Open, Shrub and Lowland as categories and NA for the dates which only has the independent variable (in this case Rain)
final$TYPE<-factor(final$TYPE, levels = c("Open", "Shrub","Lowland"))
limits <- aes(ymax = final$Max, ymin = final$Min, ysd= final$SD)
rhg_cols <- c("brown","forestgreen", "cyan4")
p <- ggplot(final, aes(Date, MeanTWC, fill=TYPE), na.rm=F )+
geom_bar(stat="identity", position = "dodge")+
scale_fill_manual(values = rhg_cols)+
scale_x_date(breaks = seq(as.Date("2016-08-15"), as.Date("2017-10-15"), by="30 days"),labels=date_format("%b-%Y"))
p<-p + labs(x="DATE", y ="Total Water in mm")
p<-p + geom_bar(stat = "Identity",
position = "dodge")+
geom_errorbar(limits, position = "dodge", size =0.2)+
ggtitle("Total Water Storage-60cm")+
scale_y_continuous(limits = c(0,100))
p<-p+theme_bw() +theme(axis.text.x = element_text(angle = 270, vjust = 1,
size =15),axis.text.y = element_text(vjust = 1, hjust = 1, size =20),
panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_line(linetype="longdash"),
panel.grid.major.y = element_line(linetype = "longdash"))
print(p)
Sample Data:
Date TYPE MeanTWC Max Min Rain
1 2016-08-13 <NA> NA NA NA 27.686
2 2016-08-14 <NA> NA NA NA 79.248
3 2016-08-15 <NA> NA NA NA 9.398
4 2016-08-16 <NA> NA NA NA 9.906
5 2016-08-17 <NA> NA NA NA 26.670
6 2016-08-21 <NA> NA NA NA 52.324
7 2016-08-27 <NA> NA NA NA 13.200
8 2016-08-28 <NA> NA NA NA 0.200
9 2016-08-29 <NA> NA NA NA 3.000
10 2016-08-30 <NA> NA NA NA 0.400
11 2016-09-02 <NA> NA NA NA 5.400
12 2016-09-04 <NA> NA NA NA 22.200
13 2016-09-05 <NA> NA NA NA 0.400
14 2016-09-06 <NA> NA NA NA 0.400
15 2016-09-11 <NA> NA NA NA 0.200
16 2016-09-19 Open 82.40583 94.13074 71.95022 NA
17 2016-09-19 Shrub 75.25720 81.09062 66.31633 NA
18 2016-09-19 Lowland 79.78265 91.46637 71.42791 NA
19 2016-09-24 <NA> NA NA NA 1.200
20 2016-09-28 Open 107.00762 128.82301 87.78908 NA
21 2016-09-28 Shrub 102.29717 114.59530 93.02085 NA
22 2016-09-28 Lowland 100.62097 108.65464 93.06479 NA
23 2016-10-04 Open 94.35146 119.11809 80.80844 NA
24 2016-10-04 Shrub 89.78960 106.59891 77.91514 NA
25 2016-10-04 Lowland 87.66499 98.93036 77.44905 NA
26 2016-10-07 <NA> NA NA NA 15.200
27 2016-10-24 Open 77.75282 90.99799 60.89542 NA
28 2016-10-24 Shrub 73.13549 84.68082 64.38086 NA
29 2016-10-24 Lowland 77.54505 89.20983 68.77503 NA
30 2016-11-04 Open 75.79262 84.63392 61.17391 NA
structure(list(Date = structure(c(17026, 17027, 17028, 17029,
17030, 17034, 17040, 17041, 17042, 17043, 17046, 17048, 17049,
17050, 17055, 17063, 17063, 17063, 17068, 17072, 17072, 17072,
17078, 17078, 17078, 17081, 17098, 17098, 17098, 17109), class = "Date"),
TYPE = structure(c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, 1L, 2L, 3L, NA, 1L, 2L, 3L, 1L, 2L, 3L,
NA, 1L, 2L, 3L, 1L), .Label = c("Open", "Shrub", "Lowland"
), class = "factor"), MeanTWC = c(NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, 82.4058263935714, 75.2571964744444,
79.782649985, NA, 107.0076241875, 102.297170442857, 100.620970785,
94.3514631776471, 89.7895999577778, 87.664985085, NA, 77.75281636125,
73.135492118, 77.54505326, 75.792624628125), Max = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 94.13073642,
81.09062269, 91.46637475, NA, 128.8230145, 114.5952995, 108.6546353,
119.1180866, 106.5989092, 98.93036216, NA, 90.99798892, 84.68081807,
89.20983383, 84.63391564), Min = c(NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, 71.95021894, 66.31632641,
71.42791015, NA, 87.78907749, 93.02084587, 93.06478569, 80.8084363,
77.91514274, 77.44904985, NA, 60.89542395, 64.38086067, 68.77503196,
61.17390712), SD = c(NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, 6.52668534466645, 5.31370742998586,
8.40565594980702, NA, 10.3287869191442, 8.45785409063748,
6.49446280465913, 9.73718805734734, 10.5575933779477, 9.35169762923353,
NA, 8.27219492616507, 6.75450870627616, 8.51146778459709,
6.75447037137946), N = c(NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, 14, 9, 4, NA, 12, 7, 4, 17, 9,
4, NA, 16, 10, 4, 16), SE = c(NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, 1.74433003078718, 1.77123580999529,
4.20282797490351, NA, 2.9816639540851, 3.19676836415673,
3.24723140232956, 2.36161499158505, 3.51919779264923, 4.67584881461676,
NA, 2.06804873154127, 2.13596319872699, 4.25573389229854,
1.68861759284486), Rain = c(27.686, 79.248, 9.398, 9.906,
26.67, 52.324, 13.2, 0.2, 3, 0.4, 5.4, 22.2, 0.4, 0.4, 0.2,
NA, NA, NA, 1.2, NA, NA, NA, NA, NA, NA, 15.2, NA, NA, NA,
NA)), .Names = c("Date", "TYPE", "MeanTWC", "Max", "Min",
"SD", "N", "SE", "Rain"), row.names = c(NA, 30L), class = "data.frame")
So, my challenge has been to convert a raw scale csv to a scored csv. Within numerous columns, the file has cells filled with "Strongly Agree" to "Strongly Disagree", 6 levels. These factors need to be converted in integers 5 to 0 respectively.
I have tried unsuccessfully to use sapply and convert the table to a string. Sapply works on the vector, but it destroys the table structure.
Method 1:
dat$Col<-sapply(dat$Col,switch,'Strongly Disagree'=0,'Disagree'=1,'Slightly Disagree'=2,'Slightly Agree'=3,'Agree'=4, 'Strongly Agree'=5)
My second approach is to convert the csv into a string. When I examined the dput output, I saw the area I wanted to target that started with a .Label="","Strongly Agree"... Mistake. My changes did not result in a useful outcome.
My third approach came from the internet gods of destruction who seemed to express that gsub() might handle the string approach as well. Nope, again the underlying table structure was destroyed.
Method #3: Convert into a string and pattern match
dat <- textConnection("control/Surveys/StudyDat_1.csv")
#Score Scales
##"Strongly Agree"= 5
##"Agree"= 4
##"Strongly Disagree" = 0
#levels(dat$Col) <- gsub("Strongly Agree", "5", levels(dat$Col))
df<- gsub("Strongly Agree", "5",dat)
dat<-read.csv(textConnection(df),header=TRUE)
In the end, I am wanting to replace ALL "Strongly Agree" to 5 across numerous columns without the consequence of destroying the retrievability of the data.
Maybe I used the wrong search string and you know the resource I need to address this problem. I would rather avoid ALL character vector approaches as that this would require labeling each column if you provide a code response. It will need to go across ALL COLUMNS.
Thanks
Data Sample Problem
structure(list(last_updated = structure(c(3L, 1L, 7L, 2L, 10L, 6L, 8L, 9L, 7L, 5L, 4L), .Label = c("2016-05-13T12:53:56.704184Z",
"2016-05-13T12:54:09.273359Z", "2016-05-13T12:54:22.757251Z",
"2016-05-14T12:44:13.474992Z", "2016-05-14T12:44:31.736469Z",
"2016-05-16T16:45:10.623410Z", "2016-05-16T16:46:17.881402Z",
"2016-05-16T16:46:55.122257Z", "2016-05-16T16:47:14.160793Z",
"2016-05-24T02:26:04.770799Z"), class = "factor"), feedback = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), A = structure(c(NA,
NA, 2L, NA, 1L, NA, NA, NA, 2L, NA, NA), .Label = c("", "Slightly Disagree"
), class = "factor"), B = structure(c(NA, NA, 2L, NA, 1L, NA,
NA, NA, 3L, NA, NA), .Label = c("", "Disagree", "Strongly Agree"
), class = "factor"), C = structure(c(NA, NA, 2L, NA, 1L, NA,
NA, NA, 3L, NA, NA), .Label = c("", "Agree", "Disagree"), class = "factor"),
D = structure(c(NA, NA, 2L, NA, 1L, NA, NA, NA, 2L, NA, NA
), .Label = c("", "Agree"), class = "factor"), E = structure(c(NA,
NA, 2L, NA, 1L, NA, NA, NA, 3L, NA, NA), .Label = c("", "Agree",
"Strongly Disagree"), class = "factor")), .Names = c("last_updated",
"feedback", "A", "B", "C", "D", "E"), class = "data.frame", row.names = c(NA,
-11L))
Data Sample Solution
df<-dget(structure(list(last_updated = structure(c(3L, 1L, 7L, 2L, 10L, 6L,8L, 9L, 7L, 5L, 4L), .Label = c("2016-05-13T12:53:56.704184Z",
"2016-05-13T12:54:09.273359Z", "2016-05-13T12:54:22.757251Z",
"2016-05-14T12:44:13.474992Z", "2016-05-14T12:44:31.736469Z",
"2016-05-16T16:45:10.623410Z", "2016-05-16T16:46:17.881402Z",
"2016-05-16T16:46:55.122257Z", "2016-05-16T16:47:14.160793Z",
"2016-05-24T02:26:04.770799Z"), class = "factor"), feedback = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), A = c(NA, NA, 2L, NA,
NA, NA, NA, NA, 2L, NA, NA), B = c(NA, NA, 1L, NA, NA, NA, NA,
NA, 5L, NA, NA), C = c(NA, NA, 4L, NA, NA, NA, NA, NA, 1L, NA,
NA), D = c(NA, NA, 4L, NA, NA, NA, NA, NA, 4L, NA, NA), E = c(NA,
NA, 4L, NA, NA, NA, NA, NA, 0L, NA, NA)), .Names = c("last_updated",
"feedback", "A", "B", "C", "D", "E"), class = "data.frame", row.names = c(NA,-11L)))
we can use factor with levels specified
nm1 <- c('Strongly Disagree', 'Disagree',
'Slightly Disagree','Slightly Agree','Agree', 'Strongly Agree')
factor(dat$col, levels = nm1,
labels = 0:5))
If there are multiple factor columns with the same levels, identify the factor columns ('i1'), loop through it with lapply and specify the levels and labels.
i1 <- sapply(dat, is.factor)
dat[i1] <- lapply(dat[i1], factor, levels = nm1, labels= 0:5)
Update
Using the OP's dput output
dat[-(1:2)] <- lapply(dat[-(1:2)], factor, levels = nm1, labels = 0:5)
dat
# last_updated feedback A B C D E
#1 2016-05-13T12:54:22.757251Z NA <NA> <NA> <NA> <NA> <NA>
#2 2016-05-13T12:53:56.704184Z NA <NA> <NA> <NA> <NA> <NA>
#3 2016-05-16T16:46:17.881402Z NA 2 1 4 4 4
#4 2016-05-13T12:54:09.273359Z NA <NA> <NA> <NA> <NA> <NA>
#5 2016-05-24T02:26:04.770799Z NA <NA> <NA> <NA> <NA> <NA>
#6 2016-05-16T16:45:10.623410Z NA <NA> <NA> <NA> <NA> <NA>
#7 2016-05-16T16:46:55.122257Z NA <NA> <NA> <NA> <NA> <NA>
#8 2016-05-16T16:47:14.160793Z NA <NA> <NA> <NA> <NA> <NA>
#9 2016-05-16T16:46:17.881402Z NA 2 5 1 4 0
#10 2016-05-14T12:44:31.736469Z NA <NA> <NA> <NA> <NA> <NA>
#11 2016-05-14T12:44:13.474992Z NA <NA> <NA> <NA> <NA> <NA>
Another option is set from data.table
library(data.table)
for(j in names(dat)[-(1:2)]){
set(dat, i = NULL, j= j, value = factor(dat[[j]], levels = nm1, labels = 0:5))
}
I would just match each target column vector into a precomputed character vector to get an integer index. You can subtract 1 afterward to change the range from 1:6 to 0:5.
## define desired value order, ascending
o <- c(
'Strongly Disagree',
'Disagree',
'Slightly Disagree',
'Slightly Agree',
'Agree',
'Strongly Agree'
);
## convert target columns
for (cn in names(df)[-(1:2)]) df[[cn]] <- match(as.character(df[[cn]]),o)-1L;
df;
## last_updated feedback A B C D E
## 1 2016-05-13T12:54:22.757251Z NA NA NA NA NA NA
## 2 2016-05-13T12:53:56.704184Z NA NA NA NA NA NA
## 3 2016-05-16T16:46:17.881402Z NA 2 1 4 4 4
## 4 2016-05-13T12:54:09.273359Z NA NA NA NA NA NA
## 5 2016-05-24T02:26:04.770799Z NA NA NA NA NA NA
## 6 2016-05-16T16:45:10.623410Z NA NA NA NA NA NA
## 7 2016-05-16T16:46:55.122257Z NA NA NA NA NA NA
## 8 2016-05-16T16:47:14.160793Z NA NA NA NA NA NA
## 9 2016-05-16T16:46:17.881402Z NA 2 5 1 4 0
## 10 2016-05-14T12:44:31.736469Z NA NA NA NA NA NA
## 11 2016-05-14T12:44:13.474992Z NA NA NA NA NA NA
Previous answers might meet your needs, but note that changing the labels of a factor isn't the same as changing a factor to an integer variable. One possibility would be to use ifelse (I've made a new data frame as the one you posted didn't actually have variables with these levels in it):
lev <- c('Strongly disagree', 'Disagree', 'Slightly disagree', 'Slightly agree', 'Agree', 'Strongly agree')
dta <- sample(lev, 55, replace = TRUE)
dta <- data.frame(matrix(dta, nrow = 11))
names(dta) <- LETTERS[1:5]
f_to_int <- function(f) {
if (is.factor(f)){
ifelse(f == 'Strongly disagree', 0,
ifelse(f == 'Disagree', 1,
ifelse(f == 'Slightly disagree', 2,``
ifelse(f == 'Slightly agree', 3,
ifelse(f == 'Agree', 4,
ifelse(f == 'Strongly agree', 5, f))))))
} else f
}
dta2 <- sapply(dta, f_to_int)
Note that this returns a matrix, but it is easily converted to a data frame if necessary.
I have a matrix and my objective is to find the maximum of each column and then to divide that number by the sum of all values in the row which contains the max of that column. In other words
max(y) / sum of values in the row where y is the max
How would apply this formula to every column in R ?
> the_matrix
Source: local data frame [20 x 10]
type 100 100F 100I 100X 101 102 1028P 103 103D
(fctr) (int) (int) (int) (int) (int) (int) (int) (int) (int)
1 0 NA NA NA NA NA NA NA NA NA
2 0A 2 NA NA NA NA NA NA NA NA
3 0B NA NA NA NA NA NA NA NA NA
4 0C NA NA NA NA NA NA NA NA NA
5 0E NA NA NA NA NA NA NA NA NA
6 0G NA NA NA NA NA NA NA NA NA
7 0O NA NA NA NA NA NA NA NA NA
8 0Z NA NA NA NA NA NA NA NA NA
9 1 2 NA NA NA NA NA NA NA NA
10 1A 3968 NA 214 26 4 289 8 56030 7484
11 1B 172 NA 107 NA NA 2 NA 372 3829
12 1C 584 NA 19 NA NA 1 NA 72951 363
13 1D 27 NA NA NA NA NA NA 365 22
14 1E 27944 16 68 NA NA NA 1 62 12
15 1F 1 NA 1 NA NA 1 NA 368 27
16 1G 4 NA NA NA NA NA NA 7 NA
17 1H 65 NA 6 21 1 6 3 714 59
18 1M NA NA NA NA NA NA NA 1 NA
19 1N NA NA NA NA NA NA NA NA NA
20 1Q NA NA NA NA NA NA NA NA NA
> dput(the_matrix)
structure(list(type = structure(1:20, .Label = c("0", "0A", "0B",
"0C", "0E", "0G", "0O", "0Z", "1", "1A", "1B", "1C", "1D", "1E",
"1F", "1G", "1H", "1M", "1N", "1Q", "1S", "1X", "1Z", "2", "2A",
"2B", "2C", "2D", "2E", "2F", "2G", "2H", "2I", "2J", "2M", "2S",
"2T", "2X", "2Z", "3", "3B", "3C", "3E", "4B", "5H", "8Z", "0H",
"1I", "1R", "2N", "3H", "5D", "0D", "1K", "1P", "1T", "1U", "1V",
"1W", "1Y", "2U", "3A", "4A", "5C", "7H", "9", "0F", "0T", "1J",
"2L", "0W", "2Q", "3G"), class = "factor"), `100` = c(NA, 2L,
NA, NA, NA, NA, NA, NA, 2L, 3968L, 172L, 584L, 27L, 27944L, 1L,
4L, 65L, NA, NA, NA), `100F` = c(NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, 16L, NA, NA, NA, NA, NA, NA), `100I` = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, 214L, 107L, 19L, NA, 68L, 1L,
NA, 6L, NA, NA, NA), `100X` = c(NA, NA, NA, NA, NA, NA, NA, NA,
NA, 26L, NA, NA, NA, NA, NA, NA, 21L, NA, NA, NA), `101` = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, 4L, NA, NA, NA, NA, NA, NA, 1L,
NA, NA, NA), `102` = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, 289L,
2L, 1L, NA, NA, 1L, NA, 6L, NA, NA, NA), `1028P` = c(NA, NA,
NA, NA, NA, NA, NA, NA, NA, 8L, NA, NA, NA, 1L, NA, NA, 3L, NA,
NA, NA), `103` = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, 56030L,
372L, 72951L, 365L, 62L, 368L, 7L, 714L, 1L, NA, NA), `103D` = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, 7484L, 3829L, 363L, 22L, 12L,
27L, NA, 59L, NA, NA, NA)), .Names = c("type", "100", "100F",
"100I", "100X", "101", "102", "1028P", "103", "103D"), class = c("tbl_df",
"data.frame"), row.names = c(NA, -20L))
Going step-by-step:
# let's not call a data frame a matrix
real_matrix = as.matrix(the_matrix[, -1])
# max of each column
col_max = apply(real_matrix, 2, max, na.rm = T)
# which row contains the max
col_which_max = apply(real_matrix, 2, which.max)
# row totals
row_total = rowSums(real_matrix, na.rm = T)
# col max divided by row total for corresponding row
col_max / row_total[col_which_max]
Rounded to 3 decimals, this yields the following:
100 100F 100I 100X 101 102 1028P 103 103D
0.994 0.001 0.003 0.000 0.000 0.004 0.000 0.987 0.110