I've got two data.frames:
First:
> dput(head(tbl_mz))
structure(list(m.z = c(258.1686969, 258.168752, 587.8313625,
587.8425292, 523.2863282, 523.2859396), Measured.mass = c(514.3228408,
514.3229511, 1173.648172, 802.4706732, 1272.645144, 1044.557326
)), .Names = c("m.z", "Measured.mass"), row.names = c(NA, 6L), class = "data.frame")
Second:
> dput(head(tbl_exl))
structure(list(V1 = c(802.4706732, 1272.649209, 1272.646875,
1272.646599, 1272.646521, 1272.645144), V2 = c(NA, NA, NA, NA,
NA, NA), V3 = c(NA, NA, NA, NA, NA, NA), V4 = c(NA, NA, NA, NA,
NA, NA), V5 = c(NA, NA, NA, NA, NA, NA), V6 = structure(c(2L,
2L, 2L, 2L, 2L, 2L), .Label = c("", "Positive"), class = "factor"),
V7 = c(28.7, 29.4, 29.4, 23.8, 28.6, 23.3), V8 = c(30.7,
31.4, 31.4, 25.8, 30.6, 25.3), X = c(NA, NA, NA, NA, NA,
NA), X.1 = c(NA, NA, NA, NA, NA, NA), X.2 = c(NA, NA, NA,
NA, NA, NA)), .Names = c("V1", "V2", "V3", "V4", "V5", "V6",
"V7", "V8", "X", "X.1", "X.2"), row.names = c(NA, 6L), class = "data.frame")
I would like to replace some values from tbl_exl, column V1 by values from the the other table tbl_mz. The values from column V1 (tbl_exl) can be found in the column Measured.mass (tbl_mz) and they should be replaced by the values from the next column m.z in tbl_mz data frame.
In another words the values in the V1 should be replaced by the m.z values.
The problem is that not all values from V1 can't be find in the other data frame. Those which can be find can be deleted or just left like they are.
The output, which I want to get:
> dput(head(tbl_exl_modified))
structure(list(V1 = c(587.8425292, 1272.649209, 1272.646875,
1272.646599, 1272.646521, 523.2863282), V2 = c(NA, NA, NA, NA,
NA, NA), V3 = c(NA, NA, NA, NA, NA, NA), V4 = c(NA, NA, NA, NA,
NA, NA), V5 = c(NA, NA, NA, NA, NA, NA), V6 = structure(c(2L,
2L, 2L, 2L, 2L, 2L), .Label = c("", "Positive"), class = "factor"),
V7 = c(28.7, 29.4, 29.4, 23.8, 28.6, 23.3), V8 = c(30.7,
31.4, 31.4, 25.8, 30.6, 25.3), X = c(NA, NA, NA, NA, NA,
NA), X.1 = c(NA, NA, NA, NA, NA, NA), X.2 = c(NA, NA, NA,
NA, NA, NA)), .Names = c("V1", "V2", "V3", "V4", "V5", "V6",
"V7", "V8", "X", "X.1", "X.2"), row.names = c(NA, 6L), class = "data.frame")
You could try match. Create numeric indexes based on the match between the columns ("Measured.mass", "V1") of the two datasets. Remove the NA values ("indx1", "indxN1") and replace the "V1" values to "m.z" based on these index.
indx <- match(tbl_mz$Measured.mass, tbl_exl$V1)
indx1 <- indx[!is.na(indx)]
indxN <- match(tbl_exl$V1, tbl_mz$Measured.mass)
indxN1 <- indxN[!is.na(indxN)]
tbl_exl$V1[indx1] <- tbl_mz$m.z[indxN1]
identical(tbl_exl, tbl_exl_modified)
#[1] TRUE
Or use left_join from dplyr
library(dplyr)
tbl_exl1 <- left_join(tbl_exl, tbl_mz, by=c('V1'='Measured.mass')) %>%
mutate(V1= pmax((NA^!is.na(m.z))*V1, m.z,
na.rm=TRUE)) %>%
select(-m.z)
tbl_exl1
# V1 V2 V3 V4 V5 V6 V7 V8 X X.1 X.2
#1 587.8425 NA NA NA NA Positive 28.7 30.7 NA NA NA
#2 1272.6492 NA NA NA NA Positive 29.4 31.4 NA NA NA
#3 1272.6469 NA NA NA NA Positive 29.4 31.4 NA NA NA
#4 1272.6466 NA NA NA NA Positive 23.8 25.8 NA NA NA
#5 1272.6465 NA NA NA NA Positive 28.6 30.6 NA NA NA
#6 523.2863 NA NA NA NA Positive 23.3 25.3 NA NA NA
Here's a solution using data.tables binary join
library(data.table)
setnames(setDT(tbl_exl), 1, "Measured.mass") # Changing the first column name for the join to work
setkey(tbl_exl, Measured.mass) # Keying tbl_exl by `Measured.mass`
setkey(setDT(tbl_mz), Measured.mass) # Keying tbl_exl by `Measured.mass`
tbl_exl[tbl_mz, Measured.mass := i.m.z][] # Joining and retrieving only matched values from `i.m.z`
# Measured.mass V2 V3 V4 V5 V6 V7 V8 X X.1 X.2
# 1: 587.8425 NA NA NA NA Positive 28.7 30.7 NA NA NA
# 2: 523.2863 NA NA NA NA Positive 23.3 25.3 NA NA NA
# 3: 1272.6465 NA NA NA NA Positive 28.6 30.6 NA NA NA
# 4: 1272.6466 NA NA NA NA Positive 23.8 25.8 NA NA NA
# 5: 1272.6469 NA NA NA NA Positive 29.4 31.4 NA NA NA
# 6: 1272.6492 NA NA NA NA Positive 29.4 31.4 NA NA NA
Related
I have a table test whose NA values I would like to approximate based on linear interpolation between values that do exist.
For example, the second row plotted looks like this:
v1 <- unlist(test[2,])
plot(v1[!is.na(v1)], names(v1)[!is.na(v1)], type="l", add = TRUE)
How would one go about interpolating/approximating the NA values along the x-axis in this case? Any suggestions in base R or dplyr would be helpful
test
variable 26500 30000 30100 30700 31600 33700 33800 33900 34000 34600 34800 35100 35200 35300
1 -20 NA 0 NA NA 10 20 NA NA NA 30 NA NA NA NA
2 -10 NA 0 NA NA NA 10 NA NA NA 20 NA NA NA 30
3 0 0 NA NA NA NA NA 10 NA NA NA 20 NA NA NA
4 24 NA NA NA 0 NA NA NA NA 10 NA NA NA 20 NA
5 40 NA NA 0 NA NA NA NA 10 NA NA NA 20 NA NA
6 55 NA NA 0 NA NA NA NA 10 NA NA NA 20 NA NA
35400 35600 35800 35900 36200 36300 36400 36700 36900 37000 37200 37800 37900 38000 38200
1 40 NA NA NA 50 NA NA NA NA NA 60 NA NA NA 70
2 NA NA NA 40 NA NA NA 50 NA NA NA 60 NA NA NA
3 NA 30 NA NA 40 NA NA NA 50 NA NA NA 60 NA NA
4 NA NA 30 NA NA 40 NA NA NA 50 NA NA NA 60 NA
5 NA NA 30 NA NA 40 NA NA NA 50 NA NA NA NA 60
6 NA NA NA 30 NA NA 40 NA NA 50 NA NA NA NA 60
38800 39000 39100 39200 39700 39800 39900 40000 40200 40600 40700 40800 41700 41800
1 NA NA NA 80 NA NA NA NA 90 NA NA NA 100 NA
2 70 NA NA NA 80 NA NA NA NA 90 NA NA 100 NA
3 70 NA NA NA NA 80 NA NA NA NA 90 NA 100 NA
4 NA 70 NA NA NA NA NA 80 NA NA NA 90 100 NA
5 NA NA 70 NA NA NA NA 80 NA NA NA 90 NA 100
6 NA 70 NA NA NA NA 80 NA NA NA NA 90 100 NA
Here is the sample data:
dput(test)
structure(list(variable = c(-20, -10, 0, 24, 40, 55), `26500` = c(NA,
NA, 0L, NA, NA, NA), `30000` = c(0L, 0L, NA, NA, NA, NA), `30100` = c(NA,
NA, NA, NA, 0L, 0L), `30700` = c(NA, NA, NA, 0L, NA, NA), `31600` = c(10L,
NA, NA, NA, NA, NA), `33700` = c(20L, 10L, NA, NA, NA, NA), `33800` = c(NA,
NA, 10L, NA, NA, NA), `33900` = c(NA, NA, NA, NA, 10L, 10L),
`34000` = c(NA, NA, NA, 10L, NA, NA), `34600` = c(30L, 20L,
NA, NA, NA, NA), `34800` = c(NA, NA, 20L, NA, NA, NA), `35100` = c(NA,
NA, NA, NA, 20L, 20L), `35200` = c(NA, NA, NA, 20L, NA, NA
), `35300` = c(NA, 30L, NA, NA, NA, NA), `35400` = c(40L,
NA, NA, NA, NA, NA), `35600` = c(NA, NA, 30L, NA, NA, NA),
`35800` = c(NA, NA, NA, 30L, 30L, NA), `35900` = c(NA, 40L,
NA, NA, NA, 30L), `36200` = c(50L, NA, 40L, NA, NA, NA),
`36300` = c(NA, NA, NA, 40L, 40L, NA), `36400` = c(NA, NA,
NA, NA, NA, 40L), `36700` = c(NA, 50L, NA, NA, NA, NA), `36900` = c(NA,
NA, 50L, NA, NA, NA), `37000` = c(NA, NA, NA, 50L, 50L, 50L
), `37200` = c(60L, NA, NA, NA, NA, NA), `37800` = c(NA,
60L, NA, NA, NA, NA), `37900` = c(NA, NA, 60L, NA, NA, NA
), `38000` = c(NA, NA, NA, 60L, NA, NA), `38200` = c(70L,
NA, NA, NA, 60L, 60L), `38800` = c(NA, 70L, 70L, NA, NA,
NA), `39000` = c(NA, NA, NA, 70L, NA, 70L), `39100` = c(NA,
NA, NA, NA, 70L, NA), `39200` = c(80L, NA, NA, NA, NA, NA
), `39700` = c(NA, 80L, NA, NA, NA, NA), `39800` = c(NA,
NA, 80L, NA, NA, NA), `39900` = c(NA, NA, NA, NA, NA, 80L
), `40000` = c(NA, NA, NA, 80L, 80L, NA), `40200` = c(90L,
NA, NA, NA, NA, NA), `40600` = c(NA, 90L, NA, NA, NA, NA),
`40700` = c(NA, NA, 90L, NA, NA, NA), `40800` = c(NA, NA,
NA, 90L, 90L, 90L), `41700` = c(100L, 100L, 100L, 100L, NA,
100L), `41800` = c(NA, NA, NA, NA, 100L, NA)), row.names = c(NA,
-6L), class = "data.frame")
We could use na.interp from forecast
library(forecast)
test[-1] <- t(apply(test[-1], 1, na.interp))
Or with na.approx
test[-1] <- t(apply(test[-1], 1, na.approx, na.rm = FALSE))
then do the plotting
v1 <- unlist(test[2, -1])
plot(v1, names(v1), type = 'l')
If you want to switch easily between different interpolation methods (or time series imputation methods in general) you can also use the imputeTS package.
For the requested solution this would be:
library("imputeTS")
test[-1] <- t(apply(test[-1], 1, na_interpolation, option = "linear"))
Switching to Spline interpolation would look like this:
test[-1] <- t(apply(test[-1], 1, na_interpolation, option = "stine"))
Another option could be Stineman interpolation:
test[-1] <- t(apply(test[-1], 1, na_interpolation, option = "spline"))
Other imputation methods like na_ma (moving average imputation), na_kalman (Kalman smoothing on structural time series models) would be also possible, if you replace the na_interpolation with the specific function (see also GitHub package Readme for a imputation function overview).
I would like to copy the last two columns from each month to the beginning of the next month. I did it as follows (below), but the data contains NA and when I change it to character, the program breaks down. How do I copy columns to keep their type?
My code:
library(readxl)
library(tibble)
df<- read_excel("C:/Users/Rezerwa/Documents/Database.xlsx")
df=add_column(df, Feb1 = as.character(do.call(paste0, df["January...4"])), .after = "January...5")
df=add_column(df, Feb2 = as.numeric(do.call(paste0, df["January...5"])), .after = "Feb1")
My data:
df
# A tibble: 10 x 13
Product January...2 January...3 January...4 January...5 February...6 February...7 February...8 February...9 March...10 March...11 March...12 March...13
<chr> <lgl> <lgl> <chr> <dbl> <chr> <dbl> <chr> <dbl> <chr> <dbl> <chr> <dbl>
1 a NA NA 754.00 4 754.00 4 754.00 4 754.00 4 754.00 4
2 b NA NA 706.00 3 706.00 3 706.00 3 706.00 3 706.00 3
3 c NA NA 517.00 3 517.00 3 517.00 3 517.00 3 517.00 3
4 d NA NA 1466.00 9 1466.00 9 1466.00 9 1466.00 9 1466.00 9
5 e NA NA 543.00 8 543.00 8 543.00 8 543.00 8 543.00 8
6 f NA NA NA NA NA NA NA NA NA NA NA NA
7 g NA NA NA NA NA NA NA NA NA NA NA NA
8 h NA NA NA NA NA NA NA NA NA NA NA NA
9 i NA NA 1466.00 8 NA NA NA NA NA NA NA NA
10 j NA NA NA NA 543.00 3 NA NA NA NA NA NA
My error:
> df=add_column(df, Feb1 = as.character(do.call(paste0, df["January...4"])), .after = "January...5")
> df=add_column(df, Feb2 = as.numeric(do.call(paste0, df["January...5"])), .after = "Feb1")
Warning message:
In eval_tidy(xs[[i]], unique_output) : NAs introduced by coercion
Using base R we can split the columns based on the prefix of their names, select last two columns from each group and cbind to original df.
df1 <- cbind(df, do.call(cbind, lapply(split.default(df[-1],
sub("\\..*", "", names(df)[-1])), function(x) {n <- ncol(x);x[, c(n-1, n)]})))
To get data in order, we can do
cbind(df1[1], df1[-1][order(match(sub("\\..*", "", names(df1)[-1]), month.name))])
data
df <- structure(list(Product = structure(1:10, .Label = c("a", "b",
"c", "d", "e", "f", "g", "h", "i", "j"), class = "factor"), January...2 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA), January...3 = c(NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA), January...4 = c(754, 706, 517,
1466, 543, NA, NA, NA, 1466, NA), January...5 = c(4L, 3L, 3L,
9L, 8L, NA, NA, NA, 8L, NA), February...6 = c(754, 706, 517,
1466, 543, NA, NA, NA, NA, 543), February...7 = c(4L, 3L, 3L,
9L, 8L, NA, NA, NA, NA, 3L), February...8 = c(754, 706, 517,
1466, 543, NA, NA, NA, NA, NA), February...9 = c(4L, 3L, 3L,
9L, 8L, NA, NA, NA, NA, NA), March...10 = c(754, 706, 517, 1466,
543, NA, NA, NA, NA, NA), March...11 = c(4L, 3L, 3L, 9L, 8L,
NA, NA, NA, NA, NA), March...12 = c(754, 706, 517, 1466, 543,
NA, NA, NA, NA, NA), March...13 = c(4L, 3L, 3L, 9L, 8L, NA, NA,
NA, NA, NA)), class = "data.frame", row.names = c("1", "2", "3",
"4", "5", "6", "7", "8", "9", "10"))
So, my challenge has been to convert a raw scale csv to a scored csv. Within numerous columns, the file has cells filled with "Strongly Agree" to "Strongly Disagree", 6 levels. These factors need to be converted in integers 5 to 0 respectively.
I have tried unsuccessfully to use sapply and convert the table to a string. Sapply works on the vector, but it destroys the table structure.
Method 1:
dat$Col<-sapply(dat$Col,switch,'Strongly Disagree'=0,'Disagree'=1,'Slightly Disagree'=2,'Slightly Agree'=3,'Agree'=4, 'Strongly Agree'=5)
My second approach is to convert the csv into a string. When I examined the dput output, I saw the area I wanted to target that started with a .Label="","Strongly Agree"... Mistake. My changes did not result in a useful outcome.
My third approach came from the internet gods of destruction who seemed to express that gsub() might handle the string approach as well. Nope, again the underlying table structure was destroyed.
Method #3: Convert into a string and pattern match
dat <- textConnection("control/Surveys/StudyDat_1.csv")
#Score Scales
##"Strongly Agree"= 5
##"Agree"= 4
##"Strongly Disagree" = 0
#levels(dat$Col) <- gsub("Strongly Agree", "5", levels(dat$Col))
df<- gsub("Strongly Agree", "5",dat)
dat<-read.csv(textConnection(df),header=TRUE)
In the end, I am wanting to replace ALL "Strongly Agree" to 5 across numerous columns without the consequence of destroying the retrievability of the data.
Maybe I used the wrong search string and you know the resource I need to address this problem. I would rather avoid ALL character vector approaches as that this would require labeling each column if you provide a code response. It will need to go across ALL COLUMNS.
Thanks
Data Sample Problem
structure(list(last_updated = structure(c(3L, 1L, 7L, 2L, 10L, 6L, 8L, 9L, 7L, 5L, 4L), .Label = c("2016-05-13T12:53:56.704184Z",
"2016-05-13T12:54:09.273359Z", "2016-05-13T12:54:22.757251Z",
"2016-05-14T12:44:13.474992Z", "2016-05-14T12:44:31.736469Z",
"2016-05-16T16:45:10.623410Z", "2016-05-16T16:46:17.881402Z",
"2016-05-16T16:46:55.122257Z", "2016-05-16T16:47:14.160793Z",
"2016-05-24T02:26:04.770799Z"), class = "factor"), feedback = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), A = structure(c(NA,
NA, 2L, NA, 1L, NA, NA, NA, 2L, NA, NA), .Label = c("", "Slightly Disagree"
), class = "factor"), B = structure(c(NA, NA, 2L, NA, 1L, NA,
NA, NA, 3L, NA, NA), .Label = c("", "Disagree", "Strongly Agree"
), class = "factor"), C = structure(c(NA, NA, 2L, NA, 1L, NA,
NA, NA, 3L, NA, NA), .Label = c("", "Agree", "Disagree"), class = "factor"),
D = structure(c(NA, NA, 2L, NA, 1L, NA, NA, NA, 2L, NA, NA
), .Label = c("", "Agree"), class = "factor"), E = structure(c(NA,
NA, 2L, NA, 1L, NA, NA, NA, 3L, NA, NA), .Label = c("", "Agree",
"Strongly Disagree"), class = "factor")), .Names = c("last_updated",
"feedback", "A", "B", "C", "D", "E"), class = "data.frame", row.names = c(NA,
-11L))
Data Sample Solution
df<-dget(structure(list(last_updated = structure(c(3L, 1L, 7L, 2L, 10L, 6L,8L, 9L, 7L, 5L, 4L), .Label = c("2016-05-13T12:53:56.704184Z",
"2016-05-13T12:54:09.273359Z", "2016-05-13T12:54:22.757251Z",
"2016-05-14T12:44:13.474992Z", "2016-05-14T12:44:31.736469Z",
"2016-05-16T16:45:10.623410Z", "2016-05-16T16:46:17.881402Z",
"2016-05-16T16:46:55.122257Z", "2016-05-16T16:47:14.160793Z",
"2016-05-24T02:26:04.770799Z"), class = "factor"), feedback = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), A = c(NA, NA, 2L, NA,
NA, NA, NA, NA, 2L, NA, NA), B = c(NA, NA, 1L, NA, NA, NA, NA,
NA, 5L, NA, NA), C = c(NA, NA, 4L, NA, NA, NA, NA, NA, 1L, NA,
NA), D = c(NA, NA, 4L, NA, NA, NA, NA, NA, 4L, NA, NA), E = c(NA,
NA, 4L, NA, NA, NA, NA, NA, 0L, NA, NA)), .Names = c("last_updated",
"feedback", "A", "B", "C", "D", "E"), class = "data.frame", row.names = c(NA,-11L)))
we can use factor with levels specified
nm1 <- c('Strongly Disagree', 'Disagree',
'Slightly Disagree','Slightly Agree','Agree', 'Strongly Agree')
factor(dat$col, levels = nm1,
labels = 0:5))
If there are multiple factor columns with the same levels, identify the factor columns ('i1'), loop through it with lapply and specify the levels and labels.
i1 <- sapply(dat, is.factor)
dat[i1] <- lapply(dat[i1], factor, levels = nm1, labels= 0:5)
Update
Using the OP's dput output
dat[-(1:2)] <- lapply(dat[-(1:2)], factor, levels = nm1, labels = 0:5)
dat
# last_updated feedback A B C D E
#1 2016-05-13T12:54:22.757251Z NA <NA> <NA> <NA> <NA> <NA>
#2 2016-05-13T12:53:56.704184Z NA <NA> <NA> <NA> <NA> <NA>
#3 2016-05-16T16:46:17.881402Z NA 2 1 4 4 4
#4 2016-05-13T12:54:09.273359Z NA <NA> <NA> <NA> <NA> <NA>
#5 2016-05-24T02:26:04.770799Z NA <NA> <NA> <NA> <NA> <NA>
#6 2016-05-16T16:45:10.623410Z NA <NA> <NA> <NA> <NA> <NA>
#7 2016-05-16T16:46:55.122257Z NA <NA> <NA> <NA> <NA> <NA>
#8 2016-05-16T16:47:14.160793Z NA <NA> <NA> <NA> <NA> <NA>
#9 2016-05-16T16:46:17.881402Z NA 2 5 1 4 0
#10 2016-05-14T12:44:31.736469Z NA <NA> <NA> <NA> <NA> <NA>
#11 2016-05-14T12:44:13.474992Z NA <NA> <NA> <NA> <NA> <NA>
Another option is set from data.table
library(data.table)
for(j in names(dat)[-(1:2)]){
set(dat, i = NULL, j= j, value = factor(dat[[j]], levels = nm1, labels = 0:5))
}
I would just match each target column vector into a precomputed character vector to get an integer index. You can subtract 1 afterward to change the range from 1:6 to 0:5.
## define desired value order, ascending
o <- c(
'Strongly Disagree',
'Disagree',
'Slightly Disagree',
'Slightly Agree',
'Agree',
'Strongly Agree'
);
## convert target columns
for (cn in names(df)[-(1:2)]) df[[cn]] <- match(as.character(df[[cn]]),o)-1L;
df;
## last_updated feedback A B C D E
## 1 2016-05-13T12:54:22.757251Z NA NA NA NA NA NA
## 2 2016-05-13T12:53:56.704184Z NA NA NA NA NA NA
## 3 2016-05-16T16:46:17.881402Z NA 2 1 4 4 4
## 4 2016-05-13T12:54:09.273359Z NA NA NA NA NA NA
## 5 2016-05-24T02:26:04.770799Z NA NA NA NA NA NA
## 6 2016-05-16T16:45:10.623410Z NA NA NA NA NA NA
## 7 2016-05-16T16:46:55.122257Z NA NA NA NA NA NA
## 8 2016-05-16T16:47:14.160793Z NA NA NA NA NA NA
## 9 2016-05-16T16:46:17.881402Z NA 2 5 1 4 0
## 10 2016-05-14T12:44:31.736469Z NA NA NA NA NA NA
## 11 2016-05-14T12:44:13.474992Z NA NA NA NA NA NA
Previous answers might meet your needs, but note that changing the labels of a factor isn't the same as changing a factor to an integer variable. One possibility would be to use ifelse (I've made a new data frame as the one you posted didn't actually have variables with these levels in it):
lev <- c('Strongly disagree', 'Disagree', 'Slightly disagree', 'Slightly agree', 'Agree', 'Strongly agree')
dta <- sample(lev, 55, replace = TRUE)
dta <- data.frame(matrix(dta, nrow = 11))
names(dta) <- LETTERS[1:5]
f_to_int <- function(f) {
if (is.factor(f)){
ifelse(f == 'Strongly disagree', 0,
ifelse(f == 'Disagree', 1,
ifelse(f == 'Slightly disagree', 2,``
ifelse(f == 'Slightly agree', 3,
ifelse(f == 'Agree', 4,
ifelse(f == 'Strongly agree', 5, f))))))
} else f
}
dta2 <- sapply(dta, f_to_int)
Note that this returns a matrix, but it is easily converted to a data frame if necessary.
I have a matrix and my objective is to find the maximum of each column and then to divide that number by the sum of all values in the row which contains the max of that column. In other words
max(y) / sum of values in the row where y is the max
How would apply this formula to every column in R ?
> the_matrix
Source: local data frame [20 x 10]
type 100 100F 100I 100X 101 102 1028P 103 103D
(fctr) (int) (int) (int) (int) (int) (int) (int) (int) (int)
1 0 NA NA NA NA NA NA NA NA NA
2 0A 2 NA NA NA NA NA NA NA NA
3 0B NA NA NA NA NA NA NA NA NA
4 0C NA NA NA NA NA NA NA NA NA
5 0E NA NA NA NA NA NA NA NA NA
6 0G NA NA NA NA NA NA NA NA NA
7 0O NA NA NA NA NA NA NA NA NA
8 0Z NA NA NA NA NA NA NA NA NA
9 1 2 NA NA NA NA NA NA NA NA
10 1A 3968 NA 214 26 4 289 8 56030 7484
11 1B 172 NA 107 NA NA 2 NA 372 3829
12 1C 584 NA 19 NA NA 1 NA 72951 363
13 1D 27 NA NA NA NA NA NA 365 22
14 1E 27944 16 68 NA NA NA 1 62 12
15 1F 1 NA 1 NA NA 1 NA 368 27
16 1G 4 NA NA NA NA NA NA 7 NA
17 1H 65 NA 6 21 1 6 3 714 59
18 1M NA NA NA NA NA NA NA 1 NA
19 1N NA NA NA NA NA NA NA NA NA
20 1Q NA NA NA NA NA NA NA NA NA
> dput(the_matrix)
structure(list(type = structure(1:20, .Label = c("0", "0A", "0B",
"0C", "0E", "0G", "0O", "0Z", "1", "1A", "1B", "1C", "1D", "1E",
"1F", "1G", "1H", "1M", "1N", "1Q", "1S", "1X", "1Z", "2", "2A",
"2B", "2C", "2D", "2E", "2F", "2G", "2H", "2I", "2J", "2M", "2S",
"2T", "2X", "2Z", "3", "3B", "3C", "3E", "4B", "5H", "8Z", "0H",
"1I", "1R", "2N", "3H", "5D", "0D", "1K", "1P", "1T", "1U", "1V",
"1W", "1Y", "2U", "3A", "4A", "5C", "7H", "9", "0F", "0T", "1J",
"2L", "0W", "2Q", "3G"), class = "factor"), `100` = c(NA, 2L,
NA, NA, NA, NA, NA, NA, 2L, 3968L, 172L, 584L, 27L, 27944L, 1L,
4L, 65L, NA, NA, NA), `100F` = c(NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, 16L, NA, NA, NA, NA, NA, NA), `100I` = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, 214L, 107L, 19L, NA, 68L, 1L,
NA, 6L, NA, NA, NA), `100X` = c(NA, NA, NA, NA, NA, NA, NA, NA,
NA, 26L, NA, NA, NA, NA, NA, NA, 21L, NA, NA, NA), `101` = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, 4L, NA, NA, NA, NA, NA, NA, 1L,
NA, NA, NA), `102` = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, 289L,
2L, 1L, NA, NA, 1L, NA, 6L, NA, NA, NA), `1028P` = c(NA, NA,
NA, NA, NA, NA, NA, NA, NA, 8L, NA, NA, NA, 1L, NA, NA, 3L, NA,
NA, NA), `103` = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, 56030L,
372L, 72951L, 365L, 62L, 368L, 7L, 714L, 1L, NA, NA), `103D` = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, 7484L, 3829L, 363L, 22L, 12L,
27L, NA, 59L, NA, NA, NA)), .Names = c("type", "100", "100F",
"100I", "100X", "101", "102", "1028P", "103", "103D"), class = c("tbl_df",
"data.frame"), row.names = c(NA, -20L))
Going step-by-step:
# let's not call a data frame a matrix
real_matrix = as.matrix(the_matrix[, -1])
# max of each column
col_max = apply(real_matrix, 2, max, na.rm = T)
# which row contains the max
col_which_max = apply(real_matrix, 2, which.max)
# row totals
row_total = rowSums(real_matrix, na.rm = T)
# col max divided by row total for corresponding row
col_max / row_total[col_which_max]
Rounded to 3 decimals, this yields the following:
100 100F 100I 100X 101 102 1028P 103 103D
0.994 0.001 0.003 0.000 0.000 0.004 0.000 0.987 0.110
I have a data.frame called RawHM and want, for each row, to evaluate sets of columns defined by the entries in the list AllList, in order to see if there is enough non-NA observations (not less than 2) to keep the column set of entries for that row. If not, the column set entries should be substituted with NA's.
AllList:
> dput(AllList)
structure(list(EGI = c("OO", "PP", "QQ"), Ref = c("RR", "SS",
"TT")), .Names = c("EGI", "Ref"))
RawHM:
> dput(head(RawHM,10))
structure(list(OO = c(2.26128283268031, NA, NA, NA, 3.1189673217816,
2.68131772865193, 1.50542478607416, NA, NA, NA), PP = c(NA, 2.86537733048028,
2.02969026818987, NA, 2.54112005565494, 3.01623803266379, 1.73909499803785,
2.49712237003491, NA, 1.67635525591635), QQ = c(NA, NA, 1.91968060122123,
NA, NA, 2.63463138625395, NA, NA, NA, NA), RR = c(NA, NA, NA,
NA, NA, 1.01488582084669, 1.01944283768403, NA, 1.06329113924051,
NA), SS = c(0.950310559006211, 0.924124326404927, 1.07886334610473,
0.951793999929161, 0.847931452310888, 0.879173290937997, 0.882126364182319,
NA, NA, 0.713085668766746), TT = c(NA, NA, 1.09812749411644,
NA, 0.9994646420402, 1.21090641120118, 1.25090285854196, NA,
NA, NA)), .Names = c("OO", "PP", "QQ", "RR", "SS", "TT"), row.names = c(1L,
2L, 15L, 16L, 23L, 24L, 25L, 30L, 36L, 40L), class = "data.frame")
I have tried by making a function:
func<-function(x)unlist(lapply(AllList,function(y)if(length(na.omit(x[unlist(y)]))<2){rep(NA,length(unlist(y)))} else{x[unlist(y)]}))
And then:
output<-t(apply(RawHM,1,func))
Which works in priciple but doesnt preserve the colnames, which i want to be the same as in the RawHM dataframe. I would prefer to avoid renaming the columns afterwards..
> dput(head(output,10))
structure(c(NA, NA, NA, NA, 3.1189673217816, 2.68131772865193,
1.50542478607416, NA, NA, NA, NA, NA, 2.02969026818987, NA, 2.54112005565494,
3.01623803266379, 1.73909499803785, NA, NA, NA, NA, NA, 1.91968060122123,
NA, NA, 2.63463138625395, NA, NA, NA, NA, NA, NA, NA, NA, NA,
1.01488582084669, 1.01944283768403, NA, NA, NA, NA, NA, 1.07886334610473,
NA, 0.847931452310888, 0.879173290937997, 0.882126364182319,
NA, NA, NA, NA, NA, 1.09812749411644, NA, 0.9994646420402, 1.21090641120118,
1.25090285854196, NA, NA, NA), .Dim = c(10L, 6L), .Dimnames = list(
c("1", "2", "15", "16", "23", "24", "25", "30", "36", "40"
), NULL))
Any help would be very welcome :-)
Regards
Mads
func is a very strange function... funky even!
When you use apply your data gets converted to a matrix from a data.frame. Your function seems to operate differently if it is a data.frame rather than a matrix:
func(RawHM[1,])
EGI.OO EGI.PP EGI.QQ Ref.RR Ref.SS Ref.TT
2.2612828 NA NA NA 0.9503106 NA
func(as.matrix(RawHM)[1,])
EGI1 EGI2 EGI3 Ref1 Ref2 Ref3
NA NA NA NA NA NA
Note that you get different results, and different names!
In any case, the names issue arises from the fact that when you produce the NAs, there are no names, so the result give inconsistent output for apply. To fix this, here is a modification:
func2 <- function(x)unlist(lapply(AllList,function(y)if(length(na.omit(x[unlist(y)]))<2){sapply(y,function(z) NA)} else{x[unlist(y)]}))
t(apply(RawHM,1,func2))
EGI.OO EGI.PP EGI.QQ Ref.RR Ref.SS Ref.TT
1 NA NA NA NA NA NA
2 NA NA NA NA NA NA
15 NA 2.029690 1.919681 NA 1.0788633 1.0981275
16 NA NA NA NA NA NA
23 3.118967 2.541120 NA NA 0.8479315 0.9994646
24 2.681318 3.016238 2.634631 1.014886 0.8791733 1.2109064
25 1.505425 1.739095 NA 1.019443 0.8821264 1.2509029
30 NA NA NA NA NA NA
36 NA NA NA NA NA NA
40 NA NA NA NA NA NA