Create a single output matrix using apply function - r

Dear programming gods,
I would like to perform a series of Chi-square tests in R (one test for each column of my species Presence/Absence data.frame) using a function that can yield a single matrix (or data.frame, ideally) which lists as output the species (column name), Chi-square test statistic, df, and p.value.
My species data snippet (actual dimensions = 50x131):
Species<-structure(list(Acesac = c(0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 1L
), Allpet = c(0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L), Ambser = c(0L,
0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L), Anoatt = c(0L, 0L, 0L, 1L, 0L,
1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L,
0L, 1L, 1L, 1L), Aritri = c(0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L,
0L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L
)), .Names = c("Acesac", "Allpet", "Ambser", "Anoatt", "Aritri"
), row.names = c("BS1", "BS10", "BS2", "BS3", "BS4", "BS5", "BS6",
"BS7", "BS8", "BS9", "LC1", "LC10", "LC2", "LC3", "LC4", "LC5",
"LC6", "LC7", "LC8", "LC9", "TR1", "TR10", "TR2", "TR3", "TR4"
), class = "data.frame")
My environmental data snippet:
Env<-structure(list(Rock = structure(1:25, .Label = c("BS1", "BS10",
"BS2", "BS3", "BS4", "BS5", "BS6", "BS7", "BS8", "BS9", "LC1",
"LC10", "LC2", "LC3", "LC4", "LC5", "LC6", "LC7", "LC8", "LC9",
"TR1", "TR10", "TR2", "TR3", "TR4", "TR5", "TR6", "TR7", "TR8",
"TR9", "WD1", "WD10", "WD2", "WD3", "WD4", "WD5", "WD6", "WD7",
"WD8", "WD9", "WW1", "WW10", "WW2", "WW3", "WW4", "WW5", "WW6",
"WW7", "WW8", "WW9"), class = "factor"), Climbed = structure(c(1L,
2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L,
1L, 2L, 1L, 1L, 2L, 2L, 1L, 2L), .Label = c("climbed", "unclimbed"
), class = "factor")), .Names = c("Rock", "Climbed"), row.names = c(NA,
25L), class = "data.frame")
The following apply function code performs a chi-sq test on each species (column) by first creating a contingency table with the number of occurrences of a given species on climbed vs. unclimbed rocks (Env$Climbed).
apply(Species, 2, function(x) {
Table<-table(Env$Climbed, x)
Test<-chisq.test(Table, corr = TRUE)
out <- data.frame("Chi.Square" = round(Test$statistic,3)
, "df" = Test$parameter
, "p.value" = round(Test$p.value, 3)
)
})
This yields a separate data.frame for each species (column). I would like to yield one data.frame, which includes also the column name of each species. Something like this:
mydf<-data.frame("spp"= colnames(Species[1:25,]), "Chi.sq"=c(1:25), "df"=
c(1:25),"p.value"= c(1:25))
Should this be done with ddply or adply? Or just a loop? (I tried, but failed). I reviewed a posting on a similar topic ([Chi Square Analysis using for loop in R), but could not make it work for my purposes.
Thank you for your time and expertise!
TC

Don't use apply on data.frames. It internally coerces to a matrix, which can have unintended consequences for some data structures (i.e. factors). It is also not efficient (memorywise).
If you want to apply a function by column, use lapply (as a data.frame is a list)
You can use plyr::ldply do automagically return a data.frame not a list.
# rewrite the function so `Env$Climbed` is not hard coded....
my_fun <- function(x,y) {
Table<-table(y, x)
Test<-chisq.test(Table, corr = TRUE)
out <- data.frame("Chi.Square" = round(Test$statistic,3)
, "df" = Test$parameter
, "p.value" = round(Test$p.value, 3)
)
}
library(plyr)
results <- ldply(Species,my_fun, y = Env$Climbed)
results
# .id Chi.Square df p.value
# 1 Acesac 0.000 1 1.000
# 2 Allpet 0.000 1 1.000
# 3 Ambser 0.000 1 1.000
# 4 Anoatt 0.338 1 0.561
# 5 Aritri 0.085 1 0.770

If you save the result of your apply as
kk <- apply(Species, 2, function(x) {...})
Then you can finish the transformation with
do.call(rbind, Map(function(x,y) cbind(x, species=y), kk, names(kk)))
Here we just append the name of the species to each data.frame and combine all the rows with rbind.

You can also try
kk <- apply(Species,2,....)
library(plyr)
ldply(kk,.id='spp')
spp Chi.Square df p.value
1 Acesac 0.000 1 1.000
2 Allpet 0.000 1 1.000
3 Ambser 0.000 1 1.000
4 Anoatt 0.338 1 0.561
5 Aritri 0.085 1 0.770
Upd:
library(plyr)
library(reshape2)
ddply(setNames(melt(Species), c("spp", "value")), .(spp), function(x) {
Test <- chisq.test(table(Env$Climbed, x$value), corr = TRUE)
data.frame(Chi.Square = round(Test$statistic, 3), df = Test$parameter, p.value = round(Test$p.value,
3))
})

Related

R two regressions from one table

I am trying to plot two different regression lines (with the formula: salary = beta0 + beta1D3 + beta2spending + beta3*(spending*D3) + w) into one scatter plot by deviding the data I have into two subsets as seen in the following code:
salary = data$salary
spending = data$spending
D1 = data$North
D2 = data$South
D3 = data$West
subsetWest = subset(data, D3 == 1)
subsetRest = subset(data, D3 == 0)
abab = lm(salary ~ 1 + spending + 1*spending, data=subsetWest) #red line
caca = lm(salary ~ 0 + spending + 0*spending, data=subsetRest) #blue line
plot(spending,salary)
points(subsetWest$spending, subsetWest$salary, pch=25, col = "red")
points(subsetRest$spending, subsetRest$salary, pch=10, col = "blue")
abline(abab, col = "red")
abline(caca, col = "blue")
This is a sample of my data table:
And this is the plot I get when running the code:
[enter image description here][2] [2]: https://i.stack.imgur.com/It8ai.png
My problem is that the intercept for my second regression is wrong, in fact I do not even get an intercept when looking at the summary, unlike with the first regression.
Does anybody see where my problem is or does anybody know an alternative way of plotting the two regression lines?
Help would be much appreciated. Thank you very much!
This is the whole table:
structure(list(salary = c(39166L, 40526L, 40650L, 53600L, 58940L,
53220L, 61356L, 54340L, 51706L, 49000L, 48548L, 54340L, 60336L,
53050L, 54720L, 43380L, 43948L, 41632L, 36190L, 41878L, 45288L,
49248L, 54372L, 67980L, 46764L, 41254L, 45590L, 43140L, 44160L,
44500L, 41880L, 43600L, 45868L, 36886L, 39076L, 40920L, 42838L,
50320L, 44964L, 41938L, 54448L, 51784L, 45288L, 49280L, 44682L,
51220L, 52030L, 51576L, 58264L, 51690L), spending = c(6692L,
6228L, 7108L, 9284L, 9338L, 9776L, 11420L, 11072L, 8336L, 7094L,
6318L, 7242L, 7564L, 8494L, 7964L, 7136L, 6310L, 6118L, 5934L,
6570L, 7828L, 9034L, 8698L, 10040L, 7188L, 5642L, 6732L, 5840L,
5960L, 7462L, 5706L, 5066L, 5458L, 4610L, 5284L, 6248L, 5504L,
6858L, 7894L, 5018L, 10880L, 8084L, 6804L, 5658L, 4594L, 5864L,
7410L, 8246L, 7216L, 7532L), North = c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), South = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L), West = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L)), class = "data.frame", row.names = c(NA,
-50L))
My problem is that the intercept for my second regression is wrong, in fact I do not even get an intercept when looking at the summary, unlike with the first regression.
That is because your second model specifies no intercept, since you use ... ~ 0 + ...
Also, your first model doesn't make sense because it includes spending twice. The second entry for spending will be ignored by lm

How to write a function to count the number of observations based on specific conditions in R?

I have a data frame of 1401 observations of 16 variables. For each column (except the first one), I have either 1 (if a condition is met) or 0 (if a condition is not met). Overall, the idea is to count how many observations meet certain conditions successively. We can think about it as a decision tree: in the first branch you can have either 1 (condition is met) or 0 (condition is not met), in the second branch starting from the 0 of the first branch, you can also have 1 or 0, etc... In my data frame, branches are columns. I want to investigate the impact of looking at the different branches (columns) in various orders.
My idea is to count the number of "1" in column Cn if I know that there was a "0" in column Cn-1.
dput(droplevels(head(data,20)))
structure(list(Substance = structure(c(13L, 9L, 10L, 12L, 1L,
19L, 16L, 17L, 5L, 2L, 14L, 7L, 4L, 6L, 20L, 18L, 15L, 3L, 11L,
8L), .Label = c("104653-34-1", "107-02-8", "111-30-8", "12057-74-8",
"122454-29-9", "14915-37-8", "20859-73-8", "27083-27-8", "28772-56-7",
"3691-35-8", "55965-84-9", "56073-07-5", "56073-10-0", "5836-29-3",
"71751-41-2", "74-90-8", "81-81-2", "86347-14-0", "90035-08-8",
"91465-08-6"), class = "factor"), colA = c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
colB = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L), colC = c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), colD = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L), colE = c(0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L,
1L, 1L), colF = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L), colG = c(0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L,
1L), colH = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), colI = c(0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L
), colK = c(1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 0L, 0L, 1L, 0L,
0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L), colJ = c(0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 1L,
0L, 0L), colL = c(1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L,
0L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 0L, 0L, 1L), colM = c(NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_), colN = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), colO = c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L)), .Names = c("Substance",
"Oral", "Dermal", "Inhalation", "SC", "SED", "RS", "SS", "M",
"C", "R", "STOT.SE", "STOT.RE", "AT", "Eco.Acute", "Eco.Chronic"
), row.names = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 9L, 10L, 12L, 13L,
14L, 17L, 18L, 19L, 20L, 21L, 22L, 28L, 34L), class = "data.frame")
#I define the order in which I look at the columns
orderA <- colnames(data)[2:16]
#no-yes function counts chemicals which meet condition Cn when condition Cn-1 is not met
count_no_yes <- function(data, cols) {
data <- data[, cols]
sum(apply(data, 1, function(x) all(x == 1)))
}
endpoints <- 0:15
#scenario A with order A of the columns
counts <- sapply(1:15, function(i) count_no_yes(data, orderA[1:i]))
counts <- c(nrow(data), counts)
scenarioA <- data.frame(endpoint=endpoints, hits=counts, scenario="scenarioA")
My problem is that I don't know how to include the information from the previous observation in my code. The current is not working. I get the following error: Error in apply(data, 1, function(x) all(x == 1)):dim(X) must have a positive length.
The idea is then to plot the number of observations that meet the conditions for each branch of the tree (column).
#scenario B with a different order of the columns
orderB <- colnames(data)[c(9, 10, 11, 5, 6, 8, 3, 2, 4, 13, 12, 7, 14, 15, 16)]
counts <- sapply(1:15, function(i) count_yes_yes(data, orderB[1:i]))
counts <- c(nrow(data), counts)
scenarioB <- data.frame(endpoint=endpoints, hits=counts, scenario="scenarioB")
#combine the different scenarios and plot
scenarios <- rbind(scenarioA, scenarioB)
library(ggplot2)
ggplot(scenarios, aes(x=endpoint, y=hits, color=scenario, group=scenario)) +
geom_point() +
geom_line()
Could it be this?
we tidy the data with tidy::gather then dplyr::group_by(par) and count the number of times a 0 is followed by a 1.
my.fun <- function(x) {
#Values
v <-rle(x)[[2]]
#Consecutive lenght
l <- rle(x)[[1]]
tmp <- data.frame(v = v, l=l)
tmp <-
tmp %>%
# for each column find a substance with
# 1 which came after a substance with value 0
# and check that 1 is followed by a zero
mutate(flag = ifelse(v==1 & lag(v)==0 & lead(v) == 0, 1, 0))
#return the sum of the `flag`value
sum(tmp$flag, na.rm = TRUE)
}
df %>%
tidyr::gather("par", "value", everything(), -Substance) %>%
group_by(par) %>%
summarise(c = my.fun(value))
# A tibble: 15 x 2
par c
<chr> <dbl>
1 AT 0
2 C 0
3 Dermal 0
4 Eco.Acute 1
5 Eco.Chronic 0
6 Inhalation 0
7 M 0
8 Oral 0
9 R 4
10 RS 1
11 SC 2
12 SED 1
13 SS 0
14 STOT.RE 4
15 STOT.SE 3
the rle function is a real gem for analyzing consecutiveness in a vector.
The my.fun can probably be adjusted to your exact needs.

failed without any error message while replacing part of names of columns with other names in dataframe

I have a dataframe which has column names as shown in the dput data below.
structure(list(mosales = c(1L, 1L, 1L, 12L, 1L), sale123 = c(14.86,
8.97, 6.44, 463.61, 15.94), totsales = c(1L, 1L, 1L,
30L, 1L), totqty = c(1L, 1L, 1L, 34L, 2L), unqsales = c(1L,
1L, 1L, 6L, 2L), x1_rank_1 = c(1L, 1L, 1L, 0L, 1L), x1_rank_4 = c(0L,
0L, 0L, 1L, 0L), x1_rank_3 = c(0L, 0L, 0L, 0L, 0L), x1_rank_2 = c(0L,
0L, 0L, 0L, 0L), x2_rank_2 = c(1L, 1L, 0L, 0L, 1L), x2_rank_1 = c(0L,
0L, 1L, 0L, 0L), x2_rank_5 = c(0L, 0L, 0L, 1L, 0L), x2_rank_4 = c(0L,
0L, 0L, 0L, 0L), x2_rank_3 = c(0L, 0L, 0L, 0L, 0L), x3_rank_1 = c(1L,
1L, 1L, 0L, 1L), x3_rank_4 = c(0L, 0L, 0L, 1L, 0L), x3_rank_3 = c(0L,
0L, 0L, 0L, 0L), x3_rank_2 = c(0L, 0L, 0L, 0L, 0L), x4_rank_1 = c(1L,
1L, 1L, 0L, 0L), x4_rank_5 = c(0L, 0L, 0L, 1L, 0L), x4_rank_2 = c(0L,
0L, 0L, 0L, 1L), x4_rank_4 = c(0L, 0L, 0L, 0L, 0L), x4_rank_3 = c(0L,
0L, 0L, 0L, 0L), x5_rank_1 = c(1L, 1L, 1L, 0L, 0L), x5_rank_4 = c(0L,
0L, 0L, 1L, 0L), x5_rank_2 = c(0L, 0L, 0L, 0L, 1L), x5_rank_3 = c(0L,
0L, 0L, 0L, 0L)), row.names = c(36L, 41L, 72L, 79L, 137L), class = "data.frame")
What I am trying to do is to replace columns names, which start with x1_rank, x2_rank .. and so forth, by the names such that "x1_rank" should be replaced with mosales, "x2_rank" should be replaced with sales123, "x3_rank" should be replaced with "totsales", "x4_rank" should be replaced with "totqty" and "x5_rank" should be replaced with "unqsales".
So the final column names would look like:
mosales, sale123, totsales, totqty, unqsales, mosales_1, mosales_2,... sale123_1, sale123_2,... totsales_1, totsales_2,... totqty_1, totqty_2,... unqsales_1, unqsales_2,...
I tried using a for loop and gsub, as in below. This code ran without any errors but I didn't get what I was looking for. Not sure where is the error.
df1 <- df
z <- names(df)
for (i in 1:length(z)){
gsub(paste0("x",i,"_rank"), z[i], names(df1))
}
df is dataframe which can be created using above dput data.
String manipulation functions in the stringi package are vectorized over pattern, replacement, and (optionally) string. This is very convenient for your case:
library(stringi)
orig <- c(stri_c("x", 1:5, "_rank"))
repl <- c("mosales", "sales123", "totsales", "totqty", "unqsales")
names(df) <- stri_replace_all_fixed(names(df), orig, repl,
vectorize_all = FALSE)
You can use this reg \\bx1_rank[a-zA-Z]*.
This will match x1_rank at the start of the string.
gsub lets you replace the found pattern with what you want. Repeat for all the cases you need.
In a loop:
repl <- c("mosales", "sales123", "totsales", "totqty", "unqsales")
for (i in 1:5) {
p = paste0("\\b^x", i, "_rank[a-zA-Z]*")
colnames(d) = gsub(pattern=p, repl[i], colnames(d))
}
colnames(d)
#
# [1] "mosales" "sale123" "totsales" "totqty" "unqsales" "mosales_1" "mosales_4"
# [8] "mosales_3" "mosales_2" "sales123_2" "sales123_1" "sales123_5" "sales123_4" "sales123_3"
# [15] "totsales_1" "totsales_4" "totsales_3" "totsales_2" "totqty_1" "totqty_5" "totqty_2"
# [22] "totqty_4" "totqty_3" "unqsales_1" "unqsales_4" "unqsales_2" "unqsales_3"

How to replace NAs with row means if proportion of row-wise NAs is below a certain threshold?

Apologies for the somewhat cumbersome question, but I am currently working on a mental health study. For one of the mental health screening tools there are 15 variables, each of which can have values of 0-3. The total score for each row/participant is then assigned by taking the sum of these 15 variables. The documentation for this tool states that if more than 20% of the values for a particular row/participant are missing, the total score should be taken as missing also, however if fewer than 20% of the values for a row are missing, each missing value should be assigned the mean of the remaining values for that row.
I decided that to do this I would have to calculate the proportion of NAs for each participant, calculate the mean of all 15 variables excluding NAs for each participant, and then use a conditional mutate statement (or something similar) that checked if the proportion of NAs was less than 20% and if so replaced NAs for the relevant columns with the mean value for that row, before finding the sum of all 15 variables for each row. The dataset also contains other columns besides these 15, so applying a function to all of the columns would not be useful.
To calculate the mean score without NAs I did the following:
mental$somatic_mean <- rowMeans(mental [, c("var1", "var2", "var3",
"var4", "var5", "var6", "var7", "var8", "var9", "var10", "var11",
"var12","var13", "var14", "var15")], na.rm=TRUE)
And to calculate the proportion of NAs for each variable:
mental$somatic_na <- rowMeans(is.na(mental [, c("var1", "var2",
"var3", "var4", "var5", "var6", "var7", "var8", "var9", "var10", "var11",
"var12", "var13", "var14", "var15")]))
However when I attempted the mutate() statement to alter the rows where fewer than 20% of values were NA I can't identify any code that works. I have tried a lot of permutations by this point, including the following for each variable:
mental_recode <- mental %>%
rowwise() %>%
mutate(var1 = if(somatic_na<0.2)
replace_na(list(var1= somatic_mean)))
Which returns:
"no applicable method for 'replace_na' applied to an object of class "list""
and attempting to do them all together without using mutate():
mental %>%
rowwise() %>%
if(somatic_na<0.2)
replace_na(list(var1 = somatic_mean, var2=
somatic_mean, var3 = somatic_mean, var4 = somatic_mean, var5 =
somatic_mean, var6 = somatic_mean, var7 = somatic_mean, var8 =
somatic_mean, var9 = somatic_mean, var10 = somatic_mean, var11 =
somatic_mean, var12 = somatic_mean, var13 = somatic_mean, var14 =
somatic_mean, var15 = somatic_mean ))
Which returns:
Error in if (.) somatic_na < 0.2 else replace_na(mental, list(var1 = somatic_mean, :
argument is not interpretable as logical
In addition: Warning message:
In if (.) somatic_na < 0.2 else replace_na(mental, list(var1 = somatic_mean, :
the condition has length > 1 and only the first element will be used
I also tried using if_else() in conjunction with mutate() and setting the value to NA if the condition was not met, but could not get that to work after various permutations and error messages either.
EDIT: Dummy data can be generated by the following:
mental <- structure(list(id = 1:21, var1 = c(0L, 0L, 1L, 1L, 1L, 0L, 0L,
NA, 0L, 0L, 0L, 0L, 0L, 0L, NA, 0L, 0L, 0L,
0L, 0L, 0L), var2 = c(0L,
0L, 1L, 1L, 1L, 0L, 0L, 2L, 1L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L,
2L, 0L, 1L, 1L), var3 = c(0L, 0L, 0L, 1L, 1L, 0L, 1L, 2L, 1L,
1L, 0L, 0L, 1L, 0L, 1L, 1L, 1L, 2L, 0L, 1L, 1L), var4 = c(1L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, NA, 0L, 0L, 0L,
0L, 1L, 0L, 0L), var5 = c(0L, 0L, 0L, 1L, NA, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), var6 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L), var7 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, NA, 0L, 0L, 0L, 0L, 0L, NA, 0L), var8 = c(0L,
0L, 0L, 0L, NA, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L), var9 = c(0L, 0L, 0L, 0L, 1L, 0L, 0L, 1L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L), var10 = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, NA, 0L, 0L, 0L,
0L, 0L, NA, 0L), var11 = c(1L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, NA, 0L), var12 = c(1L,
0L, 1L, 1L, NA, 0L, 0L, NA, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L,
1L, 0L, 1L, 1L), var13 = c(1L, 0L, 1L, 0L, 1L, 0L, 1L, 1L, 1L,
0L, 1L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L, NA, 0L), var14 = c(1L,
0L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 1L, 0L, 1L, 1L, 0L,
2L, 0L, 1L, 0L), var15 = c(1L, 0L, 2L, NA, NA, 0L, NA, 0L, 0L,
0L, 0L, 0L, NA, NA, 0L, NA, NA, NA, NA, NA, 0L)), .Names = c("id",
"var1", "var2", "var3", "var4", "var5", "var6", "var7", "var8",
"var9", "var10", "var11", "var12", "var13", "var14", "var15"), class =
"data.frame", row.names = c(NA,
-21L))
Does anyone know of code that would work for this sort of situation?
Thanks in advance!
Here is a way to do it all in one chain using dplyr using your supplied data frame.
First create a vector of all column names of interest:
name_col <- colnames(mental)[2:16]
And now use dplyr
library(dplyr)
mental %>%
# First create the column of row means
mutate(somatic_mean = rowMeans(.[name_col], na.rm = TRUE)) %>%
# Now calculate the proportion of NAs
mutate(somatic_na = rowMeans(is.na(.[name_col]))) %>%
# Create this column for filtering out later
mutate(somatic_usable = ifelse(somatic_na < 0.2,
"yes", "no")) %>%
# Make the following replacement on a row basis
rowwise() %>%
mutate_at(vars(name_col), # Designate eligible columns to check for NAs
funs(replace(.,
is.na(.) & somatic_na < 0.2, # Both conditions need to be met
somatic_mean))) %>% # What we are subbing the NAs with
ungroup() # Now ungroup the 'rowwise' in case you need to modify further
Now, if you wanted to only select the entries that have less than 20% NAs, you can pipe the above into the following:
filter(somatic_usable == "yes")
Also of note, if you wanted to instead make the condition less than or equal to 20%, you would need to replace the two somatic_na < 0.2 with somatic_na <= 0.2.
Hope this helps!
Here's a way using just base R expressions and remember mathematical properties of sums and means:
# generate fake data
set.seed(123)
dat <- data.frame(
ID = 1:10,
matrix(sample(c(0:3, NA), 10 * 15, TRUE), nrow = 10, ncol = 15),
'another_var' = 'foo',
'second_var' = 'bar',
stringsAsFactors = FALSE
)
var_names <- paste0('X', 1:15)
# add number of NAs to data
dat$na_num <- rowSums(is.na(dat[var_names]))
# add row sum
dat$row_sum <- rowSums(dat[var_names], na.rm = TRUE)
# add row mean
dat$row_mean <- rowMeans(dat[var_names], na.rm = TRUE)
# add final sum
dat$final_sum <- dat$row_sum + dat$row_mean * dat$na_num
# recode final sum to be NA if prop > .2
dat$final_sum <- ifelse(rowMeans(is.na(dat[var_names])) > .2,
NA,
dat$final_sum)
Here's a function that does the same thing. Where you specify your data and then a character vector of your variable names.
total_sum_calculation <- function(data, var_names){
# add number of NAs to data
na_num <- rowSums(is.na(data[var_names]))
# add row sum
row_sum <- rowSums(data[var_names], na.rm = TRUE)
# add row mean
row_mean <- rowMeans(data[var_names], na.rm = TRUE)
# add final sum
final_sum <- row_sum + row_mean * na_num
# recode final sum to be NA if prop > .2
ifelse(rowMeans(is.na(data[var_names])) > .2,
NA,
final_sum)
}
v_names <- paste0('var', 1:15)
total_sum_calculation(data = mental, var_names = v_names)
[1] 6.000000 0.000000 8.000000 7.500000 NA 0.000000 3.214286 9.230769 6.000000 2.000000 1.000000 0.000000 4.285714
[14] NA 5.357143 5.357143 5.357143 9.642857 1.071429 NA 3.000000

Solving normal equation gives different coefficients from using `lm`?

I wanted to compute a simple regression using the lm and plain matrix algebra. However, my regression coefficients obtained from matrix algebra are only half of those obtained from using the lm and I have no clue why.
Here's the code
boot_example <- data.frame(
x1 = c(1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L),
x2 = c(0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L),
x3 = c(1L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L),
x4 = c(0L, 1L, 0L, 0L, 1L, 0L, 0L, 1L, 0L),
x5 = c(1L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L),
x6 = c(0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 1L),
preference_rating = c(9L, 7L, 5L, 6L, 5L, 6L, 5L, 7L, 6L)
)
dummy_regression <- lm("preference_rating ~ x1+x2+x3+x4+x5+x6", data = boot_example)
dummy_regression
Call:
lm(formula = "preference_rating ~ x1+x2+x3+x4+x5+x6", data = boot_example)
Coefficients:
(Intercept) x1 x2 x3 x4 x5 x6
4.2222 1.0000 -0.3333 1.0000 0.6667 2.3333 1.3333
###The same by matrix algebra
X <- matrix(c(
c(1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L), #upper var
c(0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L), #upper var
c(1L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L), #country var
c(0L, 1L, 0L, 0L, 1L, 0L, 0L, 1L, 0L), #country var
c(1L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L), #price var
c(0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 1L) #price var
),
nrow = 9, ncol=6)
Y <- c(9L, 7L, 5L, 6L, 5L, 6L, 5L, 7L, 6L)
#Using standardized (mean=0, std=1) "z" -transformation Z = (X-mean(X))/sd(X) for all predictors
X_std <- apply(X, MARGIN = 2, FUN = function(x){(x-mean(x))/sd(x)})
##If constant shall be computed as well, uncomment next line
#X_std <- cbind(c(rep(1,9)),X_std)
#using matrix algebra formula
solve(t(X_std) %*% X_std) %*% (t(X_std) %*% Y)
[,1]
[1,] 0.5000000
[2,] -0.1666667
[3,] 0.5000000
[4,] 0.3333333
[5,] 1.1666667
[6,] 0.6666667
Does anyone see the error in my matrix computation?
Thank you in advance!
lm is not performing standardization. If you want to obtain the same result by lm, you need:
X1 <- cbind(1, X) ## include intercept
solve(crossprod(X1), crossprod(X1,Y))
# [,1]
#[1,] 4.2222222
#[2,] 1.0000000
#[3,] -0.3333333
#[4,] 1.0000000
#[5,] 0.6666667
#[6,] 2.3333333
#[7,] 1.3333333
I don't want to repeat that we should use crossprod. See the "follow-up" part of Ridge regression with glmnet gives different coefficients than what I compute by “textbook definition”?

Resources