Assign NA to the name of table - r

In the following example, I want to extract NA as a level and display it in the table just as other levels. The levels() function doesn't work with NA value. Is there any other way to deal with this problem?
n=1000
comorbid<-sample(c(rep("diabetes",2),
rep("hypertension",5),
"cirrhosis","stroke","heartfailure",
"renalfailure",rep("COPD",3)),
n,
replace=T)
comorbid[sample(1:n,50)]<-NA
mort<-sample(c(rep("alive",4),
"dead"),n,replace=T)
table.cat<-data.frame(matrix(rep(999,7),nrow=1))
table<-table(comorbid,useNA="always")
per<-prop.table(table)
table.sub<-table(comorbid,mort,useNA="always")
per.sub<-prop.table(table.sub,2)
p<-tryCatch({#using fisher's test when scarce data
chisq.test(table.sub)$p.value
}, warning = function(w) {
fisher.test(table.sub,
workspace = 10e7)$p.value
})
frame<-data.frame(No.tot=as.data.frame(table)[,"Freq"],
per.tot=as.data.frame(per)[,"Freq"],
No.1=as.data.frame.matrix(table.sub)[,"alive"],
per.1=as.data.frame.matrix(per.sub)[,"alive"],
No.2=as.data.frame.matrix(table.sub)[,"dead"],
per.2=as.data.frame.matrix(per.sub)[,"dead"],
p=p)
rownames(frame)<-paste("comorbid",levels(comorbid),sep="_")

levels() works just fine with NA values. What levels() requires however is a factor (or anything with a levels attribute). As per your code, comorbid is a character vector:
> class(comorbid)
[1] "character"
If you coerce comorbid to a factor and change the default so that NAs are not excluded from the factor levels, you get the desired behaviour:
fcomorbid <- factor(comorbid, exclude = NULL)
levels(fcomorbid)
paste("comorbid", levels(fcomorbid), sep = "_")
> levels(fcomorbid)
[1] "cirrhosis" "COPD" "diabetes" "heartfailure" "hypertension"
[6] "renalfailure" "stroke" NA
> paste("comorbid", levels(fcomorbid), sep = "_")
[1] "comorbid_cirrhosis" "comorbid_COPD" "comorbid_diabetes"
[4] "comorbid_heartfailure" "comorbid_hypertension" "comorbid_renalfailure"
[7] "comorbid_stroke" "comorbid_NA"
To complete your example then
rownames(frame) <- paste("comorbid", levels(fcomorbid), sep = "_")
and we have
> frame
No.tot per.tot No.1 per.1 No.2 per.2 p
comorbid_cirrhosis 69 0.069 57 0.07011070 12 0.06417112 0.3108409
comorbid_COPD 209 0.209 172 0.21156212 37 0.19786096 0.3108409
comorbid_diabetes 128 0.128 101 0.12423124 27 0.14438503 0.3108409
comorbid_heartfailure 57 0.057 45 0.05535055 12 0.06417112 0.3108409
comorbid_hypertension 334 0.334 267 0.32841328 67 0.35828877 0.3108409
comorbid_renalfailure 78 0.078 61 0.07503075 17 0.09090909 0.3108409
comorbid_stroke 75 0.075 63 0.07749077 12 0.06417112 0.3108409
comorbid_NA 50 0.050 47 0.05781058 3 0.01604278 0.3108409

Related

str_match based on vector with count issue

I havent got a reprex but my data are stored in a csv file
https://transcode.geo.data.gouv.fr/services/5e2a1fbefa4268bc25628f27/feature-types/drac:site?format=CSV&projection=WGS84
library(readr)
bzh_sites <- read_csv("site.csv")
I want to count row based on characters matching (column NATURE)
pattern<-c("allée|aqueduc|architecture|atelier|bas|carrière|caveau|chapelle|château|chemin|cimetière|coffre|dépôt|dolmen|eau|église|enceinte|enclos|éperon|espace|exploitation|fanum|ferme|funéraire|groupe|habitat|maison|manoir|menhir|monastère|motte|nécropole|occupation|organisation|parcellaire|pêcherie|prieuré|production|rue|sépulture|stèle|thermes|traitement|tumulus|villa")
test2 <- bzh_sites %>%
drop_na(NATURE) %>%
group_by(NATURE = str_match( NATURE, pattern )) %>%
summarise(n = n())
gives me :
NATURE n
1 allée 176
2 aqueduc 73
3 architecture 68
4 atelier 200
AND another test with the same data (NATURE)
pattern <- c("allée|aqueduc|architecture|atelier")
test2 <- bzh_sites %>%
drop_na(NATURE) %>%
group_by(NATURE = str_match( NATURE, pattern )) %>%
summarise(n = n())
gives me :
NATURE n
1 allée 178
2 aqueduc 74
3 architecture 79
4 atelier 248
I have no idea about the différences of count.
I tried to find out where the discrepancy is for first group i.e "allée". This is what I found :
library(stringr)
pattern1<-c("allée|aqueduc|architecture|atelier|bas|carrière|caveau|chapelle|château|chemin|cimetière|coffre|dépôt|dolmen|eau|église|enceinte|enclos|éperon|espace|exploitation|fanum|ferme|funéraire|groupe|habitat|maison|manoir|menhir|monastère|motte|nécropole|occupation|organisation|parcellaire|pêcherie|prieuré|production|rue|sépulture|stèle|thermes|traitement|tumulus|villa")
#Get indices where 'allée' is found using pattern1
ind1 <- which(str_match(bzh_sites$NATURE, pattern1 )[, 1] == 'allée')
pattern2 <- c("allée|aqueduc|architecture|atelier")
#Get indices where 'allée' is found using pattern1
ind2 <- which(str_match(bzh_sites$NATURE, pattern2)[, 1] == 'allée')
#Indices which are present in ind2 but absent in ind1
setdiff(ind2, ind1)
#[1] 3093 10400
#Get corresponding text
temp <- bzh_sites$NATURE[setdiff(ind2, ind1)]
temp
#[1] "dolmen allée couverte" "coffre funéraire allée couverte"
What happens when we use pattern1 and pattern2 on temp
str_match(temp, pattern1)
# [,1]
#[1,] "dolmen"
#[2,] "coffre"
str_match(temp, pattern2)
# [,1]
#[1,] "allée"
#[2,] "allée"
As we can see using pattern1 certain values are classified in another group since they occur first in the string hence we have a mismatch.
A similar explanation can be given for mismatches in other groups.
str_match only returns first match, to get all the matches in pattern we can use str_match_all
table(unlist(str_match_all(bzh_sites$NATURE, pattern1)))
# allée aqueduc architecture atelier bas
# 178 76 79 252 62
# carrière caveau chapelle château chemin
# 46 35 226 205 350
# cimetière coffre dépôt dolmen eau
# 275 155 450 542 114
# église enceinte enclos éperon space
# 360 655 338 114 102
#exploitation fanum ferme funéraire groups
# 1856 38 196 1256 295
# habitat maison manoir menhir monastère
# 1154 65 161 1036 31
# motte nécropole occupation organisation parcellaire
# 566 312 5152 50 492
# pêcherie prieuré production rue sépulture
# 69 66 334 44 152
# stèle thermes traitement tumulus villa
# 651 50 119 1232 225

Creating a data set with paired data and converting it into a matrix

So, I'm using R to try and do a phylogenetic PCA on a dataset that I have using the phyl.pca function from the phytools package. However, I'm having issues organising my data in a way that the function will accept! And that's not all: I did a bit of experimenting and I know that there are more issues further down the line, which I will get into...
Getting straight to the issue, here's the data frame (with dummy data) that I'm using:
>all
Taxa Tibia Feather
1 Microraptor 138 101
2 Microraptor 139 114
3 Microraptor 145 141
4 Anchiornis 160 81
5 Anchiornis 14 NA
6 Archaeopteryx 134 82
7 Archaeopteryx 136 71
8 Archaeopteryx 132 NA
9 Archaeopteryx 14 NA
10 Scansoriopterygidae 120 85
11 Scansoriopterygidae 116 NA
12 Scansoriopterygidae 123 NA
13 Sapeornis 108 NA
14 Sapeornis 112 86
15 Sapeornis 118 NA
16 Sapeornis 103 NA
17 Confuciusornis 96 NA
18 Confuciusornis 107 30
19 Confuciusornis 148 33
20 Confuciusornis 128 61
The taxa are arranged into a tree (called "tree") with Microraptor being the most basal and then progressing in order through to Confuciusornis:
>summary(tree)
Phylogenetic tree: tree
Number of tips: 6
Number of nodes: 5
Branch lengths:
mean: 1
variance: 0
distribution summary:
Min. 1st Qu. Median 3rd Qu. Max.
1 1 1 1 1
No root edge.
Tip labels: Confuciusornis
Sapeornis
Scansoriopterygidae
Archaeopteryx
Anchiornis
Microraptor
No node labels.
And the function:
>phyl.pca(tree, all, method="BM", mode="corr")
And this is the error that is coming up:
Error in phyl.pca(tree, all, method = "BM", mode = "corr") :
number of rows in Y cannot be greater than number of taxa in your tree
Y being the "all" data frame. So I have 6 taxa in my tree (matching the 6 taxa in the data frame) but there are 20 rows in my data frame. So I used this function:
> all_agg <- aggregate(all[,-1],by=list(all$Taxa),mean,na.rm=TRUE)
And got this:
Group.1 Tibia Feather
1 Anchiornis 153 81
2 Archaeopteryx 136 77
3 Confuciusornis 120 41
4 Microraptor 141 119
5 Sapeornis 110 86
6 Scansoriopterygidae 120 85
It's a bit odd that the order of the taxa has changed... Is this ok?
In any case, I converted it into a matrix:
> all_agg_matrix <- as.matrix(all_agg)
> all_agg_matrix
Group.1 Tibia Feather
[1,] "Anchiornis" "153" "81"
[2,] "Archaeopteryx" "136" "77"
[3,] "Confuciusornis" "120" "41"
[4,] "Microraptor" "141" "119"
[5,] "Sapeornis" "110" "86"
[6,] "Scansoriopterygidae" "120" "85"
And then used the phyl.pca function:
> phyl.pca(tree, all_agg_matrix, method = "BM", mode = "corr")
[1] "Y has no names. function will assume that the row order of Y matches tree$tip.label"
Error in invC %*% X : requires numeric/complex matrix/vector arguments
So, now the order that the function is considering taxa in is all wrong (but I can fix that relatively easily). The issue is that phyl.pca doesn't seem to believe that my matrix is actually a matrix. Any ideas why?
I think you may have bigger problems. Most phylogenetic methods, I suspect including phyl.pca, assume that traits are fixed at the species level (i.e., they don't account for within-species variation). Thus, if you want to use phyl.pca, you probably need to collapse your data to a single value per species, e.g. via
dd_agg <- aggregate(dd[,-1],by=list(dd$Taxa),mean,na.rm=TRUE)
Extract the numeric columns and label the rows properly so that phyl.pca can match them up with the tips correctly:
dd_mat <- dd_agg[,-1]
rownames(dd_mat) <- dd_agg[,1]
Using these aggregated data, I can make up a tree (since you didn't give us one) and run phyl.pca ...
library(phytools)
tt <- rcoal(nrow(dd_agg),tip.label=dd_agg[,1])
phyl.pca(tt,dd_mat)
If you do need to do an analysis that takes within-species variation into account you might need to ask somewhere more specialized, e.g. the r-sig-phylo#r-project.org mailing list ...
The answer posted by Ben Bolker seems to work whereby the data (called "all") is collapsed into a single value per species before creating a matrix and running the function. As per so:
> all_agg <- aggregate(all[,-1],by=list(all$Taxa),mean,na.rm=TRUE)
> all_mat <- all_agg[,-1]
> rownames(all_mat) <- all_agg[,1]
> phyl.pca(tree,all_mat, method= "lambda", mode = "corr")
Thanks to everyone who contributed an answer and especially Ben! :)

Calculating mode with modeest package in R

I am using the below code for calculating the mode of a dataframe:
library(modeest)
apply(df[ ,2:length(df)], 1, mfv)
My data looks like this:
Item A B C
Book001 56 32 56
Book002 95 95 20
Book003 50 89 50
Book004 6 65 40
It gives me the following output:
[[1]]
[1] 56
[[2]]
[1] 95
[[3]]
[1] 50
[[4]]
[1] 6 40 65
This code is perfect only if the data contains a recurring term.
How can I display the mode as NA when there is no recurring term?
Let's try with a custom function:
foo <- function(x){
out <- mfv(x)
if(length(out) > 1) out <- NA
return(out)
}
apply(df[ ,2:length(df)], 1, foo)
# [1] 56 95 50 NA

Filter rows based on values of multiple columns in R

Here is the data set, say name is DS.
Abc Def Ghi
1 41 190 67
2 36 118 72
3 12 149 74
4 18 313 62
5 NA NA 56
6 28 NA 66
7 23 299 65
8 19 99 59
9 8 19 61
10 NA 194 69
How to get a new dataset DSS where value of column Abc is greater than 25, and value of column Def is greater than 100.It should also ignore any row if value of atleast one column in NA.
I have tried few options but wasn't successful. Your help is appreciated.
There are multiple ways of doing it. I have given 5 methods, and the first 4 methods are faster than the subset function.
R Code:
# Method 1:
DS_Filtered <- na.omit(DS[(DS$Abc > 20 & DS$Def > 100), ])
# Method 2: which function also ignores NA
DS_Filtered <- DS[ which( DS$Abc > 20 & DS$Def > 100) , ]
# Method 3:
DS_Filtered <- na.omit(DS[(DS$Abc > 20) & (DS$Def >100), ])
# Method 4: using dplyr package
DS_Filtered <- filter(DS, DS$Abc > 20, DS$Def >100)
DS_Filtered <- DS %>% filter(DS$Abc > 20 & DS$Def >100)
# Method 5: Subset function by default ignores NA
DS_Filtered <- subset(DS, DS$Abc >20 & DS$Def > 100)

How to grep a word exactly

I'd like to grep for "nitrogen" in the following character vector and want to get
back only the entry which is containing "nitrogen" and nothing of the rest (e.g. nitrogen fixation):
varnames=c("nitrogen", "dissolved organic nitrogen", "nitrogen fixation", "total dissolved nitrogen", "total nitrogen")
I tried something like this:
grepl(pattern= "![[:space:]]nitrogen![[:space:]]", varnames)
But this doesn't work.
Although Dason's answer is easier, you could do an exact match using grep via:
varnames=c("nitrogen", "dissolved organic nitrogen", "nitrogen fixation", "total dissolved nitrogen", "total nitrogen")
grep("^nitrogen$",varnames,value=TRUE)
[1] "nitrogen"
grep("^nitrogen$",varnames)
[1] 1
To get the indices that are exactly equal to "nitrogen" you could use
which(varnames == "nitrogen")
Depending on what you want to do you might not even need the 'which' as varnames == "nitrogen" gives a logical vector of TRUE/FALSE. If you just want to do something like replace all of the occurances of "nitrogen" with "oxygen" this should suffice
varnames[varnames == "nitrogen"] <- "oxygen"
Or use fixed = TRUE if you want to match actual string (regexlessly):
v <- sample(c("nitrogen", "potassium", "hidrogen"), size = 100, replace = TRUE, prob = c(.8, .1, .1))
grep("nitrogen", v, fixed = TRUE)
# [1] 3 4 5 6 7 8 9 11 12 13 14 16 19 20 21 22 23 24 25
# [20] 26 27 29 31 32 35 36 38 39 40 41 43 44 46 47 48 49 50 51
# [39] 52 53 54 56 57 60 61 62 65 66 67 69 70 71 72 73 74 75 76
# [58] 78 79 80 81 82 83 84 85 86 87 88 89 91 92 93 94 95 96 97
# [77] 98 99 100
Dunno about the speed issues, I like to test stuff and claim that approach A is faster than approach B, but in theory, at least from my experience, indexing/binary operators should be the fastest, so I vote for #Dason's approach. Also note that regexes are always slower than fixed = TRUE greping.
A little proof is attached bellow. Note that this is a lame test, and system.time should be put inside replicate to get (more) accurate differences, you should take outliers into an account, etc. But surely this one proves that you should use which! =)
(a0 <- system.time(replicate(1e5, grep("^nitrogen$", v))))
# user system elapsed
# 5.700 0.023 5.724
(a1 <- system.time(replicate(1e5, grep("nitrogen", v, fixed = TRUE))))
# user system elapsed
# 1.147 0.020 1.168
(a2 <- system.time(replicate(1e5, which(v == "nitrogen"))))
# user system elapsed
# 1.013 0.020 1.033

Resources