names(score)
[1] "(Intercept)" "aado2_calc(20,180]" "aado2_calc(360,460]"
[4] "aado2_calc(460,629]" "albumin[1,1.8]" "albumin(1.8,2.2]"
[7] "albumin(2.2,2.8]" "aniongap(15,18]" "aniongap(18,20]"
[10] "aniongap(20,22]" "aniongap(22,25]" "aniongap(25,49]"
I want to extract the two numbers within parenthesis (numbers outside the parenthesis are not needed) and there are "(" or "[". the first number will be assigned to an object "low" and the second to "high".
You can use the readr package and the function parse_number for ease of use. For more power you'd want to use something like the base regular expression functions in r, or a package like stringi
Just like #jake-kaupp said - use stringi :) As you can see, stringi solution is shorter, easier to understand and much faster - up to 30 times!
Short answer:
arr <- stri_extract_all_regex(x, "(?<=[\\[\\(,])[0-9.]+(?=[\\]\\),])", simplify = NA)
data.frame(low = as.numeric(arr[,1]), high = as.numeric(arr[,2]))
Long answer:
require(stringi)
require(microbenchmark)
grepFun <- function(x){
mat <- regmatches(x,
gregexpr("(?<=[\\[\\(,])[0-9.]+(?=[\\]\\),])", x, perl = TRUE))
newnames <- lapply(mat, function(m) {
if (! length(m)) return(list(low = NA, high = NA))
setNames(as.list(as.numeric(m)), nm = c("low", "high"))
})
do.call(rbind.data.frame, newnames)
}
striFun <- function(x){
arr <- stri_extract_all_regex(x, "(?<=[\\[\\(,])[0-9.]+(?=[\\]\\),])", simplify = NA)
data.frame(low = as.numeric(arr[,1]), high = as.numeric(arr[,2]))
}
# both functions work the same
grepFun(scorenames)
low high
1 NA NA
2 20.0 180.0
3 360.0 460.0
4 460.0 629.0
...
12 25.0 49.0
striFun(scorenames)
low high
1 NA NA
2 20.0 180.0
3 360.0 460.0
4 460.0 629.0
...
12 25.0 49.0
# generating more complicated vector
n <- 10000
x <- stri_paste(stri_rand_strings(n, length = 1:10), sample(c("(","["),n,TRUE),
sample(1000,n,TRUE), ",", sample(1000,n,TRUE), sample(c(")","]"), n, TRUE))
head(x) # check first elements
[1] "O[68,434]" "Ql[783,151)" "Zk0(773,60)" "ETfV(446,518]" "Xixbr(576,855)" "G6QnHu(92,955)"
#short test using new data
grepFun(x[1:6])
low high
1 68 434
2 783 151
3 773 60
4 446 518
5 576 855
6 92 955
striFun(x[1:6])
low high
1 68 434
2 783 151
3 773 60
4 446 518
5 576 855
6 92 955
#and some benchmark to prove performance
microbenchmark(grepFun(x), striFun(x))
Unit: milliseconds
expr min lq mean median uq max neval
grepFun(x) 330.27733 366.09306 416.56330 406.08914 465.29829 568.15250 100
striFun(x) 11.57449 11.97825 13.38157 12.46927 13.67699 25.97455 100
scorenames <- c(
"(Intercept)" ,"aado2_calc(20,180]" ,"aado2_calc(360,460]"
,"aado2_calc(460,629]" ,"albumin[1,1.8]" ,"albumin(1.8,2.2]"
,"albumin(2.2,2.8]" ,"aniongap(15,18]" ,"aniongap(18,20]"
,"aniongap(20,22]" ,"aniongap(22,25]" ,"aniongap(25,49]"
)
The first step might be to extract everything within the "parens"-delimiters (to include (), [], and the comma ,).
mat <- regmatches(scorenames,
gregexpr("(?<=[\\[\\(,])[0-9.]+(?=[\\]\\),])", scorenames, perl = TRUE))
str(mat)
# List of 12
# $ : chr(0)
# $ : chr [1:2] "20" "180"
# $ : chr [1:2] "360" "460"
# $ : chr [1:2] "460" "629"
# $ : chr [1:2] "1" "1.8"
# $ : chr [1:2] "1.8" "2.2"
# $ : chr [1:2] "2.2" "2.8"
# $ : chr [1:2] "15" "18"
# $ : chr [1:2] "18" "20"
# $ : chr [1:2] "20" "22"
# $ : chr [1:2] "22" "25"
# $ : chr [1:2] "25" "49"
From here, we can see that (1) the first one is problematic (no surprise, you need to figure out what you want here), and (2) the rest look about right.
Here's one rough way to process this list. This is very trusting and naïve ... you should probably add checks to ensure the list is of length 2, that everything converts correctly (perhaps in a tryCatch), etc.
newnames <- lapply(mat, function(m) {
if (! length(m)) return(list(low = NA, high = NA))
setNames(as.list(as.numeric(m)), nm = c("low", "high"))
})
str(newnames)
# List of 12
# $ :List of 2
# ..$ low : logi NA
# ..$ high: logi NA
# $ :List of 2
# ..$ low : num 20
# ..$ high: num 180
# $ :List of 2
# ..$ low : num 360
# ..$ high: num 460
# ...snip...
You can turn this into a data.frame with:
head(do.call(rbind.data.frame, newnames))
# low high
# 1 NA NA
# 2 20.0 180.0
# 3 360.0 460.0
# 4 460.0 629.0
# 5 1.0 1.8
# 6 1.8 2.2
I want to create a data frame using describe() function. Dataset under consideration is iris. The data frame should look like this:
Variable n missing unique Info Mean 0.05 0.1 0.25 0.5 0.75 0.9 0.95
Sepal.Length 150 0 35 1 5.843 4.6 4.8 5.1 5.8 6.4 6.9 7.255
Sepal.Width 150 0 23 0.99 3.057 2.345 2.5 2.8 3 3.3 3.61 3.8
Petal.Length 150 0 43 1 3.758 1.3 1.4 1.6 4.35 5.1 5.8 6.1
Petal.Width 150 0 22 0.99 1.199 0.2 0.2 0.3 1.3 1.8 2.2 2.3
Species 150 0 3
Is there a way out to coerce the output of describe() to data.frame type? When I try to coerce, I get an error as shown below:
library(Hmisc)
statistics <- describe(iris)
statistics[1]
first_vec <- statistics[1]$Sepal.Length
as.data.frame(first_vec)
#Error in as.data.frame.default(first_vec) : cannot coerce class ""describe"" to a data.frame
Thanks
The way to figure this out is to examine the objects with str():
data(iris)
library(Hmisc)
di <- describe(iris)
di
# iris
#
# 5 Variables 150 Observations
# -------------------------------------------------------------
# Sepal.Length
# n missing unique Info Mean .05 .10 .25 .50 .75 .90 .95
# 150 0 35 1 5.843 4.600 4.800 5.100 5.800 6.400 6.900 7.255
#
# lowest : 4.3 4.4 4.5 4.6 4.7, highest: 7.3 7.4 7.6 7.7 7.9
# -------------------------------------------------------------
# ...
# -------------------------------------------------------------
# Species
# n missing unique
# 150 0 3
#
# setosa (50, 33%), versicolor (50, 33%)
# virginica (50, 33%)
# -------------------------------------------------------------
str(di)
# List of 5
# $ Sepal.Length:List of 6
# ..$ descript : chr "Sepal.Length"
# ..$ units : NULL
# ..$ format : NULL
# ..$ counts : Named chr [1:12] "150" "0" "35" "1" ...
# .. ..- attr(*, "names")= chr [1:12] "n" "missing" "unique" "Info" ...
# ..$ intervalFreq:List of 2
# .. ..$ range: atomic [1:2] 4.3 7.9
# .. .. ..- attr(*, "Csingle")= logi TRUE
# .. ..$ count: int [1:100] 1 0 3 0 0 1 0 0 4 0 ...
# ..$ values : Named chr [1:10] "4.3" "4.4" "4.5" "4.6" ...
# .. ..- attr(*, "names")= chr [1:10] "L1" "L2" "L3" "L4" ...
# ..- attr(*, "class")= chr "describe"
# $ Sepal.Width :List of 6
# ...
# $ Species :List of 5
# ..$ descript: chr "Species"
# ..$ units : NULL
# ..$ format : NULL
# ..$ counts : Named num [1:3] 150 0 3
# .. ..- attr(*, "names")= chr [1:3] "n" "missing" "unique"
# ..$ values : num [1:2, 1:3] 50 33 50 33 50 33
# .. ..- attr(*, "dimnames")=List of 2
# .. .. ..$ : chr [1:2] "Frequency" "%"
# .. .. ..$ : chr [1:3] "setosa" "versicolor" "virginica"
# ..- attr(*, "class")= chr "describe"
# - attr(*, "descript")= chr "iris"
# - attr(*, "dimensions")= int [1:2] 150 5
# - attr(*, "class")= chr "describe"
We see that di is a list of lists. We can take it apart by looking at just the first sublist. You can convert that into a vector:
unlist(di[[1]])
# descript counts.n
# "Sepal.Length" "150"
# counts.missing counts.unique
# "0" "35"
# counts.Info counts.Mean
# "1" "5.843"
# counts..05 counts..10
# "4.600" "4.800"
# counts..25 counts..50
# "5.100" "5.800"
# counts..75 counts..90
# "6.400" "6.900"
# counts..95 intervalFreq.range1
# "7.255" "4.3"
# intervalFreq.range2 intervalFreq.count1
# "7.9" "1"
# ...
# values.H3 values.H2
# "7.6" "7.7"
# values.H1
# "7.9"
str(unlist(di[[1]]))
# Named chr [1:125] "Sepal.Length" "150" "0" "35" ...
# - attr(*, "names")= chr [1:125] "descript" "counts.n" "counts.missing" "counts.unique" ...
It is very, very long (125). The elements have been coerced to all be of the same (and most inclusive) type, namely, character. It seems you want the 2nd through 12th elements:
unlist(di[[1]])[2:12]
# counts.n counts.missing counts.unique counts.Info
# "150" "0" "35" "1"
# counts.Mean counts..05 counts..10 counts..25
# "5.843" "4.600" "4.800" "5.100"
# counts..50 counts..75 counts..90
# "5.800" "6.400" "6.900"
Now you have something you can start to work with. But notice that this only seems to be the case for numerical variables; the factor variable species is different:
unlist(di[[5]])
# descript counts.n counts.missing counts.unique
# "Species" "150" "0" "3"
# values1 values2 values3 values4
# "50" "33" "50" "33"
# values5 values6
# "50" "33"
In that case, it seems you only want elements two through four.
Using this process of discovery and problem solving, you can see how you'd take the output of describe apart and put the information you want into a data frame. However, this will take a lot of work. You'll presumably need to use loops and lots of if(){ ... } else{ ... } blocks. You might just want to code your own dataset description function from scratch.
You can do this by using the stat.desc function from the pastecs package:
library(pastecs)
summary_df <- stat.desc(mydata)
The summary_df is the dataframe you wanted. See more info here.
In R, you just have to use the summary(iris) function instead of describe(iris) function in Python.
I used tm package and DocumentTermMatrix to create a DocumentTermMatrix and now I'd like to convert it to spare matrix for an ouput to glmnet function from glmnet package.
Any idea on how to do this?
The objects looks like this:
> str(yy)
List of 6
$ i : int [1:13864810] 2 2 2 2 2 2 2 2 2 2 ...
$ j : int [1:13864810] 320 334 339 346 347 348 355 360 362 363 ...
$ v : num [1:13864810] 1 1 1 1 1 1 1 1 1 1 ...
$ nrow : int 709678
$ ncol : int 371
$ dimnames:List of 2
..$ Docs : chr [1:709678] "1" "2" "3" "4" ...
..$ Terms: chr [1:371] "declarative_" "declarative_0" "declarative_0zc" "declarative_0zd" ...
- attr(*, "class")= chr [1:2] "DocumentTermMatrix" "simple_triplet_matrix"
- attr(*, "weighting")= chr [1:2] "term frequency" "tf"
> class(yy)
[1] "DocumentTermMatrix" "simple_triplet_matrix"
Is this the only way?
sparseYY <- sparseMatrix( i = yy$i, j=yy$j, x =yy$v)
Simply use as.matrix to convert to a sparse matrix:
> dtm_matrix <- as.matrix(dtm)
> class(dtm_matrix)
[1] "matrix"
I have managed to subset and lapply a list of data frames as follows:
subsetDeathHA <-
na.omit(subset(outcome,select = c("Hospital Name", "mortailityRate", "State"), ))
orderSubsetDeathHA <-
subsetDeathHA[order(subsetDeathHA$"mr" , subsetDeathHA$"Hospital Name", subsetDeathHA$'State' ),]
splitOrderSubsetDeahtHA <-
split(orderSubsetDeathHA, orderSubsetDeathHA$'State')
aa<- lapply(splitOrderSubsetDeahtHA, function(x) { x[num,] })
num is the ranking number on a per State basis.
Using str(aa) shows this object is a list of (54) data.frames, where each data.frame is one object of 3 variables as follows:
List of 54
$ AK:'data.frame': 1 obs. of 3 variables:
..$ Hospital Name : chr NA
..$ mortalityRate : num NA
..$ State : chr NA
..- attr(*, "na.action")=Class 'omit' Named int [1:1986] 4 5 6 10 13 17 19 23 27 28 ...
.. .. ..- attr(*, "names")= chr [1:1986] "4" "5" "6" "10" ...
$ AL:'data.frame': 1 obs. of 3 variables:
..$ Hospital Name : chr "D C H REGIONAL MEDICAL CENTER"
..$ mortalityRate : num 15.8
..$ State : chr "AL"
..- attr(*, "na.action")=Class 'omit' Named int [1:1986] 4 5 6 10 13 17 19 23 27 28 ...
.. .. ..- attr(*, "names")= chr [1:1986] "4" "5" "6" "10" ...
What I can't seem to do is the following
1) Subset out the Hospital Name and the State by removing the mortalityRate variable and return a list of the resulting 54 objects/data frames.
2) Place row.names =F appropriately to suppress the indexing that R provides.
3) Even though I thought I had 'na'd out' the NA values in the first sub-setting operation,
when I print(aa), what follows is a sample of the output.
$AK
Hospital Name mr State
NA NA <NA> NA <NA>
$AL
Hospital Name mr State
56 D C H REGIONAL MEDICAL CENTER 15.8 AL
etc...
Any help/suggestions appreciated
I do have to rename sublist titles within a main matrix list called l1. Each Name(n) is related to a value as a character string. Here is my code :
names(l1)[1] <- Name1
names(l1)[2] <- Name2
names(l1)[3] <- Name3
names(l1)[4] <- Name4
## ...
names(l1)[43] <- Name43
As you can see, I have 43 sublists. Is there a way do do that using an automated loop like for (i in 1:43) or something ? I tried to perform a loop but I am a beginner and that's very hard for now.
Edit : I would like to rename the elements of my list without having to type 43 lines manually. Here is the first three elements of my list :
str(l1)
List of 43
$ XXX : num [1:640, 1:3] -0.83 -0.925 -0.623 -0.191 0.155 ...
..- attr(*, "dimnames")=List of 2
.. ..$ : NULL
.. ..$ : chr [1:3] "EV_BICYCLE" "HW_DISTANCE" "NO_ASSETS"
$ XXX : num [1:640, 1:2] -0.159 0.485 -0.686 -0.245 -3.361 ...
..- attr(*, "dimnames")=List of 2
.. ..$ : NULL
.. ..$ : chr [1:2] "HOME_OWN" "METRO_DISTANCE"
$ XXX : num [1:640, 1:3] -0.79 1.15 0.224 0.388 -1.571 ...
..- attr(*, "dimnames")=List of 2
.. ..$ : NULL
.. ..$ : chr [1:3] "BICYCLE" "HOME_OWN_SC" "POP_SC"
That is to say, I would like to replace the 43 XXX by Name1, Name2 ... to Name43
Try
names(l1) <- unlist(mget(ls(pattern="^Nom_F")))
str(l1, list.len=2)
#List of 3
# $ Accessibility : int [1:5, 1:5] 10 10 3 9 7 6 8 2 7 8 ...
# ..- attr(*, "dimnames")=List of 2
# .. ..$ : NULL
# .. ..$ : chr [1:5] "A" "B" "C" "D" ...
# $ Access : int [1:5, 1:5] 6 4 10 5 9 8 9 4 7 1 ...
#..- attr(*, "dimnames")=List of 2
# .. ..$ : NULL
# .. ..$ : chr [1:5] "A" "B" "C" "D" ...
Instead of creating separate objects, you could create a vector of real titles. For example
v1 <- LETTERS[1:3]
names(l1) <- v1
data
set.seed(42)
l1 <- setNames(lapply(1:3, function(x)
matrix(sample(1:10, 5*5, replace=TRUE), ncol=5,
dimnames=list(NULL, LETTERS[1:5]))), rep('XXX',3))
Nom_F1 <- "Accessibility"
Nom_F2 <- "Access"
Nom_F3 <- "Poverty_and_SC"