how can I extract numbers from a string in R? - r

names(score)
[1] "(Intercept)" "aado2_calc(20,180]" "aado2_calc(360,460]"
[4] "aado2_calc(460,629]" "albumin[1,1.8]" "albumin(1.8,2.2]"
[7] "albumin(2.2,2.8]" "aniongap(15,18]" "aniongap(18,20]"
[10] "aniongap(20,22]" "aniongap(22,25]" "aniongap(25,49]"
I want to extract the two numbers within parenthesis (numbers outside the parenthesis are not needed) and there are "(" or "[". the first number will be assigned to an object "low" and the second to "high".

You can use the readr package and the function parse_number for ease of use. For more power you'd want to use something like the base regular expression functions in r, or a package like stringi

Just like #jake-kaupp said - use stringi :) As you can see, stringi solution is shorter, easier to understand and much faster - up to 30 times!
Short answer:
arr <- stri_extract_all_regex(x, "(?<=[\\[\\(,])[0-9.]+(?=[\\]\\),])", simplify = NA)
data.frame(low = as.numeric(arr[,1]), high = as.numeric(arr[,2]))
Long answer:
require(stringi)
require(microbenchmark)
grepFun <- function(x){
mat <- regmatches(x,
gregexpr("(?<=[\\[\\(,])[0-9.]+(?=[\\]\\),])", x, perl = TRUE))
newnames <- lapply(mat, function(m) {
if (! length(m)) return(list(low = NA, high = NA))
setNames(as.list(as.numeric(m)), nm = c("low", "high"))
})
do.call(rbind.data.frame, newnames)
}
striFun <- function(x){
arr <- stri_extract_all_regex(x, "(?<=[\\[\\(,])[0-9.]+(?=[\\]\\),])", simplify = NA)
data.frame(low = as.numeric(arr[,1]), high = as.numeric(arr[,2]))
}
# both functions work the same
grepFun(scorenames)
low high
1 NA NA
2 20.0 180.0
3 360.0 460.0
4 460.0 629.0
...
12 25.0 49.0
striFun(scorenames)
low high
1 NA NA
2 20.0 180.0
3 360.0 460.0
4 460.0 629.0
...
12 25.0 49.0
# generating more complicated vector
n <- 10000
x <- stri_paste(stri_rand_strings(n, length = 1:10), sample(c("(","["),n,TRUE),
sample(1000,n,TRUE), ",", sample(1000,n,TRUE), sample(c(")","]"), n, TRUE))
head(x) # check first elements
[1] "O[68,434]" "Ql[783,151)" "Zk0(773,60)" "ETfV(446,518]" "Xixbr(576,855)" "G6QnHu(92,955)"
#short test using new data
grepFun(x[1:6])
low high
1 68 434
2 783 151
3 773 60
4 446 518
5 576 855
6 92 955
striFun(x[1:6])
low high
1 68 434
2 783 151
3 773 60
4 446 518
5 576 855
6 92 955
#and some benchmark to prove performance
microbenchmark(grepFun(x), striFun(x))
Unit: milliseconds
expr min lq mean median uq max neval
grepFun(x) 330.27733 366.09306 416.56330 406.08914 465.29829 568.15250 100
striFun(x) 11.57449 11.97825 13.38157 12.46927 13.67699 25.97455 100

scorenames <- c(
"(Intercept)" ,"aado2_calc(20,180]" ,"aado2_calc(360,460]"
,"aado2_calc(460,629]" ,"albumin[1,1.8]" ,"albumin(1.8,2.2]"
,"albumin(2.2,2.8]" ,"aniongap(15,18]" ,"aniongap(18,20]"
,"aniongap(20,22]" ,"aniongap(22,25]" ,"aniongap(25,49]"
)
The first step might be to extract everything within the "parens"-delimiters (to include (), [], and the comma ,).
mat <- regmatches(scorenames,
gregexpr("(?<=[\\[\\(,])[0-9.]+(?=[\\]\\),])", scorenames, perl = TRUE))
str(mat)
# List of 12
# $ : chr(0)
# $ : chr [1:2] "20" "180"
# $ : chr [1:2] "360" "460"
# $ : chr [1:2] "460" "629"
# $ : chr [1:2] "1" "1.8"
# $ : chr [1:2] "1.8" "2.2"
# $ : chr [1:2] "2.2" "2.8"
# $ : chr [1:2] "15" "18"
# $ : chr [1:2] "18" "20"
# $ : chr [1:2] "20" "22"
# $ : chr [1:2] "22" "25"
# $ : chr [1:2] "25" "49"
From here, we can see that (1) the first one is problematic (no surprise, you need to figure out what you want here), and (2) the rest look about right.
Here's one rough way to process this list. This is very trusting and naïve ... you should probably add checks to ensure the list is of length 2, that everything converts correctly (perhaps in a tryCatch), etc.
newnames <- lapply(mat, function(m) {
if (! length(m)) return(list(low = NA, high = NA))
setNames(as.list(as.numeric(m)), nm = c("low", "high"))
})
str(newnames)
# List of 12
# $ :List of 2
# ..$ low : logi NA
# ..$ high: logi NA
# $ :List of 2
# ..$ low : num 20
# ..$ high: num 180
# $ :List of 2
# ..$ low : num 360
# ..$ high: num 460
# ...snip...
You can turn this into a data.frame with:
head(do.call(rbind.data.frame, newnames))
# low high
# 1 NA NA
# 2 20.0 180.0
# 3 360.0 460.0
# 4 460.0 629.0
# 5 1.0 1.8
# 6 1.8 2.2

Related

Calculate the average of the value of each row with each successive row in one column in R in loop

In R I need to have the average of the first and second values, then the first and third values, etc ... then the average of the second and first values, and so it gets to 96, because that's how many values my file has (in total, I need 9216 such averages) It would be good to do it in an automated way, e.g. in a loop
Could you at least supply what a sample output would look like from the example that you specified? It was hard following your word problem with etc... as part of the description.
You should always provide reproducible data, for example:
set.seed(42)
N <- 96
X <- round(runif(N, 1000, 9999))
str(X)
# num [1:96] 9232 9433 3575 8473 6775 ...
Now you want the mean of all possible pairs:
pairs <- expand.grid(seq(N), seq(N))
str(pairs)
# 'data.frame': 9216 obs. of 2 variables:
# $ Var1: int 1 2 3 4 5 6 7 8 9 10 ...
# $ Var2: int 1 1 1 1 1 1 1 1 1 1 ...
# - attr(*, "out.attrs")=List of 2
# ..$ dim : int [1:2] 96 96
# ..$ dimnames:List of 2
# .. ..$ Var1: chr [1:96] "Var1= 1" "Var1= 2" "Var1= 3" "Var1= 4" ...
# .. ..$ Var2: chr [1:96] "Var2= 1" "Var2= 2" "Var2= 3" "Var2= 4" ...
Now compute the means:
X.mn <- apply(pairs, 1, function(x) mean(c(X[x[1]], X[x[2]])))
str(X.mn)
# num [1:9216] 9232 9332 6404 8852 8004 ...
summary(X.mn)
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 1002 4349 5707 5709 7154 9899
Once you know how to use vectorization, you can often eliminate the need for a loop.

How to use an API in R to be able to get data for storing into a db?

I am trying to figure out how to get data in R for the purposes of making it into a table that I can store into a database like sql.
API <- "https://covidtrackerapi.bsg.ox.ac.uk/api/v2/stringency/date-range/{2020-01-01}/{2020-06-30}"
oxford_covid <- GET(API)
I then try to parse this data and make it into a dataframe but when I do so I get the errors of:
"Error: Columns 4, 5, 6, 7, 8, and 178 more must be named.
Use .name_repair to specify repair." and "Error: Tibble columns must have compatible sizes. * Size 2: Columns deaths, casesConfirmed, and stringency. * Size 176: Columns ..2020.12.27, ..2020.12.28, ..2020.12.29, and"
I am not sure if there is a better approach or how to parse this. Is there a method or approach? I am not having much luck online.
It looks like you're trying to take the JSON return from that API and call read.table or something on it. Don't do that, JSON should be parsed by JSON tools (such as jsonlite::parse_json).
Some work on that URL.
js <- jsonlite::parse_json(url("https://covidtrackerapi.bsg.ox.ac.uk/api/v2/stringency/date-range/2020-01-01/2020-06-30"))
lengths(js)
# scale countries data
# 3 183 182
str(js, max.level = 2, list.len = 3)
# List of 3
# $ scale :List of 3
# ..$ deaths :List of 2
# ..$ casesConfirmed:List of 2
# ..$ stringency :List of 2
# $ countries:List of 183
# ..$ : chr "ABW"
# ..$ : chr "AFG"
# ..$ : chr "AGO"
# .. [list output truncated]
# $ data :List of 182
# ..$ 2020-01-01:List of 183
# ..$ 2020-01-02:List of 183
# ..$ 2020-01-03:List of 183
# .. [list output truncated]
So this is rather large. Since you're hoping for a data.frame, I'm going to look at js$data only; js$countries looks relatively uninteresting,
str(unlist(js$countries))
# chr [1:183] "ABW" "AFG" "AGO" "ALB" "AND" "ARE" "ARG" "AUS" "AUT" "AZE" "BDI" "BEL" "BEN" "BFA" "BGD" "BGR" "BHR" "BHS" "BIH" "BLR" "BLZ" "BMU" "BOL" "BRA" "BRB" "BRN" "BTN" "BWA" "CAF" "CAN" "CHE" "CHL" "CHN" "CIV" "CMR" "COD" "COG" "COL" "CPV" ...
and does not correlate with the js$data. The js$scale might be interesting, but I'll skip it for now.
My first go-to for joining data like this into a data.frame is one of the following, depending on your preference for R dialects:
do.call(rbind.data.frame, list_of_frames) # base R
dplyr::bind_rows(list_of_frames) # tidyverse
data.table::rbindlist(list_of_frames) # data.table
But we're going to run into problems. Namely, there are entries that are NULL, when R would prefer that they be something (such as NA).
str(js$data[[1]][1])
# List of 2
# $ ABW:List of 8
# ..$ date_value : chr "2020-01-01"
# ..$ country_code : chr "ABW"
# ..$ confirmed : NULL # <--- problem
# ..$ deaths : NULL
# ..$ stringency_actual : int 0
# ..$ stringency : int 0
# ..$ stringency_legacy : int 0
# ..$ stringency_legacy_disp: int 0
So we need to iterate over each of those and replace NULL with NA. Unfortunately, I don't know of an easy tool to recursively go through lists of lists (even rapply doesn't work well in my tests), so we'll be a little brute-force here with a triple-lapply:
Long-story-short,
str(js$data[[1]][[1]])
# List of 8
# $ date_value : chr "2020-01-01"
# $ country_code : chr "ABW"
# $ confirmed : NULL
# $ deaths : NULL
# $ stringency_actual : int 0
# $ stringency : int 0
# $ stringency_legacy : int 0
# $ stringency_legacy_disp: int 0
jsdata <-
lapply(js$data, function(z) {
lapply(z, function(y) {
lapply(y, function(x) if (is.null(x)) NA else x)
})
})
str(jsdata[[1]][[1]])
# List of 8
# $ date_value : chr "2020-01-01"
# $ country_code : chr "ABW"
# $ confirmed : logi NA
# $ deaths : logi NA
# $ stringency_actual : int 0
# $ stringency : int 0
# $ stringency_legacy : int 0
# $ stringency_legacy_disp: int 0
(Technically, if we know that it's going to be integers, we should use NA_integer_. Fortunately, R and its dialects are able to work with this shortcut, as we'll see in a second.)
After that, we can do a double-dive rbinding and get back to the frame-making I discussed a couple of steps ago. Choose one of the following, whichever dialect you prefer:
alldat <- do.call(rbind.data.frame,
lapply(jsdata, function(z) do.call(rbind.data.frame, z)))
alldat <- dplyr::bind_rows(purrr::map(jsdata, dplyr::bind_rows))
alldat <- data.table::rbindlist(lapply(jsdata, data.table::rbindlist))
For simplicity, I'll show the first (base R) version:
tail(alldat)
# date_value country_code confirmed deaths stringency_actual stringency stringency_legacy stringency_legacy_disp
# 2020-06-30.AND 2020-06-30 AND 855 52 42.59 42.59 65.47 65.47
# 2020-06-30.ARE 2020-06-30 ARE 48667 315 72.22 72.22 83.33 83.33
# 2020-06-30.AGO 2020-06-30 AGO 284 13 75.93 75.93 83.33 83.33
# 2020-06-30.ALB 2020-06-30 ALB 2535 62 68.52 68.52 78.57 78.57
# 2020-06-30.ABW 2020-06-30 ABW 103 3 47.22 47.22 63.09 63.09
# 2020-06-30.AFG 2020-06-30 AFG 31507 752 78.70 78.70 76.19 76.19
And if you're curious about the $scale,
do.call(rbind.data.frame, js$scale)
# min max
# deaths 0 127893
# casesConfirmed 0 2633466
# stringency 0 100
## or
data.table::rbindlist(js$scale, idcol="id")
# id min max
# <char> <int> <int>
# 1: deaths 0 127893
# 2: casesConfirmed 0 2633466
# 3: stringency 0 100
## or
dplyr::bind_rows(js$scale, .id = "id")

Adding a suffix to names when storing results in a loop

I am making some plots in R in a for-loop and would like to store them using a name to describe the function being plotted, but also which data it came from.
So when I have a list of 2 data sets "x" and "y" and the loop has a structure like this:
x = matrix(
c(1,2,4,5,6,7,8,9),
nrow=3,
ncol=2)
y = matrix(
c(20,40,60,80,100,120,140,160,180),
nrow=3,
ncol=2)
data <- list(x,y)
for (i in data){
??? <- boxplot(i)
}
I would like the ??? to be "name" + (i) + "_" separator. In this case the 2 plots would be called "plot_x" and "plot_y".
I tried some stuff with paste("plot", names(i), sep = "_") but I'm not sure if this is what to use, and where and how to use it in this scenario.
We can create an empty list with the length same as that of the 'data' and then store the corresponding output from the for loop by looping over the sequence of 'data'
out <- vector('list', length(data))
for(i in seq_along(data)) {
out[[i]] <- boxplot(data[[i]])
}
str(out)
#List of 2
# $ :List of 6
# ..$ stats: num [1:5, 1:2] 1 1.5 2 3 4 5 5.5 6 6.5 7
# ..$ n : num [1:2] 3 3
# ..$ conf : num [1:2, 1:2] 0.632 3.368 5.088 6.912
# ..$ out : num(0)
# ..$ group: num(0)
# ..$ names: chr [1:2] "1" "2"
# $ :List of 6
# ..$ stats: num [1:5, 1:2] 20 30 40 50 60 80 90 100 110 120
# ..$ n : num [1:2] 3 3
# ..$ conf : num [1:2, 1:2] 21.8 58.2 81.8 118.2
# ..$ group: num(0)
# ..$ names: chr [1:2] "1" "2"
If required, set the names of the list elements with the object names
names(out) <- paste0("plot_", c("x", "y"))
It is better not to create multiple objects in the global environment. Instead as showed above, place the objects in a list
akrun is right, you should try to avoid setting names in the global environment. But if you really have to, you can try this,
> y = matrix(c(20,40,60,80,100,120,140,160,180),ncol=1)
> .GlobalEnv[[paste0("plot_","y")]] <- boxplot(y)
> str(plot_y)
List of 6
$ stats: num [1:5, 1] 20 60 100 140 180
$ n : num 9
$ conf : num [1:2, 1] 57.9 142.1
$ out : num(0)
$ group: num(0)
$ names: chr "1"
You can read up on .GlobalEnv by typing in ?.GlobalEnv, into the R command prompt.

Calculation of Geometric Mean of Data that includes NAs

EDIT: The problem was not within the geoMean function, but with a wrong use of aggregate(), as explained in the comments
I am trying to calculate the geometric mean of multiple measurements for several different species, which includes NAs. An example of my data looks like this:
species <- c("Ae", "Ae", "Ae", "Be", "Be")
phen <- c(2, NA, 3, 1, 2)
hveg <- c(NA, 15, 12, 60, 59)
df <- data.frame(species, phen, hveg)
When I try to calculate the geometric mean for the species Ae with the built-in function geoMean from the package EnvStats like this
library("EnvStats")
aggregate(df[, 3:3], list(df1$Sp), geoMean, na.rm=TRUE)
it works wonderful and skips the NAs to give me the geometric means per species.
Group.1 phen hveg
1 Ae 4.238536 50.555696
2 Be 1.414214 1.414214
When I do this with my large dataset, however, the function stumbles over NAs and returns NA as result even though there are e.g 10 numerical values and only one NA. This happens for example with the column SLA_mm2/mg.
My large data set looks like this:
> str(cut2trait1)
Classes ‘tbl_df’, ‘tbl’ and 'data.frame': 22 obs. of 19 variables:
$ Cut : chr "15_08" "15_08" "15_08" "15_08" ...
$ Block : num 1 1 1 1 1 1 1 1 1 1 ...
$ ID : num 451 512 431 531 591 432 551 393 511 452 ...
$ Plot : chr "1_1" "1_1" "1_1" "1_1" ...
$ Grazing : chr "n" "n" "n" "n" ...
$ Acro : chr "Leuc.vulg" "Dact.glom" "Cirs.arve" "Trif.prat" ...
$ Sp : chr "Lv" "Dg" "Ca" "Tp" ...
$ Label_neu : chr "Lv021" "Dg022" "Ca021" "Tp021" ...
$ PlantFunctionalType: chr "forb" "grass" "forb" "forb" ...
$ PlotClimate : chr "AC" "AC" "AC" "AC" ...
$ Season : chr "Aug" "Aug" "Aug" "Aug" ...
$ Year : num 2015 2015 2015 2015 2015 ...
$ Tiller : num 6 3 3 5 6 8 5 2 1 7 ...
$ Hveg : num 25 38 70 36 68 65 23 58 71 27 ...
$ Hrep : num 39 54 77 38 76 70 65 88 98 38 ...
$ Phen : num 8 8 7 8 8 7 6.5 8 8 8 ...
$ SPAD : num 40.7 42.4 48.7 43 31.3 ...
$ TDW_in_g : num 4.62 4.85 11.86 5.82 8.99 ...
$ SLA_mm2/mg : num 19.6 19.8 20.3 21.2 21.7 ...
and the result of my code
gm_cut2trait1 <- aggregate(cut2trait1[, 13:19], list(cut2trait1$Sp), geoMean, na.rm=TRUE)
is (only the first two rows):
Group.1 Tiller Hveg Hrep Phen SPAD TDW_in_g SLA_mm2/mg
1 Ae 13.521721 73.43485 106.67933 NA 28.17698 1.2602475 NA
2 Be 8.944272 43.95452 72.31182 5.477226 20.08880 0.7266361 9.309672
Here, the geometric mean of SLA for Ae is NA, even though there are 9 numeric measurements and only one NA in the column used to calculate the geometric mean.
I tried to use the geometric mean function suggested here:
Geometric Mean: is there a built-in?
But instead of NAs, this returned the value 1.000 when used with my big dataset, which doesn't solve my problem.
So my question is: What is the difference between my example df and the big dataset that throws the geoMean function off the rails?

converting string to numeric in R

I have a problem regarding data conversion using R language.
I have two data that being stored in variables named lung.X and lung.y, below are the description of my data.
> str(lung.X)
chr [1:86, 1:7129] " 170.0" " 104.0" " 53.7" " 119.0" " 105.5" " 130.0" ...
- attr(*, "dimnames")=List of 2
..$ : chr [1:86] "V3" "V4" "V5" "V6" ...
..$ : chr [1:7129] "A28102_at" "AB000114_at" "AB000115_at" "AB000220_at" ...
and
> str(lung.y)
num [1:86] -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
lung.X is a matrix (row: 86 col: 7129) and lung.y is an array of numbers (86 entries)
Do anyone know how to convert above data into the format below?
> str(lung.X)
num [1:86, 1:7129] 170 104 53.7 119 105.5 130...
I thought I should do like this
lung.X <- as.numeric(lung.X)
but I got this instead
> str(lung.X)
num [1:613094] 170 104 53.7 119 105.5 130...
The reason of doing this is because I need lung.X to be numerical only.
Thank you.
You could change the mode of your matrix to numeric:
## example data
m <- matrix(as.character(1:10), nrow=2,
dimnames = list(c("R1", "R2"), LETTERS[1:5]))
m
# A B C D E
# R1 "1" "3" "5" "7" "9"
# R2 "2" "4" "6" "8" "10"
str(m)
# num [1:2, 1:5] 1 2 3 4 5 6 7 8 9 10
# - attr(*, "dimnames")=List of 2
# ..$ : chr [1:2] "R1" "R2"
# ..$ : chr [1:5] "A" "B" "C" "D" ...
# NULL
mode(m) <- "numeric"
str(m)
# num [1:2, 1:5] 1 2 3 4 5 6 7 8 9 10
# - attr(*, "dimnames")=List of 2
# ..$ : chr [1:2] "R1" "R2"
# ..$ : chr [1:5] "A" "B" "C" "D" ...
# NULL
m
# A B C D E
# R1 1 3 5 7 9
# R2 2 4 6 8 10
Give this a try: m <- matrix(as.numeric(lung.X), nrow = 86, ncol = 7129)
If you need it in dataframe/list format, df <- data.frame(m)

Resources