How to get an element from a text string in R - r

> my_data <- "08,23,02.06.2022,5,7,THISPRODUCT,09.02.2022,yes,89,25"
> lengths(gregexpr(",", my_data))+1
[1] 10
I need to get each element individually. I tried with
print(gregexpr(",", my_data))[[1]][1]
> print(gregexpr(",", my_data))[[1]][1]
[[1]]
[1] 3 6 17 19 21 33 44 48 51
attr(,"match.length")
[1] 1 1 1 1 1 1 1 1 1
attr(,"index.type")
[1] "chars"
attr(,"useBytes")
[1] TRUE
[1] 3
but my_data has the first element "08" but it displays 3.. anyone give me correct syntax to display every element.

library(tidyverse)
strings <- "08,23,02.06.2022,5,7,THISPRODUCT,09.02.2022,yes,89,25" %>%
str_split(pattern = ",") %>%
unlist()
strings[1]
#> [1] "08"
Created on 2022-06-29 by the reprex package (v2.0.1)

Let's try scan
> scan(text = my_data, what = "",sep = ",",quiet = TRUE)
[1] "08" "23" "02.06.2022" "5" "7"
[6] "THISPRODUCT" "09.02.2022" "yes" "89" "25"

Using lapply:
lapply(strsplit(my_data, ","), `[`)
Output:
[[1]]
[1] "08" "23" "02.06.2022" "5" "7" "THISPRODUCT" "09.02.2022" "yes"
[9] "89" "25"

You can simply do:
unlist(strsplit(my_data, split = ","))

Related

How to compare vectors with different structures

I have two vectors (fo, fo2) and I would like to compare if the numbers are matching between them (such as with intersect(fo,fo2)).
However, fo and fo2 can't be compared directly. fo is numeric (each element is typed into c() ) while fo2 is read from a string such as "1 3 6 7 8 10 11 13 14 15".
The output of the vectors are produced here for illustration. Any help is greatly appreciated!
# fo is a vector
> fo <- c(1,3,6,7,8,9,10,11)
> fo
[1] 1 3 6 7 8 10 11
> is.vector(fo)
[1] TRUE
# fo2 is also a vector
> library(stringr)
> fo2 <- str_split("1 3 6 7 8 10 11 13 14 15", " ")
> fo2
[[1]]
[1] "1" "3" "6" "7" "8" "10" "11" "13" "14" "15"
> is.vector(fo2)
[1] TRUE
> intersect(fo,fo2)
list()
fo2 here is list vector but fo is atomic vector so to get the intersect e.g.
intersect(fo , fo2[[1]])
#> [1] "1" "3" "6" "7" "8" "10" "11"
to learn the difference see Vectors
Another option:
fo %in% fo2[[1]]
Output:
[1] TRUE TRUE TRUE TRUE TRUE FALSE TRUE TRUE
Check with setdiff:
setdiff(fo, fo2[[1]])
Output:
[1] 9

Creating a matrix by splitting a vector

The string:
f <- c("20-04-2018","15-07-2021","11-11-2022","08-12-2021","28-01-2020")
Allocate one column for the day, month and year.
Anyone who knows what code to use to solve this question?
One way:
> f <-c("20-04-2018","15-07-2021","11-11-2022","08-12-2021","28-01-2020")
> do.call(rbind,strsplit(f, "-"))
[,1] [,2] [,3]
[1,] "20" "04" "2018"
[2,] "15" "07" "2021"
[3,] "11" "11" "2022"
[4,] "08" "12" "2021"
[5,] "28" "01" "2020"
>
Another, better, way using date functions:
> D <- data.frame(date=as.Date(f, "%d-%m-%Y"))
> D$year <- as.integer(format(D$date, "%Y"))
> D$month <- as.integer(format(D$date, "%m"))
> D$day <- as.integer(format(D$date, "%d"))
> D
date year month day
1 2018-04-20 2018 4 20
2 2021-07-15 2021 7 15
3 2022-11-11 2022 11 11
4 2021-12-08 2021 12 8
5 2020-01-28 2020 1 28
>
Yet another way:
f |>
strsplit(split = "-") |>
unlist() |>
matrix(ncol = 3, byrow = TRUE)
[,1] [,2] [,3]
[1,] "20" "04" "2018"
[2,] "15" "07" "2021"
[3,] "11" "11" "2022"
[4,] "08" "12" "2021"
[5,] "28" "01" "2020"
Base R using regex and strcapture():
as.matrix(
strcapture(
pattern = "^(\\d{2})\\-(\\d{2})\\-(\\d{4})$",
x = f,
proto = list(
day = integer(),
month = integer(),
year = integer()
)
)
)
Base R option 2:
type.convert(
simplify2array(
within(
data.frame(f_date = as.Date(f, "%d-%m-%Y")),
{
day <- strftime(f_date, "%d")
month <- strftime(f_date, "%m")
year <- strftime(f_date, "%Y")
f_date <- NULL
}
),
higher = FALSE
),
as.is = TRUE
)

combining elements of vectors in pairs

My data frame:
df <- structure(list(g1 = 1:12, g2 = c(3L, 4L, 5L, 6L, 7L, 8L, 9L,
10L, 11L, 12L, 13L, 67L)), class = "data.frame", row.names = c(NA,
-12L))
I would like to combine 2 columns of my data frame and get a list of vectors
What I want to get:
list(c("1","3"),c("2","4"),c("3","5"),c("4","6"),c("5","7"),c("6","8"),c("7","9")c("8","10"),c("9","11"),c("10","12"),c("11","13"),c("12","67"))
What I tried:
mark.list <- list()
for(i in 1:length(bact$group1)){
x <- bact$group1[i]
y <- bact$group2[i]
df <- combn(paste(x,y))
mark.list <- c(mark.list,df)
}
Another base R option
> unclass(data.frame(t(df)))
$X1
[1] 1 3
$X2
[1] 2 4
$X3
[1] 3 5
$X4
[1] 4 6
$X5
[1] 5 7
$X6
[1] 6 8
$X7
[1] 7 9
$X8
[1] 8 10
$X9
[1] 9 11
$X10
[1] 10 12
$X11
[1] 11 13
$X12
[1] 12 67
attr(,"row.names")
[1] "g1" "g2"
If you want to have output with characters, you can try
> strsplit(do.call(paste, df), " ")
[[1]]
[1] "1" "3"
[[2]]
[1] "2" "4"
[[3]]
[1] "3" "5"
[[4]]
[1] "4" "6"
[[5]]
[1] "5" "7"
[[6]]
[1] "6" "8"
[[7]]
[1] "7" "9"
[[8]]
[1] "8" "10"
[[9]]
[1] "9" "11"
[[10]]
[1] "10" "12"
[[11]]
[1] "11" "13"
[[12]]
[1] "12" "67"
Here are couple of base R options -
asplit
asplit(df, 1)
transpose and as.list.
t(df) |> as.data.frame() |> as.list() |> unname()
ls=list(c("1","3"),c("2","4"),c("3","5"),c("4","6"),c("5","7"),c("6","8"),c("7","9"),c("8","10"),c("9","11"),c("10","12"),c("11","13"),c("12","67"))
vec <- c()
ls=list()
for (k in 1:nrow(df)){
print(k)
vec <- c(c(as.character(df$g1[k]),as.character(df$g2[k])))
ls[k] <- list(vec)
}
output
> ls
[[1]]
[1] "1" "3"
[[2]]
[1] "2" "4"
[[3]]
[1] "3" "5"
[[4]]
[1] "4" "6"
[[5]]
[1] "5" "7"
[[6]]
[1] "6" "8"
[[7]]
[1] "7" "9"
[[8]]
[1] "8" "10"
[[9]]
[1] "9" "11"
[[10]]
[1] "10" "12"
[[11]]
[1] "11" "13"
[[12]]
[1] "12" "67"
Using pmap
library(purrr)
pmap(df, ~ unname(c(...)))
-output
[[1]]
[1] 1 3
[[2]]
[1] 2 4
[[3]]
[1] 3 5
[[4]]
[1] 4 6
[[5]]
[1] 5 7
[[6]]
[1] 6 8
[[7]]
[1] 7 9
[[8]]
[1] 8 10
[[9]]
[1] 9 11
[[10]]
[1] 10 12
[[11]]
[1] 11 13
[[12]]
[1] 12 67

How to get intervals as a list of vectors?

I have a numeric vector and I need to get the intervals as a list of vectors.
I thought it was easy but I'm really struggling to find a good, simple way.
A bad, complex way would be to paste the vector and its lag, and then split the result.
Here is the working but ugly reprex:
library(tidyverse)
xx = c(1, 5, 10 ,15 ,20)
paste0(lag(xx), "-", xx-1) %>% str_split("-") #nevermind the first one, it cannot really make sense anyway
#> [[1]]
#> [1] "NA" "0"
#>
#> [[2]]
#> [1] "1" "4"
#>
#> [[3]]
#> [1] "5" "9"
#>
#> [[4]]
#> [1] "10" "14"
#>
#> [[5]]
#> [1] "15" "19"
Created on 2020-09-06 by the reprex package (v0.3.0)
Is there a cleaner way to do the same thing?
You can use Map :
Map(c, xx[-length(xx)], xx[-1] - 1)
#[[1]]
#[1] 1 4
#[[2]]
#[1] 5 9
#[[3]]
#[1] 10 14
#[[4]]
#[1] 15 19
We can also use lapply iterating over the length of the variable.
lapply(seq_along(xx[-1]), function(i) c(xx[i], xx[i+1] - 1))
We can use map2 from purrr
library(purrr)
map2(xx[-length(xx)], xx[-1] -1, c)

Find the min and max from an unstructured array

I have the following vector and it shows the possible values that a variable can take. As you can see, it's not user-friendly and I'm having a hard time finding a systemic way of going through and identifying the min and max values. Does anyone have any suggestions?
[211] "-1\n1-960" "-1\n1-960"
[213] "-1\n1-960" "-1\n1\n2\n3"
[215] "-1\n0\n1\n\n2\n3\n\n4\n\n5" "-1\nF\nG\nH\nP\nR\nS\nU"
[217] "-1\n0\n1\n2\n3" "-1\n0\n1"
[219] "-1\n0\n1\n2\n3\n4\n5\n6" "-1\n0-255"
[221] "-1\n0-255" "-1\n0-255"
[223] "-1\n0-255" "-1\n0-255"
[225] "-1\n0\n0.01–0.99\n1\n1.01–99.99" "-1\n0\n1\n2\n3\n4\n5\n\n6\n\n7\n8\n\n9\n10\n11\n12"
[227] "-1\n0\n1\n\n2\n\n3\n4\n5\n\n6" "-1\n0\n1\n2\n\n3\n\n4\n5\n6"
The value "-1\n1-960" refers to the possible range of values being between 1 and 960. -1 doesn't mean anything and should be disregarded, along with all letters.
For example:
"-1\n1-960"
"-1\n0\n1\n\n2\n\n3\n4\n5\n\n6" "-1\n0\n1\n2\n\n3\n\n4\n5\n6"
Should result in:
max min
960 1
6 0
6 0
After removing the leading -1, you can split on newlines. Then, since a - means a range, you can also split on - characters, as the two numbers give the min and max of the range. So here's some code:
lapply(
strsplit(
gsub('^-1\n', '', dat),
'\n|-'
),
function(x) range(x)
)
[[1]]
[1] "1" "960"
[[2]]
[1] "1" "960"
[[3]]
[1] "1" "960"
[[4]]
[1] "1" "3"
[[5]]
[1] "" "5"
[[6]]
[1] "F" "U"
[[7]]
[1] "0" "3"
[[8]]
[1] "0" "1"
[[9]]
[1] "0" "6"
[[10]]
[1] "0" "255"
[[11]]
[1] "0" "255"
[[12]]
[1] "0" "255"
[[13]]
[1] "0" "255"
[[14]]
[1] "0" "255"
[[15]]
[1] "0" "1.01–99.99"
[[16]]
[1] "" "9"
[[17]]
[1] "" "6"
[[18]]
[1] "" "6"
Expanding my comment with additional code which might or might not be a partial answer:
I'm guessing that -255 is some sort of missing value marker. Some of those character values (at the moment) could be parsed in R as "numeric" values, but others would throw an error if you tried to parse as such. What were you expecting from 1-960. That's an expression, so neither numeric nor character.
dat <- c( "-1\n1-960" , "-1\n1-960",
"-1\n1-960" , "-1\n1\n2\n3" ,
"-1\n0\n1\n\n2\n3\n\n4\n\n5" , "-1\nF\nG\nH\nP\nR\nS\nU",
"-1\n0\n1\n2\n3" , "-1\n0\n1" ,
"-1\n0\n1\n2\n3\n4\n5\n6" , "-1\n0-255" ,
"-1\n0-255" , "-1\n0-255" ,
"-1\n0-255" , "-1\n0-255" ,
"-1\n0\n0.01–0.99\n1\n1.01–99.99" , "-1\n0\n1\n2\n3\n4\n5\n\n6\n\n7\n8\n\n9\n10\n11\n12" ,
"-1\n0\n1\n\n2\n\n3\n4\n5\n\n6" , "-1\n0\n1\n2\n\n3\n\n4\n5\n6" )
scandat <- sapply( dat, function(x) try( scan(textConnection(x)) ) )
# Lots of error messages but wrapping the scan call in try let's it continue
# So these are the items that could be parsed as numeric:
> scandat[ sapply(scandat,class)=="numeric" ]
$`-1\n1\n2\n3`
[1] -1 1 2 3
$`-1\n0\n1\n\n2\n3\n\n4\n\n5`
[1] -1 0 1 2 3 4 5
$`-1\n0\n1\n2\n3`
[1] -1 0 1 2 3
$`-1\n0\n1`
[1] -1 0 1
$`-1\n0\n1\n2\n3\n4\n5\n6`
[1] -1 0 1 2 3 4 5 6
$`-1\n0\n1\n2\n3\n4\n5\n\n6\n\n7\n8\n\n9\n10\n11\n12`
[1] -1 0 1 2 3 4 5 6 7 8 9 10 11 12
$`-1\n0\n1\n\n2\n\n3\n4\n5\n\n6`
[1] -1 0 1 2 3 4 5 6
$`-1\n0\n1\n2\n\n3\n\n4\n5\n6`
[1] -1 0 1 2 3 4 5 6
I'm not cleaning this up but you could replace the funky names with womething else and it would print better:
> sapply( scandat[ sapply(scandat,class)=="numeric" ], function(x) list(minx=min(x), maxx=max(x) )
+ )
-1\n1\n2\n3 -1\n0\n1\n\n2\n3\n\n4\n\n5 -1\n0\n1\n2\n3 -1\n0\n1 -1\n0\n1\n2\n3\n4\n5\n6
minx -1 -1 -1 -1 -1
maxx 3 5 3 1 6
-1\n0\n1\n2\n3\n4\n5\n\n6\n\n7\n8\n\n9\n10\n11\n12 -1\n0\n1\n\n2\n\n3\n4\n5\n\n6 -1\n0\n1\n2\n\n3\n\n4\n5\n6
minx -1 -1 -1
maxx 12 6 6

Resources