splitting a vector with strings - r

I have a vector, which is filled with strings of length 2:
vec <-c( "00", "10", "00", "01", "11", "11", "10", "00",...)
I want now to split the strings of the vector into the length 1:
result <- "0", "0", "1", "0", "0", "0", "0",...)
The last step is to merge the vector entries to strings of length 8:
qpsk <- "00100001", "11111000",...)
I know there is the function strsplit and the function paste, but i don't know how to use it in this case.
Is there a efficient way to do this?
Thanks

Try
strsplit(paste(vec, collapse=''), '(?<=.{8})', perl=TRUE)[[1]]
#[1] "00100001" "11111000" "0100"
data
vec <-c( "00", "10", "00", "01", "11", "11", "10", "00", "01", "00")

Related

How to convert a range of columns from Character to Number/Integer in R

I am tryin to convert a few columns which are in a range from Character to Integer. I dont want to write each column as.integer.
I am trying to find a more effective way where I can pass the the column names which I want to convert and then convert them into integer.
Is this doable in R? Or Should I do it one column after the other.
The Expected output:
Convert a range of data which is in char to Integer.
Convert a few columns without using passing them as range but rather as individual columns.
The code I wrote is given below:
library(readxl)
Final <- read_excel("C:/X/X/X- X/Desktop/Final.xlsx")
First_Date <- colnames(Final)[4]
Last_Date <- tail(colnames(Final),1)
str(Final)
Final <- Final %>%
mutate_if(c(First_Date:Last_Date),as.numeric)
The data I am working with is given below:
structure(list(UniqueID = c("3F-FA|807905", "3F-FA|808005", "3F-FA|808006",
"3F-FA|808007", "Py_AuAriFa|761403", "3F-FA|761502", "AutoTheta|761602",
"3F-FA|318901", "3F-FA|339401"), Xreg = c("3F-FA", "3F-FA", "3F-FA",
"3F-FA", "Py_AuAriFa", "3F-FA", "AutoTheta", "3F-FA", "3F-FA"
), Row = c("807905", "808005", "808006", "808007", "761403",
"761502", "761602", "318901", "339401"), `2023-02-01` = c("0",
"0", "0", "0", "50", "1", "7", "0", "0"), `2023-03-01` = c("0",
"0", "0", "0", "32", "1", "7", "0", "0"), `2023-04-01` = c("0",
"0", "0", "0", "36", "1", "7", "0", "0"), `2023-05-01` = c("0",
"0", "0", "0", "41", "1", "7", "0", "0"), `2023-06-01` = c("0",
"0", "0", "0", "31", "1", "6", "0", "0"), `2023-07-01` = c("0",
"0", "0", "0", "38", "1", "6", "0", "0"), `2023-08-01` = c("0",
"0", "0", "0", "34", "1", "6", "0", "0"), `2023-09-01` = c("0",
"0", "0", "0", "32", "1", "6", "0", "0"), `2023-10-01` = c("0",
"0", "0", "0", "35", "1", "5", "0", "0")), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -9L))
The columns I am trying to convert is from 2023-02-01 to 2023-10-01. I cant use mutateif and pass it through the whole dataframe as the column Row has data which are character and can be converted to integer but should not be converted. Hence the selected few columns.
We can match the patterns in the column names to loop over those column and modify the class
library(dplyr)
Final <- Final %>%
mutate(across(matches("^\\d{4}-\\d{2}-\\d{2}$"), as.integer))
Or use the :
Final <- Final %>%
mutate(across("2023-02-01":"2023-10-01", as.integer))

Column retrieved from database must be cast to numeric to be usable

If I retrieve the data from a database (a MySQL database with either RMariaDB or ODBC) I get errors while using the data as-is with multiple R functions (hist, boxplot, but not sd or summary):
Error in hist.default(lockout_per_hour$alarm_count) :
some 'x' not counted; maybe 'breaks' do not span range of 'x'
In addition: Warning message:
In pretty.default(range(x), n = breaks, min.n = 1) :
Internal(pretty()): very small range.. corrected
If I just export that same data to a CSV file and import it in RStudio everything works, otherwise if I want to use the data from the database I have to cast it to numeric.
As requested, code:
library(DBI);
db <- DBI::dbConnect(odbc::odbc(), 'my-dns');
q_perHour = "SELECT
DATE_FORMAT(MIN(timestamp), '%H') hour, COUNT(*) count
FROM alarm
GROUP BY YEAR(timestamp), MONTH(timestamp), DAY(timestamp), HOUR(timestamp)
LIMIT 100";
rs = dbSendQuery(db, q_perHour);
data <- dbFetch(rs);
hist(data$count); # KO
sd(data$count); # OK
dput output:
structure(list(hour = c("18", "19", "20", "21", "22", "23", "00",
"01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11",
"12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22",
"23", "00", "01", "02", "03", "04", "05", "06", "07", "08", "09",
"10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20",
"21", "22", "23", "00", "01", "02", "03", "04", "05", "06", "07",
"08", "09", "10", "11", "12", "13", "14", "15", "16", "17", "18",
"19", "20", "21", "22", "23", "00", "01", "02", "03", "04", "05",
"06", "07", "08", "09", "10", "11", "12", "13", "14", "15", "16",
"17", "18", "19", "20", "21"), count = structure(c(2.47032822920623e-323,
4.94065645841247e-323, 3.95252516672997e-323, 3.95252516672997e-323,
3.45845952088873e-323, 3.95252516672997e-323, 8.39911597930119e-323,
1.48219693752374e-323, 3.95252516672997e-323, 3.45845952088873e-323,
5.92878775009496e-323, 5.92878775009496e-323, 4.94065645841247e-323,
5.43472210425371e-323, 2.47032822920623e-323, 1.97626258336499e-323,
5.43472210425371e-323, 5.43472210425371e-323, 4.44659081257122e-323,
9.38724727098368e-323, 5.92878775009496e-323, 6.91691904177745e-323,
6.42285339593621e-323, 2.47032822920623e-323, 4.94065645841247e-323,
8.89318162514244e-323, 4.44659081257122e-323, 8.39911597930119e-323,
1.08694442085074e-322, 1.33397724377137e-322, 2.02566914794911e-322,
1.13635098543487e-322, 1.24010477106153e-321, 9.40700989681733e-321,
1.43279037293961e-322, 1.67982319586024e-322, 1.08694442085074e-322,
4.44659081257122e-323, 7.90505033345994e-323, 5.92878775009496e-323,
7.4109846876187e-323, 6.91691904177745e-323, 8.89318162514244e-323,
5.92878775009496e-323, 9.88131291682493e-323, 7.90505033345994e-323,
9.38724727098368e-323, 1.18575755001899e-322, 7.4109846876187e-323,
1.23516411460312e-322, 1.23516411460312e-322, 1.13635098543487e-322,
1.72922976044436e-322, 1.28457067918724e-322, 1.67982319586024e-322,
1.72922976044436e-322, 9.38724727098368e-323, 2.12448227711736e-322,
2.99403781379795e-321, 1.13635098543487e-322, 1.13635098543487e-322,
7.90505033345994e-323, 8.39911597930119e-323, 9.38724727098368e-323,
7.4109846876187e-323, 6.91691904177745e-323, 5.92878775009496e-323,
8.89318162514244e-323, 6.42285339593621e-323, 6.91691904177745e-323,
1.13635098543487e-322, 7.90505033345994e-323, 1.67982319586024e-322,
2.27270197086973e-322, 1.87744945419674e-322, 7.90505033345994e-323,
1.43279037293961e-322, 8.89318162514244e-323, 1.13635098543487e-322,
1.23516411460312e-322, 1.03753785626662e-322, 1.28457067918724e-322,
1.03753785626662e-322, 7.4109846876187e-323, 9.88131291682493e-323,
1.08694442085074e-322, 3.45845952088873e-323, 7.4109846876187e-323,
4.44659081257122e-323, 4.94065645841247e-323, 3.45845952088873e-323,
2.96439387504748e-323, 5.43472210425371e-323, 5.43472210425371e-323,
7.90505033345994e-323, 6.91691904177745e-323, 5.43472210425371e-323,
7.90505033345994e-323, 8.39911597930119e-323, 7.11454530011395e-322
), class = "integer64")), class = "data.frame", row.names = c(NA,
-100L))
As suggested, the issue is remediated if I change the connection to:
db <- DBI::dbConnect(odbc::odbc(), 'my-dns', bigint='numeric');
It seems the class "integer64" does not work well with the hist() function. Try modifying both variables to numeric:
library(dplyr)
data = mutate(data, hour = as.numeric(hour), count = as.numeric(count))
This works, although a warning is thrown for hist(data$count):
Warning messages: 1: In pretty.default(range(x), n = breaks, min.n =
: Internal(pretty()): very small range.. corrected 2: In
plot.window(xlim, ylim, "", ...) : Internal(pretty()): very small
range.. corrected
This warning seems to be connected to the data itself, though.
Also, you can try using the bigint argument in dbConnect() set to "numeric". This governs how 64-bit integer data is returned.

applying if statement for list within list in r

I am trying to run over a list of lists. Each line has 29 lists, and each list has 6 numbers stored as strings. An example looks like the following
dput(M[6000])
list(list(c("0", "1", "19", "785", "-3150", "0.90"), c("4", "2", "-1", "5550", "4400", "0.00"),
c("1", "3", "6", "3319", "-2558", "1.49"), c("1", "4", "1", "4573", "-435", "1.24"),
c("0", "5", "6", "1137", "-2828", "2.28"), c("0", "6", "24", "1668", "-1143", "2.76"),
c("1", "7", "2", "2859", "-720", "1.40"), c("1", "8", "23", "420", "-3346", "1.57"),
c("1", "9", "26", "2290", "752", "1.23"), c("1", "10", "8", "1208", "-2842", "2.14"),
c("0", "11", "11", "-219", "-374", "1.26"), c("0", "12", "3", "-69", "-2403", "2.24"),
c("0", "13", "1", "-3488", "-830", "0.17"), c("1", "14", "7", "2102", "-1404", "1.24"),
c("1", "15", "3", "1746", "-3481", "1.59"), c("3", "16", "0", "720", "-1425", "0.47"),
c("1", "17", "9", "170", "-2257", "3.14"), c("0", "18", "5", "-351", "-1564", "1.08"),
c("4", "19", "-1", "5550", "4400", "0.00"), c("3", "20", "1", "3304", "-3448", "1.78"),
c("1", "21", "4", "2289", "-1873", "3.13"), c("0", "22", "2", "175", "-3080", "1.28"),
c("1", "23", "12", "877", "140", "1.52"), c("0", "24", "8", "871", "-1933", "4.11"),
c("0", "25", "9", "3185", "-2548", "1.50"), c("4", "26", "-1", "5550", "4400", "0.00"),
c("3", "27", "2", "-290", "3415", "0.56"), c("4", "28", "-1", "5550", "4400", "0.00"),
c("0", "29", "32", "2176", "-2145", "1.58")))
For each line, I am trying to run over the 29 lists and save only the lists that has the 3rd element equal to 4. For one line it would be:
if(as.numeric(M[[6000]][[1]][3]) == 4) M[[6000]][[1]]
I have tried something down the line of
MP4 <- lapply(M, function(x) if(as.numeric(x[[1]][3]) == 4) x[[1]])
without luck.
The purrr package is very good at those kinds of problems:
library(purrr)
M %>%
map(.f = keep, .p = ~ .x[[3]] == "4")
# [[1]]
# [[1]][[1]]
# [1] "1" "21" "4" "2289" "-1873" "3.13"
Edit per your comment:
Let's make another list, M_2, to illustrate the issue:
M_2 <- c(M, list(list()))
M_2 %>%
map(.f = keep, .p = ~ .x[[3]] == "4")
# [[1]]
# [[1]][[1]]
# [1] "1" "21" "4" "2289" "-1873" "3.13"
#
#
# [[2]]
# list()
Then simply discard lists that are equal to list():
M_2 %>%
map(.f = keep, .p = ~ .x[[3]] == "4") %>%
discard(identical, list())
# [[1]]
# [[1]][[1]]
# [1] "1" "21" "4" "2289" "-1873" "3.13"
To use base R, you can use Filter:
lapply(M, Filter, f = function(x){x[[3]] == '4'})
## [[1]]
## [[1]][[1]]
## [1] "1" "21" "4" "2289" "-1873" "3.13"
To filter out empty elements of a larger list, Filter twice:
# using #apom's data from above
Filter(function(x){length(x) != 0},
lapply(M_2, Filter, f = function(x){x[[3]] == '4'}))
## [[1]]
## [[1]][[1]]
## [1] "1" "21" "4" "2289" "-1873" "3.13"
Here is one way to loop through the list and create a new one which saves elements according to your criteria of third element equal to 4.
new_dl <- list()
j <- 1L
for (l in 1L:length(dl)) {
new_dl[[l]] <- list()
for (i in 1L:length(dl[[1]]))
if (dl[[l]][[i]][3] == 4) {
new_dl[[l]][[j]] <- dl[[l]][[i]]
j <- j + 1L
}
j <- 1L
}

Iterate several operations over a list of files in a directory and save with new dynamic filename in R, Lapply?

I am new to R and would like to read in a list of files as separate data frames, perform several operations on each, and save them out as separate files with dynamic file names. I am thinking I should use lappy, but not sure.
Here is the code I wrote that works for one file:
df <- read.fwf('USC00011084.dly', widths = c(21, rep(c(5, 1, 1, 1),31)))
df2 <- df[-c(3:5, 7:9, 11:13, 15:17, 19:21, 23:25, 27:29, 31:33, 35:37, 39:41, 43:45, 47:49, 51:53, 55:57, 59:61, 63:65, 67:69, 71:73, 75:77, 79:81, 83:85, 87:89, 91:93, 95:97, 99:101, 103:105, 107:109, 111:113, 115:117, 119:121, 123:125)]
df2[df2=="-9999"]<-NA
df$new <- rowSums(df2[,2:32], na.rm = TRUE)
df2["Total"] <- df$new
colnames(df2) <- c("StationDateType", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26", "27", "28", "28", "30", "31", "TotalMonthly")
Prcp <- df2[grep("PRCP", df2$StationDateType),]
write.table(Prcp, "USC00011084Prcp.txt", sep="\t", row.names=FALSE)
How can I do this for a list of files in a directory? Any ideas? Thank you.
You can try this...
You can get a list of your files:
files <- list.files(getwd())
Write a function that performs the analysis you want and writes the results to table, as you have done. Here we use tools::file_path_sans_ext to extract the filename (without the file type extension), and at the end use it to name the table to be saved to txt.
myFunction <- function(files){
fileName <- tools::file_path_sans_ext(files)
df <- read.fwf(files, widths = c(21, rep(c(5, 1, 1, 1),31)))
# rest of your code
# ...
write.table(Prcp, paste0(fileName, "Prcp.txt"), sep="\t", row.names=FALSE)
}
You can use lapply to run your function on each file in files.
lapply(files, function(x) myFunction(x))

invalid color name background in qgraph

I have been trying to use qgraph to generate the network graph. The code is as following
Gw <- qgraph(edgeList, diag = TRUE, labels = TRUE,legend.cex = 0.3, vsize = 1,edge.color=colorLabels,legend=TRUE,asize=1)
The figure can be generated, but the R command line gives the following error message. I do not know what does the invalid color name 'background' mean.
The dput result is shown as follows,
dput(edgeList)
structure(c("1", "2", "2", "3", "4", "5", "6", "7", "8", "1",
"9", "10", "11", "12", "13", "14", "15", "16", "17", "18", "16",
"4", "5", "7", "1", "9", "10", "19", "20", "2", "16", "21", "3",
"22", "5", "23", "8", "1", "20", "2", "13", "14", "17", "14",
"1", "19", "14", "2", "21", "14", "24", "1", ":499.3", "nk Transfe",
"de of tran", "up(non-US ", "up(non-US ", "up(non-US ", "up(non-US ",
"up(non-US ", "up(non-US ", "up(non-US ", "up(non-US ", "up(non-US ",
"ine:4121", "ine:3257.4", "ine:75.2", "ine:75.2", "ine:11615.",
"ine:10603", "ine:334.2", "ine:7256.8", "ine:7256.8", "ine:996.8",
"ine:884.6", "ine:364.9", "ine:6360", "ine:5640.9", "ine:2729.7",
"ine:5482.6", "ine:85", "ine:1474.9", "ine:700.8", "ine:2754.6",
"ine:3257.4", "ine:3257.4", "ine:7307.8", "ine:18560.", "ine:85.1",
"ine:364.8", ":700.1", ":5317", "l:4258.9", "l:4258.9", "l:1637.6",
"l:1637.6", "l:46.4", "l:3938.5", "l:3938.5", "l:2800.4", "l:2715.1",
"l:2715.1", "l:12708.2", "l:1042", ":499.3", "nk Transfe", "de of tran",
"up(non-US ", "up(non-US ", "up(non-US ", "up(non-US ", "up(non-US ",
"up(non-US ", "up(non-US ", "up(non-US ", "up(non-US ", "ine:4121",
"ine:3257.4", "ine:75.2", "ine:75.2", "ine:11615.", "ine:10603",
"ine:334.2", "ine:7256.8", "ine:7256.8", "ine:996.8", "ine:884.6",
"ine:364.9", "ine:6360", "ine:5640.9", "ine:2729.7", "ine:5482.6",
"ine:85", "ine:1474.9", "ine:700.8", "ine:2754.6", "ine:3257.4",
"ine:3257.4", "ine:7307.8", "ine:18560.", "ine:85.1", "ine:364.8",
":700.1", ":5317", "l:4258.9", "l:4258.9", "l:1637.6", "l:1637.6",
"l:46.4", "l:3938.5", "l:3938.5", "l:2800.4", "l:2715.1", "l:2715.1",
"l:12708.2", "l:1042", "25", "1", "1", "26", "27", "28", "29",
"30", "31", "25", "32", "33", "4", "4", "3", "3", "5", "5", "7",
"6", "6", "27", "28", "30", "25", "32", "33", "9", "8", "1",
"1", "10", "12", "12", "16", "16", "16", "16", "8", "1", "3",
"3", "7", "7", "25", "9", "9", "1", "10", "10", "14", "14"), .Dim = c(104L,
2L), .Dimnames = list(NULL, c("newsendId", "newtoId")))
The generated figure is as follows. I used the following command to generate it
Gw <- qgraph(edgeList, layout = "spring", diag = FALSE, labels = TRUE, cut = NULL, edge.color = "red",legend.cex = 0.5, vsize = 8)
Which nodes are problems? With your data and code you can modify label.cex. There are other variations of the arguments for the label and legend sizes. Here is one version, with the color blue.
library(qgraph)
Gw <- qgraph(edgeList, layout = "spring", diag = FALSE, labels = TRUE, cut = NULL, edge.color = "red", legend.cex = 0.3, vsize = 4, label.cex = 0.3, label.color = "blue")
Gw

Resources