Identify and strip characters from colums - r

I have a large dataset in which I want identify and remove characters and signs to keep only the number value.
For example I want -£1125.91m to be -1125.91
dataset
Event var1 var2
<fct> <chr> <chr>
1 Labour Costs YoY 13.34m 0.026
2 Unemployment Change (000's) $16.91b -0.449
3 Unemployment Rate -£1125.91m 0.89k
4 Jobseekers Net Change ¥1012.74b 9.56m
At the moment I know how to remove a single character from the column. Like this:
dataset$`var1` <- gsub("k", "", dataset$`var`)
Doing this manually will be a lot of work because the dataset is really big. I was wondering if you can identify and remove all the characters, so also the currency symbols and the m's and b's all at once?
To replicate the dataset:
dataset <- structure(list(Event = structure(2:5, .Label = c("Event", "Labour Costs YoY",
"Unemployment Change (000's)", "Unemployment Rate", "Jobseekers Net Change"),
.Names = c("", "", "", ""), class = "factor"), var1 = c("13.34m", "$16.91b", "-£1125.91m", "¥1012.74b"), var2 = c(0.026, -0.449, "0.89k", "9.56m")), row.names = c(NA,
-4L), class = c("tbl_df", "tbl", "data.frame"))

To remove all but a hyphen, digit or a dot, you can use
dataset$var1 <- gsub("[^-0-9.]", "", dataset$var1)
The [^-0-9.] pattern is a negated character class that matches any char but the ones defined in the class.
See the regex demo online.
See an online R demo:
dataset <- structure(list(Event = structure(2:5, .Label = c("Event", "Labour Costs YoY",
"Unemployment Change (000's)", "Unemployment Rate", "Jobseekers Net Change"),
.Names = c("", "", "", ""), class = "factor"), var1 = c("13.34m", "$16.91b", "-£1125.91m", "¥1012.74b"), var2 = c(0.026, -0.449, "0.89k", "9.56m")), row.names = c(NA,
-4L), class = c("tbl_df", "tbl", "data.frame"))
gsub("[^-0-9.,]", "", dataset$var1)
## => [1] "13.34" "16.91" "-1125.91" "1012.74"

Related

Number formatting: big.mark with "." and decilmal.mark with "," it gives NAs

I have the following table:
structure(list(grup_CNAE_Red = c("Aliments i begudes no alcohòliques",
"Begudes alcohòliques i tabac", "Vestit i calçat"), despesatotal = c(90668.3942513707,
9350.70966336, 21808.88895535), proporcio_total = c(16.4337534376051,
1.69482715937661, 3.9528868554426)), row.names = c(NA, -3L), class = c("tabyl",
"tbl_df", "tbl", "data.frame"), core = structure(list(grup_CNAE_Red = structure(1:12, .Label = c("Aliments i begudes no alcohòliques",
"Begudes alcohòliques i tabac", "Vestit i calçat", "Habitatge",
"Mobles i parament de la llar", "Salut", "Transport", "Comunicacions",
"Oci i cultura", "Educació", "Restaurants i hotels", "Altres béns i serveis"
), class = "factor"), despesatotal = c(90668.3942513707, 9350.70966336,
21808.88895535, 186645.44797435, 24265.96063364, 22709.57989815,
60941.60103453, 17670.69007283, 24405.55628082, 8274.5920409,
43167.89349882, 41811.23204347), despesamonetaria = c(89883.1722917393,
9339.26085002, 21794.62559391, 63341.3240054499, 24245.31065782,
22706.3181967, 60847.98764934, 17638.41977893, 24402.01538905,
8137.57745883, 41326.4730733121, 41519.99268768)), row.names = c(NA,
-12L), class = "data.frame"), tabyl_type = "two_way", totals = "row")
And I want to convert despesatotal such that thousands are stated in "." and decimals in ",", and rounded to 2 digits.
Also, proporcio_total, decimals should be with "," and rounded to 2 digits.
When I try to do it with format I only get either NA values or character (but I need numeric values).
With dplyr::mutate() you could try this, however, the values will be characters not numeric. I'm not sure you can format R numeric values with thousands separators and retain the numeric type.
You could round to two decimal places with round and retain the type as numeric. The decimal separator is, I think, linked to the operating system for your locale. You might try with options(OutDec=",") at the beginning of your script to control the decimal separator.
library(dplyr)
df1 |>
mutate(despesatotal = format(despesatotal, big.mark = ".", decimal.mark = ","),
proporcio_total = format(round(proporcio_total, 2), decimal.mark = ","))
#> # A tibble: 3 × 3
#> grup_CNAE_Red despesatotal proporcio_total
#> <chr> <chr> <chr>
#> 1 Aliments i begudes no alcohòliques "90.668,39" "16,43"
#> 2 Begudes alcohòliques i tabac " 9.350,71" " 1,69"
#> 3 Vestit i calçat "21.808,89" " 3,95"
Created on 2022-12-16 with reprex v2.0.2

Compare data frame and list to find highest priority value using R

Edited As per request
Team,
Need suggestion in below request.
I have a static list df2= c("Maths/Science", "Science/Engg", "Maths/Engg", "Maths","Science","Engg"). I need to compare each column of df1 with df2 and check if all these combinations are present or not. It can appear separately or in combination with other values as well.
Weightage is as follows
df2= c("Maths/Science", "Science/Engg", "Maths/Engg", "Maths","Science","Engg")
Maths/Science= 6
Science/Engg=5
Maths/Engg = 4
Maths=3
Science=2
Engg=1
A new dataframe df3 is created to include d1 data and new column as 'weightage' and mention the highest available values in the row(as per weightage).
Please find the data below,
df1-Input df1
dput(input)
structure(list(Col_1 = c("Maths/Science", "Engg", "Commerce",
"Engg"), Col_2 = c("Science L", "Science/Maths", "English,",
"Science/Engg"), Col_3 = c("Commerce", "NA", "NA", "Science"),
Col_4 = c("CS/Engg", "NA", "NA", "NA")), row.names = c(NA,
-4L), class = c("tbl_df", "tbl", "data.frame"))
d2
structure(list(Col_1 = "(\"Maths/Science\", \"Science/Engg\", \"Maths/Engg\", \"Maths\",\"Science\",\"Engg\")"), row.names = c(NA,
-1L), class = c("tbl_df", "tbl", "data.frame"))
df3 output
structure(list(Col_1 = c("Maths", "Engg", "Science", "Engg"),
Col_2 = c("Science L", "Science/Maths", "Engg", "Science/Engg"
), Col_3 = c("Commerce", "NA", "NA", "Science"), Col_4 = c("Maths/Science",
"NA", "NA", "NA"), Weightage = c("Maths/Science", "Science/Maths",
"Science/Engg", "Science/Engg")), row.names = c(NA, -4L), class = c("tbl_df",
"tbl", "data.frame"))
It is not entirely clear how you want to handle subjects, but this may be helpful for you.
First, create a vector and put in order your subjects based on weight:
vec <- c("Maths/Science", "Science/Engg", "Maths/Engg", "Maths", "Science", "Engg")
Then, you can convert your columns to factors, and use ordered levels based on your vector:
df_fac <- lapply(df1, factor, levels = vec, ordered = T)
Finally, you can get the minimum factor level (in this case highest weight, based on my ordered in the vector) for each row:
do.call(pmin, c(df_fac, na.rm = T))
You can assign to df1$Weightage and compare with your example.

How to add single quotes to dataset?

A test dataset
structure(list(numero_certificado = c("1234", "5678"
), sitio_defuncion = c("HOSPITAL/CLINICA", "HOSPITAL/CLINICA"
), tipo_defuncion = c("NO FETAL", "NO FETAL"), fecha_defuncion = structure(c(1635861000,
1635874800), tzone = "", class = c("POSIXct", "POSIXt")), tipo_documento_fallecido = c("REGISTRO CIVIL",
"CEDULA DE CIUDADANIA"), documento_fallecido = c("1111",
"2222")), row.names = c(NA, -2L), class = c("tbl_df", "tbl",
"data.frame"))
I would like to be able to
Add singles quotes (') to each element of the entire dataset
Add singles quotes (') to all elements in specific columns based on index, as some data will be numeric or date and not string
structure(list(numero_certificado = c("'1234'", "'5678'"
), sitio_defuncion = c("'HOSPITAL/CLINICA'", "'HOSPITAL/CLINICA'"
), tipo_defuncion = c("'NO FETAL'", "'NO FETAL'"), fecha_defuncion = structure(c(1635861000,
1635874800), tzone = "", class = c("POSIXct", "POSIXt")), tipo_documento_fallecido = c("'REGISTRO CIVIL'",
"'CEDULA DE CIUDADANIA'"), documento_fallecido = c("'1111'",
"'2222'")), row.names = c(NA, -2L), class = c("tbl_df", "tbl",
"data.frame"))
We may use sQuote by looping across the columns (everything() - if all the columns needs to be changed or for selected columns use one of the select_helpers i.e. here if we need to remove the fecha_defuncion, prefix with -) of the data
library(dplyr)
df1 <- df1 %>%
mutate(across(-fecha_defuncion, sQuote, FALSE))
-output
df1
# A tibble: 2 × 6
numero_certificado sitio_defuncion tipo_defuncion fecha_defuncion tipo_documento_fallecido documento_fallecido
<chr> <chr> <chr> <dttm> <chr> <chr>
1 '1234' 'HOSPITAL/CLINICA' 'NO FETAL' 2021-11-02 08:50:00 'REGISTRO CIVIL' '1111'
2 '5678' 'HOSPITAL/CLINICA' 'NO FETAL' 2021-11-02 12:40:00 'CEDULA DE CIUDADANIA' '2222'
Also as #KonradRudolph mentioned in the comments, if the sQuote depends on the locale, another option is either glue or paste or sprintf
df1 <- df1 %>%
mutate(across(-fecha_defuncion, ~sprintf("'%s'", as.character(.))))

create data frame from nested entries

I have a data frame test like this:
dput(test)
structure(list(X = 1L, entityId = structure(1L, .Label = "HOST-123", class = "factor"),
displayName = structure(1L, .Label = "server1", class = "factor"),
discoveredName = structure(1L, .Label = "server1", class = "factor"),
firstSeenTimestamp = 1593860000000, lastSeenTimestamp = 1603210000000,
tags = structure(1L, .Label = "c(\"CONTEXTLESS\", \"CONTEXTLESS\", \"CONTEXTLESS\", \"CONTEXTLESS\", \"CONTEXTLESS\", \"CONTEXTLESS\", \"CONTEXTLESS\", \"CONTEXTLESS\"), c(\"app1\", \"client\", \"org\", \"app1\", \"DATA_CENTER\", \"PURPOSE\", \"REGION\", \"Test\"), c(NA, \"NONE\", \"Host:Environment:test123\", \"111\", \"222\", \"GENERAL\", \"444\", \"555\")", class = "factor")), .Names = c("X",
"entityId", "displayName", "discoveredName", "firstSeenTimestamp",
"lastSeenTimestamp", "tags"), class = "data.frame", row.names = c(NA,
-1L))
There is a column called tags which should become a dataframe. I need to get rid of the first row in tags (which keep saying CONTEXTLESS, expand the second column in tags(make them columns. Lastly I need to insert the 3rd column values in tags under each expanded columns.
For example in needs to look like this:
structure(list(entityId = structure(1L, .Label = "HOST-123", class = "factor"),
displayName = structure(1L, .Label = "server1", class = "factor"),
discoveredName = structure(1L, .Label = "server1", class = "factor"),
firstSeenTimestamp = 1593860000000, lastSeenTimestamp = 1603210000000,
app1 = NA, client = structure(1L, .Label = "None", class = "factor"),
org = structure(1L, .Label = "Host:Environment:test123", class = "factor"),
app1.1 = 111L, data_center = 222L, purppose = structure(1L, .Label = "general", class = "factor"),
region = 444L, test = 555L), .Names = c("entityId", "displayName",
"discoveredName", "firstSeenTimestamp", "lastSeenTimestamp",
"app1", "client", "org", "app1.1", "data_center", "purppose",
"region", "test"), class = "data.frame", row.names = c(NA, -1L
))
I need to remove the 1st vector that keeps saying "contextless", add the second vector the columns. Each 2nd vector value should be a column name. Last vector should be values of the newly added columns.
If you are willing to drop the first "row" of garbage and then do a ittle cleanup of the parse-side-effects, then this might be a good place to start:
read.table(text=gsub("\\),", ")\n", test$tags[1]), sep=",", skip=1, #drops line
header=TRUE)
c.app1 client org app1 DATA_CENTER PURPOSE REGION Test.
1 c(NA NONE Host:Environment:test123 111 222 GENERAL 444 555)
The read.table function uses the scan function which doesn't know that "c(" and ")" are meaningful. The other alternative might be to try eval(parse(text= .)) (which would know that they are enclosing vectors) on the the second and third lines, but I couldn't see a clean way to do that. I initially tried to separate the lines using strsplit, but that caused me to loose the parens.
Here's a stab at some cleanup via that addition of some more gsub operations:
read.table(text=gsub("c\\(|\\)","", # gets rid of enclosing "c(" and ")"
gsub("\\),", "\n", # inserts line breaks
test$tags[1])),
sep=",", #lets commas be parsed
skip=1, #drops line
header=TRUE) # converts to colnames
app1 client org app1.1 DATA_CENTER PURPOSE REGION Test
1 NA NONE Host:Environment:test123 111 222 GENERAL 444 555
The reason for the added ".1" in the second instance of app1 is that R colnames in dataframes need to be unique unless you override that with check.names=FALSE
Here is a tidyverse approach
library(dplyr)
library(tidyr)
str2dataframe <- function(txt, keep = "all") {
# If you can confirm that all vectors are of the same length, then we can make them into columns of a data.frame
out <- eval(parse(text = paste0("data.frame(", as.character(txt),")")))
# rename columns as X1, X2, ...
nms <- make.names(seq_along(out), unique = TRUE)
if (keep == "all")
keep <- nms
`names<-`(out, nms)[, keep]
}
df %>%
mutate(
tags = lapply(tags, str2dataframe, -1L),
tags = lapply(tags, function(d) within(d, X2 <- make.unique(X2)))
) %>%
unnest(tags) %>%
pivot_wider(names_from = "X2", values_from = "X3")
df looks like this
> df
X entityId displayName discoveredName firstSeenTimestamp lastSeenTimestamp
1 1 HOST-123 server1 server1 1.59386e+12 1.60321e+12
tags
1 c("CONTEXTLESS", "CONTEXTLESS", "CONTEXTLESS", "CONTEXTLESS", "CONTEXTLESS", "CONTEXTLESS", "CONTEXTLESS", "CONTEXTLESS"), c("app1", "client", "org", "app1", "DATA_CENTER", "PURPOSE", "REGION", "Test"), c(NA, "NONE", "Host:Environment:test123", "111", "222", "GENERAL", "444", "555")
Output looks like this
# A tibble: 1 x 14
X entityId displayName discoveredName firstSeenTimestamp lastSeenTimestamp app1 client org app1.1 DATA_CENTER PURPOSE REGION Test
<int> <fct> <fct> <fct> <dbl> <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 1 HOST-123 server1 server1 1593860000000 1603210000000 NA NONE Host:Environment:test123 111 222 GENERAL 444 555

merging outputs from a loop

I have two datasets and named E and eF respectively.
E<- structure(list(Inception_Date = structure(c(962323200, 962323200,
810950400, 988675200, 1042502400, 1536624000), tzone = "UTC", class =
c("POSIXct","POSIXt")), Name = c("Calvert Social Index B", "Calvert US
Large Cap Core Rspnb Idx A", "Green Century Equity Individual
Investor", "Praxis Value Index A", "Vanguard FTSE Social Index I",
"Amundi IS Amundi MSCI USA SRI ETF DR")), row.names = c(NA, -6L),
class = c("tbl_df", "tbl", "data.frame"))
eF <- structure(list(Inception_Date = structure(c(760233600, 519868800,
1380067200, 1101772800, 1325203200, 628473600, 1325203200, 1123804800
), tzone = "UTC", class = c("POSIXct", "POSIXt")), Name = c("Amana
Growth Investor", "Amana Income Investor", "Amana Income
Institutional", "American Century Sustainable Equity A",
"Ariel Appreciation Institutional", "Ariel Appreciation Investor",
"Ariel Focus Institutional", "Baywood Socially Responsible Invs"
)), row.names = c(NA, -8L), class = c("tbl_df", "tbl", "data.frame"
))
I applied the following codes to the data E and eF.
for (k in 1:nrow(E)) {
F_temp <- eF;
G_temp <- F_temp %>% filter(abs(F_temp$Inception_Date-
E$Inception_Date[k]) <= 1500);
print(G_temp)}
As the "G_temp" under the "Global Environment" shows it as 0 obs. of 2 variables only (which must be the last components in the loop's list), how to make a .csv file that shows all the "G_temp" components merged together removing duplicates?
Thanks
Using your exact filter criteria would this do it?
G_temp <- data.frame(Inception_Date = as.POSIXct(character()),
Name = character())
for (k in 1:nrow(E)) {
G_temp_int <- eF %>%
filter(abs(eF$Inception_Date - E$Inception_Date[k]) <= 1500)
G_temp <- bind_rows(G_temp, G_temp_int)
}
G_temp <- G_temp %>%
distinct(Inception_Date, Name)
write.csv(G_temp, "G_temp.csv")

Resources