How to change dot to comma in R - r

get_bike_data <- function(url) {
html_bike_category <- read_html(url)
# Get the names
bike_name_tbl <- html_bike_category %>%
html_nodes(css = ".catalog-category-bikes__title-text") %>%
html_text() %>%
str_remove_all(pattern = "\n") %>%
enframe(name = "position", value = "name")
# Get the prices
bike_price_tbl <- html_bike_category %>%
html_nodes(css = ".catalog-category-bikes__price-title") %>%
html_text() %>%
str_remove_all(pattern = "\\.")%>%
extract_numeric()%>%
enframe(name = "position", value = "price_euro") %>%
left_join(bike_name_tbl)
}
# 2.3.1b Alternative with a for loop
# Create an empty tibble, that we can populate
# Loop through all urls
bike_data_tbl <- bike_data_tbl %>%
rename("model" = "name")%>%
subset(nchar(price_euro)!=0)
bike_data_tbl
this is a data of price and model from a website. I wanted to change the 1.699 to 1,699. Although I tried many other methods(format(decimal.mark=","), parse.number(), sub(), etc.) that I googled, it still does not work.
What is the problem?

Below a possible solution
library(stringr)
text<-c('1231.1','4343.5','312312.0')
str_replace(string = text,pattern = "[.]",replacement = ",")
[1] "1231,1" "4343,5" "312312,0"
another possible solution is:
num_text<-c(1231.1,4343.5,312312.0)
gsub("\\.", ",", num_text)
[1] "1231,1" "4343,5" "312312"

Both gsub and formatshould work
#format
format(bike_data_tbl$price_euro, decimal.mark = ",")
#gsub
gsub(pattern = ".", x = bike_data_tbl$price_euro, replacement = ",", fixed = TRUE)
However, it seems that the prices are in thousands (e.g. 1.699 for ground control model = 1699 euros. You could try this:
as.numeric(gsub(pattern = ".", x = bike_data_tbl$price_euro, replacement = "", fixed = TRUE))
The last function replaces all dots with nothing.

Related

I'm trying to clean my data and the following error keeps appearing:

M_aff <- API_info$affil_info names(API_info$affil_info) <- c("entry_number", "doi", "affiliation_name", "affiliation_city", "affiliation_country", "affiliation_id") filter(is.na(as.numeric(gsub("([0-9]+).*$", "\1", affiliation_name)))) %>% mutate(affiliation_name = str_replace_all(affiliation_name, "&", "&"), affiliation_name = stringi::stri_trans_general(affiliation_name, "ASCII"), affiliation_name = str_replace_all(affiliation_name, "\.|\(|\)|\-|\\", " "), affiliation_name = trimws(affiliation_name, which = "both")) %>% filter(str_detect(affiliation_name, "netflix")) %>% distinct(entry_number, affiliation_id, .keep_all = T) %>% left_join(cleaned_data %>% select(entry_number, citations, year), by = "entry_number")
ERROR:Error in is.factor(x) : object 'affiliation_name' not found
The M_aff is the set of observations with 6 variables.
The "netflix" file is the file with the raw data.
I also am not sure if i should be using filter, I want to clean my data (remove the false positives): the "netflix" file that is a list of documents published by netflix.
I extracted the Netflix file as following:
api_key <- '26d72e4e014f3c18226cd4bf29725557' set_api_key(api_key) netflix <- scopus_search(api_key ='26d72e4e014f3c18226cd4bf29725557', query = "affil(netflix)", count=1) df_netflix = gen_entries_to_df(netflix$entries) save(df_netflix, file = paste(getwd(), 'netflix/input_data/df_netflix.RData', sep='/'))
Then I applied the following code to clean:
cleaned_data <- df_netflix$df %>%
select("eid", "dc:title", "dc:creator", "prism:publicationName", "prism:issn", "prism:eIssn", "prism:coverDate", "prism:doi",
"citedby-count", "subtypeDescription", "entry_number") %>%
rename(title = "dc:title",
creator = "dc:creator",
journal = "prism:publicationName",
date = "prism:coverDate",
doi = "prism:doi",
citations = "citedby-count",
doc_type = "subtypeDescription",
issn_og = "prism:issn",
eissn_og = "prism:eIssn") %>%
mutate(year = str_sub(date,start=1, end=4),
issn = case_when(is.na(issn_og) & !is.na(eissn_og) ~ eissn_og,
!is.na(issn_og) & is.na(eissn_og) ~ issn_og,
!is.na(issn_og) & !is.na(eissn_og) ~ paste(issn_og, eissn_og, sep=', '),
TRUE ~ 'NA'
)) %>%
left_join(sjrdata::sjr_journals %>% select(year, issn, sjr, sjr_best_quartile, h_index), by=c('year','issn'))
openxlsx::write.xlsx(cleaned_data, file= paste(getwd(), 'netflix/output_data/netflix.xlsx', sep='/'))
save(cleaned_data, file = paste(getwd(), 'netflix/input_data/cleaned_data.RData', sep='/'))
api_key <- '26d72e4e014f3c18226cd4bf29725557'
set_api_key(api_key)
doi_not_na <-
cleaned_data %>%
as_tibble() %>%
select(entry_number, doi) %>%
filter(!is.na(doi))
API_info <- get_api_info(doi_not_na)
save(API_info, file = paste(getwd(), "netflix/input_data/API_info_netflix.RData", sep = "/"))
Then I was trying to continue to clean data as following:
M_aff <-
API_info$affil_info
names(API_info$affil_info) <- c("entry_number", "doi", "affiliation_name", "affiliation_city", "affiliation_country", "affiliation_id")
filter(is.na(as.numeric(gsub("([0-9]+).*$", "\\1", affiliation_name)))) %>%
mutate(affiliation_name = str_replace_all(affiliation_name, "&", "&"),
affiliation_name = stringi::stri_trans_general(affiliation_name, "ASCII"),
affiliation_name = str_replace_all(affiliation_name, "\\.|\\(|\\)|\\-|\\\\", " "),
affiliation_name = trimws(affiliation_name, which = "both")) %>%
filter(str_detect(affiliation_name, "netflix")) %>%
distinct(entry_number, affiliation_id, .keep_all = T) %>%
left_join(cleaned_data %>%
select(entry_number, citations, year),
by = "entry_number")
true_positive= M_aff%>%
filter(grepl(pattern = 'netflix', x = tolower(affiliation_id)))%>%
select(doi)%>%
distinct()%>%
pull(doi)
M_aff=M_aff%>%
filter(doi %in% true_positive)
API_info$author_info= API_info$author_info%>%
filter(doi %in% true_positive)
API_info$subject_info=API_info$subject_info%>%
filter(doi %in% true_positive)
API_info$authkey_info= API_info$authkey_info%>%
filter(doi %in% true_positive)
save(M_aff, file = paste(getwd(), "netflix/input_data/M_aff.RData", sep = "/"))
openxlsx::write.xlsx(M_aff, file= paste(getwd(), 'netflix/output_data/netflix_aff.xlsx', sep='/'))
openxlsx::write.xlsx(API_info$author_info, file= paste(getwd(), 'netflix/output_data/netflix_auth.xlsx', sep='/'))
openxlsx::write.xlsx(API_info$subject_info, file= paste(getwd(), 'netflix/output_data/netflix_subject.xlsx', sep='/'))
openxlsx::write.xlsx(API_info$authkey_info, file= paste(getwd(), 'netflix/output_data/netflix_authkey.xlsx', sep='/'))

If statement with pipe

I've seen a few articles on how to use if statements or conditionals using piping, but I'm not sure how to apply it to my situation. Along with a specific answer to my problem, I was also hoping for also a more general explanation about adding a if statement with piping so I am able to handle most situations.
I tried to learn to use this answer below (use if() to use select() within a dplyr pipe chain), but I don't understand why we are supplying "." as an argument on the third line below and when I should do so
mtcars %>%
group_by(cyl) %>%
{ if (cond) filter(., am == 1) else . } %>%
summarise(m = mean(wt))
Here's a sample of my data:
df_parse<-
structure(list(value = c("HURESPLI\t2\tLINE NUMBER OF THE RESPONDENT\tCURRENT\t22 - 23",
"FILLER\t2\t\t27 - 28", "HUBUSL1\t2\tENTER LINE NUMBER\t81 - 82",
"GEDIV\t1\tDIVISION\t91 - 91", "GESTFIPS\t2\tFEDERAL INFORMATION\t93 - 94"
), starts_with_position = c(TRUE, TRUE, TRUE, TRUE, TRUE), missing_vars = c("HUFINAL\t FINAL OUTCOME CODE\t 24 - 26",
"HETENURE\t ARE YOUR LIVING QUARTERS... (READ ANSWER CATEGORIES)\t 29 - 30",
"FOR HUBUS = 1 VALID ENTRIES 83 - 84", " 92 - 92", " 95 - 95"
)), row.names = c(NA, 5L), class = "data.frame")
I'm trying to separate out the missing_vars column using extract (tidyr) and gsub as shown below:
df_parse<-
df_parse %>%
mutate(dup_value2 = missing_vars) %>%
extract(col = dup_value2, into = "position2", regex = "(\\d+\\s*-\\s*\\d+)$") %>%
mutate(id2 = gsub(pattern = "\\t.*", replacement = "", x = missing_vars)) %>%
mutate(desc2 = gsub(".*\\\t\\d+\\\t", replacement = "", x = missing_vars)) %>%
mutate(desc2 = gsub("(\\d+\\s*-\\s*\\d+)$", replacement = "", x = missing_vars))
This works fine, but I wanted to add a conditional on the start of this pipe, where df_parse$starts_with_position == TRUE
Something like this? (I know it doesn't work)
df_parse %>% if(starts_with_position==TRUE){
mutate(dup_value2 = missing_vars) %>%
extract(col = dup_value2, into = "position2", regex = "(\\d+\\s*-\\s*\\d+)$") %>%
mutate(id2 = gsub(pattern = "\\t.*", replacement = "", x = missing_vars)) %>%
mutate(desc2 = gsub(".*\\\t\\d+\\\t", replacement = "", x = missing_vars)) %>%
mutate(desc2 = gsub("(\\d+\\s*-\\s*\\d+)$", replacement = "", x = missing_vars))
}else ""

Don't understand Cannot Coerce type 'closure' Error

I see this is a common issue but I can't understand what to do from reading other posts or trying to understand functional programming which is new to me. Functions are closures in R, encapsulating the environment they were created in? The code I have is:
# Remove numbers from text
minus_TextNum <- function(df, new.df){
new.df <- mutate(df, text = gsub(x = text, pattern = "[0-9]+|\\(.*\\)", replacement = "")) %>% # and/or whatever's in brackets
unnest_tokens(input = text, output = word) %>%
filter(!word %in% c(stop_words$word, "patient")) %>%
group_by(id) %>%
summarise(text = paste(word, collapse = " "))
return(new.df)
}
minus_TextNum(TidySymptoms)
Error is as follows:
Error: Problem with mutate() column text. ℹ text = gsub(x = text, pattern = "[0-9]+|\\(.*\\)", replacement = ""). x cannot coerce
type 'closure' to vector of type 'character'
I don't understand what type closure is, and this is a simple function that works on a simple dataset I created to test. Problem arises when I use the real-world dataset.
Any feedback appreciated. Reproducible sample below:
# Remove numbers and/or anything in brackets
# Test Data
mydata <- data.frame(id = 1:8,
text = c("112773 Nissan Micra, Car, (10 pcs)",
"112774 Nissan Micra, Car, (10 pcs)",
"112775 Nissan Micra, Car, (10 pcs)",
"112776 Volkswagon Beetle, Car, (3 pcs)",
"112777 Toyota Corolla, Car, (12 pcs)",
"112778 Nissan Micra, Car, (10 pcs)",
"112779 Toyota Prius, Car, (9 pcs)",
"112780 Toyota Corolla, Car, (12 pcs)"),
stringsAsFactors = F)
library(dplyr)
library(tidytext)
# remove numbers from text data
data(stop_words)
minus_TextNum <- function(df, new.df){
new.df <- mutate(df, text = gsub(x = text, pattern = "[0-9]+|\\(.*\\)", replacement = "")) %>% # and/or whatevers in brackets
unnest_tokens(input = text, output = word) %>%
filter(!word %in% c(stop_words$word, "car")) %>%
group_by(id) %>%
summarise(text = paste(word, collapse = " "))
return(new.df)
}
minus_TextNum(mydata)
dput(head(TidySymptoms, n = 10))
structure(list(word = c("epiglottis", "swelled", "hinder", "swallowing",
"pictures", "benadryl", "tylenol", "approximately", "30", "min"
)), row.names = c(NA, 10L), class = "data.frame")
TidySymptoms data has no id column in it. Assuming it's a mistake and you have that already in your data you can do the following changes in the function.
There is no need to pass df.new to the function.
The column in TidySymptoms is called as word but you are using text in the function.
Try this code.
minus_TextNum <- function(df){
df.new <- mutate(df, text = gsub(x = word, pattern = "[0-9]+|\\(.*\\)", replacement = "")) %>%
unnest_tokens(input = text, output = word) %>%
filter(!word %in% c(stop_words$word, "patient")) %>%
group_by(id) %>%
summarise(text = paste(word, collapse = " "))
return(new.df)
}
minus_TextNum(TidySymptoms)

Create a loop in order to generate colnames in a large list

I would like to create a loop in order to change the column names as shown:
a <- c("day", "month", "year", "flow")
I've got a large list of 6937 elements that I managed to import into R:
library(tidyverse)
library(readtext)
txt_files_ls <- paste("C:/Users/obarresi/Desktop/doc osvaldo/ana_data_acquisition/data_flow-ANA/All",
list.files(path = "C:/Users/obarresi/Desktop/doc osvaldo/ana_data_acquisition/data_flow-ANA/All",
pattern = "*.txt"), sep = "/")
txt_files_df_list <- vector("list", length(txt_files_ls))
txt_files_df_list <- lapply(txt_files_ls,
function(x){data.frame(read.table(file = x, header = F,
sep ="",colnames(x)))})
How to do this into all my df inside my list:
txt_files_df_list[[1]] <- colnames(c("day", "month", "year", "flow"))
Thank you for your help!
I would approach it like this:
txt_files_df_list %>%
map(~ set_names(., c("day", "month", "year", "flow")))

How to keep only the text after a specific tag and insert to other rows 0

Data
data.frame(id = c(1, 2), text = c("something here <h1>my text</h1> also <h1>Keep it</h1>", "<h1>title</h1> another here"))
How is it possible to keep after this tag <h1>my text</h1> the text until to find the next start of a tag and if this not exist in the row insert 0
Example output
data.frame(id = c(1, 2), text = c("also", 0))
In regex you can use lookaheads and lookbehinds, see this link for more info. With naming the data df:
df$text <- str_extract(df$text, pattern = "(?<=</h1>)(.*)(?=<h1>)")
ifelse(is.na(df$text), "0", trimws(df$text))
[1] "also" "0"
You can do this in quanteda using several corpus_select() calls:
df <- data.frame(
id = c(1, 2),
text = c(
"something here <h1>my text</h1> also <h1>Keep it</h1>",
"<h1>title</h1> another here"
)
)
library("quanteda", warn.conflicts = FALSE)
## Package version: 2.1.1
corp <- df %>%
corpus(docid_field = "id") %>%
corpus_segment("<h1>my text</h1>", pattern_position = "before") %>%
corpus_segment("<h1>", pattern_position = "after")
Now we can get your 0s from merging this with the sequence of IDs, and converting any non-matches (NAs) to 0:
library("dplyr", warn.conflicts = FALSE)
convert(corp, to = "data.frame") %>%
rename(id = doc_id) %>%
select(id, text) %>%
mutate(id = as.integer(id)) %>%
right_join(data.frame(id = 1:2)) %>%
tidyr::replace_na(list(text = 0))
## Joining, by = "id"
## id text
## 1 1 also
## 2 2 0

Resources