I have a data and want to split into columns
price_list <- c("Vegetables", " Garlic Desi<U+062A><U+06BE><U+0648><U+0645> <U+062F><U+06CC><U+0633><U+06CC> 140 per kg ",
" Fresh-bean<U+0641><U+0631><U+0627><U+0634><U+0628><U+06CC><U+0646> — per kg ",
"Fruits",
" Apple Kala Kolu Irani<U+0633><U+06CC><U+0628> <U+06A9><U+0627><U+0644><U+0627> <U+06A9><U+0648><U+0644><U+0648> <U+0627><U+06CC><U+0631><U+0627><U+0646><U+06CC> 168 per kg ",
" Apple golden 115 per kg ",
" Banana (I)<U+06A9><U+06CC><U+0644><U+0627> <U+0627><U+0646><U+0688><U+06CC><U+0646> 182 per dozen ",
"Others",
" Chicken<U+0645><U+0631><U+063A><U+06CC> <U+0634><U+06CC><U+0648><U+0631> 170 per kg ",
" Egg<U+0627><U+0646><U+0688><U+06D2> <U+0634><U+06CC><U+0648><U+0631> 95 per dozen "
)
tried but Unicodes creating problem
library(stringr)
regexp <- "[[:digit:]]+"
rprice <- str_extract(df$price_list, regexp)
df$price <- data.frame(rprice)
Desired out put like
Name Unicode Price Quantity
Vegetables
Fresh-bean فراشبین NA kg
Fruits
Apple golden NA 115 kg
Others
Egg انڈے شیور NA dozen
This forum is really helpful saved hundred and thousands of hours thanks
url <- "https://ictadministration.gov.pk/services/price-list/
complete code
library(rvest)
scraping_wiki <- read_html("https://ictadministration.gov.pk/services/price-list/")
library(magrittr)
price_date <- scraping_wiki %>%
html_nodes(".tm-article-content > ol:nth-child(1) > div:nth-child(1)") %>%
html_text()%>%
strsplit(split = "\n") %>%
unlist() %>%
.[. != ""]
price_date <- gsub(":", "", price_date)
price_list <- scraping_wiki %>%
html_nodes(".xl-tbl") %>%
html_text() %>%
strsplit(split = "\n") %>%
unlist() %>%
.[. != ""]
Wow, messy. This gets you close:
library(dplyr)
library(stringr)
unis <- price_list %>% str_extract(pattern = "<[[:print:]]*>")
words <- price_list %>% str_extract(pattern = "[A-Z a-z<]*") %>% gsub("<U", "", x = .)
price <- price_list %>% str_extract(pattern = "[0-9]* per") %>% gsub("per", "", x = .)
quant <- price_list %>% str_extract(pattern = "per [a-z]*")
df <- tibble(Name = words, Unicode = unis, Price = price, Quantity = quant)
Result:
> head(df)
# A tibble: 6 x 4
Name Unicode Price Quantity
<chr> <chr> <chr> <chr>
1 Vegetables NA NA NA
2 " Garlic Desi" <U+062A><U+06BE><U+0648><U+0645> <U+062F><U+06CC><U+0633><U+06CC> "140~ per kg
3 " Fresh" <U+0641><U+0631><U+0627><U+0634><U+0628><U+06CC><U+0646> " " per kg
4 Fruits NA NA NA
5 " Apple Kala Kolu Irani" <U+0633><U+06CC><U+0628> <U+06A9><U+0627><U+0644><U+0627> <U+06A9><U+~ "168~ per kg
6 " Apple golden " NA "115~ per kg
I'm not a regex genius, so I'm sure there must be a cleaner way.
Here's a functional approach. It's always good to learn to find a work around with functions.
Following are the steps:
1. Clean the price_list and keep the name, number and quantity.
2. Write functions which does that.
3. Apply functions on the new data frame.
# clean text
clean_list <- lapply(price_list, function(i) gsub("<[^>]+>", "",i))
clean_list <- lapply(clean_list, function(i) gsub('per','',i))
clean_list <- lapply(clean_list, str_trim)
# convert list to data frame
df <- data.table(do.call('rbind', clean_list))
colnames(df) <- 'text'
# helper functions
get_number <- function(j)
{
p1 <- unlist(strsplit(j, ' '))
p2 <- grepl('\\d+',p1)
if(sum(as.integer(p2)) ==1) return (grep('\\d+',p1,value = T))
else return (0)
}
get_quantity <- function(j)
{
p1 <- unlist(strsplit(j, ' '))
p2 <- grepl('kg|dozen',p1)
if(sum(as.integer(p2)) ==1) return (grep('kg|dozen',p1,value = T))
else return (NA)
}
# apply functions and get output
df[,Name := sapply(text, function(i) unlist(strsplit(i, ' '))[1])]
df[,Price := sapply(text, get_number)]
df[,Quantity := sapply(text, get_quantity)]
df[,Unicode := sapply(price_list, function(x) str_extract(string = x, pattern = '<[[:print:]]*>'))]
head(df)
text Name Price Quantity Unicode
1 Vegetables Vegetables 0 NA NA
2 Garlic Desi 140 kg Garlic Desi 140 kg <U+062A><U+06BE><U+0648><U+0645> <U+062F><U+06CC><U+0633><U+06CC>
3 Fresh-bean — kg Fresh-bean 0 kg <U+0641><U+0631><U+0627><U+0634><U+0628><U+06CC><U+0646>
4 Fruits Fruits 0 NA NA
5 Apple Kala Kolu Irani 168 kg Apple Kala Kolu Irani 168 kg <U+0633><U+06CC><U+0628> <U+06A9><U+0627><U+0644><U+0627> <U+06A9><U+0648><U+0644><…
6 Apple golden 115 kg Apple golden 115 kg NA
>
Related
I have two datasets which I want to merge :
df1 <- data.frame( title =
c("residence mozart",
"les hesperides auteuil mirabeau",
"chaillot",
"jouvenet",
"retraite dosne"))
df2 <- data.frame(title = c("terrasses mozart", "chaillot",
"villa jules janin", "retraites dosne"))
And I would like to have something like this :
1 residence mozart NA (or terrasses mozart)
2 les hesperides auteuil mirabeau NA
3 chaillot chaillot
4 jouvenet NA
5 retraite dosne retraites dosne
Here is what I did :
x = data.frame(title_df2 = matrix(ncol = 1, nrow = nrow(df1)))
for (i in nbr){
x[i, ] <- grep(df1$title[i], df2$title, value = T)
}
It does not work at all ! Even though grep(df1$title[5], df2$title, value = T) works and return "chaillot"!
If I understand correctly
df1 <- data.frame( title =
c("residence mozart",
"les hesperides auteuil mirabeau",
"chaillot",
"jouvenet",
"retraite dosne"))
df2 <- data.frame(title = c("terrasses mozart", "chaillot",
"villa jules janin", "retraites dosne"))
library(dplyr)
library(fuzzyjoin)
stringdist_left_join(x = df1, y = df2, method = "jw", distance_col = "d") %>%
filter(d < 0.25) %>%
right_join(df1, by = c("title.x" = "title"))
#> Joining by: "title"
#> title.x title.y d
#> 1 residence mozart terrasses mozart 0.23863636
#> 2 chaillot chaillot 0.00000000
#> 3 retraite dosne retraites dosne 0.09206349
#> 4 les hesperides auteuil mirabeau <NA> NA
#> 5 jouvenet <NA> NA
Created on 2021-04-19 by the reprex package (v2.0.0)
The issue is that grep returns a vector of length 0 when there is no match.
grep('a', 'hello', value = TRUE)
#character(0)
If we want to make use of the same for loop, make an adjustment in the code to return NA whereever there is no match
nbr <- seq_len(nrow(df1))
for (i in nbr){
x[i, ] <- c(grep(df1$title[i], df2$title, value = TRUE), NA_character_)[1]
}
-output
x
# title_df2
#1 <NA>
#2 <NA>
#3 chaillot
#4 <NA>
#5 <NA>
You could do:
a <-Vectorize(agrep, "pattern")(df1$title, df2$title, value=TRUE)
is.na(a)<- lengths(a) == 0
cbind(df1,df2_title=unlist(a, use.names = FALSE))
title df2_title
1 residence mozart <NA>
2 les hesperides auteuil mirabeau <NA>
3 chaillot chaillot
4 jouvenet <NA>
5 retraite dosne retraites dosne
To achieve your goal, you need a matching on each word of your strings within df1 title.
As used in your example, Grep will return an output only if there is a match on the full string.
In order to do that, you'll need to grep on possible words on df1 that are also contained in df2. This can be achieved by implementing an or condition on the full word contained in each string.
nbr <- 1:nrow(x)
for (i in nbr){
pattern <- paste("\\b",unlist(strsplit(as.character(df1$title[i]), " ")), "\\b", collapse = "|", sep = "") # here you create a regex expression whereby you can check if one of the words contained in 1 is also in df2. the \\b \\b escape makes sure that there is a full match on the single word.
fitInDataFrame <- grep(pattern, as.character(df2$title), value = T) # here you grep on the constructed regex expression
x[i, ] <- ifelse(length(fitInDataFrame) == 0, NA, fitInDataFrame)
}
Here the output:
> x
title_df2
1 terrasses mozart
2 <NA>
3 chaillot
4 <NA>
5 retraites dosne
You can do a left_join(df1, df2, by = c('title' = 'title'), keep = TRUE), specifying keep = TRUE so it doesn't drop df2's join column.
Or, for this particular case, you could do this:
df1$newcol <- ifelse(df1$title %in% df2$title, df1$title, NA)
This adds a new column to df1 which is filled out by going through each title in df1, checking if that title is in df2, if so writing that title in the second column and if not writing NA in that row of the second column. You could choose to put something else there instead, like:
df1$newcol <- ifelse(df1$title %in% df2$title, 'Title in DF2', 'Not in DF2')
I have a data frame where one of the columns have several information separated by ";", like the following:
DF = data.frame(a = c(1,1,1,2,2), b = c('aaa','aaa','aba','abc','ccc'),
extra_info = c(
'animal=horse;color=orange;shape=circle',
'animal=monkey;shape=square;value=532',
'animal=horse;color=blue;shape=square;value=321',
'animal=dog;color=green;value=678',
'color=pink;shape=triangle'
))
I can't use read.table because I'm already using a different function to read the data (and also the content of each row in the column extra_info is different, and the columns would be messed up). What I wish to do is separate all this information to different columns, and assign proper names accordingly, such as:
a b animal color shape value
1 aaa horse orange circle NA
1 aaa monkey NA square 532
1 aba horse blue square 321
2 abc dog green NA 678
2 ccc NA pink triangle NA
So far, I've tried:
new_cols = DF %>% separate(extra_info, c(LETTERS[1:4]), sep = ";")
new_cols %>% separate(A, c("key","value"), sep = '=') %>%
separate(B, c("key","value"), sep = '=') %>%
separate(C, c("key","value"), sep = '=') %>%
separate(D, c("key","value"), sep = '=') %>%
pivot_wider(names_from = c("key"), values_from = c("value"))
But it doesn't work as expected.
Here's an approach where I change the syntax of your key-value pairs into valid JSON syntax and use jsonlite::fromJSON to parse it:
library(purrr)
library(dplyr)
library(stringr)
library(jsonlite)
DF %>%
mutate(
json = str_replace_all(extra_info, pattern = "\\b", replacement = '"'),
json = str_replace_all(json, pattern = fixed("="), replacement = ":"),
json = str_replace_all(json, pattern = fixed(";"), replacement = ","),
json = paste("{", json, "}"),
) %>%
pull(json) %>%
map(jsonlite::fromJSON) %>%
map(as.data.frame) %>%
bind_rows %>%
cbind(DF, .)
# a b extra_info animal color shape value
# 1 1 aaa animal=horse;color=orange;shape=circle horse orange circle <NA>
# 2 1 aaa animal=monkey;shape=square;value=532 monkey <NA> square 532
# 3 1 aba animal=horse;color=blue;shape=square;value=321 horse blue square 321
# 4 2 abc animal=dog;color=green;value=678 dog green <NA> 678
# 5 2 ccc color=pink;shape=triangle <NA> pink triangle <NA>
Here is a base R option using gsub + eval + str2expression
v <- DF$extra_info
p <- gsub(";", ",", gsub("(?<=\\=)(\\w+)", "'\\1'", v, perl = TRUE))
nms <- unique(unlist(regmatches(v, gregexpr("\\w+(?=\\=)", v, perl = TRUE))))
q <- unname(Map(function(x) setNames(eval(str2expression(x))[nms], nms), sprintf("c(%s)", p)))
cbind(DF[c("a","b")], type.convert(data.frame(do.call(rbind, q)), as.is = TRUE))
which gives
a b animal color shape value
1 1 aaa horse orange circle NA
2 1 aaa monkey <NA> square 532
3 1 aba horse blue square 321
4 2 abc dog green <NA> 678
5 2 ccc <NA> pink triangle NA
It's a bit neater with the stringr package, but if you just want base R you can use the following. In the pattern structure (?<=animal=)\\w+(?=\\b) here, the \\w+ is what's actually being returned, it is any word character (\\w) and there has to be at least one of them (+). This is swapped with \\d+ for 'value' since digits are required. Alternatively you could replace both with [:alnum:]+.
Then the (?<=animal=) structure is used to specify that it must be preceded by "animal=", and the (?=\\b) structure indicates that it has to be followed by a word boundary (\\b). You could get a bit more specific and replace \\b with (,|;|$) which stands for comma or semicolon or end of line (EDIT: the original question had commas in some places). There might be a nice way of writing a loop over the four words that creates the variable names and patterns dynamically.
pattern <- "(?<=animal=)\\w+(?=\\b)"
DF$animal <- sapply(regmatches(DF$extra_info, regexec(pattern, DF$extra_info , perl=T)), "[", 1)
pattern <- "(?<=color=)\\w+(?=\\b)"
DF$color<- sapply(regmatches(DF$extra_info, regexec(pattern, DF$extra_info , perl=T)), "[", 1)
pattern <- "(?<=shape=)\\w+(?=\\b)"
DF$shape<- sapply(regmatches(DF$extra_info, regexec(pattern, DF$extra_info , perl=T)), "[", 1)
pattern <- "(?<=value=)\\d+(?=\\b)"
DF$value <- sapply(regmatches(DF$extra_info, regexec(pattern, DF$extra_info , perl=T)), "[", 1)
If you're happy to use tidyverse/stringr, here is the code.
DF <- DF %>%
mutate(animal = str_extract(extra_info, "(?<=animal=)\\w+(?=\\b)" )) %>%
mutate(color = str_extract(extra_info, "(?<=color=)\\w+(?=\\b)" )) %>%
mutate(shape = str_extract(extra_info, "(?<=shape=)\\w+(?=\\b)" )) %>%
mutate(value = str_extract(extra_info, "(?<=value=)\\d+(?=\\b)" ))
For more info on string manipulation and regular expressions, see the stringr cheat sheet here: https://github.com/rstudio/cheatsheets/blob/master/strings.pdf
library(stringr)
col_names <- unlist(str_extract_all(DF$extra_info[3], "(?<=^|;)\\w+"))
DF %>%
mutate(animal = str_extract(extra_info, paste0("(?<=", col_names[1], "=)\\w+")),
color = str_extract(extra_info, paste0("(?<=", col_names[2], "=)\\w+")),
shape = str_extract(extra_info, paste0("(?<=", col_names[3], "=)\\w+")),
value = str_extract(extra_info, paste0("(?<=", col_names[4], "=)\\w+"))
a b extra_info animal color shape value
1 1 aaa animal=horse;color=orange;shape=circle horse orange circle <NA>
2 1 aaa animal=monkey;shape=square;value=532 monkey <NA> square 532
3 1 aba animal=horse;color=blue;shape=square;value=321 horse blue square 321
4 2 abc animal=dog;color=green;value=678 dog green <NA> 678
5 2 ccc color=pink;shape=triangle <NA> pink triangle <NA>
In R,there is a long character "mr" as blow,how can i split "mr" by number (split into three short strings):
mr <- 'total amount 25.36 expense -2 promotion discount-2.56'
# 'total amount 25.36','expense -2','promotion discount-2.56'
An option with tidyverse
library(dplyr)
library(tidyr)
tibble(col1 = mr) %>%
separate_rows(col1, sep="(?<=\\d) ") %>%
separate(col1, into = c("Description", "Amount"),
sep = "(?<=[a-z])\\s*(?=[-0-9])", convert = TRUE)
# A tibble: 3 x 2
# Description Amount
# <chr> <dbl>
#1 total amount 25.4
#2 expense -2
#3 promotion discount -2.56
Adding to #rawr 's comment,
If you want to have it as a data frame,
mr <- 'total amount 25.36 expense -2 promotion discount-2.56'
splt <- strsplit(mr, '(?<=\\d) ', perl = TRUE)[[1]]
df <- data.frame("Desciption" = gsub("[^a-z ]", "", splt),
"Amount" = as.numeric(gsub("[^0-9.-]", "", splt)))
df
Desciption Amount
1 total amount 25.36
2 expense -2.00
3 promotion discount -2.56
I have couple of pdfs and I wish to extract the shareholders table. How can I specify such that only table appearing after the string 'TWENTY LARGEST SHAREHOLDERS' is extracted?
I tried but was not quite sure of the function part.
library("pdftools")
library("tidyverse")
url <- c("https://www.computershare.com/News/Annual%20Report%202019.pdf?2")
raw_text <- map(url, pdf_text)
clean_table <- function(table){
table <- str_split(table, "\n", simplify = TRUE)
table_start <- stringr::str_which(table, "TWENTY LARGEST SHAREHOLDERS")
table <- table[1, (table_start +1 ):(table_end - 1)]
table <- str_replace_all(table, "\\s{2,}", "|")
text_con <- textConnection(table)
data_table <- read.csv(text_con, sep = "|")
colnames(data_table) <- c("Name", "Number of Shares", "Percentage")
}
shares <- map_df(raw_text, clean_table)
Try this. Besides some minor issues the main change is that I first get the page which contains the desired table. BTW: You have to search for "Twenty Largest Shareholders" and not "TWENTY LARGEST SHAREHOLDERS".
library(pdftools)
library(tidyverse)
# download pdf
url <- c("https://www.computershare.com/News/Annual%20Report%202019.pdf?2")
raw_text <- map(url, pdf_text)
clean_table1 <- function(raw) {
# Split the single pages
raw <- map(raw, ~ str_split(.x, "\\n") %>% unlist())
# Concatenate the splitted pages
raw <- reduce(raw, c)
table_start <- stringr::str_which(tolower(raw), "twenty largest shareholders")
table_end <- stringr::str_which(tolower(raw), "total")
table_end <- table_end[min(which(table_end > table_start))]
table <- raw[(table_start + 3 ):(table_end - 1)]
table <- str_replace_all(table, "\\s{2,}", "|")
text_con <- textConnection(table)
data_table <- read.csv(text_con, sep = "|")
colnames(data_table) <- c("Name", "Number of Shares", "Percentage")
data_table
}
shares <- map_df(raw_text, clean_table1)
head(shares)
#> Name Number of Shares
#> 1 J P Morgan Nominees Australia Pty Limited 109,500,852
#> 2 Citicorp Nominees Pty Limited 57,714,777
#> 3 Mr Chris Morris 32,231,000
#> 4 National Nominees Limited 19,355,892
#> 5 Welas Pty Ltd 18,950,000
#> 6 BNP Paribas Nominees Pty Ltd <Agency Lending DRP A/C> 11,520,882
#> Percentage
#> 1 20.17
#> 2 10.63
#> 3 5.94
#> 4 3.56
#> 5 3.49
#> 6 2.12
I have two data frames with text data about users:
x <- data.frame("Address_line1" = c("123 Street","21 Hill drive"),
"City" = c("Chicago","London"), "Phone" = c("123","219"))
y <- data.frame("Address_line1" = c("461 road","PO Box 123","543 Highway"),
"City" = c("Dallas","Paris","New York" ), "Phone" = c("235","542","842"))
> x
Address_line1 City Phone
1 123 Street Chicago 123
2 21 Hill drive London 219
> y
Address_line1 City Phone
1 461 road Dallas 235
2 PO Box 123 Paris 542
3 543 Highway New York 842
For each row of the x dataframe, I want to iterate over all the rows in y, compare the corresponding columns (address to address, city to city etc.) and obtain the string distance for each.
So for the first row of x, I want an output like:
[16 20 20]
Where 16 is
stringdist("123 Street","461 road", method = "lv")+
stringdist("Chicago","Dallas", method = "lv")+
stringdist("123","235", method = "lv")
20 is the sum for second row and 20 for third.
Similarly, I want a list containing nrow(y) elements for each row of x.
We can use for loop
out <- c()
for(i in seq_len(nrow(x))) {
for(j in seq_len(nrow(y))) {
x1 <- x[i,]; y1 <- y[j,]
out <- c(out, sum(unlist(Map(stringdist, x1, y1,
MoreArgs = list(method = 'lv')))))
}
}
out
#[1] 16 20 20 19 20 21
It is not clear about the expected. We can also use tidyverse methods
library(dplyr)
library(tidyr)
library(purrr)
library(stringdist)
library(stringr)
crossing(x, y, .name_repair = 'unique') %>%
rename_all(~ str_remove(., "\\.{2,}")) %>%
split.default(str_remove(names(.), "\\d+$")) %>%
map(~ pmap(.x, ~ stringdist(..1, ..2, method = 'lv'))) %>%
transpose %>%
map_dbl(~ flatten_dbl(.x) %>%
sum)
#[1] 16 20 20 19 21 20