Some data
example_df <- data.frame(
url = c('blog/blah', 'blog/?utm_medium=foo', 'blah', 'subscription/apples', 'UK/something'),
numbs = 1:5
)
lookup_df <- data.frame(
string = c('blog', 'subscription', 'UK'),
group = c('blog', 'subs', 'UK')
)
library(fuzzyjoin)
data_combined <- example_df %>%
fuzzy_left_join(lookup_df, by = c("url" = "string"),
match_fun = `%in%`)
data_combined
url numbs string group
1 blog/blah 1 <NA> <NA>
2 blog/?utm_medium=foo 2 <NA> <NA>
3 blah 3 <NA> <NA>
4 subscription/apples 4 <NA> <NA>
5 UK/something 5 <NA> <NA>
I expected data_combined to have values for string and group where there's a match based on match_fun. Instead all NA.
Example, the first value of string in lookup_df is 'blog'. Since this is %in% the first value of example_df string, expected a match with value 'blog' and 'blog' in string and group fields.
If we want to do a partial match with the word before the / in the 'url' with the 'string' column in 'lookup_df', we could extract that substring as a new column and then do a regex_left_join
library(dplyr)
library(fuzzyjoin)
library(stringr)
example_df %>%
mutate(string = str_remove(url, "\\/.*")) %>%
regex_left_join(lookup_df, by = 'string') %>%
select(url, numbs, group)
-output
# url numbs group
#1 blog/blah 1 blog
#2 blog/?utm_medium=foo 2 blog
#3 blah 3 <NA>
#4 subscription/apples 4 subs
#5 UK/something 5 UK
Related
I have a tibble with a character column. The character in each row is a set of words like this: "type:mytype,variable:myvariable,variable:myothervariable:asubvariableofthisothervariable". Things like that. I want to either convert this into columns in my tibble (a column "type", a column "variable", and so on; but then I don't really know what to do with my 3rd level words), or convert it to a column list x, so that x has a structure of sublists: x$type, x$variable, x$variable$myothervariable.
I'm not sure what is the best approach, but also, I don't know how to implement this two approaches that I suggest here. I have to say that I have maximum 3 levels, and more 1st level words than "type" and "variable".
Small Reproducible Example:
df <- tibble()
df$id<- 1:3
df$keywords <- c(
"type:novel,genre:humor:black,year:2010"
"type:dictionary,language:english,type:bilingual,otherlang:french"
"type:essay,topic:philosophy:purposeoflife,year:2005"
)
# expected would be in idea 1:
colnames(df)
# n, keywords, type, genre, year,
# language, otherlang, topic
# on idea 2:
colnames(df)
# n, keywords, keywords.as.list
We can use separate_rows from tidyr to split the 'keywords' column by ,, then with cSplit, split the column 'keywords' into multiple columns at :, reshape to 'long' format with pivot_longer and then reshape back to 'wide' with pivot_wider
library(dplyr)
library(tidyr)
library(data.table)
library(splitstackshape)
df %>%
separate_rows(keywords, sep=",") %>%
cSplit("keywords", ":") %>%
pivot_longer(cols = keywords_2:keywords_3, values_drop_na = TRUE) %>%
select(-name) %>%
mutate(rn = rowid(id, keywords_1)) %>%
pivot_wider(names_from = keywords_1, values_from = value) %>%
select(-rn) %>%
type.convert(as.is = TRUE)
-output
# A tibble: 6 x 7
# id type genre year language otherlang topic
# <int> <chr> <chr> <int> <chr> <chr> <chr>
#1 1 novel humor 2010 <NA> <NA> <NA>
#2 1 <NA> black NA <NA> <NA> <NA>
#3 2 dictionary <NA> NA english french <NA>
#4 2 bilingual <NA> NA <NA> <NA> <NA>
#5 3 essay <NA> 2005 <NA> <NA> philosophy
#6 3 <NA> <NA> NA <NA> <NA> purposeoflife
data
df <- structure(list(id = 1:3, keywords = c("type:novel,genre:humor:black,year:2010",
"type:dictionary,language:english,type:bilingual,otherlang:french",
"type:essay,topic:philosophy:purposeoflife,year:2005")), row.names = c(NA,
-3L), class = c("tbl_df", "tbl", "data.frame"))
I have a method for replacing values in a dataframe by matching id values. This works well for small data sets but not well on large datasets. Does anyone have a suggestion on how I might make this process more computationally effective?
Below is an example of my R code. I am using the tidyverse package.
# Delta Array small test
test_df <- data.frame(ID = c(1,2,3,4,5,6,7,8,8,9),
val = c(1,NA,3,4,5,6,7,8,NA,9))
delta_test <- data.frame(ID = c(2,8,9),
val = c(2,100,50))
test_df$val <- ifelse(is.na(delta_test$val[match(test_df$ID, delta_test$ID)]),
test_df$val,
delta_test$val[match(test_df$ID, delta_test$ID)])
test_df
You can try to join test_df with delta_test and select the first non-NA value using coalesce.
library(dplyr)
test_df <- test_df %>%
left_join(delta_test, by = 'ID') %>%
mutate(val = coalesce(val.y, val.x)) %>%
select(ID, val)
test_df
# ID val
#1 1 1
#2 2 2
#3 3 3
#4 4 4
#5 5 5
#6 6 6
#7 7 7
#8 8 100
#9 8 100
#10 9 50
In base R this can be implemented as :
test_df <- transform(merge(test_df, delta_test, by = 'ID', all.x = TRUE),
val = ifelse(is.na(val.y), val.x, val.y))
Trying to spread two column data to a format where there will be some NA values.
dataframe:
df <- data.frame(Names = c("TXT","LSL","TXT","TXT","TXT","USL","LSL"), Values = c("apple",-2,"orange","banana","pear",10,-1),stringsAsFactors = F)
If a row includes TXT following rows that has LSL or USL will belong to that row.
For ex:
in the first row; Name is TXT Value is apple next row is LSL value will be for apple's LSL and since no USL that will be NA until the next TXT name.
If there is a TXT followed by another TXT, then LSL and USL values for that row will be NA
trying to create this:
I tried using spread with row numbers as unique identifier but that's not what I want:
df %>% group_by(Names) %>% mutate(row = row_number()) %>% spread(key = Names,value = Values)
I guess I need to create following full table with NAs then spread but couldn't figure out how.
We can expand the dataset with complete after creating a grouping index based on the occurence of 'TXT'
library(dplyr)
library(tidyr)
df %>%
group_by(grp = cumsum(Names == 'TXT')) %>%
complete(Names = unique(.$Names)) %>%
ungroup %>%
spread(Names, Values) %>%
select(TXT, LSL, USL)
# A tibble: 4 x 3
# TXT LSL USL
# <chr> <chr> <chr>
#1 apple -2 <NA>
#2 orange <NA> <NA>
#3 banana <NA> <NA>
#4 pear -1 10
In data.table, we can use dcast :
library(data.table)
dcast(setDT(df), cumsum(Names == 'TXT')~Names, value.var = 'Values')[, -1]
# LSL TXT USL
#1: -2 apple <NA>
#2: <NA> orange <NA>
#3: <NA> banana <NA>
#4: -1 pear 10
I have a dataframe with two columns which can contain literally any character of various formats and i would like to match them.
library(stringr)
library(fuzzyjoin)
x <- data.frame(idX=1:3, string=c("silver", "30BEDJE202AA", "30BEDJE2027"))
y <- data.frame(idY=letters[1:3], seed=c("sliver", "30BEDJE202ABC", "30BEDJE2027BL"))
x$string = as.character(x$string)
y$seed = as.character(y$seed)
x %>% fuzzy_left_join(y, by = c(string = "seed"), match_fun = str_detect)
Here is the result i get when running the above code:
idX string idY seed
1 1 silver <NA> <NA>
2 2 30BEDJE202AA <NA> <NA>
3 3 30BEDJE2027 <NA> <NA>
And this is what i would like to have:
idX string idY seed
1 1 silver a sliver
2 2 30BEDJE202AA b 30BEDJE202ABC
3 3 30BEDJE2027 c 30BEDJE2027BL
Is there a way to get there?
I have a large dataframe and I would like to split a column into many columns based on two conditions the caret character ^ and the letter following IMM-. Based on the data below Column 1 would be split into columns named IMM-A, IMM-B, IMM-C, and IMM-W. I tried the separate function but it only works if you specify the column names and because my data is not uniform I don't always know what the column names should be.
SampleId Column1
1 IMM-A*010306+IMM-A*0209^IMM-B*6900+IMM-B*779999^IMM-C*1212+IMM-C*3333
2 IMM-A*010306+IMM-A*0209^IMM-C*6900+IMM-C*779999^IMM-W*1212+IMM-W*3333
3 IMM-B*010306+IMM-B*0209^IMM-C*6900+IMM-C*779999^IMM-W*1212+IMM-W*3333
The expected output would be;
SampleId IMM-A IMM-B IMM-C IMM-W
1 IMM-A*010306+IMM-A*0209 IMM-B*6900+IMM-B*779999 IMM-C*1212+IMM-C*3333
2 IMM-A*010306+IMM-A*0209 IMM-C*6900+IMM-C*779999 IMM-W*1212+IMM-W*3333
3 IMM-B*010306+IMM-B*0209 IMM-C*6900+IMM-C*779999 IMM-W*1212+IMM-W*3333
Not clear about the expected output. Based on the description, we may need
library(tidyverse)
map(strsplit(df$Column1, "[*+^]"), ~
stack(setNames(as.list(.x[c(FALSE, TRUE)]), .x[c(TRUE, FALSE)])) %>%
group_by(ind) %>%
mutate(rn = row_number()) %>%
spread(ind, values)) %>%
set_names(df$SampleId) %>%
bind_rows(.id = 'SampleId') %>%
select(-rn)
# A tibble: 6 x 5
# SampleId `IMM-A` `IMM-B` `IMM-C` `IMM-W`
# <chr> <chr> <chr> <chr> <chr>
#1 1 010306 6900 1212 <NA>
#2 1 0209 779999 3333 <NA>
#3 2 010306 <NA> 6900 1212
#4 2 0209 <NA> 779999 3333
#5 3 <NA> 010306 6900 1212
#6 3 <NA> 0209 779999 3333
Update
Based on the OP's expected output, we expand the data by splitting the 'Column1' at the ^ delimiter, then separate the 'Column1' into 'colA', 'colB' at the delimiter *, remove the 'colB' and spread to 'wide' format
df %>%
separate_rows(Column1, sep = "\\^") %>%
separate(Column1, into = c("colA", "colB"), remove = FALSE, sep="[*]") %>%
select(-colB) %>%
spread(colA, Column1, fill = "")
#SampleId IMM-A IMM-B IMM-C IMM-W
#1 1 IMM-A*010306+IMM-A*0209 IMM-B*6900+IMM-B*779999 IMM-C*1212+IMM-C*3333
#2 2 IMM-A*010306+IMM-A*0209 IMM-C*6900+IMM-C*779999 IMM-W*1212+IMM-W*3333
#3 3 IMM-B*010306+IMM-B*0209 IMM-C*6900+IMM-C*779999 IMM-W*1212+IMM-W*3333
data
df <- structure(list(SampleId = 1:3, Column1 =
c("IMM-A*010306+IMM-A*0209^IMM-B*6900+IMM-B*779999^IMM-C*1212+IMM-C*3333",
"IMM-A*010306+IMM-A*0209^IMM-C*6900+IMM-C*779999^IMM-W*1212+IMM-W*3333",
"IMM-B*010306+IMM-B*0209^IMM-C*6900+IMM-C*779999^IMM-W*1212+IMM-W*3333"
)), class = "data.frame", row.names = c(NA, -3L))