Match same strings over two different vectors - r

Say we have two different datasets:
Dataset A:
ids name price
1234 bread 1.5
245r7 butter 1.2
123984 red wine 5
43498 beer 1
235897 cream 1.8
Dataset B:
ids name price
24908 lait 1
1234,089 pain 1.7
77289,43498 bière 1.5
245r7 beurre 1.4
My goal is to match all the products sharing at least one ID and bring them together into a new dataset that should look as follows:
id a_name b_name a_price b_price
1234 bread pain 1.5 1.7
245r7 butter beurre 1.2 1.4
43498 beer bière 1 1.5
Is this feasible using stringr or any other R package?

You can create a long dataset with separate_rows and then do a join.
library(dplyr)
library(tidyr)
B %>%
separate_rows(ids, sep = ',') %>%
inner_join(A, by = 'ids')
# ids name.x price.x name.y price.y
# <chr> <chr> <dbl> <chr> <dbl>
#1 1234 pain 1.7 bread 1.5
#2 43498 bière 1.5 beer 1
#3 245r7 beurre 1.4 butter 1.2

We can use the sqldf package here:
library(sqldf)
sql <- "SELECT a.ids AS id, a.name AS a_name, b.name AS b_name, a.price AS a_price,
b.price AS b_price
FROM df_a a
INNER JOIN df_b b
ON ',' || b.ids || ',' LIKE '%,' || a.ids || ',%'"
output <- sqldf(sql)

As separate_rows (my favorite) is already provided by Ronak Shah,
Here is another strategy using strsplit and unnest():
library(tidyr)
library(dplyr)
df_B %>%
mutate(ids = strsplit(as.character(ids), ",")) %>%
unnest() %>%
inner_join(df_A, by="ids")
ids name.x price.x name.y price.y
<chr> <chr> <dbl> <chr> <chr>
1 1234 pain 1.7 bread 1.5
2 43498 bi??re 1.5 beer 1
3 245r7 beurre 1.4 butter 1.2
data:
df_A <- structure(list(ids = c("1234", "245r7", "123984", "43498", "235897"
), name = c("bread", "butter", "red", "beer", "cream"), price = c("1.5",
"1.2", "wine", "1", "1.8")), class = c("spec_tbl_df", "tbl_df",
"tbl", "data.frame"), row.names = c(NA, -5L), problems = structure(list(
row = 3L, col = NA_character_, expected = "3 columns", actual = "4 columns",
file = "'test'"), row.names = c(NA, -1L), class = c("tbl_df",
"tbl", "data.frame")))
df_B <- structure(list(ids = c("24908", "1234,089", "77289,43498", "245r7"
), name = c("lait", "pain", "bi??re", "beurre"), price = c(1,
1.7, 1.5, 1.4)), class = c("spec_tbl_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -4L))

Related

Replacing column with another data frame based on name matching

Hi I am a bit new so I am not sure if I am doing this right, but I looked around on the overflow and couldn't find a code or advice that worked with my code.
I have a dataframe mainDF that looks like this:
Person
ABG
SEP
CLC
XSP
APP
WED
GSH
SP-1
2.1
3.0
1.3
1.8
1.4
2.5
1.4
SP-2
2.5
2.1
2.0
1.9
1.2
1.2
2.1
SP-3
2.3
3.1
2.5
1.5
1.1
2.6
2.1
I have another dataframe, TranslateDF that has the converting info for the abbreviated column names. And I want to replace the abbreviated names with the real names here:
Do note that the translating data frame may have extraneous info or it could be missing info for the column, and so if the mainDF does not get the full naming, for it to be dropped from the data.
Abbreviated
Full Naming
ABG
All barbecue grill
SEP
shake eel peel
CLC
cold loin cake
XSP
xylophone spear pint
APP
apple pot pie
HUM
hall united meat
LPL
lending porkloin
Ideally, I would get the new resulted dataframe as:
Person
All barbecue grill
shake eel peel
cold loin cake
xylophone spear pint
apple pot pie
SP-1
2.1
3.0
1.3
1.8
1.4
SP-2
2.5
2.1
2.0
1.9
1.2
SP-3
2.3
3.1
2.5
1.5
1.1
I would appreciate any help on this thank you!
You can pass a named vector to select() which will rename and select in one step. Wrapping with any_of() ensures it won't fail if any columns don't exist in the main data frame:
library(dplyr)
df1 %>%
select(Person, any_of(setNames(df2$Abbreviated, df2$Full_Naming)))
# A tibble: 3 x 6
Person `All barbecue grill` `shake eel peel` `cold loin cake` `xylophone spear pint` `apple pot pie`
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 SP-1 2.1 3 1.3 1.8 1.4
2 SP-2 2.5 2.1 2 1.9 1.2
3 SP-3 2.3 3.1 2.5 1.5 1.1
Data:
df1 <- structure(list(Person = c("SP-1", "SP-2", "SP-3"), ABG = c(2.1,
2.5, 2.3), SEP = c(3, 2.1, 3.1), CLC = c(1.3, 2, 2.5), XSP = c(1.8,
1.9, 1.5), APP = c(1.4, 1.2, 1.1), WED = c(2.5, 1.2, 2.6), GSH = c(1.4,
2.1, 2.1)), class = c("spec_tbl_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -3L), spec = structure(list(cols = list(
Person = structure(list(), class = c("collector_character",
"collector")), ABG = structure(list(), class = c("collector_double",
"collector")), SEP = structure(list(), class = c("collector_double",
"collector")), CLC = structure(list(), class = c("collector_double",
"collector")), XSP = structure(list(), class = c("collector_double",
"collector")), APP = structure(list(), class = c("collector_double",
"collector")), WED = structure(list(), class = c("collector_double",
"collector")), GSH = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1L), class = "col_spec"))
df2 <- structure(list(Abbreviated = c("ABG", "SEP", "CLC", "XSP", "APP",
"HUM", "LPL"), Full_Naming = c("All barbecue grill", "shake eel peel",
"cold loin cake", "xylophone spear pint", "apple pot pie", "hall united meat",
"lending porkloin")), class = "data.frame", row.names = c(NA,
-7L))
How about this:
mainDF <- structure(list(Person = c("SP-1", "SP-2", "SP-3"), ABG = c(2.1,
2.5, 2.3), SEP = c(3, 2.1, 3.1), CLC = c(1.3, 2, 2.5), XSP = c(1.8,
1.9, 1.5), APP = c(1.4, 1.2, 1.1), WED = c(2.5, 1.2, 2.6), GSH = c(1.4,
2.1, 2.1)), row.names = c(NA, 3L), class = "data.frame")
translateDF <- structure(list(Abbreviated = c("ABG", "SEP", "CLC", "XSP", "APP",
"HUM", "LPL"), `Full Naming` = c("All barbecue grill", "shake eel peel",
"cold loin cake", "xylophone spear pint", "apple pot pie", "hall united meat",
"lending porkloin")), row.names = c(NA, 7L), class = "data.frame")
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(tidyr)
mainDF %>%
pivot_longer(-Person,
names_to="Abbreviated",
values_to = "vals") %>%
left_join(translateDF) %>%
select(-Abbreviated) %>%
na.omit() %>%
pivot_wider(names_from=`Full Naming`, values_from="vals")
#> Joining, by = "Abbreviated"
#> # A tibble: 3 × 6
#> Person `All barbecue grill` `shake eel peel` `cold loin cake` `xylophone spe…`
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 SP-1 2.1 3 1.3 1.8
#> 2 SP-2 2.5 2.1 2 1.9
#> 3 SP-3 2.3 3.1 2.5 1.5
#> # … with 1 more variable: `apple pot pie` <dbl>
Created on 2022-04-24 by the reprex package (v2.0.1)
library(tidyverse)
mainDF %>%
rename_with(~str_replace_all(., set_names(TranslateDF[, 2], TranslateDF[, 1]))) %>%
select(Person, which(!(names(.) %in% names(mainDF))))
Person All barbecue grill shake eel peel cold loin cake xylophone spear pint apple pot pie
1 SP-1 2.1 3.0 1.3 1.8 1.4
2 SP-2 2.5 2.1 2.0 1.9 1.2
3 SP-3 2.3 3.1 2.5 1.5 1.1

Group_by multiple columns and summarise unique column

I have a dataset below
family
type
inc
name
AA
success
30000
Bill
AA
ERROR
15000
Bess
CC
Pending
22000
Art
CC
Pending
18000
Amy
AA
Serve not respnding d
25000
Paul
ZZ
Success
50000
Pat
ZZ
Processing
50000
Pat
I want to group by multiple columns
here is my code bellow
df<-df1%>%
group_by(Family , type)%>%
summarise(Transaction_count = n(), Face_value = sum(Inc))%>%
mutate(Pct = Transaction_count/sum(Transaction_count))
what I want is that anywhere there is same observation Family, it should pick only one
like this result in the picture below.
Thank you
You can use duplicated to replace the repeating values with blank value.
library(dplyr)
df %>%
group_by(family , type)%>%
summarise(Transaction_count = n(), Face_value = sum(inc))%>%
mutate(Pct = Transaction_count/sum(Transaction_count),
family = replace(family, duplicated(family), '')) %>%
ungroup
# family type Transaction_count Face_value Pct
# <chr> <chr> <int> <int> <dbl>
#1 "AA" ERROR 1 15000 0.333
#2 "" Serve not respnding d 1 25000 0.333
#3 "" success 1 30000 0.333
#4 "CC" Pending 2 40000 1
#5 "ZZ" Processing 1 50000 0.5
#6 "" Success 1 50000 0.5
If you want data for displaying purpose you may look into packages like formattable, kable etc.
data
It is easier to help if you provide data in a reproducible format
df <- structure(list(family = c("AA", "AA", "CC", "CC", "AA", "ZZ",
"ZZ"), type = c("success", "ERROR", "Pending", "Pending", "Serve not respnding d",
"Success", "Processing"), inc = c(30000L, 15000L, 22000L, 18000L,
25000L, 50000L, 50000L), name = c("Bill", "Bess", "Art", "Amy",
"Paul", "Pat", "Pat")), row.names = c(NA, -7L), class = "data.frame")

Converting from long to wide, using pivot_wide() on two columns in R

I would like to transform my data from long format to wide by the values in two columns. How can I do this using tidyverse?
Updated dput
structure(list(Country = c("Algeria", "Benin", "Ghana", "Algeria",
"Benin", "Ghana", "Algeria", "Benin", "Ghana"
), Indicator = c("Indicator 1",
"Indicator 1",
"Indicator 1",
"Indicator 2",
"Indicator 2",
"Indicator 2",
"Indicator 3",
"Indicator 3",
"Indicator 3"
), Status = c("Actual", "Forecast", "Target", "Actual", "Forecast",
"Target", "Actual", "Forecast", "Target"), Value = c(34, 15, 5,
28, 5, 2, 43, 5,
1)), row.names
= c(NA, -9L), class = c("tbl_df", "tbl", "data.frame"))
Country Indicator Status Value
<chr> <chr> <chr> <dbl>
1 Algeria Indicator 1 Actual 34
2 Benin Indicator 1 Forecast 15
3 Ghana Indicator 1 Target 5
4 Algeria Indicator 2 Actual 28
5 Benin Indicator 2 Forecast 5
6 Ghana Indicator 2 Target 2
7 Algeria Indicator 3 Actual 43
8 Benin Indicator 3 Forecast 5
9 Ghana Indicator 3 Target 1
Expected output
Country Indicator1_Actual Indicator1_Forecast Indicator1_Target Indicator2_Actual
Algeria 34 15 5 28
etc
Appreciate any tips!
foo <- data %>% pivot_wider(names_from = c("Indicator","Status"), values_from = "Value")
works perfectly!
I think the mistake is in your pivot_wider() command
data %>% pivot_wider(names_from = Indicator, values_from = c(Indicator, Status))
I bet you can't use the same column for both names and values.
Try this code
data %>% pivot_wider(names_from = c(Indicator, Status), values_from = Value))
Explanation: Since you want the column names to be Indicator 1_Actual, you need both columns indicator and status going into your names_from
It would be helpful if you provided example data and expected output. But I tested this on my dummy data and it gives the expected output -
Data:
# A tibble: 4 x 4
a1 a2 a3 a4
<int> <int> <chr> <dbl>
1 1 5 s 10
2 2 4 s 20
3 3 3 n 30
4 4 2 n 40
Call : a %>% pivot_wider(names_from = c(a2, a3), values_from = a4)
Output :
# A tibble: 4 x 5
a1 `5_s` `4_s` `3_n` `2_n`
<int> <dbl> <dbl> <dbl> <dbl>
1 1 10 NA NA NA
2 2 NA 20 NA NA
3 3 NA NA 30 NA
4 4 NA NA NA 40
Data here if you want to reproduce
structure(list(a1 = 1:4, a2 = 5:2, a3 = c("s", "s", "n", "n"),
a4 = c(10, 20, 30, 40)), row.names = c(NA, -4L), class = c("tbl_df",
"tbl", "data.frame"))
Edit : For the edited question after trying out the correct pivot_wider() command - It looks like your data could actually have duplicates, in which case the output you are seeing would make sense - I would suggest you try to figure out if your data actually has duplicates by using filter(Country == .., Indicator == .., Status == ..)
This can be achieved by calling both your columns to pivot wider in the names_from argument in pivot_wider().
data %>%
pivot_wider(names_from = c("Indicator","Status"),
values_from = "Value")
Result
Country `Indicator 1_Ac… `Indicator 1_Fo… `Indicator 1_Ta… `Indicator 2_Ac… `Indicator 2_Fo…
<chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Algeria 34 15 5 28 5

Perform a series of mutations to columns in dataframe

I am trying to replace some text in my dataframe (a few rows given below)
> dput(Henry.longer[1:4,])
structure(list(N_l = c(4, 4, 4, 4), UG = c("100", "100", "100",
"100"), S = c(12, 12, 12, 12), Sample = c(NA, NA, NA, NA), EQ = c("Henry",
"Henry", "Henry", "Henry"), DF = c(0.798545454545455, 0.798545454545455,
0.798545454545455, 0.798545454545455), meow = c("Henry.Exterior.single",
"Multi", "Henry.Exterior.multi", "Henry.Interior.single"), Girder = c("Henry.Exterior.single",
"Henry.Interior.multi", "Henry.Exterior.multi", "Interior")), row.names = c(NA,
-4L), groups = structure(list(UG = "100", S = 12, .rows = list(
1:4)), row.names = c(NA, -1L), class = c("tbl_df", "tbl",
"data.frame"), .drop = FALSE), class = c("grouped_df", "tbl_df",
"tbl", "data.frame"))
I try to mutate the dataframe as:
Henry.longer <- Henry.longer %>%
mutate(Loading = str_replace(meow, "Henry.Exterior.single", "Single")) %>%
mutate(Loading = str_replace(meow, "Henry.Exterior.multi", "Multi")) %>%
mutate(Loading = str_replace(meow, "Henry.Interior.single", "Single")) %>%
mutate(Loading = str_replace(meow, "Henry.Interior.multi", "Multi")) %>%
mutate(Girder = str_replace(meow, "Henry.Exterior.multi", "Exterior")) %>%
mutate(Girder = str_replace(meow, "Henry.Exterior.single", "Exterior")) %>%
mutate(Girder = str_replace(meow, "Henry.Interior.multi", "Interior")) %>%
mutate(Girder = str_replace(meow, "Henry.Interior.single", "Interior")) %>%
select(-meow)
But for some reason the results does not get applied to all the rows and only:
N_l UG S Sample EQ DF Loading Girder
1 4 100 12 NA Henry 0.799 Henry.Exterior.single Henry.Exterior.single
2 4 100 12 NA Henry 0.799 Multi Henry.Interior.multi
3 4 100 12 NA Henry 0.799 Henry.Exterior.multi Henry.Exterior.multi
4 4 100 12 NA Henry 0.799 Henry.Interior.single Interior
I think we can use lookup vectors for this, if it's easy or safer to use static string lookups:
tr_vec <- c(Henry.Exterior.single = "Single", Henry.Exterior.multi = "Multi", Henry.Interior.single = "Single", Henry.Interior.multi = "Multi")
tr_vec2 <- c(Henry.Exterior.multi = "Exterior", Henry.Exterior.single = "Exterior", Henry.Interior.multi = "Interior", Henry.Interior.single = "Interior")
Henry.longer %>%
mutate(
Loading = coalesce(tr_vec[Loading], Loading),
Girder = coalesce(tr_vec2[Girder], Girder)
)
# # A tibble: 4 x 8
# # Groups: UG, S [1]
# N_l UG S Sample EQ DF Loading Girder
# <dbl> <chr> <dbl> <lgl> <chr> <dbl> <chr> <chr>
# 1 4 100 12 NA Henry 0.799 Single Exterior
# 2 4 100 12 NA Henry 0.799 Multi Interior
# 3 4 100 12 NA Henry 0.799 Multi Exterior
# 4 4 100 12 NA Henry 0.799 Single Interior
The advantage of RonakShah's regex solution is that it can very easily handle many of the types of substrings you appear to need. Regexes do carry a little risk, though, in that they may (unlikely in that answer, but) miss match.
Instead of using str_replace I guess it would be easier to extract what you want using regex.
library(dplyr)
Henry.longer %>%
mutate(Loading = sub('.*\\.', '', meow),
Girder = sub('.*\\.(\\w+)\\..*', '\\1', meow))
where
Loading - removes everything until last dot
Girder - extracts a word between two dots.
Oh boy, looks like you've got some answers here already but here's a super-simple one that uses stringr::str_extract:
Henry.longer <- Henry.longer %>%
mutate(Loading = str_extract(meow, "single|multi")) %>%
mutate(Girder = str_extract(meow, "Interior|Exterior"))
It's worth noting that the demo data has a weird entry for meow in one column, so it didn't run perfectly on my machine:

Calculate year-to-year absolute change in R

Give a dataframe df as follows:
df <- structure(list(year = c(2001, 2002, 2003, 2004), `1` = c(22.0775,
24.2460714285714, 29.4039285714286, 27.7110714285714), `2` = c(27.2535714285714,
35.9996428571429, 26.39, 27.8557142857143), `3` = c(24.7710714285714,
25.4428571428571, 15.1142857142857, 19.9657142857143)), row.names = c(NA,
-4L), groups = structure(list(year = c(2001, 2002, 2003, 2004
), .rows = structure(list(1L, 2L, 3L, 4L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, 4L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
Out:
year 1 2 3
0 2001 22.07750 27.25357 24.77107
1 2002 24.24607 35.99964 25.44286
2 2003 29.40393 26.39000 15.11429
3 2004 27.71107 27.85571 19.96571
For column 1, 2 and 3, how could I calculate year-to-year absolute change?
The expected result will like this:
year 1 2 3
0 2002 2.16857 8.74607 0.67179
1 2003 5.15786 9.60964 10.32857
2 2004 1.69286 1.46571 4.85142
The final objective is to compare values of 1, 2, 3 columns across all years, find the largest change year and column, at this example, it should be 2003 and column 3.
How could I do that in R? Thanks.
You can use :
library(dplyr)
data <- df %>% ungroup %>% summarise(across(-1, ~abs(diff(.))))
data
# A tibble: 3 x 3
# `1` `2` `3`
# <dbl> <dbl> <dbl>
#1 2.17 8.75 0.672
#2 5.16 9.61 10.3
#3 1.69 1.47 4.85
To get max change
mat <- which(data == max(data), arr.ind = TRUE)
mat
# row col
#[1,] 2 3
#Year name
df$year[mat[, 1] + 1]
#[1] 2003
#Column name
mat[, 2]
#col
# 3
You can try:
library(reshape2)
library(dplyr)
#Melt
Melted <- reshape2::melt(df,id.vars = 'year')
#Group
Melted %>% group_by(variable) %>% mutate(Diff=c(0,abs(diff(value)))) %>% ungroup() %>%
filter(Diff==max(Diff))
# A tibble: 1 x 4
year variable value Diff
<dbl> <fct> <dbl> <dbl>
1 2003 3 15.1 10.3
We can apply the diff on the entire dataset by converting the numeric columns of interest to matrix in base R
cbind(year = df$year[-1], abs(diff(as.matrix(df[-1]))))
# year 1 2 3
#[1,] 2002 2.168571 8.746071 0.6717857
#[2,] 2003 5.157857 9.609643 10.3285714
#[3,] 2004 1.692857 1.465714 4.8514286

Resources